Institute for Parallel and Distributed Systems (IPVS)

Publications

An overview of publications of the Institute for Parallel and Distributed Systems.

Publications AS: Bibliography 2024 BibTeX

 
@inproceedings {INPROC-2024-11,
   author = {Michael Behringer and Dennis Treder-Tschechlov and Jannis Rapp},
   title = {{Empowering Domain Experts to Enhance Clustering Results Through Interactive Refinement}},
   booktitle = {Onizuka, M., et al. Database Systems for Advanced Applications. DASFAA 2024. Lecture Notes in Computer Science, vol 14856. Springer, Singapore.},
   publisher = {Springer},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {518--522},
   type = {Conference Paper},
   month = {September},
   year = {2024},
   doi = {https://doi.org/10.1007/978-981-97-5575-2_51},
   language = {German},
   cr-category = {I.5.3 Pattern Recognition Clustering},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Data mining is crucial to gain knowledge from large amounts of data. One popular data mining technique is clustering aiming to group similar data together. This technique relies on domain knowledge to interpret the results. However, the initial results are often insufficient and must be refined - taking tremendous time and resources with unclear benefits. In this demo paper, we introduce our novel user-centric approach that supports domain expert in interactively refining clustering results to their needs by merging and splitting clusters, specifying constraints, or by applying active learning - combined in one single tool.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-11&engl=1}
}
@inproceedings {INPROC-2024-10,
   author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
   title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
   booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
   editor = {Proceedings of the VLDB Endowment},
   publisher = {ACM},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {2880--2892},
   type = {Conference Paper},
   month = {August},
   year = {2024},
   doi = {https://doi.org/10.14778/3681954.3681970},
   language = {English},
   cr-category = {I.5.3 Pattern Recognition Clustering},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice because they scale well for large datasets. However, they are only able to detect simple data characteristics. Ensemble clustering can overcome this limitation by combining multiple results of efficient algorithms. However, analysts face several challenges when applying ensemble clustering, i. e., analysts struggle to (a) efficiently generate an ensemble and (b) combine the ensemble using a suitable consensus function with a corresponding hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble clustering approach to address these challenges. Our approach relies on meta-learning to learn about dataset characteristics and the correlation between generated base clusterings and the performance of consensus functions. We apply the learned knowledge to generate appropriate ensembles and select a suitable consensus function to combine their results. Further, we use a state-of-the-art optimization technique to tune the hyperparameters of the selected consensus function. Our comprehensive evaluation on synthetic and real-world datasets demonstrates that EffEns significantly outperforms state-of-the-art approaches w.r.t. accuracy and runtime.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-10&engl=1}
}
@inproceedings {INPROC-2024-09,
   author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
   title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
   booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
   editor = {Proceedings of the VLDB Endowment},
   publisher = {ACM},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {2880--2892},
   type = {Conference Paper},
   month = {August},
   year = {2024},
   doi = {https://doi.org/10.14778/3681954.3681970},
   language = {English},
   cr-category = {I.5.3 Pattern Recognition Clustering},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice because they scale well for large datasets. However, they are only able to detect simple data characteristics. Ensemble clustering can overcome this limitation by combining multiple results of efficient algorithms. However, analysts face several challenges when applying ensemble clustering, i. e., analysts struggle to (a) efficiently generate an ensemble and (b) combine the ensemble using a suitable consensus function with a corresponding hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble clustering approach to address these challenges. Our approach relies on meta-learning to learn about dataset characteristics and the correlation between generated base clusterings and the performance of consensus functions. We apply the learned knowledge to generate appropriate ensembles and select a suitable consensus function to combine their results. Further, we use a state-of-the-art optimization technique to tune the hyperparameters of the selected consensus function. Our comprehensive evaluation on synthetic and real-world datasets demonstrates that EffEns significantly outperforms state-of-the-art approaches w.r.t. accuracy and runtime.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-09&engl=1}
}
@inproceedings {INPROC-2024-08,
   author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
   title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
   booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
   editor = {Proceedings of the VLDB Endowment},
   publisher = {ACM},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {2880--2892},
   type = {Conference Paper},
   month = {August},
   year = {2024},
   doi = {https://doi.org/10.14778/3681954.3681970},
   language = {English},
   cr-category = {I.5.3 Pattern Recognition Clustering},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice because they scale well for large datasets. However, they are only able to detect simple data characteristics. Ensemble clustering can overcome this limitation by combining multiple results of efficient algorithms. However, analysts face several challenges when applying ensemble clustering, i. e., analysts struggle to (a) efficiently generate an ensemble and (b) combine the ensemble using a suitable consensus function with a corresponding hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble clustering approach to address these challenges. Our approach relies on meta-learning to learn about dataset characteristics and the correlation between generated base clusterings and the performance of consensus functions. We apply the learned knowledge to generate appropriate ensembles and select a suitable consensus function to combine their results. Further, we use a state-of-the-art optimization technique to tune the hyperparameters of the selected consensus function. Our comprehensive evaluation on synthetic and real-world datasets demonstrates that EffEns significantly outperforms state-of-the-art approaches w.r.t. accuracy and runtime.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-08&engl=1}
}
@inproceedings {INPROC-2024-07,
   author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
   title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
   booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
   editor = {Proceedings of the VLDB Endowment},
   publisher = {VLDB Endowment},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {2880--2892},
   type = {Conference Paper},
   month = {August},
   year = {2024},
   language = {English},
   cr-category = {I.5.3 Pattern Recognition Clustering},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice because they scale well for large datasets. However, they are only able to detect simple data characteristics. Ensemble clustering can overcome this limitation by combining multiple results of efficient algorithms. However, analysts face several challenges when applying ensemble clustering, i. e., analysts struggle to (a) efficiently generate an ensemble and (b) combine the ensemble using a suitable consensus function with a corresponding hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble clustering approach to address these challenges. Our approach relies on meta-learning to learn about dataset characteristics and the correlation between generated base clusterings and the performance of consensus functions. We apply the learned knowledge to generate appropriate ensembles and select a suitable consensus function to combine their results. Further, we use a state-of-the-art optimization technique to tune the hyperparameters of the selected consensus function. Our comprehensive evaluation on synthetic and real-world datasets demonstrates that EffEns significantly outperforms state-of-the-art approaches w.r.t. accuracy and runtime.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-07&engl=1}
}
@inproceedings {INPROC-2024-06,
   author = {Christoph Stach and Yunxuan Li and Laura Schuiki and Bernhard Mitschang},
   title = {{LALO—A Virtual Data Lake Zone for Composing Tailor-Made Data Products on Demand}},
   booktitle = {Proceedings of the 35th International Conference on Database and Expert Systems Applications (DEXA 2024)},
   editor = {Christine Strauss and Toshiyuki Amagasa and Giuseppe Manco and Gabriele Kotsis and A Min Tjoa and Ismail Khalil},
   address = {Cham},
   publisher = {Springer},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   series = {Lecture Notes in Computer Science},
   volume = {14911},
   pages = {288--305},
   type = {Conference Paper},
   month = {August},
   year = {2024},
   isbn = {978-3-031-68311-4},
   issn = {0302-9743},
   doi = {10.1007/978-3-031-68312-1_22},
   keywords = {Data Product; Virtual Data Lake Zone; Data Stream Adaptation},
   language = {English},
   cr-category = {H.2.7 Database Administration,     E.2 Data Storage Representations,     H.3.3 Information Search and Retrieval,     H.2.8 Database Applications},
   contact = {Senden Sie eine E-Mail an \<christoph.stach@ipvs.uni-stuttgart.de\>.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The emerging paradigm of data products, which has become increasingly popular recently due to the rise of data meshes and data marketplaces, also poses unprecedented challenges for data management. Current data architectures, namely data warehouses and data lakes, are not able to meet these challenges adequately. In particular, these architectures are not designed for a just-in-time provision of highly customized data products tailored perfectly to the needs of customers. In this paper, we therefore present a virtual data lake zone for composing tailor-made data products on demand, called LALO. LALO uses data streaming technologies to enable just-in-time composing of data products without allocating storage space in the data architecture permanently. In order to enable customers to tailor data products to their needs, LALO uses a novel mechanism that enables live adaptation of data streams. Evaluation results show that the overhead for such an adaptation is negligible. Therefore, LALO represents an efficient solution for the appropriate handling of data products, both in terms of storage space and runtime.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-06&engl=1}
}
@inproceedings {INPROC-2024-05,
   author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch},
   title = {{The Data Platform Evolution: From Data Warehouses over Data Lakes to Lakehouses}},
   booktitle = {Proceedings of the 34th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Hirsau, Germany},
   editor = {Holger Schwarz},
   publisher = {CEUR Workshop Proceedings},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   series = {CEUR Workshop Proceedings},
   volume = {3714},
   pages = {67--71},
   type = {Workshop Paper},
   month = {July},
   year = {2024},
   issn = {1613-0073},
   keywords = {Lakehouse; Data Warehouse; Data Lake; Data Management; Data Analytics},
   language = {English},
   cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,     H.4.2 Information Systems Applications Types of Systems},
   ee = {https://ceur-ws.org/Vol-3714/invited2.pdf},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The continuously increasing availability of data and the growing maturity of data-driven analysis techniques have encouraged enterprises to collect and analyze huge amounts of business-relevant data in order to exploit it for competitive advantages. To facilitate these processes, various platforms for analytical data management have been developed: While data warehouses have traditionally been used by business analysts for reporting and OLAP, data lakes emerged as an alternative concept that also supports advanced analytics. As these two common types of data platforms show rather contrary characteristics and target different user groups and analytical approaches, enterprises usually need to employ both of them, resulting in complex, error-prone and cost-expensive architectures. To address these issues, efforts have recently become apparent to combine features of data warehouses and data lakes into so-called lakehouses, which pursue to serve all kinds of analytics from a single data platform. This paper provides an overview on the evolution of analytical data platforms from data warehouses over data lakes to lakehouses and elaborates on the vision and characteristics of the latter. Furthermore, it addresses the question of what aspects common data lakes are currently missing that prevent them from transitioning to lakehouses.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-05&engl=1}
}
@inproceedings {INPROC-2024-04,
   author = {Jan Schneider and Arnold Lutsch and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
   title = {{First Experiences on the Application of Lakehouses in Industrial Practice}},
   booktitle = {Proceedings of the 35th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Herdecke, Germany},
   editor = {Uta St{\"o}rl},
   publisher = {CEUR Workshop Proceedings},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   series = {CEUR Workshop Proceedings},
   volume = {3710},
   pages = {3--8},
   type = {Workshop Paper},
   month = {June},
   year = {2024},
   isbn = {1613-0073},
   keywords = {Data Lakehouse; Data Platform; Platform Architecture; Data Analytics; Case Study; Industry Experience},
   language = {English},
   cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,     H.4.2 Information Systems Applications Types of Systems},
   ee = {https://ceur-ws.org/Vol-3710/paper1.pdf},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {In recent years, so-called lakehouses have emerged as a new type of data platform that intends to combine characteristics of data warehouses and data lakes. Although companies started to employ the associated concepts and technologies as part of their analytics architectures, little is known about their practical medium- and long-term experiences as well as proven architectural decisions. Additionally, there is only limited knowledge about how lakehouses can be utilized effectively in an industrial context. Hence, it remains unclear under which circumstances lakehouses represent a viable alternative to conventional data platforms. To address this gap, we conducted a case study on a real-world industrial case, in which manufacturing data needs to be managed and analytically exploited. Within the scope of this case, a dedicated analytics department has been testing and leveraging a lakehouse approach for several months in a productive environment with high data volumes and various types of analytical workloads. The paper at hand presents the results of our within-case analyses and focuses on the industrial setting of the case as well as the architecture of the utilized lakehouse. This way, it provides preliminary insights on the application of lakehouses in industrial practice and refers to useful architectural decisions.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-04&engl=1}
}
@inproceedings {INPROC-2024-03,
   author = {Andrea Fieschi and Pascal Hirmer and Sachin Agrawal and Christoph Stach and Bernhard Mitschang},
   title = {{HySAAD - A Hybrid Selection Approach for Anonymization by Design in the Automotive Domain}},
   booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
   editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Ashley Song and Cheng Long},
   address = {Los Alamitos, Washington, Tokyo},
   publisher = {IEEE Computer Society Conference Publishing Services},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {203--210},
   type = {Conference Paper},
   month = {June},
   year = {2024},
   isbn = {979-8-3503-7455-1},
   issn = {2375-0324},
   doi = {10.1109/MDM61037.2024.00044},
   keywords = {anonymization; connected vehicles; privacy protection; metrics},
   language = {English},
   cr-category = {K.4.1 Computers and Society Public Policy Issues},
   contact = {Senden Sie eine E-Mail an \<andrea.fieschi@ipvs.uni-stuttgart.de\>.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The increasing connectivity and data exchange between vehicles and the cloud have led to growing privacy concerns. To keep on gaining product insights through data collection while guaranteeing privacy protection, an anonymization-by-design approach should be used. A rising number of anonymization methods, not limited to the automotive domain, can be found in the literature and practice. The developers need support to select the suitable anonymization technique. To this end, we make the following two contributions: 1) We apply our knowledge from the automotive domain to outline the usage of qualitative metrics for anonymization techniques assessment; 2) We introduce HySAAD, a hybrid selection approach for anonymization by design that leverages this groundwork by recommending appropriate anonymization techniques for each mobile data analytics use case based on both, qualitative (i.e., {\ss}oft``) metrics and quantitative (i.e., ''hard``) metrics. Using a real-world use case from the automotive, we demonstrate the applicability and effectiveness of HySAAD.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-03&engl=1}
}
@inproceedings {INPROC-2024-02,
   author = {Yunxuan Li and Christoph Stach and Bernhard Mitschang},
   title = {{PaDS: An adaptive and privacy-enabling Data Pipeline for Smart Cars}},
   booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
   editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Kyoung-Sook Kim and Manos Papagelis and Dimitris Sacharidis},
   address = {Los Alamitos, Washington, Tokyo},
   publisher = {IEEE Computer Society Conference Publishing Services},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {41--50},
   type = {Conference Paper},
   month = {June},
   year = {2024},
   isbn = {979-8-3503-7455-1},
   issn = {2375-0324},
   doi = {10.1109/MDM61037.2024.00026},
   keywords = {smart car; privacy-enabling data pipeline; datastream runtime adaptation; mobile data privacy management},
   language = {English},
   cr-category = {K.4.1 Computers and Society Public Policy Issues},
   contact = {Senden Sie eine E-Mail an \<yunxuan.li@ipvs.uni-stuttgart.de\>.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {The extensive use of onboard sensors in smart cars enables the collection, processing, and dissemination of large amounts of mobile data containing information about the vehicle, its driver, and even bystanders. Despite the undoubted benefits of such smart cars, this leads to significant privacy concerns. Due to their inherent mobility, the situation of smart cars changes frequently, and with it, the appropriate measures to counteract the exposure of private data. However, data management in such vehicles lacks sufficient support for this privacy dynamism. We therefore introduce PaDS, a framework for Privacy adaptive Data Stream. The focus of this paper is to enable adaptive data processing within the vehicle data stream. With PaDS, Privacy-Enhancing Technologies can be deployed dynamically in the data pipeline of a smart car according to the current situation without user intervention. With a comparison of state-of-the-art approaches, we demonstrate that our solution is very efficient as it does not require a complete restart of the data pipeline. Moreover, compared to a static approach, PaDS causes only minimal overhead despite its dynamic adaptation of the data pipeline to react to changing privacy requirements. This renders PaDS an effective privacy solution for smart cars.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-02&engl=1}
}
@inproceedings {INPROC-2024-01,
   author = {Dennis Przytarski and Christoph Stach and Bernhard Mitschang},
   title = {{Assessing Data Layouts to Bring Storage Engine Functionality to Blockchain Technology}},
   booktitle = {Proceedings of the 57th Hawaii International Conference on System Sciences (HICSS '24)},
   editor = {Tung X. Bui},
   publisher = {ScholarSpace},
   institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
   pages = {5091--5100},
   type = {Conference Paper},
   month = {January},
   year = {2024},
   isbn = {978-0-9981331-7-1},
   keywords = {blockchain; storage engine; queries},
   language = {English},
   cr-category = {H.3.1 Content Analysis and Indexing,     H.3.2 Information Storage,     H.3.3 Information Search and Retrieval},
   ee = {https://hdl.handle.net/10125/106995},
   contact = {Senden Sie eine E-Mail an \<Christoph.Stach@ipvs.uni-stuttgart.de\>.},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {Nowdays, modern applications often use blockchains as a secure data store. However, querying blockchain data is more challenging than querying conventional databases due to blockchains being primarily designed for the logging of asset transfers, such as cryptocurrencies, rather than storing and reading generic data. To improve the experience of querying blockchain data and make it comparable to querying conventional databases, new design approaches of the storage engine for blockchain technology are required. An important aspect is the data layout of a block, as it plays a crucial role in facilitating reading of blockchain data. In this paper, we identify a suitable data layout that provides the required query capabilities while preserving the key properties of blockchain technology. Our goal is to overcome the limitations of current data access models in blockchains, such as the reliance on auxiliary data storages and error-prone smart contracts. To this end, we compare four promising data layouts with data models derived from document, row, column, and triple stores in terms of schema flexibility, read pattern generality, and relational algebra suitability. We then assess the most suitable data layout for blockchain technology.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-01&engl=1}
}
@article {ART-2024-01,
   author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch and Holger Schwarz and Bernhard Mitschang},
   title = {{The Lakehouse: State of the Art on Concepts and Technologies}},
   journal = {SN Computer Science},
   publisher = {Springer Nature},
   volume = {5},
   number = {5},
   pages = {1--39},
   type = {Article in Journal},
   month = {April},
   year = {2024},
   issn = {2661-8907},
   doi = {10.1007/s42979-024-02737-0},
   keywords = {Data Lakehouse; Data Lake; Data Platform; Data Analytics},
   language = {English},
   cr-category = {H.3.4 Information Storage and Retrieval Systems and Software},
   ee = {https://doi.org/10.1007/s42979-024-02737-0,     https://link.springer.com/content/pdf/10.1007/s42979-024-02737-0.pdf},
   department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
   abstract = {In the context of data analytics, so-called lakehouses refer to novel variants of data platforms that attempt to combine characteristics of data warehouses and data lakes. In this way, lakehouses promise to simplify enterprise analytics architectures, which often suffer from high operational costs, slow analytical processes and further shortcomings resulting from data replication. However, different views and notions on the lakehouse paradigm exist, which are commonly driven by individual technologies and varying analytical use cases. Therefore, it remains unclear what challenges lakehouses address, how they can be characterized and which technologies can be leveraged to implement them. This paper addresses these issues by providing an extensive overview of concepts and technologies that are related to the lakehouse paradigm and by outlining lakehouses as a distinct architectural approach for data platforms. Concepts and technologies from literature with regard to lakehouses are discussed, based on which a conceptual foundation for lakehouses is established. In addition, several popular technologies are evaluated regarding their suitability for the building of lakehouses. All findings are supported and demonstrated with the help of a representative analytics scenario. Typical challenges of conventional data platforms are identified, a new, sharper definition for lakehouses is proposed and technical requirements for lakehouses are derived. As part of an evaluation, these requirements are applied to several popular technologies, of which frameworks for data lakes turn out to be particularly helpful for the construction of lakehouses. Our work provides an overview of the state of the art and a conceptual foundation for the lakehouse paradigm, which can support future research.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2024-01&engl=1}
}
 
To the top of the page