Position within the page tree

Home
Institute
Publications

Institute for Parallel and Distributed Systems (IPVS)

Publications

An overview of publications of the Institute for Parallel and Distributed Systems.

Publications AS: Bibliography 2024 BibTeX

@inproceedings {INPROC-2024-11,
    author = {Michael Behringer and Dennis Treder-Tschechlov and Jannis Rapp},
    title = {{Empowering Domain Experts to Enhance Clustering Results Through Interactive Refinement}},
    booktitle = {Onizuka, M., et al. Database Systems for Advanced Applications. DASFAA 2024. Lecture Notes in Computer Science, vol 14856. Springer, Singapore.},
    publisher = {Springer},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {518--522},
    type = {Conference Paper},
    month = {September},
    year = {2024},
    doi = {https://doi.org/10.1007/978-981-97-5575-2_51},
    language = {German},
    cr-category = {I.5.3 Pattern Recognition Clustering},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Data mining is crucial to gain knowledge from large amounts of data. One
      popular data mining technique is clustering aiming to group similar data
      together. This technique relies on domain knowledge to interpret the results.
      However, the initial results are often insufficient and must be refined -
      taking tremendous time and resources with unclear benefits. In this demo paper,
      we introduce our novel user-centric approach that supports domain expert in
      interactively refining clustering results to their needs by merging and
      splitting clusters, specifying constraints, or by applying active learning -
      combined in one single tool.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-11&engl=1}
 }

@inproceedings {INPROC-2024-10,
    author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
    title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
    booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880�2892.},
    editor = {Proceedings of the VLDB Endowment},
    publisher = {ACM},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {2880--2892},
    type = {Conference Paper},
    month = {August},
    year = {2024},
    doi = {https://doi.org/10.14778/3681954.3681970},
    language = {English},
    cr-category = {I.5.3 Pattern Recognition Clustering},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
      because they scale well for large datasets. However, they are only able to
      detect simple data characteristics. Ensemble clustering can overcome this
      limitation by combining multiple results of efficient algorithms. However,
      analysts face several challenges when applying ensemble clustering, i. e.,
      analysts struggle to (a) efficiently generate an ensemble and (b) combine the
      ensemble using a suitable consensus function with a corresponding
      hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
      clustering approach to address these challenges. Our approach relies on
      meta-learning to learn about dataset characteristics and the correlation
      between generated base clusterings and the performance of consensus functions.
      We apply the learned knowledge to generate appropriate ensembles and select a
      suitable consensus function to combine their results. Further, we use a
      state-of-the-art optimization technique to tune the hyperparameters of the
      selected consensus function. Our comprehensive evaluation on synthetic and
      real-world datasets demonstrates that EffEns significantly outperforms
      state-of-the-art approaches w.r.t. accuracy and runtime.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-10&engl=1}
 }

@inproceedings {INPROC-2024-09,
    author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
    title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
    booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880�2892.},
    editor = {Proceedings of the VLDB Endowment},
    publisher = {ACM},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {2880--2892},
    type = {Conference Paper},
    month = {August},
    year = {2024},
    doi = {https://doi.org/10.14778/3681954.3681970},
    language = {English},
    cr-category = {I.5.3 Pattern Recognition Clustering},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
      because they scale well for large datasets. However, they are only able to
      detect simple data characteristics. Ensemble clustering can overcome this
      limitation by combining multiple results of efficient algorithms. However,
      analysts face several challenges when applying ensemble clustering, i. e.,
      analysts struggle to (a) efficiently generate an ensemble and (b) combine the
      ensemble using a suitable consensus function with a corresponding
      hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
      clustering approach to address these challenges. Our approach relies on
      meta-learning to learn about dataset characteristics and the correlation
      between generated base clusterings and the performance of consensus functions.
      We apply the learned knowledge to generate appropriate ensembles and select a
      suitable consensus function to combine their results. Further, we use a
      state-of-the-art optimization technique to tune the hyperparameters of the
      selected consensus function. Our comprehensive evaluation on synthetic and
      real-world datasets demonstrates that EffEns significantly outperforms
      state-of-the-art approaches w.r.t. accuracy and runtime.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-09&engl=1}
 }

@inproceedings {INPROC-2024-08,
    author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
    title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
    booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880�2892.},
    editor = {Proceedings of the VLDB Endowment},
    publisher = {ACM},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {2880--2892},
    type = {Conference Paper},
    month = {August},
    year = {2024},
    doi = {https://doi.org/10.14778/3681954.3681970},
    language = {English},
    cr-category = {I.5.3 Pattern Recognition Clustering},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
      because they scale well for large datasets. However, they are only able to
      detect simple data characteristics. Ensemble clustering can overcome this
      limitation by combining multiple results of efficient algorithms. However,
      analysts face several challenges when applying ensemble clustering, i. e.,
      analysts struggle to (a) efficiently generate an ensemble and (b) combine the
      ensemble using a suitable consensus function with a corresponding
      hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
      clustering approach to address these challenges. Our approach relies on
      meta-learning to learn about dataset characteristics and the correlation
      between generated base clusterings and the performance of consensus functions.
      We apply the learned knowledge to generate appropriate ensembles and select a
      suitable consensus function to combine their results. Further, we use a
      state-of-the-art optimization technique to tune the hyperparameters of the
      selected consensus function. Our comprehensive evaluation on synthetic and
      real-world datasets demonstrates that EffEns significantly outperforms
      state-of-the-art approaches w.r.t. accuracy and runtime.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-08&engl=1}
 }

@inproceedings {INPROC-2024-07,
    author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
    title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
    booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880�2892.},
    editor = {Proceedings of the VLDB Endowment},
    publisher = {VLDB Endowment},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {2880--2892},
    type = {Conference Paper},
    month = {August},
    year = {2024},
    language = {English},
    cr-category = {I.5.3 Pattern Recognition Clustering},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
      because they scale well for large datasets. However, they are only able to
      detect simple data characteristics. Ensemble clustering can overcome this
      limitation by combining multiple results of efficient algorithms. However,
      analysts face several challenges when applying ensemble clustering, i. e.,
      analysts struggle to (a) efficiently generate an ensemble and (b) combine the
      ensemble using a suitable consensus function with a corresponding
      hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
      clustering approach to address these challenges. Our approach relies on
      meta-learning to learn about dataset characteristics and the correlation
      between generated base clusterings and the performance of consensus functions.
      We apply the learned knowledge to generate appropriate ensembles and select a
      suitable consensus function to combine their results. Further, we use a
      state-of-the-art optimization technique to tune the hyperparameters of the
      selected consensus function. Our comprehensive evaluation on synthetic and
      real-world datasets demonstrates that EffEns significantly outperforms
      state-of-the-art approaches w.r.t. accuracy and runtime.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-07&engl=1}
 }

@inproceedings {INPROC-2024-06,
    author = {Christoph Stach and Yunxuan Li and Laura Schuiki and Bernhard Mitschang},
    title = {{LALO�A Virtual Data Lake Zone for Composing Tailor-Made Data Products on Demand}},
    booktitle = {Proceedings of the 35th International Conference on Database and Expert Systems Applications (DEXA 2024)},
    editor = {Christine Strauss and Toshiyuki Amagasa and Giuseppe Manco and Gabriele Kotsis and A Min Tjoa and Ismail Khalil},
    address = {Cham},
    publisher = {Springer},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    series = {Lecture Notes in Computer Science},
    volume = {14911},
    pages = {288--305},
    type = {Conference Paper},
    month = {August},
    year = {2024},
    isbn = {978-3-031-68311-4},
    issn = {0302-9743},
    doi = {10.1007/978-3-031-68312-1_22},
    keywords = {Data Product; Virtual Data Lake Zone; Data Stream Adaptation},
    language = {English},
    cr-category = {H.2.7 Database Administration,
                   E.2 Data Storage Representations,
                   H.3.3 Information Search and Retrieval,
                   H.2.8 Database Applications},
    contact = {Senden Sie eine E-Mail an \<christoph.stach@ipvs.uni-stuttgart.de\>.},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {The emerging paradigm of data products, which has become increasingly popular
      recently due to the rise of data meshes and data marketplaces, also poses
      unprecedented challenges for data management. Current data architectures,
      namely data warehouses and data lakes, are not able to meet these challenges
      adequately. In particular, these architectures are not designed for a
      just-in-time provision of highly customized data products tailored perfectly to
      the needs of customers. In this paper, we therefore present a virtual data lake
      zone for composing tailor-made data products on demand, called LALO. LALO uses
      data streaming technologies to enable just-in-time composing of data products
      without allocating storage space in the data architecture permanently. In order
      to enable customers to tailor data products to their needs, LALO uses a novel
      mechanism that enables live adaptation of data streams. Evaluation results show
      that the overhead for such an adaptation is negligible. Therefore, LALO
      represents an efficient solution for the appropriate handling of data products,
      both in terms of storage space and runtime.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-06&engl=1}
 }

@inproceedings {INPROC-2024-05,
    author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch},
    title = {{The Data Platform Evolution: From Data Warehouses over Data Lakes to Lakehouses}},
    booktitle = {Proceedings of the 34th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Hirsau, Germany},
    editor = {Holger Schwarz},
    publisher = {CEUR Workshop Proceedings},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    series = {CEUR Workshop Proceedings},
    volume = {3714},
    pages = {67--71},
    type = {Workshop Paper},
    month = {July},
    year = {2024},
    issn = {1613-0073},
    keywords = {Lakehouse; Data Warehouse; Data Lake; Data Management; Data Analytics},
    language = {English},
    cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,
                   H.4.2 Information Systems Applications Types of Systems},
    ee = {https://ceur-ws.org/Vol-3714/invited2.pdf},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {The continuously increasing availability of data and the growing maturity of
      data-driven analysis techniques have encouraged enterprises to collect and
      analyze huge amounts of business-relevant data in order to exploit it for
      competitive advantages. To facilitate these processes, various platforms for
      analytical data management have been developed: While data warehouses have
      traditionally been used by business analysts for reporting and OLAP, data lakes
      emerged as an alternative concept that also supports advanced analytics. As
      these two common types of data platforms show rather contrary characteristics
      and target different user groups and analytical approaches, enterprises usually
      need to employ both of them, resulting in complex, error-prone and
      cost-expensive architectures. To address these issues, efforts have recently
      become apparent to combine features of data warehouses and data lakes into
      so-called lakehouses, which pursue to serve all kinds of analytics from a
      single data platform. This paper provides an overview on the evolution of
      analytical data platforms from data warehouses over data lakes to lakehouses
      and elaborates on the vision and characteristics of the latter. Furthermore, it
      addresses the question of what aspects common data lakes are currently missing
      that prevent them from transitioning to lakehouses.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-05&engl=1}
 }

@inproceedings {INPROC-2024-04,
    author = {Jan Schneider and Arnold Lutsch and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
    title = {{First Experiences on the Application of Lakehouses in Industrial Practice}},
    booktitle = {Proceedings of the 35th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Herdecke, Germany},
    editor = {Uta St{\"o}rl},
    publisher = {CEUR Workshop Proceedings},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    series = {CEUR Workshop Proceedings},
    volume = {3710},
    pages = {3--8},
    type = {Workshop Paper},
    month = {June},
    year = {2024},
    isbn = {1613-0073},
    keywords = {Data Lakehouse; Data Platform; Platform Architecture; Data Analytics; Case Study; Industry Experience},
    language = {English},
    cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,
                   H.4.2 Information Systems Applications Types of Systems},
    ee = {https://ceur-ws.org/Vol-3710/paper1.pdf},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {In recent years, so-called lakehouses have emerged as a new type of data
      platform that intends to combine characteristics of data warehouses and data
      lakes. Although companies started to employ the associated concepts and
      technologies as part of their analytics architectures, little is known about
      their practical medium- and long-term experiences as well as proven
      architectural decisions. Additionally, there is only limited knowledge about
      how lakehouses can be utilized effectively in an industrial context. Hence, it
      remains unclear under which circumstances lakehouses represent a viable
      alternative to conventional data platforms. To address this gap, we conducted a
      case study on a real-world industrial case, in which manufacturing data needs
      to be managed and analytically exploited. Within the scope of this case, a
      dedicated analytics department has been testing and leveraging a lakehouse
      approach for several months in a productive environment with high data volumes
      and various types of analytical workloads. The paper at hand presents the
      results of our within-case analyses and focuses on the industrial setting of
      the case as well as the architecture of the utilized lakehouse. This way, it
      provides preliminary insights on the application of lakehouses in industrial
      practice and refers to useful architectural decisions.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-04&engl=1}
 }

@inproceedings {INPROC-2024-03,
    author = {Andrea Fieschi and Pascal Hirmer and Sachin Agrawal and Christoph Stach and Bernhard Mitschang},
    title = {{HySAAD - A Hybrid Selection Approach for Anonymization by Design in the Automotive Domain}},
    booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
    editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Ashley Song and Cheng Long},
    address = {Los Alamitos, Washington, Tokyo},
    publisher = {IEEE Computer Society Conference Publishing Services},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {203--210},
    type = {Conference Paper},
    month = {June},
    year = {2024},
    isbn = {979-8-3503-7455-1},
    issn = {2375-0324},
    doi = {10.1109/MDM61037.2024.00044},
    keywords = {anonymization; connected vehicles; privacy protection; metrics},
    language = {English},
    cr-category = {K.4.1 Computers and Society Public Policy Issues},
    contact = {Senden Sie eine E-Mail an \<andrea.fieschi@ipvs.uni-stuttgart.de\>.},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {The increasing connectivity and data exchange between vehicles and the cloud
      have led to growing privacy concerns. To keep on gaining product insights
      through data collection while guaranteeing privacy protection, an
      anonymization-by-design approach should be used. A rising number of
      anonymization methods, not limited to the automotive domain, can be found in
      the literature and practice. The developers need support to select the suitable
      anonymization technique. To this end, we make the following two contributions:
      1) We apply our knowledge from the automotive domain to outline the usage of
      qualitative metrics for anonymization techniques assessment; 2) We introduce
      HySAAD, a hybrid selection approach for anonymization by design that leverages
      this groundwork by recommending appropriate anonymization techniques for each
      mobile data analytics use case based on both, qualitative (i.e., {\ss}oft``) metrics
      and quantitative (i.e., ''hard``) metrics. Using a real-world use case from the
      automotive, we demonstrate the applicability and effectiveness of HySAAD.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-03&engl=1}
 }

@inproceedings {INPROC-2024-02,
    author = {Yunxuan Li and Christoph Stach and Bernhard Mitschang},
    title = {{PaDS: An adaptive and privacy-enabling Data Pipeline for Smart Cars}},
    booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
    editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Kyoung-Sook Kim and Manos Papagelis and Dimitris Sacharidis},
    address = {Los Alamitos, Washington, Tokyo},
    publisher = {IEEE Computer Society Conference Publishing Services},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {41--50},
    type = {Conference Paper},
    month = {June},
    year = {2024},
    isbn = {979-8-3503-7455-1},
    issn = {2375-0324},
    doi = {10.1109/MDM61037.2024.00026},
    keywords = {smart car; privacy-enabling data pipeline; datastream runtime adaptation; mobile data privacy management},
    language = {English},
    cr-category = {K.4.1 Computers and Society Public Policy Issues},
    contact = {Senden Sie eine E-Mail an \<yunxuan.li@ipvs.uni-stuttgart.de\>.},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {The extensive use of onboard sensors in smart cars enables the collection,
      processing, and dissemination of large amounts of mobile data containing
      information about the vehicle, its driver, and even bystanders. Despite the
      undoubted benefits of such smart cars, this leads to significant privacy
      concerns. Due to their inherent mobility, the situation of smart cars changes
      frequently, and with it, the appropriate measures to counteract the exposure of
      private data. However, data management in such vehicles lacks sufficient
      support for this privacy dynamism. We therefore introduce PaDS, a framework for
      Privacy adaptive Data Stream. The focus of this paper is to enable adaptive
      data processing within the vehicle data stream. With PaDS, Privacy-Enhancing
      Technologies can be deployed dynamically in the data pipeline of a smart car
      according to the current situation without user intervention. With a comparison
      of state-of-the-art approaches, we demonstrate that our solution is very
      efficient as it does not require a complete restart of the data pipeline.
      Moreover, compared to a static approach, PaDS causes only minimal overhead
      despite its dynamic adaptation of the data pipeline to react to changing
      privacy requirements. This renders PaDS an effective privacy solution for smart
      cars.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-02&engl=1}
 }

@inproceedings {INPROC-2024-01,
    author = {Dennis Przytarski and Christoph Stach and Bernhard Mitschang},
    title = {{Assessing Data Layouts to Bring Storage Engine Functionality to Blockchain Technology}},
    booktitle = {Proceedings of the 57th Hawaii International Conference on System Sciences (HICSS '24)},
    editor = {Tung X. Bui},
    publisher = {ScholarSpace},
    institution = {University of Stuttgart, Faculty of Computer Science, Electrical Engineering, and Information Technology, Germany},
    pages = {5091--5100},
    type = {Conference Paper},
    month = {January},
    year = {2024},
    isbn = {978-0-9981331-7-1},
    keywords = {blockchain; storage engine; queries},
    language = {English},
    cr-category = {H.3.1 Content Analysis and Indexing,
                   H.3.2 Information Storage,
                   H.3.3 Information Search and Retrieval},
    ee = {https://hdl.handle.net/10125/106995},
    contact = {Senden Sie eine E-Mail an \<Christoph.Stach@ipvs.uni-stuttgart.de\>.},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {Nowdays, modern applications often use blockchains as a secure data store.
      However, querying blockchain data is more challenging than querying
      conventional databases due to blockchains being primarily designed for the
      logging of asset transfers, such as cryptocurrencies, rather than storing and
      reading generic data. To improve the experience of querying blockchain data and
      make it comparable to querying conventional databases, new design approaches of
      the storage engine for blockchain technology are required. An important aspect
      is the data layout of a block, as it plays a crucial role in facilitating
      reading of blockchain data. In this paper, we identify a suitable data layout
      that provides the required query capabilities while preserving the key
      properties of blockchain technology. Our goal is to overcome the limitations of
      current data access models in blockchains, such as the reliance on auxiliary
      data storages and error-prone smart contracts. To this end, we compare four
      promising data layouts with data models derived from document, row, column, and
      triple stores in terms of schema flexibility, read pattern generality, and
      relational algebra suitability. We then assess the most suitable data layout
      for blockchain technology.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-01&engl=1}
 }

@article {ART-2024-01,
    author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch and Holger Schwarz and Bernhard Mitschang},
    title = {{The Lakehouse: State of the Art on Concepts and Technologies}},
    journal = {SN Computer Science},
    publisher = {Springer Nature},
    volume = {5},
    number = {5},
    pages = {1--39},
    type = {Article in Journal},
    month = {April},
    year = {2024},
    issn = {2661-8907},
    doi = {10.1007/s42979-024-02737-0},
    keywords = {Data Lakehouse; Data Lake; Data Platform; Data Analytics},
    language = {English},
    cr-category = {H.3.4 Information Storage and Retrieval Systems and Software},
    ee = {https://doi.org/10.1007/s42979-024-02737-0,
      https://link.springer.com/content/pdf/10.1007/s42979-024-02737-0.pdf},
    department = {University of Stuttgart, Institute of Parallel and Distributed Systems, Applications of Parallel and Distributed Systems},
    abstract = {In the context of data analytics, so-called lakehouses refer to novel variants
      of data platforms that attempt to combine characteristics of data warehouses
      and data lakes. In this way, lakehouses promise to simplify enterprise
      analytics architectures, which often suffer from high operational costs, slow
      analytical processes and further shortcomings resulting from data replication.
      However, different views and notions on the lakehouse paradigm exist, which are
      commonly driven by individual technologies and varying analytical use cases.
      Therefore, it remains unclear what challenges lakehouses address, how they can
      be characterized and which technologies can be leveraged to implement them.
      This paper addresses these issues by providing an extensive overview of
      concepts and technologies that are related to the lakehouse paradigm and by
      outlining lakehouses as a distinct architectural approach for data platforms.
      Concepts and technologies from literature with regard to lakehouses are
      discussed, based on which a conceptual foundation for lakehouses is
      established. In addition, several popular technologies are evaluated regarding
      their suitability for the building of lakehouses. All findings are supported
      and demonstrated with the help of a representative analytics scenario. Typical
      challenges of conventional data platforms are identified, a new, sharper
      definition for lakehouses is proposed and technical requirements for lakehouses
      are derived. As part of an evaluation, these requirements are applied to
      several popular technologies, of which frameworks for data lakes turn out to be
      particularly helpful for the construction of lakehouses. Our work provides an
      overview of the state of the art and a conceptual foundation for the lakehouse
      paradigm, which can support future research.},
    url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2024-01&engl=1}
 }