{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T19:12:14Z","timestamp":1774552334106,"version":"3.50.1"},"reference-count":156,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["390621612"],"award-info":[{"award-number":["390621612"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Union Horizon Programme","award":["101093164"],"award-info":[{"award-number":["101093164"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Knowl. Data Eng."],"published-print":{"date-parts":[[2023,12,1]]},"DOI":"10.1109\/tkde.2023.3270101","type":"journal-article","created":{"date-parts":[[2023,4,25]],"date-time":"2023-04-25T18:43:23Z","timestamp":1682448203000},"page":"12571-12590","source":"Crossref","is-referenced-by-count":89,"title":["Data Lakes: A Survey of Functions and Systems"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3720-6585","authenticated-orcid":false,"given":"Rihan","family":"Hai","sequence":"first","affiliation":[{"name":"Department of Software Technology, Delft University of Technology, Delft, CD, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3015-154X","authenticated-orcid":false,"given":"Christos","family":"Koutras","sequence":"additional","affiliation":[{"name":"Department of Software Technology, Delft University of Technology, Delft, CD, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1698-4345","authenticated-orcid":false,"given":"Christoph","family":"Quix","sequence":"additional","affiliation":[{"name":"Hochschule Niederrhein, Krefeld, Germany and Fraunhofer FIT, Hochschule Niederrhein University of Applied Sciences, Krefeld, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6169-2942","authenticated-orcid":false,"given":"Matthias","family":"Jarke","sequence":"additional","affiliation":[{"name":"RWTH Aachen University, Aachen, Germany"}]}],"member":"263","reference":[{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-33223-5_7"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18420\/btw2021-19"},{"key":"ref59","author":"gorelik","year":"2019","journal-title":"The Enterprise Big Data Lake Delivering the Promise of Big Data and Data Science"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-27520-4_13"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3183746"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/191839.191908"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-69751-2_5"},{"key":"ref54","article-title":"Gartner says beware of the data lake fallacy","author":"gartner","year":"2014"},{"key":"ref51","first-page":"473","article-title":"Data wrangling for Big Data: Challenges and opportunities","author":"furche","year":"2016","journal-title":"Proc Int Conf Extending Database Technol"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457552"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CYBER.2015.7288049"},{"key":"ref45","first-page":"198","author":"fagin","year":"2009","journal-title":"chapter Clio Schema Mapping Creation and Data Exchange"},{"key":"ref48","first-page":"1001","article-title":"Aurum: A data discovery system","author":"fernandez","year":"2018","journal-title":"Proc IEEE Int Conf Data Eng"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2899391"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1108\/IJWIS-03-2021-0026"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/2814710.2814713"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-27615-7_29"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59065-9_7"},{"key":"ref49","first-page":"1190","article-title":"Lazo: A cardinality-based method for coupled estimation of jaccard similarity and containment","author":"fernandez","year":"2019","journal-title":"Proc IEEE Int Conf Data Eng"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1060745.1060840"},{"key":"ref7","article-title":"Lakehouse: A new generation of open platforms that unify data warehousing and advanced analytics","author":"armbrust","year":"2021","journal-title":"Proc 11th Conf Innov Data Syst Res"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3132847.3133171"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3388870"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-32065-2_3"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415560"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-68474-1_20"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.14778\/3229863.3240491"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.07.439"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00046"},{"key":"ref35","article-title":"An approach to extracting thematic views from highly heterogeneous sources of a data lake","author":"diamantini","year":"2018","journal-title":"Proc 26th Italian Symp Adv Database Syst"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00063-9_17"},{"key":"ref37","article-title":"Pentaho, hadoop, and data lakes | james dixon's blog","author":"dixon","year":"2010"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/s10796-020-10010-x"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-012-0302-x"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/HORA52670.2021.9461293"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2849727"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1145\/3372117"},{"key":"ref33","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref146","article-title":"Commentary: The FAIR guiding principles for scientific data management and stewardship","volume":"3","author":"wilkonson","year":"2016","journal-title":"Nature Sci Data"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3542700.3542709"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/DAS49615.2020.9108912"},{"key":"ref39","author":"doan","year":"2012","journal-title":"Principles of Data Integration"},{"key":"ref38","author":"dixon","year":"2014"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.14778\/2994509.2994534"},{"key":"ref156","author":"zikopoulos","year":"2014","journal-title":"Big data beyond the hype a guide to conversations for today's data center"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2994641"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3300065"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389726"},{"key":"ref152","first-page":"1014","article-title":"Efficient deep learning pipelines for accurate cost estimations over large scale query workload","author":"kang","year":"2021","journal-title":"Proc ACM SIGMOD Int Conf Manage Data"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352095"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.14778\/3137628.3137633"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.3389\/fbioe.2020.553904"},{"key":"ref26","article-title":"Governing and managing big data for analytics and decision makers","author":"chessell","year":"2014"},{"key":"ref25","first-page":"823","article-title":"Data lakes: A survey paper","author":"cherradi","year":"2021","journal-title":"Proc Int Conf Smart City Appl"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ISGT49243.2021.9372181"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/1365815.1365816"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2015.2472010"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/2491245"},{"key":"ref27","volume":"350","author":"conover","year":"1998","journal-title":"Practical Nonparametric Statistics"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18293\/SEKE2019-129"},{"key":"ref13","first-page":"11c","article-title":"A novel big data architecture in support of ADS-B data analytic","author":"boci","year":"2015","journal-title":"Proc IEEE Int Conf Neutron Scattering"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.14778\/3457390.3457403"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3209900.3209911"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-35514-2_32"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE48307.2020.00067"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2882939"},{"key":"ref97","first-page":"148","article-title":"The security data lake","author":"marty","year":"2015","journal-title":"Proc IEEE 4th Int Conf Future Internet Things Cloud Workshops"},{"key":"ref126","first-page":"1","article-title":"On data lake architectures and metadata management","volume":"56","author":"sawadogo","year":"2020","journal-title":"J Intell Inf Syst"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/DASC.2017.8102023"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-30278-8_43"},{"key":"ref11","article-title":"Temporal provenance model (TPM): Model and query language","author":"beheshti","year":"2010"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICDEW.2019.00-37"},{"key":"ref124","article-title":"Optimizing federated queries based on the physical design of a data lake","author":"rohde","year":"2020","journal-title":"Proc Workshops EDBT\/ICDT Joint Conf"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.14778\/3229863.3236230"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/s13222-017-0272-7"},{"key":"ref125","article-title":"Data lakes: Purposes, practices, patterns, and platforms","author":"russom","year":"2017","journal-title":"TDWI"},{"key":"ref17","first-page":"1877","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.58729\/1941-6687.1324"},{"key":"ref19","article-title":"The technology of the business data lake table","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.14778\/1687627.1687750"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2019.2958084"},{"key":"ref133","article-title":"A survey of data provenance techniques","volume":"69","author":"simmhan","year":"2005","journal-title":"Comput Sci Dept Indiana Univ Bloomington"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/3012071.3012077"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.32628\/CSEIT1952121"},{"key":"ref95","author":"manning","year":"2009","journal-title":"An Introduction to Information Retrieval"},{"key":"ref131","author":"sharma","year":"2016","journal-title":"Architecting Data Lakes"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3314132"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/MSST.2010.5496972"},{"key":"ref130","first-page":"521","article-title":"SAP HANA Vora: A distributed computing platform for enterprise data lakes","author":"sengstock","year":"2017","journal-title":"Proc Datenbanksysteme Bus Technol Web"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-91563-0_29"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.14778\/3137765.3137792"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.3115\/1613715.1613817"},{"key":"ref139","article-title":"Multimedia database systems: Issues and research directions","author":"subrahmanian","year":"2012"},{"key":"ref86","author":"lindstedt","year":"2011","journal-title":"Super Charge Your Data Warehouse Invaluable Data Modeling Rules to Implement Your Data Vault"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457250"},{"key":"ref85","first-page":"13","article-title":"DomainNet: Homograph detection for data lake disambiguation","author":"leventidis","year":"2021","journal-title":"Proc Int Conf Extending Database Technol"},{"key":"ref138","article-title":"The enterprise data lake: Better integration and deeper analytics","volume":"1","author":"stein","year":"2014","journal-title":"PwC Technology Forecast Rethinking integration"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3323214"},{"key":"ref135","article-title":"Introduction to the theory of computation","volume":"2","author":"sipser","year":"2006"},{"key":"ref87","author":"lindstedt","year":"2015","journal-title":"Building a Scalable Data Warehouse with Data Vault"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/eScience.2018.00040"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.14778\/3574245.3574274"},{"key":"ref144","article-title":"Needle in a haystack queries in cloud data lakes","author":"weintraub","year":"2021","journal-title":"Proc EDBT\/ICDT Workshops"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3375661"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-61845-6_30"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.14778\/3137628.3137657"},{"key":"ref142","article-title":"Data wrangling: The challenging yourney from the wild to the lake","author":"terrizzano","year":"2015","journal-title":"Proc Conf Innov Data Syst Res"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2017.8258204"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/BDCloud.2015.62"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/eScience.2016.7870919"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2017.8258302"},{"key":"ref80","article-title":"Federated query processing over heterogeneous data sources in a semantic data lake","author":"endris","year":"2020"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2068"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2014.82"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-67271-7_16"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.14778\/3384345.3384346"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192973"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1145\/3216122.3216130"},{"key":"ref75","article-title":"Dataset relationship management","author":"ives","year":"2019","journal-title":"Proc Conf Innov Data Syst Res"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3380605"},{"key":"ref74","author":"inmon","year":"2016","journal-title":"Data Lake Architecture Designing the Data Lake and avoiding the garbage dump"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352116"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-05153-5"},{"key":"ref102","article-title":"The real questions of enterprise transformation","author":"mueller-wuensch","year":"2022","journal-title":"Panel ICIS"},{"key":"ref76","article-title":"Analyzing and comparing lakehouse storage systems","author":"jain","year":"2023","journal-title":"Proc Conf Innov Data Syst Res"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2858256"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2845915"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476382"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1080\/00437956.1954.11659520"},{"key":"ref111","first-page":"6","article-title":"Data lake governance best practices","volume":"4","author":"patel","year":"2017","journal-title":"The DZone Guide to Big Data - Data Science & Advanced Analytics"},{"key":"ref70","article-title":"Deep lake: A lakehouse for deep learning","author":"hambardzumyan","year":"2023","journal-title":"Proc Conf Innov Data Syst Res"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICPHYS.2019.8780276"},{"key":"ref73","first-page":"1924","article-title":"A study of enterprise data lake solutions","volume":"7","author":"hukkeri","year":"2020","journal-title":"Int Res J Eng Technol"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476317"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476364"},{"key":"ref68","first-page":"5","article-title":"Managing Google's data lake: An overview of the goods system","volume":"39","author":"halevy","year":"2016","journal-title":"IEEE Data Eng Bull"},{"key":"ref119","first-page":"3","article-title":"Data cleaning: Problems and current approaches","volume":"23","author":"rahm","year":"2000","journal-title":"IEEE Data Eng Bull"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2903730"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.7250\/csimq.2016-9.04"},{"key":"ref69","first-page":"1255","article-title":"Split query processing in polybase","author":"halverson","year":"2013","journal-title":"Proc ACM SIGMOD Int Conf Manage Data"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/s007780100057"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-33223-5_19"},{"key":"ref115","first-page":"1","author":"quix","year":"2018","journal-title":"Data lake"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00847-5_28"},{"key":"ref116","first-page":"129","article-title":"GEMMS: A generic and extensible metadata management system for data lakes","author":"quix","year":"2016","journal-title":"Proc 28th Int Conf Adv Inf Syst Eng"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s007780100054"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.14778\/2336664.2336665"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-98398-1_3"},{"key":"ref114","first-page":"37","article-title":"Ontology matching for patent classification","author":"quix","year":"2017","journal-title":"Proc 13th Int Workshop Ontology Matching Co-Located 17th Int Semantic Web Conf"},{"key":"ref60","author":"grover","year":"2015","journal-title":"Hadoop Application Architectures Designing Real-World Big Data Applications"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-27615-7_23"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476353"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.14778\/3342263.3342631"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3056100"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2899389"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77385-4_18"}],"container-title":["IEEE Transactions on Knowledge and Data Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/69\/10311056\/10107808.pdf?arnumber=10107808","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,11]],"date-time":"2023-12-11T20:56:07Z","timestamp":1702328167000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10107808\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,1]]},"references-count":156,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tkde.2023.3270101","relation":{},"ISSN":["1041-4347","1558-2191","2326-3865"],"issn-type":[{"value":"1041-4347","type":"print"},{"value":"1558-2191","type":"electronic"},{"value":"2326-3865","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,1]]}}}