{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T15:02:08Z","timestamp":1773154928847,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation","award":["IIS-1956096 and IIS-2107248"],"award-info":[{"award-number":["IIS-1956096 and IIS-2107248"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1145\/3555041.3589409","type":"proceedings-article","created":{"date-parts":[[2023,6,5]],"date-time":"2023-06-05T16:25:14Z","timestamp":1685982314000},"page":"69-75","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["Table Discovery in Data Lakes: State-of-the-art and Future Directions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9020-3642","authenticated-orcid":false,"given":"Grace","family":"Fan","sequence":"first","affiliation":[{"name":"Northeastern University, Boston, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3172-6133","authenticated-orcid":false,"given":"Jin","family":"Wang","sequence":"additional","affiliation":[{"name":"Megagon Labs, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0602-149X","authenticated-orcid":false,"given":"Yuliang","family":"Li","sequence":"additional","affiliation":[{"name":"Megagon Labs, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1484-4787","authenticated-orcid":false,"given":"Ren\u00e9e J.","family":"Miller","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, MA, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,6,5]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Parag Agrawal Arvind Arasu and Raghav Kaushik. 2010. On indexing error-tolerant set containment. In SIGMOD. 927--938.","DOI":"10.1145\/1807167.1807267"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Alex Bogatu Alvaro A. A. Fernandes Norman W. Paton and Nikolaos Konstantinou. 2020. Dataset Discovery in Data Lakes. In ICDE. 709--720.","DOI":"10.1109\/ICDE48307.2020.00067"},{"key":"e_1_3_2_1_3_1","volume-title":"Noy","author":"Brickley Dan","year":"2019","unstructured":"Dan Brickley, Matthew Burgess, and Natasha F. Noy. 2019. Google Dataset Search: Building a search engine for datasets in an open Web ecosystem. In WWW. 1365--1375."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.14778\/1687627.1687750"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Riccardo Cappuzzo Paolo Papotti and Saravanan Thirumuruganathan. 2020. Creating Embeddings of Heterogeneous Relational Datasets for Data Integration Tasks. In SIGMOD. 1335--1349.","DOI":"10.1145\/3318464.3389742"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476346"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-019-00564-x"},{"key":"e_1_3_2_1_8_1","volume-title":"Tim Kraska, and David R","author":"Chepurko Nadiia","year":"2020","unstructured":"Nadiia Chepurko, Ryan Marcus, Emanuel Zgraggen, Raul Castro Fernandez, Tim Kraska, and David R. Karger. 2020. ARDA: Automatic Relational Data Augmentation for Machine Learning. Proc. VLDB Endow. , Vol. 13, 9 (2020), 1373--1387."},{"key":"e_1_3_2_1_9_1","volume-title":"How government can promote open data","author":"Chui Michael","year":"2014","unstructured":"Michael Chui, Diana Farrell, and Kate Jackson. 2014. How government can promote open data. McKinsey Company (2014)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2012.228"},{"key":"e_1_3_2_1_11_1","volume-title":"WarpGate: A Semantic Join Discovery System for Cloud Data Warehouses. CoRR","author":"Cong Tianji","year":"2022","unstructured":"Tianji Cong, James Gale, Jason Frantz, H. V. Jagadish, and cC agatay Demiralp. 2022. WarpGate: A Semantic Join Discovery System for Cloud Data Warehouses. CoRR , Vol. abs\/2212.14155 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Anish Das Sarma Lujun Fang Nitin Gupta Alon Y. Halevy Hongrae Lee Fei Wu Reynold Xin and Cong Yu. 2012. Finding related tables. In SIGMOD. 817--828.","DOI":"10.1145\/2213836.2213962"},{"key":"e_1_3_2_1_13_1","volume-title":"Mirrokni","author":"Datar Mayur","year":"2004","unstructured":"Mayur Datar, Nicole Immorlica, Piotr Indyk, and Vahab S. Mirrokni. 2004. Locality-sensitive hashing scheme based on p-stable distributions. In SCG. ACM, 253--262."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.14778\/3430915.3430921"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Yuyang Dong Kunihiro Takeoka Chuan Xiao and Masafumi Oyamada. 2021. Efficient Joinable Table Discovery in Data Lakes: A High-Dimensional Similarity-Based Approach. In ICDE. 456--467.","DOI":"10.1109\/ICDE51399.2021.00046"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.14778\/3529337.3529353"},{"key":"e_1_3_2_1_17_1","volume-title":"Semantics-aware Dataset Discovery from Data Lakes with Contextualized Column-based Representation Learning. CoRR","author":"Fan Grace","year":"1922","unstructured":"Grace Fan, Jin Wang, Yuliang Li, Dan Zhang, and Ren\u00e9 e J. Miller. 2022. Semantics-aware Dataset Discovery from Data Lakes with Contextualized Column-based Representation Learning. CoRR , Vol. abs\/2210.01922 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Aurum: A Data Discovery System. In ICDE. 1001--1012.","author":"Fernandez Raul Castro","year":"2018","unstructured":"Raul Castro Fernandez, Ziawasch Abedjan, Famien Koko, Gina Yuan, Samuel Madden, and Michael Stonebraker. 2018a. Aurum: A Data Discovery System. In ICDE. 1001--1012."},{"key":"e_1_3_2_1_19_1","volume-title":"Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang.","author":"Fernandez Raul Castro","year":"2018","unstructured":"Raul Castro Fernandez, Essam Mansour, Abdulhakim Ali Qahtan, Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang. 2018b. Seeping Semantics: Linking Datasets Using Word Embeddings for Data Discovery. In ICDE. 989--1000."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Sainyam Galhotra and Udayan Khurana. 2020. Semantic Search over Structured Data. In CIKM.","DOI":"10.1145\/3340531.3417426"},{"key":"e_1_3_2_1_21_1","volume-title":"Christopher Olston, Neoklis Polyzotis, Sudip Roy, and Steven Euijong Whang.","author":"Halevy Alon Y.","year":"2016","unstructured":"Alon Y. Halevy, Flip Korn, Natalya Fridman Noy, Christopher Olston, Neoklis Polyzotis, Sudip Roy, and Steven Euijong Whang. 2016. Goods: Organizing Google's Datasets. In SIGMOD. 795--806."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/1287369.1287427"},{"key":"e_1_3_2_1_23_1","volume-title":"Michiel A. Bakker, Emanuel Zgraggen, Arvind Satyanarayan, Tim Kraska, cC agatay Demiralp, and C\u00e9 sar A. Hidalgo.","author":"Hulsebos Madelon","year":"2019","unstructured":"Madelon Hulsebos, Kevin Zeng Hu, Michiel A. Bakker, Emanuel Zgraggen, Arvind Satyanarayan, Tim Kraska, cC agatay Demiralp, and C\u00e9 sar A. Hidalgo. 2019. Sherlock: A Deep Learning Approach to Semantic Data Type Detection. In SIGKDD. 1500--1508."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Stratos Idreos and Tim Kraska. 2019. From Auto-tuning One Size Fits All to Self-designed and Learned Data-intensive Systems. In SIGMOD. 2054--2059.","DOI":"10.1145\/3299869.3314034"},{"key":"e_1_3_2_1_25_1","first-page":"64","article-title":"The Periodic Table of Data Structures","volume":"41","author":"Idreos Stratos","year":"2018","unstructured":"Stratos Idreos, Kostas Zoumpatianos, Manos Athanassoulis, Niv Dayan, Brian Hentschel, Michael S. Kester, Demi Guo, Lukas M. Maas, Wilson Qin, Abdul Wasay, and Yiyou Sun. 2018. The Periodic Table of Data Structures. IEEE Data Eng. Bull. , Vol. 41, 3 (2018), 64--75.","journal-title":"IEEE Data Eng. Bull."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Mehdi Kargar Aijun An Nick Cercone Parke Godfrey Jaroslaw Szlichta and Xiaohui Yu. 2014. MeanKS: meaningful keyword search in relational databases with complex schema. In SIGMOD. 905--908.","DOI":"10.1109\/ICDE.2015.7113302"},{"key":"e_1_3_2_1_27_1","volume-title":"SANTOS: Relationship-based Semantic Table Union Search. In SIGMOD.","author":"Khatiwada Aamod","year":"2023","unstructured":"Aamod Khatiwada, Grace Fan, Roee Shraga, Zixuan Chen, Wolfgang Gatterbauer, Ren\u00e9e J. Miller, and Mirek Riedewald. 2023. SANTOS: Relationship-based Semantic Table Union Search. In SIGMOD."},{"key":"e_1_3_2_1_28_1","volume-title":"Valentine: Evaluating Matching Techniques for Dataset Discovery. In ICDE. 468--479.","author":"Koutras Christos","year":"2021","unstructured":"Christos Koutras, George Siachamis, Andra Ionescu, Kyriakos Psarakis, Jerry Brons, Marios Fragkoulis, Christoph Lofi, Angela Bonifati, and Asterios Katsifodimos. 2021. Valentine: Evaluating Matching Techniques for Dataset Discovery. In ICDE. 468--479."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.14778\/3137628.3137657"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Oliver Lehmberg Dominique Ritze Robert Meusel and Christian Bizer. 2016. A Large Public Corpus of Web Tables containing Time and Context Metadata. In WWW. 75--76.","DOI":"10.1145\/2872518.2889386"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.websem.2015.05.001"},{"key":"e_1_3_2_1_32_1","volume-title":"Wolfgang Gatterbauer, Ren\u00e9 e J. Miller, and Mirek Riedewald.","author":"Leventidis Aristotelis","year":"2021","unstructured":"Aristotelis Leventidis, Laura Di Rocco, Wolfgang Gatterbauer, Ren\u00e9 e J. Miller, and Mirek Riedewald. 2021. DomainNet: Homograph Detection for Data Lake Disambiguation. In EDBT. 13--24."},{"key":"e_1_3_2_1_33_1","unstructured":"Keqian Li Yeye He and Kris Ganjam. 2017. Discovering Enterprise Concepts Using Spreadsheet Tables. In SIGKDD. 1873--1882."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.14778\/1920841.1921005"},{"key":"e_1_3_2_1_35_1","unstructured":"Xiao Ling Alon Y. Halevy Fei Wu and Cong Yu. 2013. Synthesizing Union Tables from the Web. In IJCAI. 2677--2683."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.14778\/3231751.3231758"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Jiaheng Lu Chunbin Lin Jin Wang and Chen Li. 2019. Synergy of Database Techniques and Machine Learning Models for String Similarity Search and Join. In CIKM. 2975--2976.","DOI":"10.1145\/3357384.3360319"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889473"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.14778\/3229863.3240491"},{"key":"e_1_3_2_1_40_1","first-page":"59","article-title":"Making Open Data Transparent: Data Discovery on Open Data","volume":"41","author":"Miller Ren\u00e9","year":"2018","unstructured":"Ren\u00e9 e J. Miller, Fatemeh Nargesian, Erkang Zhu, Christina Christodoulakis, Ken Q. Pu, and Periklis Andritsos. 2018. Making Open Data Transparent: Data Discovery on Open Data. IEEE Data Eng. Bull. , Vol. 41, 2 (2018), 59--70.","journal-title":"IEEE Data Eng. Bull."},{"key":"e_1_3_2_1_41_1","first-page":"237","article-title":"Data Lake Organization","volume":"35","author":"Nargesian Fatemeh","year":"2023","unstructured":"Fatemeh Nargesian, Ken Q. Pu, Bahar Ghadiri Bashardoost, Erkang Zhu, and Ren\u00e9 e J. Miller. 2023. Data Lake Organization. IEEE Trans. Knowl. Data Eng. , Vol. 35, 1 (2023), 237--250.","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"e_1_3_2_1_42_1","volume-title":"Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller","author":"Nargesian Fatemeh","year":"2020","unstructured":"Fatemeh Nargesian, Ken Q. Pu, Erkang Zhu, Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller. 2020. Organizing Data Lakes for Navigation. In SIGMOD. 1939--1950."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352116"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192973"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.14778\/3384345.3384346"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476364"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.14778\/2336664.2336665"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"A\u00e9 cio S. R. Santos Aline Bessa Christopher Musco and Juliana Freire. 2022. A Sketch-based Index for Correlated Dataset Search. In ICDE. 2928--2941.","DOI":"10.1109\/ICDE53745.2022.00264"},{"key":"e_1_3_2_1_49_1","volume-title":"PLEX: Towards Practical Learned Indexing. CoRR","author":"Stoian Mihail","year":"2021","unstructured":"Mihail Stoian, Andreas Kipf, Ryan Marcus, and Tim Kraska. 2021. PLEX: Towards Practical Learned Indexing. CoRR , Vol. abs\/2108.05117 (2021)."},{"key":"e_1_3_2_1_50_1","volume-title":"Chen Chen, and Wang-Chiew Tan.","author":"Suhara Yoshihiko","year":"2022","unstructured":"Yoshihiko Suhara, Jinfeng Li, Yuliang Li, Dan Zhang, cC agatay Demiralp, Chen Chen, and Wang-Chiew Tan. 2022. Annotating Columns with Pre-trained Language Models. In SIGMOD. 1493--1503."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.14778\/2002938.2002939"},{"key":"e_1_3_2_1_52_1","volume-title":"Xin Luna Dong, and Meng Jiang","author":"Wang Daheng","year":"2021","unstructured":"Daheng Wang, Prashant Shiralkar, Colin Lockard, Binxuan Huang, Xin Luna Dong, and Meng Jiang. 2021. TCN: Table Convolutional Network for Web Table Interpretation. In WWW. 4020--4032."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476393"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Mohamed Yakout Kris Ganjam Kaushik Chakrabarti and Surajit Chaudhuri. 2012. InfoGather: entity augmentation and attribute discovery by holistic matching with web tables. In SIGMOD. 97--108.","DOI":"10.1145\/2213836.2213848"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Ce Zhang Jaeho Shin Christopher R\u00e9 Michael J. Cafarella and Feng Niu. 2016. Extracting Databases from Dark Data with DeepDive. In SIGMOD. 847--859.","DOI":"10.1145\/2882903.2904442"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407793"},{"key":"e_1_3_2_1_57_1","volume-title":"Ives","author":"Zhang Yi","year":"2020","unstructured":"Yi Zhang and Zachary G. Ives. 2020. Finding Related Tables in Data Lakes for Interactive Data Science. In SIGMOD. 1951--1966."},{"key":"e_1_3_2_1_58_1","volume-title":"Leva: Boosting Machine Learning Performance with Relational Embedding Data Augmentation. In SIGMOD. 1504--1517.","author":"Zhao Zixuan","year":"2022","unstructured":"Zixuan Zhao and Raul Castro Fernandez. 2022. Leva: Boosting Machine Learning Performance with Relational Embedding Data Augmentation. In SIGMOD. 1504--1517."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3300065"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.14778\/2994509.2994534"}],"event":{"name":"SIGMOD\/PODS '23: International Conference on Management of Data","location":"Seattle WA USA","acronym":"SIGMOD\/PODS '23","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Companion of the 2023 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3555041.3589409","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3555041.3589409","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T18:43:58Z","timestamp":1750272238000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3555041.3589409"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":60,"alternative-id":["10.1145\/3555041.3589409","10.1145\/3555041"],"URL":"https:\/\/doi.org\/10.1145\/3555041.3589409","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]},"assertion":[{"value":"2023-06-05","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}