{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:32:31Z","timestamp":1772908351898,"version":"3.50.1"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icde65448.2025.00118","type":"proceedings-article","created":{"date-parts":[[2025,8,20]],"date-time":"2025-08-20T18:28:25Z","timestamp":1755714505000},"page":"1523-1536","source":"Crossref","is-referenced-by-count":1,"title":["TabSketchFM: Sketch-Based Tabular Representation Learning for Data Discovery Over Data Lakes"],"prefix":"10.1109","author":[{"given":"Aamod","family":"Khatiwada","sequence":"first","affiliation":[{"name":"Northeastern University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Harsha","family":"Kokel","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ibrahim","family":"Abdelaziz","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Subhajit","family":"Chaudhury","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Julian","family":"Dolby","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Oktie","family":"Hassanzadeh","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenhan","family":"Huang","sequence":"additional","affiliation":[{"name":"Rensselaer Polytechnic Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tejaswini","family":"Pedapati","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Horst","family":"Samulowitz","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kavitha","family":"Srinivas","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352116"},{"issue":"4","key":"ref2","first-page":"932","article-title":"Integrating data lake tables","volume-title":"Proc. VLDB Endow.","volume":"16","author":"Khatiwada","year":"2022"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3555041.3589732"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192973"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3300065"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3588689"},{"key":"ref7","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","author":"Devlin","year":"2019"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.14778\/3430915.3430921"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.270"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467434"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.745"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.398"},{"key":"ref13","article-title":"LLM2vec: Large language models are secretly powerful text encoders","volume-title":"First Conference on Language Modeling","author":"BehnamGhader","year":"2024"},{"issue":"12","key":"ref14","first-page":"1185","article-title":"LSH ensemble: Internet-scale domain search","volume-title":"Proc. VLDB Endow.","volume":"9","author":"Zhu","year":"2016"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3458456"},{"issue":"7","key":"ref16","first-page":"1726","article-title":"Semantics-aware dataset discovery from data lakes with contextualized column-based representation learning","volume-title":"Proc. VLDB Endow.","volume":"16","author":"Fan","year":"2023"},{"key":"ref17","volume-title":"Lakebench: Benchmarks for data discovery over data lakes","author":"Srinivas","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00544"},{"key":"ref19","article-title":"Electra: Pre-training text encoders as discriminators rather than generators","volume-title":"International Conference on Learning Representations","author":"Clark"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.14778\/1453856.1453916"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313685"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2213836.2213962"},{"issue":"11","key":"ref23","doi-asserted-by":"crossref","first-page":"1502","DOI":"10.14778\/3137628.3137657","article-title":"Stitching web tables for improving matching quality","volume":"10","author":"Lehmberg","year":"2017","journal-title":"Proc. VLDB Endow."},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/icde.2018.00094"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE48307.2020.00067"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"3786","DOI":"10.18653\/v1\/2023.findings-acl.233","article-title":"Automatic table union search with tabular representation learning","volume-title":"Findings of the Association for Computational Linguistics: ACL 2023","author":"Hu","year":"2023, 2023"},{"issue":"10","key":"ref27","doi-asserted-by":"crossref","first-page":"1034","DOI":"10.14778\/3115404.3115409","article-title":"Auto-join: Joining tables by leveraging transformations","volume":"10","author":"Zhu","year":"2017","journal-title":"Proc. VLDB Endow."},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/icde51399.2021.00046"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.14778\/3603581.3603587"},{"key":"ref30","article-title":"Warpgate: A semantic join discovery system for cloud data warehouses","volume-title":"13th Conference on Innovative Data Systems Research, CIDR 2023, Amsterdam, The Netherlands, January 8\u201311","author":"Cong","year":"2023"},{"key":"ref31","first-page":"518","article-title":"Similarity search in high dimensions via hashing","volume-title":"VLDB\u201999, Proceedings of 25th International Conference on Very Large Data Bases, September 7\u201310, 1999","author":"Gionis"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.14778\/3659437.3659461"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3588710"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679157"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/icde51399.2021.00047"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/s007780100057"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3196926"},{"issue":"12","key":"ref38","doi-asserted-by":"crossref","first-page":"1197","DOI":"10.14778\/2994509.2994535","article-title":"Magellan: Toward building entity matching management systems","volume":"9","author":"Konda","year":"2016","journal-title":"Proc. VLDB Endow."},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2872518.2889386"},{"key":"ref40","article-title":"TAPEX: table pre-training via learning a neural SQL executor","volume-title":"The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event","author":"Liu","year":"2022, 2022"},{"key":"ref41","article-title":"Grappa: Grammar-augmented pre-training for table semantic parsing","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Yu","year":"2021, 2021"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462909"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517906"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3372117"},{"key":"ref45","article-title":"Proceedings of the Semantic Web Challenge on Tabular Data to Knowledge Graph Matching, SemTab 2022","volume-title":"co-located with the 21st International Semantic Web Conference, ISWC 2022, Virtual conference","volume":"3320","author":"Efthymiou","year":"2022"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300892"},{"issue":"11","key":"ref47","doi-asserted-by":"crossref","first-page":"1835","DOI":"10.14778\/3407790.3407793","article-title":"Sato: Contextual semantic type detection in tables","volume":"13","author":"Zhang","year":"2020","journal-title":"Proc. VLDB Endow."},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330993"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.14778\/3659437.3659448"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657877"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139924801"},{"key":"ref52","volume-title":"ekzhu\/datasketch: First stable release","author":"Zhu","year":"2017"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1117\/1.2348895"},{"key":"ref54","volume-title":"ECB Statistical Data Warehouse","year":"2023"},{"key":"ref55","doi-asserted-by":"crossref","first-page":"3911","DOI":"10.18653\/v1\/D18-1425","article-title":"Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-SQL task","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","author":"Yu","year":"2018"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/762"},{"key":"ref58","article-title":"BERT loses patience: Fast and robust inference with early exit","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6\u201312, 2020, virtual","author":"Zhou","year":"2020"},{"issue":"8","key":"ref59","doi-asserted-by":"crossref","first-page":"1684","DOI":"10.14778\/3529337.3529353","article-title":"MATE: multi-attribute table extraction","volume":"15","author":"Esmailoghli","year":"2022","journal-title":"Proc. VLDB Endow."},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2018.2889473"}],"event":{"name":"2025 IEEE 41st International Conference on Data Engineering (ICDE)","location":"Hong Kong, Hong Kong","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE 41st International Conference on Data Engineering (ICDE)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11112833\/11112834\/11113110.pdf?arnumber=11113110","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:25:10Z","timestamp":1755753910000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11113110\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/icde65448.2025.00118","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}