{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T15:02:13Z","timestamp":1773154933439,"version":"3.50.1"},"reference-count":80,"publisher":"Association for Computing Machinery (ACM)","issue":"11","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["Proc. VLDB Endow."],"published-print":{"date-parts":[[2025,7]]},"abstract":"<jats:p>\n            We propose\n            <jats:italic toggle=\"yes\">OmniMatch<\/jats:italic>\n            , a novel joinability discovery technique, specifically tailored for the needs of\n            <jats:italic toggle=\"yes\">data products<\/jats:italic>\n            : cohesive curated collections of tabular datasets.\n            <jats:italic toggle=\"yes\">OmniMatch<\/jats:italic>\n            combines multiple column-pair similarity measures leveraging self-supervised Graph Neural Networks (GNNs).\n            <jats:italic toggle=\"yes\">OmniMatch<\/jats:italic>\n            's GNN captures column relatedness by leveraging graph neighborhood information, significantly improving the recall of joinability discovery tasks. At the same time,\n            <jats:italic toggle=\"yes\">OmniMatch<\/jats:italic>\n            increases its precision by augmenting its training data with negative column join examples through an automated negative example generation process. Compared to the state-of-the-art,\n            <jats:italic toggle=\"yes\">OmniMatch<\/jats:italic>\n            exhibits up to 14% higher effectiveness in F1 score and AUC without relying on individual, user-provided thresholds for each similarity metric.\n          <\/jats:p>","DOI":"10.14778\/3749646.3749715","type":"journal-article","created":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T17:55:06Z","timestamp":1757008506000},"page":"4588-4601","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["OmniMatch: Joinability Discovery in Data Products"],"prefix":"10.14778","volume":"18","author":[{"given":"Christos","family":"Koutras","sequence":"first","affiliation":[{"name":"TU Delft"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiani","family":"Zhang","sequence":"additional","affiliation":[{"name":"Google"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao","family":"Qin","sequence":"additional","affiliation":[{"name":"AWS"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chuan","family":"Lei","sequence":"additional","affiliation":[{"name":"AWS"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vasileios","family":"Ioannidis","sequence":"additional","affiliation":[{"name":"AWS"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christos","family":"Faloutsos","sequence":"additional","affiliation":[{"name":"AWS &amp; CMU"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"George","family":"Karypis","sequence":"additional","affiliation":[{"name":"AWS"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Asterios","family":"Katsifodimos","sequence":"additional","affiliation":[{"name":"AWS &amp; TU Delft"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,9,4]]},"reference":[{"key":"e_1_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-015-0389-y"},{"key":"e_1_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2012.66"},{"key":"e_1_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00084"},{"key":"e_1_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00544"},{"key":"e_1_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.14778\/3457390.3457403"},{"key":"e_1_2_1_6_1","volume-title":"Norman W Paton, and Nikolaos Konstantinou.","author":"Bogatu Alex","year":"2020","unstructured":"Alex Bogatu, Alvaro AA Fernandes, Norman W Paton, and Nikolaos Konstantinou. 2020. Dataset Discovery in Data Lakes. In IEEE ICDE."},{"key":"e_1_2_1_7_1","volume-title":"Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389742"},{"key":"e_1_2_1_9_1","volume-title":"International conference on machine learning. PMLR, 1597\u20131607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597\u20131607."},{"key":"e_1_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.14778\/3352063.3352128"},{"key":"e_1_2_1_11_1","volume-title":"Proceedings of the VLDB Endowment 13","author":"Chepurko Nadiia","year":"2021","unstructured":"Nadiia Chepurko, Ryan Marcus, Emanuel Zgraggen, Raul Castro Fernandez, Tim Kraska, and David Karger. 2021. ARDA: Automatic Relational Data Augmentation for Machine Learning. Proceedings of the VLDB Endowment 13, 9 (2021)."},{"key":"e_1_2_1_12_1","unstructured":"Tianji Cong James Gale Jason Frantz H. V. Jagadish and \u00c7agatay Demiralp. 2023. WarpGate: A Semantic Join Discovery System for Cloud Data Warehouses. In CIDR."},{"key":"e_1_2_1_13_1","volume-title":"Pylon: Semantic Table Union Search in Data Lakes. arXiv preprint arXiv:2301.04901","author":"Cong Tianji","year":"2023","unstructured":"Tianji Cong, Fatemeh Nargesian, and HV Jagadish. 2023. Pylon: Semantic Table Union Search in Data Lakes. arXiv preprint arXiv:2301.04901 (2023)."},{"key":"e_1_2_1_14_1","unstructured":"Zhamak Dehghani. 2022. Data mesh: Delivering Data-Driven Value at Scale. O' Reilly Media."},{"key":"e_1_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.14778\/3115404.3115413"},{"key":"e_1_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542700.3542709"},{"key":"e_1_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-155860869-6\/50060-3"},{"key":"e_1_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00046"},{"key":"e_1_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3603581.3603587"},{"key":"e_1_2_1_20_1","volume-title":"In Situ Neural Relational Schema Matcher. In 2024 IEEE 40th International Conference on Data Engineering (ICDE). IEEE, 138\u2013150","author":"Du Xingyu","year":"2024","unstructured":"Xingyu Du, Gongsheng Yuan, Sai Wu, Gang Chen, and Peng Lu. 2024. In Situ Neural Relational Schema Matcher. In 2024 IEEE 40th International Conference on Data Engineering (ICDE). IEEE, 138\u2013150."},{"key":"e_1_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2008.4497410"},{"key":"e_1_2_1_22_1","volume-title":"BTW workshops","volume":"7","author":"Engmann Daniel","year":"2007","unstructured":"Daniel Engmann and Sabine Massmann. 2007. Instance Matching with COMA++.. In BTW workshops, Vol. 7. 28\u201337."},{"key":"e_1_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/3587136.3587146"},{"key":"e_1_2_1_24_1","volume-title":"2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 1001\u20131012","author":"Fernandez Raul Castro","year":"2018","unstructured":"Raul Castro Fernandez, Ziawasch Abedjan, Famien Koko, Gina Yuan, Samuel Madden, and Michael Stonebraker. 2018. Aurum: A data discovery system. In 2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 1001\u20131012."},{"key":"e_1_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3329859.3329877"},{"key":"e_1_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2018.00093"},{"key":"e_1_2_1_27_1","volume-title":"2019 IEEE 35th International Conference on Data Engineering (ICDE). IEEE, 1190\u20131201","author":"Fernandez Raul Castro","year":"2019","unstructured":"Raul Castro Fernandez, Jisoo Min, Demitri Nava, and Samuel Madden. 2019. Lazo: A cardinality-based method for coupled estimation of jaccard similarity and containment. In 2019 IEEE 35th International Conference on Data Engineering (ICDE). IEEE, 1190\u20131201."},{"key":"e_1_2_1_28_1","volume-title":"Advances in Database Technology: EDBT 2021, 24th International Conference on Extending Database Technology: Nicosia, Cyprus, March 23\u201326, 2021: proceedings. OpenProceedings, 433\u2013438","author":"de Jes\u00fas Flores Herrera Javier","year":"2021","unstructured":"Javier de Jes\u00fas Flores Herrera, Sergi Nadal Francesch, and \u00d3scar Romero Moral. 2021. Towards scalable data discovery. In Advances in Database Technology: EDBT 2021, 24th International Conference on Extending Database Technology: Nicosia, Cyprus, March 23\u201326, 2021: proceedings. OpenProceedings, 433\u2013438."},{"key":"e_1_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671471"},{"key":"e_1_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588722"},{"key":"e_1_2_1_31_1","volume-title":"MEDTO: Medical Data to Ontology Matching Using Hybrid Graph Neural Networks. In ACM SIGKDD. 2946\u20132954.","author":"Hao Junheng","year":"2021","unstructured":"Junheng Hao, Chuan Lei, Vasilis Efthymiou, Abdul Quamar, Fatma \u00d6zcan, Yizhou Sun, and Wei Wang. 2021. MEDTO: Medical Data to Ontology Matching Using Hybrid Graph Neural Networks. In ACM SIGKDD. 2946\u20132954."},{"key":"e_1_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476317"},{"key":"e_1_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.233"},{"key":"e_1_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330993"},{"key":"e_1_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.270"},{"key":"e_1_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00150"},{"key":"e_1_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2068"},{"key":"e_1_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588689"},{"key":"e_1_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.14778\/3574245.3574274"},{"key":"e_1_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.14778\/3401960.3401970"},{"key":"e_1_2_1_41_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings."},{"key":"e_1_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-industry.21"},{"key":"e_1_2_1_43_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Kong Kezhi","year":"2024","unstructured":"Kezhi Kong, Jiani Zhang, Zhengyuan Shen, Balasubramaniam Srinivasan, Chuan Lei, Christos Faloutsos, Huzefa Rangwala, and George Karypis. 2024. OpenTab: Advancing Large Language Models as Open-domain Table Reasoners. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_2_1_44_1","volume-title":"REMA: Graph Embeddings-based Relational Schema Matching.","author":"Koutras Christos","year":"2020","unstructured":"Christos Koutras, Marios Fragkoulis, Asterios Katsifodimos, and Christoph Lofi. 2020. REMA: Graph Embeddings-based Relational Schema Matching."},{"key":"e_1_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00047"},{"key":"e_1_2_1_46_1","volume-title":"Wolfgang Gatterbauer, Ren\u00e9e J Miller, and Mirek Riedewald.","author":"Leventidis Aristotelis","year":"2021","unstructured":"Aristotelis Leventidis, Laura Di Rocco, Wolfgang Gatterbauer, Ren\u00e9e J Miller, and Mirek Riedewald. 2021. DomainNet: Homograph Detection for Data Lake Disambiguation. EDBT 2021 (2021)."},{"key":"e_1_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452824"},{"key":"e_1_2_1_48_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_2_1_49_1","volume-title":"Magneto: Combining Small and Large Language Models for Schema Matching. arXiv preprint arXiv:2412.08194","author":"Liu Yurong","year":"2024","unstructured":"Yurong Liu, Eduardo Pena, Aecio Santos, Eden Wu, and Juliana Freire. 2024. Magneto: Combining Small and Large Language Models for Schema Matching. arXiv preprint arXiv:2412.08194 (2024)."},{"key":"e_1_2_1_50_1","volume-title":"VLDB","volume":"1","author":"Madhavan Jayant","year":"2001","unstructured":"Jayant Madhavan, Philip A Bernstein, and Erhard Rahm. 2001. Generic schema matching with cupid. In VLDB, Vol. 1. Citeseer, 49\u201358."},{"key":"e_1_2_1_51_1","volume-title":"Foundations of statistical natural language processing","author":"Manning Christopher","unstructured":"Christopher Manning and Hinrich Schutze. 1999. Foundations of statistical natural language processing. MIT press."},{"key":"e_1_2_1_52_1","volume-title":"FREYJA: Efficient Join Discovery in Data Lakes. arXiv preprint arXiv:2412.06637","author":"Maynou Marc","year":"2024","unstructured":"Marc Maynou, Sergi Nadal, Raquel Panadero, Javier Flores, Oscar Romero, and Anna Queralt. 2024. FREYJA: Efficient Join Discovery in Data Lakes. arXiv preprint arXiv:2412.06637 (2024)."},{"key":"e_1_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2002.994702"},{"key":"e_1_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Fatemeh Nargesian Erkang Zhu Ken Q Pu and Ren\u00e9e J Miller. 2018. Table union search on open data. In VLDB.","DOI":"10.14778\/3192965.3192973"},{"key":"e_1_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.14778\/1988776.1988777"},{"key":"e_1_2_1_56_1","volume-title":"Scikit-learn: Machine learning in Python. the Journal of machine Learning research 12","author":"Pedregosa Fabian","year":"2011","unstructured":"Fabian Pedregosa, Ga\u00ebl Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, Peter Prettenhofer, Ron Weiss, Vincent Dubourg, et al. 2011. Scikit-learn: Machine learning in Python. the Journal of machine Learning research 12 (2011), 2825\u20132830."},{"key":"e_1_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_2_1_58_1","volume-title":"A survey of approaches to automatic schema matching. the VLDB Journal 10, 4","author":"Rahm Erhard","year":"2001","unstructured":"Erhard Rahm and Philip A Bernstein. 2001. A survey of approaches to automatic schema matching. the VLDB Journal 10, 4 (2001), 334\u2013350."},{"key":"e_1_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.14963588"},{"key":"e_1_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"e_1_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.14778\/3397230.3397237"},{"key":"e_1_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401120"},{"key":"e_1_2_1_64_1","volume-title":"Chess: Contextual harnessing for efficient sql synthesis. arXiv preprint arXiv:2405.16755","author":"Talaei Shayan","year":"2024","unstructured":"Shayan Talaei, Mohammadreza Pourreza, Yu-Chen Chang, Azalia Mirhoseini, and Amin Saberi. 2024. Chess: Contextual harnessing for efficient sql synthesis. arXiv preprint arXiv:2405.16755 (2024)."},{"key":"e_1_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457328"},{"key":"e_1_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2011.5767865"},{"key":"e_1_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2019.00042"},{"key":"e_1_2_1_68_1","unstructured":"Minjie Wang Da Zheng Zihao Ye Quan Gan Mufei Li Xiang Song Jinjing Zhou Chao Ma Lingfan Yu Yu Gai et al. 2019. Deep graph library: A graph-centric highly-performant package for graph neural networks. arXiv preprint arXiv:1909.01315 (2019)."},{"key":"e_1_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Zhichun Wang Qingsong Lv Xiaohan Lan and Yu Zhang. 2018. Cross-lingual Knowledge Graph Alignment via Graph Convolutional Networks. In EMNLP. 349\u2013357.","DOI":"10.18653\/v1\/D18-1032"},{"key":"e_1_2_1_70_1","unstructured":"Yuting Wu Xiao Liu Yansong Feng Zheng Wang and Dongyan Zhao. 2020. Neighborhood Matching Network for Entity Alignment. In ACL. 6477\u20136487."},{"key":"e_1_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2213836.2213848"},{"key":"e_1_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.745"},{"key":"e_1_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.820"},{"key":"e_1_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.14778\/1920841.1920944"},{"key":"e_1_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/1989323.1989336"},{"key":"e_1_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178876.3186067"},{"key":"e_1_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00123"},{"key":"e_1_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389726"},{"key":"e_1_2_1_79_1","volume-title":"Leva: Boosting Machine Learning Performance with Relational Embedding Data Augmentation.","author":"Zhao Zixuan","year":"2022","unstructured":"Zixuan Zhao and Raul Castro Fernandez. 2022. Leva: Boosting Machine Learning Performance with Relational Embedding Data Augmentation. (2022)."},{"key":"e_1_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3300065"}],"container-title":["Proceedings of the VLDB Endowment"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.14778\/3749646.3749715","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T03:33:40Z","timestamp":1757043220000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.14778\/3749646.3749715"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":80,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["10.14778\/3749646.3749715"],"URL":"https:\/\/doi.org\/10.14778\/3749646.3749715","relation":{},"ISSN":["2150-8097"],"issn-type":[{"value":"2150-8097","type":"print"}],"subject":[],"published":{"date-parts":[[2025,7]]},"assertion":[{"value":"2025-09-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}