{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T20:55:32Z","timestamp":1771707332545,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,27]],"date-time":"2023-02-27T00:00:00Z","timestamp":1677456000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100006785","name":"Google","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2107290,1741022,1934565,2107050"],"award-info":[{"award-number":["2107290,1741022,1934565,2107050"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,27]]},"DOI":"10.1145\/3539597.3572727","type":"proceedings-article","created":{"date-parts":[[2023,2,22]],"date-time":"2023-02-22T23:27:00Z","timestamp":1677108420000},"page":"1256-1259","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Next-generation Challenges of Responsible Data Integration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4710-8719","authenticated-orcid":false,"given":"Fatemeh","family":"Nargesian","sequence":"first","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5251-6186","authenticated-orcid":false,"given":"Abolfazl","family":"Asudeh","sequence":"additional","affiliation":[{"name":"University of Illinois Chicago, Chicago, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0724-5214","authenticated-orcid":false,"given":"H. V.","family":"Jagadish","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,2,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-015-0389-y"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Swarup Acharya Phillip B. Gibbons Viswanath Poosala and Sridhar Ramaswamy. 1999. Join Synopses for Approximate Query Answering. In SIGMOD Alex Delis Christos Faloutsos and Shahram Ghandeharizadeh (Eds.). 275--286.","DOI":"10.1145\/304181.304207"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415566"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Abolfazl Asudeh Zhongjun Jin and H. V. Jagadish. 2019. Assessing and Remedying Coverage for a Given Dataset. In ICDE. 554--565.","DOI":"10.1109\/ICDE.2019.00056"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551858"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Abolfazl Asudeh Nima Shahbazi Zhongjun Jin and HV Jagadish. 2021. Identifying Insufficient Data Coverage for Ordinal Continuous-Valued Attributes. In SIGMOD. 129--141.","DOI":"10.1145\/3448016.3457315"},{"key":"e_1_3_2_2_7_1","unstructured":"Solon Barocas Moritz Hardt and Arvind Narayanan. 2019. Fairness and machine learning: Limitations and opportunities. fairmlbook.org."},{"key":"e_1_3_2_2_8_1","first-page":"671","article-title":"Big data's disparate impact","volume":"104","author":"Barocas Solon","year":"2016","unstructured":"Solon Barocas and Andrew D Selbst. 2016. Big data's disparate impact. Calif. L. Rev., Vol. 104 (2016), 671.","journal-title":"Calif. L. Rev."},{"key":"e_1_3_2_2_9_1","volume-title":"A study of the behavior of several methods for balancing machine learning training data. ACM SIGKDD explorations newsletter","author":"Batista Gustavo EAPA","year":"2004","unstructured":"Gustavo EAPA Batista, Ronaldo C Prati, and Maria Carolina Monard. 2004. A study of the behavior of several methods for balancing machine learning training data. ACM SIGKDD explorations newsletter, Vol. 6, 1 (2004), 20--29."},{"key":"e_1_3_2_2_10_1","volume-title":"Fairness-Aware Machine Learning: Practical Challenges and Lessons Learned","author":"Bird Sarah","unstructured":"Sarah Bird, Krishnaram Kenthapadi, Emre Kiciman, and Margaret Mitchell. 2019. Fairness-Aware Machine Learning: Practical Challenges and Lessons Learned. In WSDM, J. Shane Culpepper, Alistair Moffat, Paul N. Bennett, and Kristina Lerman (Eds.). ACM, 834--835."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Alex Bogatu Alvaro A. A. Fernandes Norman W. Paton and Nikolaos Konstantinou. 2020. Dataset Discovery in Data Lakes. In ICDE. 709--720.","DOI":"10.1109\/ICDE48307.2020.00067"},{"key":"e_1_3_2_2_12_1","volume-title":"William Spoth, and Ying Yang.","author":"Brachmann Mike","year":"2019","unstructured":"Mike Brachmann, Carlos Bautista, Sonia Castelo, Su Feng, Juliana Freire, Boris Glavic, Oliver Kennedy, Heiko Mueller, R\u00e9 mi Rampin, William Spoth, and Ying Yang. 2019. Data Debugging and Exploration with Vizier. In SIGMOD. 1877--1880."},{"key":"e_1_3_2_2_13_1","volume-title":"Noy","author":"Brickley Dan","year":"2019","unstructured":"Dan Brickley, Matthew Burgess, and Natasha F. Noy. 2019. Google Dataset Search: Building a search engine for datasets in an open Web ecosystem. In WWW. 1365--1375."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476346"},{"key":"e_1_3_2_2_15_1","unstructured":"Chengliang Chai Ju Fan and Guoliang Li. 2018. Incentive-Based Entity Collection Using Crowdsourcing. In ICDE. 341--352."},{"key":"e_1_3_2_2_16_1","unstructured":"Chengliang Chai Guoliang Li Jian Li Dong Deng and Jianhua Feng. 2016. Cost-Effective Crowdsourced Entity Resolution: A Partial-Order Approach. In SIGMOD. 969--984."},{"key":"e_1_3_2_2_17_1","volume-title":"Narasayya","author":"Chaudhuri Surajit","year":"1999","unstructured":"Surajit Chaudhuri, Rajeev Motwani, and Vivek R. Narasayya. 1999. On Random Sampling over Joins. In SIGMOD, Alex Delis, Christos Faloutsos, and Shahram Ghandeharizadeh (Eds.). 263--274."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.953"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2016.2611509"},{"key":"e_1_3_2_2_20_1","volume-title":"Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang.","author":"Fernandez Raul Castro","year":"2018","unstructured":"Raul Castro Fernandez, Essam Mansour, Abdulhakim Ali Qahtan, Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang. 2018. Seeping Semantics: Linking Datasets Using Word Embeddings for Data Discovery. In ICDE. 989--1000."},{"key":"e_1_3_2_2_21_1","volume-title":"Lazo: A Cardinality-Based Method for Coupled Estimation of Jaccard Similarity and Containment. In ICDE. 1190--1201.","author":"Fernandez Raul Castro","year":"2019","unstructured":"Raul Castro Fernandez, Jisoo Min, Demitri Nava, and Samuel Madden. 2019. Lazo: A Cardinality-Based Method for Coupled Estimation of Jaccard Similarity and Containment. In ICDE. 1190--1201."},{"key":"e_1_3_2_2_22_1","volume-title":"Hanna Wallach, Hal Daum\u00e9 III, and Kate Crawford.","author":"Gebru Timnit","year":"2021","unstructured":"Timnit Gebru, Jamie Morgenstern, Briana Vecchione, Jennifer Wortman Vaughan, Hanna Wallach, Hal Daum\u00e9 III, and Kate Crawford. 2021. Datasheets for Datasets. Commun. ACM, Vol. 64, 12 (2021), 86?-\u00ec92."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3384689"},{"key":"e_1_3_2_2_24_1","volume-title":"ITCS (LIPIcs","volume":"23","author":"Kleinberg Jon M.","year":"2017","unstructured":"Jon M. Kleinberg, Sendhil Mullainathan, and Manish Raghavan. 2017. Inherent Trade-Offs in the Fair Determination of Risk Scores. In ITCS (LIPIcs, Vol. 67), Christos H. Papadimitriou (Ed.). Schloss Dagstuhl - Leibniz-Zentrum f\u00fc r Informatik, 43:1--43:23."},{"key":"e_1_3_2_2_25_1","volume-title":"Letter to the editor","author":"Kullback Solomon","year":"1987","unstructured":"Solomon Kullback. 1987. Letter to the editor: The Kullback-Leibler distance. (1987)."},{"key":"e_1_3_2_2_26_1","volume-title":"Wander Join: Online Aggregation via Random Walks. In SIGMOD. 615--629.","author":"Li Feifei","year":"2016","unstructured":"Feifei Li, Bin Wu, Ke Yi, and Zhuoyue Zhao. 2016. Wander Join: Online Aggregation via Random Walks. In SIGMOD. 615--629."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.14778\/3467861.3467872"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407821"},{"key":"e_1_3_2_2_29_1","volume-title":"Declarative Machine Learning Systems. arXiv preprint arXiv:2107.08148","author":"Molino Piero","year":"2021","unstructured":"Piero Molino and Christopher R\u00e9. 2021. Declarative Machine Learning Systems. arXiv preprint arXiv:2107.08148 (2021)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476249.3476299"},{"key":"e_1_3_2_2_31_1","volume-title":"Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller","author":"Nargesian Fatemeh","year":"2020","unstructured":"Fatemeh Nargesian, Ken Q. Pu, Erkang Zhu, Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller. 2020. Organizing Data Lakes for Navigation. In SIGMOD, David Maier, Rachel Pottinger, AnHai Doan, Wang-Chiew Tan, Abdussalam Alawini, and Hung Q. Ngo (Eds.). 1939--1950."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192973"},{"key":"e_1_3_2_2_33_1","first-page":"2863","article-title":"RONIN","volume":"14","author":"Ouellette Paul","year":"2021","unstructured":"Paul Ouellette, Aidan Sciortino, Fatemeh Nargesian, Bahar Ghadiri Bashardoost, Erkang Zhu, Ken Pu, and Ren\u00e9 e J. Miller. 2021. RONIN: Data Lake Exploration. PVLDB, Vol. 14, 12 (2021), 2863--2866.","journal-title":"Data Lake Exploration. PVLDB"},{"key":"e_1_3_2_2_34_1","volume-title":"Statistical inference based on divergence measures","author":"Pardo Leandro","unstructured":"Leandro Pardo. 2018. Statistical inference based on divergence measures. CRC press."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3319901"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"A\u00e9 cio S. R. Santos Aline Bessa Fernando Chirigati Christopher Musco and Juliana Freire. 2021. Correlation Sketches for Approximate Join-Correlation Queries. In SIGMOD Guoliang Li Zhanhuai Li Stratos Idreos and Divesh Srivastava (Eds.). 1531--1544.","DOI":"10.1145\/3448016.3458456"},{"key":"e_1_3_2_2_37_1","unstructured":"Sebastian Schelter Yuxuan He Jatin Khilnani and Julia Stoyanovich. 2020. FairPrep: Promoting Data to a First-Class Citizen in Studies on Fairness-Enhancing Interventions. In EDBT. 395--398."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","unstructured":"Nima Shahbazi Yin Lin Abolfazl Asudeh and H. V. Jagadish. 2022. A Survey on Techniques for Identifying and Resolving Representation Bias in Data. CoRR Vol. abs\/2203.11852 (2022). https:\/\/doi.org\/10.48550\/arxiv.2203.11852","DOI":"10.48550\/arxiv.2203.11852"},{"key":"e_1_3_2_2_39_1","volume-title":"Fairness-Aware Range Queries for Selecting Unbiased Data. ICDE","author":"Shetiya Suraj","year":"2022","unstructured":"Suraj Shetiya, Ian Swift, Abolfazl Asudeh, and Gautam Das. 2022. Fairness-Aware Range Queries for Selecting Unbiased Data. ICDE (2022)."},{"key":"e_1_3_2_2_40_1","unstructured":"William Spoth Poonam Kumari Oliver Kennedy and Fatemeh Nargesian. [n. d.]. Loki: Streamlining Integration and Enrichment. In HILDA@SIGMOD."},{"key":"e_1_3_2_2_41_1","unstructured":"Chenkai Sun Abolfazl Asudeh H. V. Jagadish Bill Howe and Julia Stoyanovich. 2019. MithraLabel: Flexible Dataset Nutritional Labels for Responsible Data Science. In CIKM. 2893--2896."},{"key":"e_1_3_2_2_42_1","volume-title":"Hyunsu Kim, and Steven Euijong Whang.","author":"Tae Ki Hyun","year":"2019","unstructured":"Ki Hyun Tae, Yuji Roh, Young Hun Oh, Hyunsu Kim, and Steven Euijong Whang. 2019. Data Cleaning for Accurate, Fair, and Robust Models: A Big Data - AI Integration Approach. In DEEM@SIGMOD. 5:1--5:4."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452792"},{"key":"e_1_3_2_2_44_1","volume-title":"Deep Learning: Systems and Responsibility. In SIGMOD. 2867--2875.","author":"Wasay Abdul","year":"2021","unstructured":"Abdul Wasay, Subarna Chatterjee, and Stratos Idreos. 2021. Deep Learning: Systems and Responsibility. In SIGMOD. 2867--2875."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415562"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Zhuoyue Zhao Robert Christensen Feifei Li Xiao Hu and Ke Yi. 2018. Random Sampling over Joins Revisited. In SIGMOD. 1525--1539.","DOI":"10.1145\/3183713.3183739"},{"key":"e_1_3_2_2_47_1","volume-title":"Miller","author":"Zhu Erkang","year":"2019","unstructured":"Erkang Zhu, Dong Deng, Fatemeh Nargesian, and Ren\u00e9e J. Miller. 2019. JOSIE: Overlap Set Similarity Search for Finding Joinable Tables in Data Lakes. In SIGMOD. 847--864."},{"key":"e_1_3_2_2_48_1","first-page":"1185","article-title":"LSH Ensemble","volume":"9","author":"Zhu Erkang","year":"2016","unstructured":"Erkang Zhu, Fatemeh Nargesian, Ken Q. Pu, and Ren\u00e9e J. Miller. 2016. LSH Ensemble: Internet-Scale Domain Search. PVLDB, Vol. 9, 12 (2016), 1185--1196.","journal-title":"Internet-Scale Domain Search. PVLDB"}],"event":{"name":"WSDM '23: The Sixteenth ACM International Conference on Web Search and Data Mining","location":"Singapore Singapore","acronym":"WSDM '23","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the Sixteenth ACM International Conference on Web Search and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539597.3572727","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539597.3572727","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:15Z","timestamp":1750186935000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539597.3572727"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,27]]},"references-count":48,"alternative-id":["10.1145\/3539597.3572727","10.1145\/3539597"],"URL":"https:\/\/doi.org\/10.1145\/3539597.3572727","relation":{},"subject":[],"published":{"date-parts":[[2023,2,27]]},"assertion":[{"value":"2023-02-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}