{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:53:33Z","timestamp":1773482013970,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,10]],"date-time":"2022-06-10T00:00:00Z","timestamp":1654819200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Google research scholar award"},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["1934565,1741022,2107290"],"award-info":[{"award-number":["1934565,1741022,2107290"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,10]]},"DOI":"10.1145\/3514221.3522567","type":"proceedings-article","created":{"date-parts":[[2022,6,12]],"date-time":"2022-06-12T02:33:49Z","timestamp":1655001229000},"page":"2458-2464","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Responsible Data Integration: Next-generation Challenges"],"prefix":"10.1145","author":[{"given":"Fatemeh","family":"Nargesian","sequence":"first","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abolfazl","family":"Asudeh","sequence":"additional","affiliation":[{"name":"University of Illinois at Chicago, Chicago, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"H. V.","family":"Jagadish","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor, MI, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-015-0389-y"},{"key":"e_1_3_2_1_2_1","volume-title":"EDBT\/ICDT Workshops.","author":"Accinelli Chiara","year":"2021","unstructured":"Chiara Accinelli, Barbara Catania, Giovanna Guerrini, and Simone Minisi. 2021. The impact of rewriting on coverage constraint satisfaction.. In EDBT\/ICDT Workshops."},{"key":"e_1_3_2_1_3_1","volume-title":"EDBT\/ICDT Workshops.","author":"Accinelli Chiara","year":"2020","unstructured":"Chiara Accinelli, Simone Minisi, and Barbara Catania. 2020. Coverage-based Rewriting for Data Preparation.. In EDBT\/ICDT Workshops."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Swarup Acharya Phillip B. Gibbons Viswanath Poosala and Sridhar Ramaswamy. 1999. Join Synopses for Approximate Query Answering. In SIGMOD Alex Delis Christos Faloutsos and Shahram Ghandeharizadeh (Eds.). 275--286.","DOI":"10.1145\/304181.304207"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415566"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Abolfazl Asudeh Zhongjun Jin and H. V. Jagadish. 2019. Assessing and Remedying Coverage for a Given Dataset. In ICDE. 554--565.","DOI":"10.1109\/ICDE.2019.00056"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Abolfazl Asudeh Nima Shahbazi Zhongjun Jin and HV Jagadish. 2021. Identifying Insufficient Data Coverage for Ordinal Continuous-Valued Attributes. In SIGMOD. 129--141.","DOI":"10.1145\/3448016.3457315"},{"key":"e_1_3_2_1_8_1","unstructured":"Solon Barocas Moritz Hardt and Arvind Narayanan. 2019. Fairness and machine learning: Limitations and opportunities. fairmlbook.org."},{"key":"e_1_3_2_1_9_1","first-page":"671","article-title":"Big data's disparate impact","volume":"104","author":"Barocas Solon","year":"2016","unstructured":"Solon Barocas and Andrew D Selbst. 2016. Big data's disparate impact. Calif. L. Rev., Vol. 104 (2016), 671.","journal-title":"Calif. L. Rev."},{"key":"e_1_3_2_1_10_1","volume-title":"A study of the behavior of several methods for balancing machine learning training data. ACM SIGKDD explorations newsletter","author":"Batista Gustavo EAPA","year":"2004","unstructured":"Gustavo EAPA Batista, Ronaldo C Prati, and Maria Carolina Monard. 2004. A study of the behavior of several methods for balancing machine learning training data. ACM SIGKDD explorations newsletter, Vol. 6, 1 (2004), 20--29."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Alex Bogatu Alvaro A. A. Fernandes Norman W. Paton and Nikolaos Konstantinou. 2020. Dataset Discovery in Data Lakes. In ICDE. 709--720.","DOI":"10.1109\/ICDE48307.2020.00067"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Mike Brachmann Carlos Bautista Sonia Castelo Su Feng Juliana Freire Boris Glavic Oliver Kennedy Heiko Mueller R\u00e9mi Rampin William Spoth and Ying Yang. 2019. Data Debugging and Exploration with Vizier. In SIGMOD. 1877--1880.","DOI":"10.1145\/3299869.3320246"},{"key":"e_1_3_2_1_13_1","volume-title":"Noy","author":"Brickley Dan","year":"2019","unstructured":"Dan Brickley, Matthew Burgess, and Natasha F. Noy. 2019. Google Dataset Search: Building a search engine for datasets in an open Web ecosystem. In WWW. 1365--1375."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476346"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 1349--1359","author":"Celis L Elisa","year":"2020","unstructured":"L Elisa Celis, Vijay Keswani, and Nisheeth Vishnoi. 2020. Data preprocessing to mitigate bias: A maximum entropy based approach. In International Conference on Machine Learning. PMLR, 1349--1359."},{"key":"e_1_3_2_1_16_1","unstructured":"Chengliang Chai Ju Fan and Guoliang Li. 2018. Incentive-Based Entity Collection Using Crowdsourcing. In ICDE. 341--352."},{"key":"e_1_3_2_1_17_1","unstructured":"Chengliang Chai Guoliang Li Jian Li Dong Deng and Jianhua Feng. 2016. Cost-Effective Crowdsourced Entity Resolution: A Partial-Order Approach. In SIGMOD. 969--984."},{"key":"e_1_3_2_1_18_1","volume-title":"Narasayya","author":"Chaudhuri Surajit","year":"1999","unstructured":"Surajit Chaudhuri, Rajeev Motwani, and Vivek R. Narasayya. 1999. On Random Sampling over Joins. In SIGMOD, Alex Delis, Christos Faloutsos, and Shahram Ghandeharizadeh (Eds.). 263--274."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.953"},{"key":"e_1_3_2_1_20_1","volume-title":"Breast cancer disparities among women in underserved communities in the USA. Current breast cancer reports","author":"Beti Thompson","year":"2018","unstructured":"Beti Thompson et. al. 2018. Breast cancer disparities among women in underserved communities in the USA. Current breast cancer reports, Vol. 10, 3 (2018), 131--141."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2016.2611509"},{"key":"e_1_3_2_1_22_1","volume-title":"Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang.","author":"Fernandez Raul Castro","year":"2018","unstructured":"Raul Castro Fernandez, Essam Mansour, Abdulhakim Ali Qahtan, Ahmed K. Elmagarmid, Ihab F. Ilyas, Samuel Madden, Mourad Ouzzani, Michael Stonebraker, and Nan Tang. 2018. Seeping Semantics: Linking Datasets Using Word Embeddings for Data Discovery. In ICDE. 989--1000."},{"key":"e_1_3_2_1_23_1","volume-title":"Lazo: A Cardinality-Based Method for Coupled Estimation of Jaccard Similarity and Containment. In ICDE. 1190--1201.","author":"Fernandez Raul Castro","year":"2019","unstructured":"Raul Castro Fernandez, Jisoo Min, Demitri Nava, and Samuel Madden. 2019. Lazo: A Cardinality-Based Method for Coupled Estimation of Jaccard Similarity and Containment. In ICDE. 1190--1201."},{"key":"e_1_3_2_1_24_1","volume-title":"On the (im) possibility of fairness. arXiv preprint arXiv:1609.07236","author":"Friedler Sorelle A","year":"2016","unstructured":"Sorelle A Friedler, Carlos Scheidegger, and Suresh Venkatasubramanian. 2016. On the (im) possibility of fairness. arXiv preprint arXiv:1609.07236 (2016)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458723"},{"key":"e_1_3_2_1_26_1","first-page":"333","article-title":"Methods of weighting for unit non-response","volume":"40","author":"Holt David","year":"1991","unstructured":"David Holt and David Elliot. 1991. Methods of weighting for unit non-response. Journal of the Royal Statistical Society: Series D (The Statistician), Vol. 40, 3 (1991), 333--342.","journal-title":"Journal of the Royal Statistical Society: Series D (The Statistician)"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3384689"},{"key":"e_1_3_2_1_28_1","volume-title":"Inherent trade-offs in the fair determination of risk scores. arXiv preprint arXiv:1609.05807","author":"Kleinberg Jon","year":"2016","unstructured":"Jon Kleinberg, Sendhil Mullainathan, and Manish Raghavan. 2016. Inherent trade-offs in the fair determination of risk scores. arXiv preprint arXiv:1609.05807 (2016)."},{"key":"e_1_3_2_1_29_1","volume-title":"Letter to the editor","author":"Kullback Solomon","year":"1987","unstructured":"Solomon Kullback. 1987. Letter to the editor: The Kullback-Leibler distance. (1987)."},{"key":"e_1_3_2_1_30_1","volume-title":"Wander Join: Online Aggregation via Random Walks. In SIGMOD. 615--629.","author":"Li Feifei","year":"2016","unstructured":"Feifei Li, Bin Wu, Ke Yi, and Zhuoyue Zhao. 2016. Wander Join: Online Aggregation via Random Walks. In SIGMOD. 615--629."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.14778\/3297753.3297757"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.14778\/3467861.3467872"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407821"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476333"},{"key":"e_1_3_2_1_35_1","volume-title":"Naughton","author":"Luo Gang","year":"2002","unstructured":"Gang Luo, Curt J. Ellmann, Peter J. Haas, and Jeffrey F. Naughton. 2002. A scalable hash ripple join algorithm. In SIGMOD. 252--262."},{"key":"e_1_3_2_1_36_1","volume-title":"Declarative Machine Learning Systems. arXiv preprint arXiv:2107.08148","author":"Molino Piero","year":"2021","unstructured":"Piero Molino and Christopher R\u00e9. 2021. Declarative Machine Learning Systems. arXiv preprint arXiv:2107.08148 (2021)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476249.3476299"},{"key":"e_1_3_2_1_38_1","volume-title":"Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller","author":"Nargesian Fatemeh","year":"2020","unstructured":"Fatemeh Nargesian, Ken Q. Pu, Erkang Zhu, Bahar Ghadiri Bashardoost, and Ren\u00e9 e J. Miller. 2020. Organizing Data Lakes for Navigation. In SIGMOD, David Maier, Rachel Pottinger, AnHai Doan, Wang-Chiew Tan, Abdussalam Alawini, and Hung Q. Ngo (Eds.). 1939--1950."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.14778\/3192965.3192973"},{"key":"e_1_3_2_1_41_1","unstructured":"Laurel J. Orr Magdalena Balazinska and Dan Suciu. 2020. Sample Debiasing in the Themis Open World Database System. In SIGMOD. 257--268."},{"key":"e_1_3_2_1_42_1","first-page":"2863","article-title":"RONIN","volume":"14","author":"Ouellette Paul","year":"2021","unstructured":"Paul Ouellette, Aidan Sciortino, Fatemeh Nargesian, Bahar Ghadiri Bashardoost, Erkang Zhu, Ken Pu, and Ren\u00e9 e J. Miller. 2021. RONIN: Data Lake Exploration. PVLDB, Vol. 14, 12 (2021), 2863--2866.","journal-title":"Data Lake Exploration. PVLDB"},{"key":"e_1_3_2_1_43_1","volume-title":"Statistical inference based on divergence measures","author":"Pardo Leandro","unstructured":"Leandro Pardo. 2018. Statistical inference based on divergence measures .CRC press."},{"key":"e_1_3_2_1_44_1","unstructured":"Tara Parker-Pope. Dec. 23 2020. Pulse Oximeters May Be Less Accurate for Black People. Should You Use One? The NewYork Times."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aap.2019.05.014"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299869.3319901"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"A\u00e9 cio S. R. Santos Aline Bessa Fernando Chirigati Christopher Musco and Juliana Freire. 2021. Correlation Sketches for Approximate Join-Correlation Queries. In SIGMOD Guoliang Li Zhanhuai Li Stratos Idreos and Divesh Srivastava (Eds.). 1531--1544.","DOI":"10.1145\/3448016.3458456"},{"key":"e_1_3_2_1_48_1","unstructured":"Sebastian Schelter Yuxuan He Jatin Khilnani and Julia Stoyanovich. 2020. FairPrep: Promoting Data to a First-Class Citizen in Studies on Fairness-Enhancing Interventions. In EDBT. 395--398."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","unstructured":"Nima Shahbazi Yin Lin Abolfazl Asudeh and H. V. Jagadish. 2022. A Survey on Techniques for Identifying and Resolving Representation Bias in Data. CoRR Vol. abs\/2203.11852 (2022). https:\/\/doi.org\/10.48550\/arxiv.2203.11852","DOI":"10.48550\/arxiv.2203.11852"},{"key":"e_1_3_2_1_50_1","volume-title":"Fairness-Aware Range Queries for Selecting Unbiased Data. ICDE","author":"Shetiya Suraj","year":"2022","unstructured":"Suraj Shetiya, Ian Swift, Abolfazl Asudeh, and Gautam Das. 2022. Fairness-Aware Range Queries for Selecting Unbiased Data. ICDE (2022)."},{"key":"e_1_3_2_1_51_1","unstructured":"Mallory Simon. 2009. HP looking into claim webcams can't see black people. CNN."},{"key":"e_1_3_2_1_52_1","unstructured":"William Spoth Poonam Kumari Oliver Kennedy and Fatemeh Nargesian. [n.d.]. Loki: Streamlining Integration and Enrichment. In HILDA@SIGMOD."},{"key":"e_1_3_2_1_53_1","unstructured":"Chenkai Sun Abolfazl Asudeh H. V. Jagadish Bill Howe and Julia Stoyanovich. 2019. MithraLabel: Flexible Dataset Nutritional Labels for Responsible Data Science. In CIKM. 2893--2896."},{"key":"e_1_3_2_1_54_1","volume-title":"Hyunsu Kim, and Steven Euijong Whang.","author":"Tae Ki Hyun","year":"2019","unstructured":"Ki Hyun Tae, Yuji Roh, Young Hun Oh, Hyunsu Kim, and Steven Euijong Whang. 2019. Data Cleaning for Accurate, Fair, and Robust Models: A Big Data - AI Integration Approach. In DEEM@SIGMOD. 5:1--5:4."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452792"},{"key":"e_1_3_2_1_56_1","unstructured":"Tess Townsend. 2017. Most engineers are white and so are the faces they use to train software. Recode."},{"key":"e_1_3_2_1_57_1","volume-title":"Deep Learning: Systems and Responsibility. In SIGMOD. 2867--2875.","author":"Wasay Abdul","year":"2021","unstructured":"Abdul Wasay, Subarna Chatterjee, and Stratos Idreos. 2021. Deep Learning: Systems and Responsibility. In SIGMOD. 2867--2875."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415562"},{"key":"e_1_3_2_1_59_1","unstructured":"Yiliang Zhang and Qi Long. 2021. Assessing Fairness in the Presence of Missing Data. In NeurIPS."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Zhuoyue Zhao Robert Christensen Feifei Li Xiao Hu and Ke Yi. 2018. Random Sampling over Joins Revisited. In SIGMOD. 1525--1539.","DOI":"10.1145\/3183713.3183739"},{"key":"e_1_3_2_1_61_1","volume-title":"Miller","author":"Zhu Erkang","year":"2019","unstructured":"Erkang Zhu, Dong Deng, Fatemeh Nargesian, and Ren\u00e9e J. Miller. 2019. JOSIE: Overlap Set Similarity Search for Finding Joinable Tables in Data Lakes. In SIGMOD. 847--864."},{"key":"e_1_3_2_1_62_1","first-page":"1185","article-title":"LSH Ensemble","volume":"9","author":"Zhu Erkang","year":"2016","unstructured":"Erkang Zhu, Fatemeh Nargesian, Ken Q. Pu, and Ren\u00e9e J. Miller. 2016. LSH Ensemble: Internet-Scale Domain Search. PVLDB, Vol. 9, 12 (2016), 1185--1196.","journal-title":"Internet-Scale Domain Search. PVLDB"}],"event":{"name":"SIGMOD\/PODS '22: International Conference on Management of Data","location":"Philadelphia PA USA","acronym":"SIGMOD\/PODS '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2022 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514221.3522567","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3514221.3522567","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3514221.3522567","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:10:07Z","timestamp":1750183807000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514221.3522567"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,10]]},"references-count":61,"alternative-id":["10.1145\/3514221.3522567","10.1145\/3514221"],"URL":"https:\/\/doi.org\/10.1145\/3514221.3522567","relation":{},"subject":[],"published":{"date-parts":[[2022,6,10]]},"assertion":[{"value":"2022-06-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}