{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T22:30:28Z","timestamp":1774909828917,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,1,8]],"date-time":"2022-01-08T00:00:00Z","timestamp":1641600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,1,8]]},"DOI":"10.1145\/3493700.3493774","type":"proceedings-article","created":{"date-parts":[[2022,1,7]],"date-time":"2022-01-07T23:54:21Z","timestamp":1641599661000},"page":"354-357","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Automatic Assessment of Quality of your Data for AI"],"prefix":"10.1145","author":[{"given":"Hima","family":"Patel","sequence":"first","affiliation":[{"name":"IBM Research, IN"}]},{"given":"Nitin","family":"Gupta","sequence":"additional","affiliation":[{"name":"IBM, IN"}]},{"given":"Naveen","family":"Panwar","sequence":"additional","affiliation":[{"name":"IBM Research, India, IN"}]},{"given":"Ruhi","family":"Sharma Mittal","sequence":"additional","affiliation":[{"name":"IBM Research, India, IN"}]},{"given":"Sameep","family":"Mehta","sequence":"additional","affiliation":[{"name":"IBM, India Research Lab, IN"}]},{"given":"Shanmukha","family":"Guttula","sequence":"additional","affiliation":[{"name":"IBM, IN"}]},{"given":"Shashank","family":"Mujumdar","sequence":"additional","affiliation":[{"name":"IBM Research, India, IN"}]},{"given":"Shazia","family":"Afzal","sequence":"additional","affiliation":[{"name":"IBM Research, India, IN"}]},{"given":"Srikanta","family":"Bedathur","sequence":"additional","affiliation":[{"name":"IIT-Delhi, IN"}]},{"given":"Vitobha","family":"Munigala","sequence":"additional","affiliation":[{"name":"IBM Research Labs - India, IN"}]}],"member":"320","published-online":{"date-parts":[[2022,1,8]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"SOCO","author":"Ali Aida","year":"2015","unstructured":"Aida Ali, Siti Mariyam\u00a0Hj. Shamsuddin, and Anca\u00a0L. Ralescu. 2015. Classification with class imbalance problem: a review. In SOCO 2015."},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon. 2021. Test data quality at scale with Deequ. https:\/\/aws.amazon.com\/blogs\/big-data\/test-data-quality-at-scale-with-deequ\/."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-014-0446-y"},{"key":"e_1_3_2_1_4_1","volume-title":"1st International Workshop on Data Assessment and Readiness for AI.. In PAKDD (Workshops). 117\u2013120","author":"Bandyopadhyay Bortik","year":"2021","unstructured":"Bortik Bandyopadhyay, Sambaran Bandyopadhyay, Srikanta Bedathur, Nitin Gupta, Sameep Mehta, Shashank Mujumdar, Srinivasan Parthasarathy, and Hima Patel. 2021. 1st International Workshop on Data Assessment and Readiness for AI.. In PAKDD (Workshops). 117\u2013120."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313602"},{"key":"e_1_3_2_1_6_1","volume-title":"Data Validation for Machine Learning. In Conference on Systems and Machine Learning (SysML).","author":"Breck Eric","year":"2019","unstructured":"Eric Breck, Neoklis Polyzotis, Sudip Roy, Steven Whang, and Martin Zinkevich. 2019. Data Validation for Machine Learning. In Conference on Systems and Machine Learning (SysML)."},{"key":"e_1_3_2_1_7_1","unstructured":"Simon Brugman. 2021. Pandas Profiling. https:\/\/pandas-profiling.github.io\/pandas-profiling\/docs\/master\/rtd\/."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2006.132"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-13059-5_22"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 34th International Conference on Machine Learning-Volume 70","author":"Devlin Jacob","year":"2017","unstructured":"Jacob Devlin, Jonathan Uesato, Surya Bhupatiraju, Rishabh Singh, Abdel-rahman Mohamed, and Pushmeet Kohli. 2017. Robustfill: Neural program learning under noisy i\/o. In Proceedings of the 34th International Conference on Machine Learning-Volume 70. 990\u2013998."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8655(02)00303-3"},{"key":"e_1_3_2_1_12_1","unstructured":"Amirata Ghorbani and James Zou. 2019. Data shapley: Equitable valuation of data for machine learning. arXiv preprint arXiv:1904.02868(2019)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/1925844.1926423"},{"key":"e_1_3_2_1_14_1","unstructured":"Nitin Gupta Hima Patel Shazia Afzal Naveen Panwar Ruhi\u00a0Sharma Mittal Shanmukha Guttula Abhinav Jain Lokesh Nagalapatti Sameep Mehta Sandeep Hans 2021. Data Quality Toolkit: Automatic assessment of data quality and remediation for machine learning datasets. arXiv preprint arXiv:2108.05935(2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3163\/1536-5050.101.3.020"},{"key":"e_1_3_2_1_16_1","unstructured":"IBM. 2020. Overview and Importance of Data Quality for Machine Learning Tasks. https:\/\/researcher.watson.ibm.com\/researcher\/view_group.php?id=10456."},{"key":"e_1_3_2_1_17_1","unstructured":"IBM. 2021. Data Quality for AI. https:\/\/developer.ibm.com\/apis\/catalog\/dataquality4ai--data-quality-for-ai\/Introduction\/."},{"key":"e_1_3_2_1_18_1","unstructured":"IBM. 2021. Data Quality for Machine Learning Tasks. https:\/\/researcher.watson.ibm.com\/researcher\/view_group_subpage.php?id=10754."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1978942.1979444"},{"key":"e_1_3_2_1_20_1","unstructured":"Hamid Karimi Tyler Derr and Jiliang Tang. 2019. Characterizing the decision boundary of deep neural networks. arXiv preprint arXiv:1912.11460(2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939502.2939511"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3347711"},{"key":"e_1_3_2_1_23_1","volume-title":"Bayes Imbalance Impact Index: A Measure of Class Imbalanced Data Set for Classification Problem","author":"Lu Y.","year":"2019","unstructured":"Y. Lu, Y. Cheung, and Y.\u00a0Y. Tang. 2019. Bayes Imbalance Impact Index: A Measure of Class Imbalanced Data Set for Classification Problem. IEEE Transactions on Neural Networks and Learning Systems (2019), 1\u201315."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300356"},{"key":"e_1_3_2_1_25_1","volume-title":"Confident Learning: Estimating Uncertainty in Dataset Labels. arXiv preprint arXiv:1911.00068(2019).","author":"Northcutt G","year":"2019","unstructured":"Curtis\u00a0G Northcutt, Lu Jiang, and Isaac\u00a0L Chuang. 2019. Confident Learning: Estimating Uncertainty in Dataset Labels. arXiv preprint arXiv:1911.00068(2019)."},{"key":"e_1_3_2_1_26_1","volume-title":"A Taxonomy of Data Quality Problems. Journal of Data and Information Quality - JDIQ (01","author":"Oliveira Paulo","year":"2005","unstructured":"Paulo Oliveira, F\u00e1tima Rodrigues, Pedro Rangel\u00a0Henriques, and Helena Galhardas. 2005. A Taxonomy of Data Quality Problems. Journal of Data and Information Quality - JDIQ (01 2005)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3469468"},{"key":"e_1_3_2_1_28_1","unstructured":"Erhard Rahm and Hong\u00a0Hai Do. 2000. Data cleaning: Problems and current approaches. (2000)."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. PMLR, 5351\u20135360","author":"Ramamurthy Karthikeyan\u00a0Natesan","year":"2019","unstructured":"Karthikeyan\u00a0Natesan Ramamurthy, Kush Varshney, and Krishnan Mody. 2019. Topological data analysis of decision boundaries with application to model selection. In International Conference on Machine Learning. PMLR, 5351\u20135360."},{"key":"e_1_3_2_1_30_1","unstructured":"Vijayshankar Raman and Joseph\u00a0M Hellerstein. 2001. Potter\u2019s wheel: An interactive data cleaning system. In VLDB Vol.\u00a01. 381\u2013390."},{"key":"e_1_3_2_1_31_1","volume-title":"Holoclean: Holistic data repairs with probabilistic inference. arXiv preprint arXiv:1702.00820(2017).","author":"Rekatsinas Theodoros","year":"2017","unstructured":"Theodoros Rekatsinas, Xu Chu, Ihab\u00a0F Ilyas, and Christopher R\u00e9. 2017. Holoclean: Holistic data repairs with probabilistic inference. arXiv preprint arXiv:1702.00820(2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3328519.3329133"},{"key":"e_1_3_2_1_33_1","volume-title":"An instance level analysis of data complexity. Machine learning 95, 2","author":"Smith R","year":"2014","unstructured":"Michael\u00a0R Smith, Tony Martinez, and Christophe Giraud-Carrier. 2014. An instance level analysis of data complexity. Machine learning 95, 2 (2014), 225\u2013256."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178708"},{"key":"e_1_3_2_1_35_1","unstructured":"Jinsung Yoon Sercan\u00a0O Arik and Tomas Pfister. 2019. Data Valuation using Reinforcement Learning. arXiv preprint arXiv:1909.11671(2019)."}],"event":{"name":"CODS-COMAD 2022: 5th Joint International Conference on Data Science & Management of Data (9th ACM IKDD CODS and 27th COMAD)","location":"Bangalore India","acronym":"CODS-COMAD 2022","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the 5th Joint International Conference on Data Science &amp; Management of Data (9th ACM IKDD CODS and 27th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3493700.3493774","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3493700.3493774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:44Z","timestamp":1750188644000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3493700.3493774"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,8]]},"references-count":35,"alternative-id":["10.1145\/3493700.3493774","10.1145\/3493700"],"URL":"https:\/\/doi.org\/10.1145\/3493700.3493774","relation":{},"subject":[],"published":{"date-parts":[[2022,1,8]]},"assertion":[{"value":"2022-01-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}