{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T14:20:14Z","timestamp":1780582814520,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,14]],"date-time":"2021-08-14T00:00:00Z","timestamp":1628899200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,14]]},"DOI":"10.1145\/3447548.3470817","type":"proceedings-article","created":{"date-parts":[[2021,8,12]],"date-time":"2021-08-12T06:12:03Z","timestamp":1628748723000},"page":"4040-4041","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":86,"title":["Data Quality for Machine Learning Tasks"],"prefix":"10.1145","author":[{"given":"Nitin","family":"Gupta","sequence":"first","affiliation":[{"name":"IBM Research India, Delhi, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shashank","family":"Mujumdar","sequence":"additional","affiliation":[{"name":"IBM Research India, Delhi, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hima","family":"Patel","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Satoshi","family":"Masuda","sequence":"additional","affiliation":[{"name":"IBM Research Japan, Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Naveen","family":"Panwar","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sambaran","family":"Bandyopadhyay","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sameep","family":"Mehta","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shanmukha","family":"Guttula","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shazia","family":"Afzal","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruhi","family":"Sharma Mittal","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vitobha","family":"Munigala","sequence":"additional","affiliation":[{"name":"IBM Research India, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,8,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume":"2014","author":"Adikaram KKLB","unstructured":"KKLB Adikaram , MA Hussein , M Effenberger , and T Becker . [n.d.]. Outlier detection method in linear regression based on sum of arithmetic progression. The Scientific World Journal 2014 ([n. d.]). KKLB Adikaram, MA Hussein, M Effenberger, and T Becker. [n.d.]. Outlier detection method in linear regression based on sum of arithmetic progression. The Scientific World Journal 2014 ([n. d.]).","journal-title":"The Scientific World Journal"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Ateret Anaby-Tavor Boaz Carmeli Esther Goldbraich Amir Kantor George Kour Segev Shlomov Naama Tepper and Naama Zwerdling. 2020. Do Not Have Enough Data? Deep Learning to the Rescue!. In AAAI. 7383--7390.  Ateret Anaby-Tavor Boaz Carmeli Esther Goldbraich Amir Kantor George Kour Segev Shlomov Naama Tepper and Naama Zwerdling. 2020. Do Not Have Enough Data? Deep Learning to the Rescue!. In AAAI. 7383--7390.","DOI":"10.1609\/aaai.v34i05.6233"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Gowtham Atluri Anuj Karpatne and Vipin Kumar. 2018. Spatio-Temporal Data Mining: A Survey of Problems and Methods. ACM Comput. Surv. (2018).  Gowtham Atluri Anuj Karpatne and Vipin Kumar. 2018. Spatio-Temporal Data Mining: A Survey of Problems and Methods. ACM Comput. Surv. (2018).","DOI":"10.1145\/3161602"},{"key":"e_1_3_2_1_4_1","volume-title":"1st International Workshop on Data Assessment and Readiness for AI.. In PAKDD (Workshops).","author":"Bandyopadhyay Bortik","year":"2021","unstructured":"Bortik Bandyopadhyay , Sambaran Bandyopadhyay , Srikanta Bedathur , Nitin Gupta , Sameep Mehta , Shashank Mujumdar , Srinivasan Parthasarathy , and Hima Patel . 2021 . 1st International Workshop on Data Assessment and Readiness for AI.. In PAKDD (Workshops). Bortik Bandyopadhyay, Sambaran Bandyopadhyay, Srikanta Bedathur, Nitin Gupta, Sameep Mehta, Shashank Mujumdar, Srinivasan Parthasarathy, and Hima Patel. 2021. 1st International Workshop on Data Assessment and Readiness for AI.. In PAKDD (Workshops)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313602"},{"key":"e_1_3_2_1_6_1","volume-title":"Evolutionary Data Measures: Understanding the Difficulty of Text Classification Tasks. arXiv","author":"Collins Edward","year":"2018","unstructured":"Edward Collins , Nikolai Rozanov , and Bingbing Zhang . 2018. Evolutionary Data Measures: Understanding the Difficulty of Text Classification Tasks. arXiv ( 2018 ). Edward Collins, Nikolai Rozanov, and Bingbing Zhang. 2018. Evolutionary Data Measures: Understanding the Difficulty of Text Classification Tasks. arXiv (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"Juan Carlos Corrales, and Agapito Ledezma","author":"Corrales David Camilo","year":"2018","unstructured":"David Camilo Corrales , Juan Carlos Corrales, and Agapito Ledezma . 2018 . How to address the data quality issues in regression models: a guided process for data cleaning. Symmetry ( 2018). David Camilo Corrales, Juan Carlos Corrales, and Agapito Ledezma. 2018. How to address the data quality issues in regression models: a guided process for data cleaning. Symmetry (2018)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-13059-5_22"},{"key":"e_1_3_2_1_9_1","volume-title":"Foundations of data quality management. Synthesis Lectures on Data Management","author":"Fan Wenfei","year":"2012","unstructured":"Wenfei Fan and Floris Geerts . 2012. Foundations of data quality management. Synthesis Lectures on Data Management ( 2012 ). Wenfei Fan and Floris Geerts. 2012. Foundations of data quality management. Synthesis Lectures on Data Management (2012)."},{"key":"e_1_3_2_1_10_1","volume":"201","author":"Gupta M.","unstructured":"M. Gupta , J. Gao , C. C. Aggarwal , and J. Han. 201 4. Outlier Detection for Temporal Data: A Survey. IEEE Transactions on Knowledge and Data Engineering (2014). M. Gupta, J. Gao, C. C. Aggarwal, and J. Han. 2014. Outlier Detection for Temporal Data: A Survey. IEEE Transactions on Knowledge and Data Engineering (2014).","journal-title":"J. Han."},{"key":"e_1_3_2_1_11_1","volume-title":"2nd International Workshop on Data Quality Assessment for Machine Learning. In KDD (Workshops).","author":"Gupta Nitin","year":"2021","unstructured":"Nitin Gupta , Hima Patel , Srikanta Bedathur , Sameep Mehta , Shashank Mujumdar , Fuyuki Ishikawa , Laure Berti-Equille , Shazia Afzal , Satoshi Masuda , and Yasuharu Nishi . 2021 . 2nd International Workshop on Data Quality Assessment for Machine Learning. In KDD (Workshops). Nitin Gupta, Hima Patel, Srikanta Bedathur, Sameep Mehta, Shashank Mujumdar, Fuyuki Ishikawa, Laure Berti-Equille, Shazia Afzal, Satoshi Masuda, and Yasuharu Nishi. 2021. 2nd International Workshop on Data Quality Assessment for Machine Learning. In KDD (Workshops)."},{"key":"e_1_3_2_1_12_1","volume-title":"Ruhi Sharma Mittal, and Vitobha Munigala","author":"Jain Abhinav","year":"2020","unstructured":"Abhinav Jain , Hima Patel , Lokesh Nagalapatti , Nitin Gupta , Sameep Mehta , Shanmukha Guttula , Shashank Mujumdar , Shazia Afzal , Ruhi Sharma Mittal, and Vitobha Munigala . 2020 . Overview and Importance of Data Quality for Machine Learning Tasks. In KDD. Abhinav Jain, Hima Patel, Lokesh Nagalapatti, Nitin Gupta, Sameep Mehta, Shanmukha Guttula, Shashank Mujumdar, Shazia Afzal, Ruhi Sharma Mittal, and Vitobha Munigala. 2020. Overview and Importance of Data Quality for Machine Learning Tasks. In KDD."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Ramakrishnan Kannan Hyenkyun Woo Charu C Aggarwal and Haesun Park. 2017. Outlier detection for text data. In ICDM.  Ramakrishnan Kannan Hyenkyun Woo Charu C Aggarwal and Haesun Park. 2017. Outlier detection for text data. In ICDM.","DOI":"10.1137\/1.9781611974973.55"},{"key":"e_1_3_2_1_14_1","unstructured":"Edwin M. Knorr and Raymond T. Ng. [n.d.]. Algorithms for Mining DistanceBased Outliers in Large Datasets. In VLDB.  Edwin M. Knorr and Raymond T. Ng. [n.d.]. Algorithms for Mining DistanceBased Outliers in Large Datasets. In VLDB."},{"key":"e_1_3_2_1_15_1","volume-title":"Jens Lehmann, Marcilio CP Souto, and Tin Kam Ho.","author":"Lorena Ana C","year":"2019","unstructured":"Ana C Lorena , Lu\u00eds PF Garcia , Jens Lehmann, Marcilio CP Souto, and Tin Kam Ho. 2019 . How Complex is your classification problem? A survey on measuring classification complexity. CSUR ( 2019). Ana C Lorena, Lu\u00eds PF Garcia, Jens Lehmann, Marcilio CP Souto, and Tin Kam Ho. 2019. How Complex is your classification problem? A survey on measuring classification complexity. CSUR (2019)."},{"key":"e_1_3_2_1_16_1","unstructured":"Y. Lu Y. Cheung and Y. Y. Tang. 2019. Bayes Imbalance Impact Index: A Measure of Class Imbalanced Data Set for Classification Problem. TNNLS (2019).  Y. Lu Y. Cheung and Y. Y. Tang. 2019. Bayes Imbalance Impact Index: A Measure of Class Imbalanced Data Set for Classification Problem. TNNLS (2019)."},{"key":"e_1_3_2_1_17_1","volume-title":"Confident Learning: Estimating Uncertainty in Dataset Labels. arXiv","author":"Northcutt Curtis G","year":"2019","unstructured":"Curtis G Northcutt , Lu Jiang , and Isaac L Chuang . 2019 . Confident Learning: Estimating Uncertainty in Dataset Labels. arXiv (2019). Curtis G Northcutt, Lu Jiang, and Isaac L Chuang. 2019. Confident Learning: Estimating Uncertainty in Dataset Labels. arXiv (2019)."},{"key":"e_1_3_2_1_18_1","unstructured":"Marco Ribeiro Tongshuang Wu Carlos Guestrin and Sameer Singh. [n.d.]. Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. arXiv ([n. d.]).  Marco Ribeiro Tongshuang Wu Carlos Guestrin and Sameer Singh. [n.d.]. Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. arXiv ([n. d.])."},{"key":"e_1_3_2_1_19_1","volume-title":"Spatiotemporal Data Mining: A Computational Perspective. ISPRS International Journal of Geo-Information","author":"Shekhar Shashi","year":"2015","unstructured":"Shashi Shekhar , Zhe Jiang , Reem Y. Ali , Emre Eftelioglu , Xun Tang , Venkata M. V. Gunturi , and Xun Zhou . 2015. Spatiotemporal Data Mining: A Computational Perspective. ISPRS International Journal of Geo-Information ( 2015 ). Shashi Shekhar, Zhe Jiang, Reem Y. Ali, Emre Eftelioglu, Xun Tang, Venkata M. V. Gunturi, and Xun Zhou. 2015. Spatiotemporal Data Mining: A Computational Perspective. ISPRS International Journal of Geo-Information (2015)."},{"key":"e_1_3_2_1_20_1","volume-title":"Dataset cartography: Mapping and diagnosing datasets with training dynamics. arXiv","author":"Swayamdipta Swabha","year":"2020","unstructured":"Swabha Swayamdipta , Roy Schwartz , Nicholas Lourie , Yizhong Wang , Hannaneh Hajishirzi , Noah A Smith , and Yejin Choi . 2020. Dataset cartography: Mapping and diagnosing datasets with training dynamics. arXiv ( 2020 ). Swabha Swayamdipta, Roy Schwartz, Nicholas Lourie, Yizhong Wang, Hannaneh Hajishirzi, Noah A Smith, and Yejin Choi. 2020. Dataset cartography: Mapping and diagnosing datasets with training dynamics. arXiv (2020)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-40669-0_33"},{"key":"e_1_3_2_1_22_1","unstructured":"S. Wang J. Cao and P. Yu. 2020. Deep Learning for Spatio-Temporal Data Mining: A Survey. TKDE (2020).  S. Wang J. Cao and P. Yu. 2020. Deep Learning for Spatio-Temporal Data Mining: A Survey. TKDE (2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv","author":"Wei Jason","year":"2019","unstructured":"Jason Wei and Kai Zou . 2019 . Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv (2019). Jason Wei and Kai Zou. 2019. Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv (2019)."},{"key":"e_1_3_2_1_24_1","unstructured":"Jinsung Yoon Sercan Arik and Tomas Pfister. 2020. Data valuation using reinforcement learning. In ICML.  Jinsung Yoon Sercan Arik and Tomas Pfister. 2020. Data valuation using reinforcement learning. In ICML."}],"event":{"name":"KDD '21: The 27th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Virtual Event Singapore","acronym":"KDD '21","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery &amp; Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3447548.3470817","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3447548.3470817","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:32Z","timestamp":1750191512000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3447548.3470817"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,14]]},"references-count":24,"alternative-id":["10.1145\/3447548.3470817","10.1145\/3447548"],"URL":"https:\/\/doi.org\/10.1145\/3447548.3470817","relation":{},"subject":[],"published":{"date-parts":[[2021,8,14]]},"assertion":[{"value":"2021-08-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}