{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T15:14:01Z","timestamp":1768403641861,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T00:00:00Z","timestamp":1652659200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"State of Upper Austria","award":["FFG-Nr. 865891 & 888127"],"award-info":[{"award-number":["FFG-Nr. 865891 & 888127"]}]},{"name":"Austrian Federal Ministry for Climate Action, Environment, Energy, Mobility, Innovation and Technology"},{"name":"Austrian Federal Ministry for Digital and Economic Affairs"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,5,16]]},"DOI":"10.1145\/3522664.3528590","type":"proceedings-article","created":{"date-parts":[[2022,10,17]],"date-time":"2022-10-17T16:30:14Z","timestamp":1666024214000},"page":"229-239","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":38,"title":["Data smells"],"prefix":"10.1145","author":[{"given":"Harald","family":"Foidl","sequence":"first","affiliation":[{"name":"University of Innsbruck, Austria"}]},{"given":"Michael","family":"Felderer","sequence":"additional","affiliation":[{"name":"University of Innsbruck, Austria"}]},{"given":"Rudolf","family":"Ramler","sequence":"additional","affiliation":[{"name":"Hagenberg GmbH, Austria"}]}],"member":"320","published-online":{"date-parts":[[2022,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.14778\/2994509.2994518"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-021-00419-9"},{"key":"e_1_3_2_1_3_1","volume-title":"CheckCell. In Proceedings of the 2014 ACM International Conference on Object Oriented Programming Systems Languages & Applications, Andrew Black and Todd Millstein (Eds.). ACM, 507--523","author":"Barowy Daniel W.","unstructured":"Daniel W. Barowy, Dimitar Gochev, and Emery D. Berger. 2014. CheckCell. In Proceedings of the 2014 ACM International Conference on Object Oriented Programming Systems Languages & Applications, Andrew Black and Todd Millstein (Eds.). ACM, 507--523."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Carlo Batini Monica Scannapieco et al. 2016. Data and information quality. Springer Cham Switzerland.","DOI":"10.1007\/978-3-319-24106-7"},{"key":"e_1_3_2_1_5_1","first-page":"51","article-title":"Automated Data Validation in Machine Learning Systems","volume":"44","author":"Biessmann Felix","year":"2021","unstructured":"Felix Biessmann, Jacek Golebiowski, Tammo Rukat, Dustin Lange, and Philipp Schmidt. 2021. Automated Data Validation in Machine Learning Systems. Bulletin of the IEEE Computer Society Technical Committee on Data Engineering 44, 1 (2021), 51--65.","journal-title":"Bulletin of the IEEE Computer Society Technical Committee on Data Engineering"},{"key":"e_1_3_2_1_6_1","volume-title":"Characterizing Technical Debt and Antipatterns in AI-Based Systems: A Systematic Mapping Study. arXiv preprint arXiv:2103.09783","author":"Bogner Justus","year":"2021","unstructured":"Justus Bogner, Roberto Verdecchia, and Ilias Gerostathopoulos. 2021. Characterizing Technical Debt and Antipatterns in AI-Based Systems: A Systematic Mapping Study. arXiv preprint arXiv:2103.09783 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"Helena Holmstr\u00f6m Olsson, and Ivica Crnkovic","author":"Bosch Jan","year":"2021","unstructured":"Jan Bosch, Helena Holmstr\u00f6m Olsson, and Ivica Crnkovic. 2021. Engineering AI systems: A research agenda. In Artificial Intelligence Paradigms for Smart Cyber-Physical Systems. IGI Global, 1--19."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2020.110542"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2nd SysML Conference.","author":"Breck Eric","year":"2019","unstructured":"Eric Breck, Neoklis Polyzotis, Sudip Roy, Steven Whang, and Martin Zinkevich. 2019. Data Validation for Machine Learning. In Proceedings of the 2nd SysML Conference."},{"key":"e_1_3_2_1_10_1","volume-title":"SysML Conference.","author":"Breck Eric","year":"2018","unstructured":"Eric Breck, Neoklis Polyzotis, Sudip Roy, Steven Euijong Whang, and Martin Zinkevich. 2018. Data infrastructure for machine learning. In SysML Conference."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3384707"},{"key":"e_1_3_2_1_12_1","unstructured":"coronadatascraper. 2020. Data has a gap between 2020-3-11 and 2020-3-24 #375: GitHub. https:\/\/github.com\/covidatlas\/coronadatascraper\/issues\/375"},{"key":"e_1_3_2_1_13_1","unstructured":"covid19india react. 2020. Rajasthan District names are wrong #321: GitHub. https:\/\/github.com\/covid19india\/covid19india-react\/issues\/321"},{"key":"e_1_3_2_1_14_1","volume-title":"The Wall Street Journal 1","author":"Davenport Thomas H.","year":"2015","unstructured":"Thomas H. Davenport. 2015. Lessons-from-the-Cognitive-Front-Lines-Early-Adopters-of-IBMs-Watson. The Wall Street Journal 1 (2015). https:\/\/www.tomdavenport.com\/wp-content\/uploads\/2019\/01\/Lessons-from-the-Cognitive-Front-Lines-Early-Adopters-of-IBMs-Watson.pdf"},{"key":"e_1_3_2_1_15_1","first-page":"42","article-title":"Validating Data and Models in Continuous ML pipelines","volume":"44","author":"Dreves Mike","year":"2021","unstructured":"Mike Dreves, Gene Huang, Zhuo Peng, Neoklis Polyzotis, Evan Rosen, and Paul Suganthan GC. 2021. Validating Data and Models in Continuous ML pipelines. Bulletin of the IEEE Computer Society Technical Committee on Data Engineering 44, 1 (2021), 42--50.","journal-title":"Bulletin of the IEEE Computer Society Technical Committee on Data Engineering"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDIM.2018.8846984"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340482.3342743"},{"key":"e_1_3_2_1_18_1","volume-title":"Information Processing and Management of Uncertainty in Knowledge-Based Systems","author":"Foorthuis Ralph","unstructured":"Ralph Foorthuis. 2018. A Typology of Data Anomalies. In Information Processing and Management of Uncertainty in Knowledge-Based Systems. Theory and Foundations, Jes\u00fas Medina, Manuel Ojeda-Aciego, Jos\u00e9 Luis Verdegay, David A. Pelta, Inma P. Cabrera, Bernadette Bouchon-Meunier, and Ronald R. Yager (Eds.). Communications in Computer and Information Science, Vol. 854. Springer International Publishing, Cham, 26--38."},{"key":"e_1_3_2_1_19_1","unstructured":"Martin Fowler Kent Beck J. Brant W. Opdyke and D. Roberts. 1999. Refactoring: improving the design of existing code ser. In Addison Wesley object technology series. Addison-Wesley."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5381\/jot.2013.12.2.a1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2018.09.006"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 2007 MIT International Conference on Information Quality (ICIQ).","author":"Ge Mouzhi","year":"2007","unstructured":"Mouzhi Ge and Markus Helfert. 2007. A review of Information Quality Research: Develop a Research Agenda. In Proceedings of the 2007 MIT International Conference on Information Quality (ICIQ)."},{"key":"e_1_3_2_1_23_1","unstructured":"Great Expectations. 2022. https:\/\/greatexpectations.io\/"},{"key":"e_1_3_2_1_24_1","volume-title":"Shanmukha Guttula, Abhinav Jain, Lokesh Nagalapatti, Sameep Mehta, Sandeep Hans, et al.","author":"Gupta Nitin","year":"2021","unstructured":"Nitin Gupta, Hima Patel, Shazia Afzal, Naveen Panwar, Ruhi Sharma Mittal, Shanmukha Guttula, Abhinav Jain, Lokesh Nagalapatti, Sameep Mehta, Sandeep Hans, et al. 2021. Data Quality Toolkit: Automatic assessment of data quality and remediation for machine learning datasets. arXiv preprint arXiv:2108.05935 (2021)."},{"key":"e_1_3_2_1_25_1","unstructured":"Jacob Harris. 2014. Distrust Your Data. https:\/\/source.opennews.org\/articles\/distrust-your-data\/"},{"key":"e_1_3_2_1_26_1","volume-title":"31st Conference on Neural Information Processing Systems (NIPS): Workshop on ML Systems.","author":"Hynes Nick","year":"2017","unstructured":"Nick Hynes, D. Sculley, and Michael Terry. 2017. The data linter: Lightweight, automated sanity checking for ml data sets. In 31st Conference on Neural Information Processing Systems (NIPS): Workshop on ML Systems."},{"key":"e_1_3_2_1_27_1","volume-title":"Ilyas and Xu Chu","author":"Ihab","year":"2019","unstructured":"Ihab F. Ilyas and Xu Chu. 2019. Data cleaning. Morgan & Claypool."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338906.3338955"},{"key":"e_1_3_2_1_29_1","unstructured":"Nikolas Iubel. 2014. Ensuring Accuracy in Data Journalism. https:\/\/github.com\/nikeiubel\/data-smells\/wiki\/Ensuring-Accuracy-in-Data-Journalism"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-31128-4_15"},{"key":"e_1_3_2_1_31_1","volume-title":"DBKDA 2019: The Eleventh International Conference on Advances in Databases, Knowledge, and Data Applications. 57--60","author":"Borovina Josko Joao Marcelo","year":"2019","unstructured":"Joao Marcelo Borovina Josko, Lisa Ehrlinger, and Wolfram W\u00f6\u00df. 2019. Towards a Knowledge Graph to Describe and Process Data Defects. In DBKDA 2019: The Eleventh International Conference on Advances in Databases, Knowledge, and Data Applications. 57--60."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"Mark Kasunic James McCurley Dennis Goldenson and David Zubrow. 2011. An Investigation of Techniques for Detecting Data Anomalies in Earned Value Management Data. 10.21236\/ADA591417","DOI":"10.21236\/ADA591417"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1021564703268"},{"key":"e_1_3_2_1_34_1","volume-title":"Characterizing and Detecting Mismatch in Machine-Learning-Enabled Systems. arXiv preprint arXiv:2103.14101","author":"Lewis Grace A.","year":"2021","unstructured":"Grace A. Lewis, Stephany Bellomo, and Ipek Ozkaya. 2021. Characterizing and Detecting Mismatch in Machine-Learning-Enabled Systems. arXiv preprint arXiv:2103.14101 (2021)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5176\/2010-2283_1.2.52"},{"key":"e_1_3_2_1_36_1","unstructured":"Steve Lohr. 2021. What Ever Happened to IBM's Watson? https:\/\/www.nytimes.com\/2021\/07\/16\/technology\/what-happened-ibm-watson.html"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEIP52600.2021.00034"},{"key":"e_1_3_2_1_38_1","volume-title":"Anna Maria Vollmer, and Stefan Wagner","author":"Mart\u00ednez-Fern\u00e1ndez Silverio","year":"2021","unstructured":"Silverio Mart\u00ednez-Fern\u00e1ndez, Justus Bogner, Xavier Franch, Marc Oriol, Julien Siebert, Adam Trendowicz, Anna Maria Vollmer, and Stefan Wagner. 2021. Software Engineering for AI-Based Systems: A Survey. arXiv preprint arXiv:2105.01984 (2021)."},{"key":"e_1_3_2_1_39_1","volume-title":"Impact of Data Quality in Real-Time Big Data Systems. In CEUR Workshop Proceedings.","volume":"2716","author":"Merino Jorge","year":"2020","unstructured":"Jorge Merino, Xiang Xie, Ian Lewis, Ajith Parlikad, and Duncan McFarlane. 2020. Impact of Data Quality in Real-Time Big Data Systems. In CEUR Workshop Proceedings. Vol. 2716. 73--86."},{"key":"e_1_3_2_1_40_1","unstructured":"MobyDQ. 2022. https:\/\/ubisoft.github.io\/mobydq\/"},{"key":"e_1_3_2_1_41_1","volume-title":"2nd Int. Workshop on Data and Information Quality. 219--233","author":"Oliveira Paulo","year":"2005","unstructured":"Paulo Oliveira, Fatima Rodrigues, Pedro Henriques, and Helena Galhardas. 2005. A taxonomy of data quality problems. In 2nd Int. Workshop on Data and Information Quality. 219--233."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patter.2021.100336"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3299887.3299891"},{"key":"e_1_3_2_1_44_1","volume-title":"What can Data-Centric AI Learn from Data and ML Engineering? arXiv preprint arXiv:2112.06439","author":"Polyzotis Neoklis","year":"2021","unstructured":"Neoklis Polyzotis and Matei Zaharia. 2021. What can Data-Centric AI Learn from Data and ML Engineering? arXiv preprint arXiv:2112.06439 (2021)."},{"key":"e_1_3_2_1_45_1","volume-title":"An Empirical Study of Bugs in COVID-19 Software Projects. Journal of Software Engineering Research and Development 9","author":"Ur Rahman Akond Ashfaque","year":"2021","unstructured":"Akond Ashfaque Ur Rahman and Effat Farhana. 2021. An Empirical Study of Bugs in COVID-19 Software Projects. Journal of Software Engineering Research and Development 9 (2021)."},{"key":"e_1_3_2_1_46_1","unstructured":"redgate. 2017. SQLCode Smells. http:\/\/assets.red-gate.com\/community\/books\/sql-code-smells.pdf"},{"key":"e_1_3_2_1_47_1","unstructured":"Sergey Redyuk Zoi Kaoudi Volker Markl and Sebastian Schelter. 2021. Automating Data Quality Validation for Dynamic Data Ingestion. In EDBT. 61--72."},{"key":"e_1_3_2_1_48_1","unstructured":"Casey Ross and Ike Swetlitz. 2017. IBM pitched its Watson supercomputer as a revolution in cancer care. It's nowhere close. (2017)."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 2021 CHI 2021. 1--15","author":"Sambasivan Nithya","unstructured":"Nithya Sambasivan, Shivani Kapania, Hannah Highfill, Diana Akrong, Praveen Paritosh, and Lora M. Aroyo. 2021. \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In Proceedings of the 2021 CHI 2021. 1--15."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2018.07.035"},{"key":"e_1_3_2_1_51_1","volume-title":"Machine Learning Systems workshop at the conference on Neural Information Processing Systems (NeurIPS).","author":"Schelter Sebastian","year":"2018","unstructured":"Sebastian Schelter, Stefan Grafberger, Philipp Schmidt, Tammo Rukat, Mario Kiessling, Andrey Taptunov, Felix Biessmann, and Dustin Lange. 2018. Deequdata quality validation for machine learning pipelines. In Machine Learning Systems workshop at the conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.14778\/3229863.3229867"},{"key":"e_1_3_2_1_53_1","volume-title":"24th International Conference on Extending Database Technology (EDBT), March 23--26","author":"Schelter Sebastian","year":"2021","unstructured":"Sebastian Schelter, Tammo Rukat, and Felix Biessmann. 2021. JENGA - A Framework to Study the Impact of Data Errors on the Predictions of Machine Learning Models. In 24th International Conference on Extending Database Technology (EDBT), March 23--26, 2021. 529--534."},{"key":"e_1_3_2_1_54_1","volume-title":"Hidden technical debt in machine learning systems. Advances in neural information processing systems 28","author":"Sculley David","year":"2015","unstructured":"David Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Francois Crespo, and Dan Dennison. 2015. Hidden technical debt in machine learning systems. Advances in neural information processing systems 28 (2015), 2503--2511."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3183519.3183529"},{"key":"e_1_3_2_1_56_1","volume-title":"2019 IEEE International Conference on Big Data. 2913--2922","author":"Shrivastava Shrey","unstructured":"Shrey Shrivastava, Dhaval Patel, Anuradha Bhamidipaty, Wesley M. Gifford, Stuart A. Siegel, Venkata Sitaramagiridharganesh Ganapavarapu, and Jayant R. Kalagnanam. 2019. Dqa: Scalable, automated and interactive data quality advisor. In 2019 IEEE International Conference on Big Data. 2913--2922."},{"key":"e_1_3_2_1_57_1","volume-title":"Tabular Data Anomaly Patterns. In International Conference on Big Data Innovations and Applications (Innovate-Data)","author":"Sukhobok Dina","year":"2017","unstructured":"Dina Sukhobok, Nikolay Nikolov, and Dumitru Roman. 2017. Tabular Data Anomaly Patterns. In International Conference on Big Data Innovations and Applications (Innovate-Data), 21-23 August 2017. 25--34."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE48307.2020.00140"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1200\/jco.2014.32.15_suppl.6506"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/WCRE.2012.69"},{"key":"e_1_3_2_1_61_1","volume-title":"Understanding the Challenges and Assisting Developers with Developing Spark Applications","author":"Wang Zehao","unstructured":"Zehao Wang. 2021. Understanding the Challenges and Assisting Developers with Developing Spark Applications. In IEEE\/ACM 43rd International Conference on Software Engineering: Companion Proceedings (ICSE-Companion). 132--134."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2601248.2601268"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE51524.2021.9678520"}],"event":{"name":"CAIN '22: 1st Conference on AI Engineering - Software Engineering for AI","location":"Pittsburgh Pennsylvania","acronym":"CAIN '22","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","IEEE TCSC IEEE Technical Committee on Scalable Computing"]},"container-title":["Proceedings of the 1st International Conference on AI Engineering: Software Engineering for AI"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3522664.3528590","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3522664.3528590","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:09:34Z","timestamp":1750183774000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3522664.3528590"}},"subtitle":["categories, causes and consequences, and detection of suspicious data in AI-based systems"],"short-title":[],"issued":{"date-parts":[[2022,5,16]]},"references-count":63,"alternative-id":["10.1145\/3522664.3528590","10.1145\/3522664"],"URL":"https:\/\/doi.org\/10.1145\/3522664.3528590","relation":{},"subject":[],"published":{"date-parts":[[2022,5,16]]},"assertion":[{"value":"2022-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}