{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,18]],"date-time":"2025-10-18T10:57:53Z","timestamp":1760785073762,"version":"3.37.3"},"reference-count":73,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001665","name":"French Agence Nationale de la Recherche","doi-asserted-by":"publisher","award":["ANR-17-CE23-0018 (DirtyData)","ANR-20-CHIA-0026 (LearnI)"],"award-info":[{"award-number":["ANR-17-CE23-0018 (DirtyData)","ANR-20-CHIA-0026 (LearnI)"]}],"id":[{"id":"10.13039\/501100001665","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/access.2022.3168013","type":"journal-article","created":{"date-parts":[[2022,4,18]],"date-time":"2022-04-18T20:13:24Z","timestamp":1650312804000},"page":"42420-42431","source":"Crossref","is-referenced-by-count":3,"title":["Analytics on Non-Normalized Data Sources: More Learning, Rather Than More Cleaning"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2643-1848","authenticated-orcid":false,"given":"Alexis","family":"Cvetkov-Iliev","sequence":"first","affiliation":[{"name":"Soda, Inria Saclay, Palaiseau, France"}]},{"given":"Alexandre","family":"Allauzen","sequence":"additional","affiliation":[{"name":"ESPCI Paris, Paris, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1076-5122","authenticated-orcid":false,"given":"Ga\u00ebl","family":"Varoquaux","sequence":"additional","affiliation":[{"name":"Soda, Inria Saclay, Palaiseau, France"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/2339530.2339602"},{"key":"ref2","article-title":"Data engineering for data analytics: A classification of the issues, and case studies","volume-title":"arXiv:2004.12929","author":"Nazabal","year":"2020"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-018-0563-7"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM.2018.8621574"},{"volume-title":"Governing and managing big data for analytics and decision makers","year":"2014","author":"Chessell","key":"ref5"},{"volume-title":"Kaggle Industry Survey","year":"2018","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.2200\/S00655ED1V01Y201507WBE013"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313578"},{"volume-title":"Government Salaries Explorer","year":"2021","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.2307\/1924267"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/S0272-7757(01)00049-8"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1257\/jel.20160995"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/1978942.1979444"},{"key":"ref14","first-page":"1","article-title":"Data curation at scale: The data Tamer system","volume-title":"Proc. CIDR","author":"Stonebraker"},{"volume-title":"Using OpenRefine","year":"2013","author":"Verborgh","key":"ref15"},{"issue":"2","key":"ref16","first-page":"3","article-title":"Data integration: The current status and the way forward","volume":"41","author":"Stonebraker","year":"2018","journal-title":"IEEE Data Eng. Bull."},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3197387"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.14778\/2994509.2994514"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3310205"},{"issue":"3","key":"ref20","first-page":"59","article-title":"SampleClean: Fast and reliable analytics on dirty data","volume":"38","author":"Krishnan","year":"2015","journal-title":"IEEE Data Eng. Bull."},{"key":"ref21","article-title":"BoostClean: Automated error detection and repair for machine learning","volume-title":"arXiv:1711.01299","author":"Krishnan","year":"2017"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313602"},{"issue":"1","key":"ref23","first-page":"83","article-title":"Semantic integration research in the database community: A brief survey","volume":"26","author":"Doan","year":"2005","journal-title":"AI Mag."},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2007.250581"},{"key":"ref25","article-title":"The state of record linkage and current research problems","volume-title":"Statistical Research Division","author":"Winkler","year":"1999"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-018-5724-2"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.1969.10501049"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/956750.956759"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21236\/ADA440386"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3183713.3196926"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/775047.775087"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.14778\/3236187.3236198"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1586"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1038\/scientificamerican0577-119"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3076246.3076251"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-45528-0"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2992529"},{"issue":"85","key":"ref39","first-page":"2825","article-title":"Scikit-learn: Machine learning in Python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"ref40","article-title":"On the consistency of supervised learning with missing values","volume-title":"arXiv:1902.06931","author":"Josse","year":"2020"},{"key":"ref41","first-page":"1","article-title":"NeuMiss networks: Differentiable programming for supervised learning with missing values","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Le Morvan"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1198\/jasa.2011.r10138"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.2307\/1913643"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.2307\/144855"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.2307\/2525981"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1037\/h0037350"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139025751"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/1102351.1102430"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1093\/aje\/kwq439"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1111\/ectj.12097"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1093\/ije\/dyz132"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1093\/pan\/mpp036"},{"key":"ref53","first-page":"279","article-title":"Against cleaning","volume-title":"Debates in the Digital Humanities","author":"Rawson","year":"2019"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-37177-7_5"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-008-0119-9"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2865785"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.14778\/3137628.3137631"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.2200\/s00362ed1v01y201105dtm016"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.2200\/s00206ed1v01y200907aim007"},{"issue":"1","key":"ref60","first-page":"3846","article-title":"Hinge-loss Markov random fields and probabilistic soft logic","volume":"18","author":"Bach","year":"2017","journal-title":"J. Mach. Learn. Res."},{"key":"ref61","volume-title":"Fuzzy Databases: Principles and Applications","volume":"5","author":"Petry","year":"2012"},{"volume-title":"Deep Learning","year":"2016","author":"Goodfellow","key":"ref62"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2017.2754499"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389742"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2010.5447873"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-015-5900-5"},{"key":"ref67","first-page":"7712","article-title":"Probabilistic logic neural networks for reasoning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Qu"},{"key":"ref68","first-page":"1","article-title":"Graph convolutional transformer: Learning the graphical structure of electronic health records","volume-title":"Proc. AAAI","author":"Choi"},{"volume-title":"Interpretable Machine Learning","year":"2020","author":"Molnar","key":"ref69"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371824"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btq134"},{"key":"ref72","first-page":"1","article-title":"Data wrangling: The challenging journey from the wild to the lake","volume-title":"Proc. CIDR","author":"Terrizzano"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.4018\/978-1-60960-593-3.ch008"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9668973\/09758752.pdf?arnumber=9758752","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,22]],"date-time":"2024-01-22T20:57:54Z","timestamp":1705957074000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9758752\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/access.2022.3168013","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2022]]}}}