{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,15]],"date-time":"2025-06-15T03:40:04Z","timestamp":1749958804600,"version":"3.41.0"},"reference-count":35,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016,12]]},"DOI":"10.1109\/bigdata.2016.7840769","type":"proceedings-article","created":{"date-parts":[[2017,2,7]],"date-time":"2017-02-07T21:46:59Z","timestamp":1486504019000},"page":"1595-1602","source":"Crossref","is-referenced-by-count":6,"title":["Data quality: Experiences and lessons from operationalizing big data"],"prefix":"10.1109","author":[{"given":"Archana","family":"Ganapathi","sequence":"first","affiliation":[]},{"given":"Yanpei","family":"Chen","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"journal-title":"Wikipedia","article-title":"Data Janitor","year":"0","key":"ref33"},{"key":"ref32","article-title":"Data Scientists are Like Forrest Gump, Scrubbing Data with Toothbrushes","volume":"28","author":"popescu","year":"2013","journal-title":"Silicon Angle"},{"key":"ref31","article-title":"Data scientist: The sexiest job of the 21 st century","volume":"10","author":"davenport","year":"2012","journal-title":"Harvard Business Review"},{"key":"ref30","article-title":"For big-data scientists, janitor work is key hurdle to insights","author":"lohr","year":"2014","journal-title":"New York Times"},{"key":"ref35","doi-asserted-by":"crossref","DOI":"10.1111\/j.2517-6161.1951.tb00088.x","article-title":"The interpretation of interaction in contingency tables","author":"simpson","year":"1951","journal-title":"Journal of the Royal Statistical Society"},{"key":"ref34","article-title":"Relational data cleaning: A statistical perspective","author":"krishnan","year":"2016","journal-title":"Invited academia exchange seminar Splunk"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522737"},{"year":"0","key":"ref11","article-title":"Apache Kafka"},{"key":"ref12","article-title":"MLbase: A Distributed Machine Learning System","author":"kraska","year":"2013","journal-title":"CIDR"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/2465351.2465355"},{"key":"ref14","article-title":"Tensorfiow: Large-scale machine learning on heterogeneous distributed systems","author":"dean","year":"2015","journal-title":"Google Research Whitepaper"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2043556.2043566"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2008.277"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1111\/rssb.12050"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1561\/1900000045"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2012.219"},{"key":"ref4","article-title":"Impala: A Modern, Open-Source SQL Engine for Hadoop","author":"kornacker","year":"2015","journal-title":"CIDR"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/2939502.2939511"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/1294261.1294281"},{"key":"ref6","doi-asserted-by":"crossref","DOI":"10.1145\/2670979.2670985","article-title":"Tachyon: Reliable, Memory Speed Storage for Cluster Computing Frameworks","author":"li","year":"2014","journal-title":"SOCC"},{"key":"ref29","article-title":"Cleaning big data: Most time-consuming, least enjoyable data science task, survey says","author":"press","year":"2016","journal-title":"Forbes"},{"key":"ref5","article-title":"Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing","author":"zaharia","year":"2012","journal-title":"NSDI"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1807167.1807184"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1272996.1273005"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/945449.945450"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.14778\/2212351.2212354"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1327452.1327492"},{"key":"ref20","article-title":"Quantitative data cleaning for large databases","author":"hellerstein","year":"2008","journal-title":"United Nations Economic Commission for Europe (UNECE)"},{"journal-title":"Advanced Analytics with Spark","year":"2015","author":"ryza","key":"ref22"},{"journal-title":"Robust Regression and Outlier Detection","year":"2005","author":"rousseeuw","key":"ref21"},{"key":"ref24","article-title":"Data cleaning: Problems and current approaches","volume":"23","author":"rahm","year":"2000","journal-title":"IEEE Data Engineering Bulletin"},{"journal-title":"Data Science from Scratch","year":"2015","author":"grus","key":"ref23"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2912574"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/872757.872875"}],"event":{"name":"2016 IEEE International Conference on Big Data (Big Data)","start":{"date-parts":[[2016,12,5]]},"location":"Washington DC,USA","end":{"date-parts":[[2016,12,8]]}},"container-title":["2016 IEEE International Conference on Big Data (Big Data)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7818133\/7840573\/07840769.pdf?arnumber=7840769","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,15]],"date-time":"2025-06-15T03:03:19Z","timestamp":1749956599000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7840769\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,12]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/bigdata.2016.7840769","relation":{},"subject":[],"published":{"date-parts":[[2016,12]]}}}