{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,23]],"date-time":"2025-12-23T15:33:49Z","timestamp":1766504029077,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,6,27]],"date-time":"2017-06-27T00:00:00Z","timestamp":1498521600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-AC02-06CH11357"],"award-info":[{"award-number":["DE-AC02-06CH11357"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1148484"],"award-info":[{"award-number":["1148484"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,6,27]]},"DOI":"10.1145\/3085504.3091116","type":"proceedings-article","created":{"date-parts":[[2017,6,5]],"date-time":"2017-06-05T12:50:05Z","timestamp":1496667005000},"page":"1-4","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Skluma"],"prefix":"10.1145","author":[{"given":"Paul","family":"Beckman","sequence":"first","affiliation":[{"name":"Computation Institute, University of Chicago and Argonne National Laboratory, Chicago, IL"}]},{"given":"Tyler J.","family":"Skluzacek","sequence":"additional","affiliation":[{"name":"Computation Institute, University of Chicago and Argonne National Laboratory, Chicago, IL"}]},{"given":"Kyle","family":"Chard","sequence":"additional","affiliation":[{"name":"Computation Institute, University of Chicago and Argonne National Laboratory, Chicago, IL"}]},{"given":"Ian","family":"Foster","sequence":"additional","affiliation":[{"name":"Computation Institute, University of Chicago and Argonne National Laboratory, Chicago, IL"}]}],"member":"320","published-online":{"date-parts":[[2017,6,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/www.petrel.alcf.anl.gov. Visited","author":"Management Petrel Data","year":"2017","unstructured":"Petrel Data Management and Sharing Pilot . (????). https:\/\/www.petrel.alcf.anl.gov. Visited Feb. 28, 2017 . Petrel Data Management and Sharing Pilot. (????). https:\/\/www.petrel.alcf.anl.gov. Visited Feb. 28, 2017."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDMW.2016.0055"},{"volume-title":"Proceedings of the IEEE International Conference on Big Data (Big Data). 302--310","author":"Babuji Y. N.","key":"e_1_3_2_1_3_1","unstructured":"Y. N. Babuji , K. Chard , A. Gerow , and E. Duede . 2016. Cloud Kotta: Enabling secure and scalable data analytics in the cloud . In Proceedings of the IEEE International Conference on Big Data (Big Data). 302--310 . Y. N. Babuji, K. Chard, A. Gerow, and E. Duede. 2016. Cloud Kotta: Enabling secure and scalable data analytics in the cloud. In Proceedings of the IEEE International Conference on Big Data (Big Data). 302--310."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1093\/nar\/28.1.235"},{"key":"e_1_3_2_1_5_1","first-page":"993","article-title":"Latent dirichlet allocation","author":"Blei David M","year":"2003","unstructured":"David M Blei , Andrew Y Ng , and Michael I Jordan . 2003 . Latent dirichlet allocation . Journal of machine Learning research 3 , Jan (2003), 993 -- 1022 . David M Blei, Andrew Y Ng, and Michael I Jordan. 2003. Latent dirichlet allocation. Journal of machine Learning research 3, Jan (2003), 993--1022.","journal-title":"Journal of machine Learning research 3"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.3998\/3336451.0014.103"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/eScience.2015.68"},{"volume-title":"Cloudera RecordBreaker GitHub Repository","year":"2014","key":"e_1_3_2_1_8_1","unstructured":"Cloudera. 2014. RecordBreaker. Cloudera RecordBreaker GitHub Repository ( 2014 ). https:\/\/github.com\/cloudera\/RecordBreaker\/tree\/master\/src Cloudera. 2014. RecordBreaker. Cloudera RecordBreaker GitHub Repository (2014). https:\/\/github.com\/cloudera\/RecordBreaker\/tree\/master\/src"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1175\/1520-0477(1993)074<0645:RAAUWD>2.0.CO;2"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR).","author":"Deng Dong","year":"2017","unstructured":"Dong Deng , Raul Castro Fernandez , Ziawasch Abedjan , Sibo Wang , Michael Stonebraker , Ahmed Elmagarmid , Ihab F Ilyasl , Samuel Madden , Mourad Ouzzani , and Nan Tang . 2017 . The Data Civilizer System . In Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR). Dong Deng, Raul Castro Fernandez, Ziawasch Abedjan, Sibo Wang, Michael Stonebraker, Ahmed Elmagarmid, Ihab F Ilyasl, Samuel Madden, Mourad Ouzzani, and Nan Tang. 2017. The Data Civilizer System. In Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1938551.1938556"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2011.64"},{"key":"e_1_3_2_1_13_1","volume-title":"RFC","author":"Freed N.","year":"2045","unstructured":"N. Freed and N. Borenstein . 1996. Multipurpose Internet Mail Extensions (MIME) . RFC 2045 . IETF. N. Freed and N. Borenstein. 1996. Multipurpose Internet Mail Extensions (MIME). RFC 2045. IETF."},{"volume-title":"Proceedings of the 3rd ACM\/IEEE-CS Joint Conference on Digital Libraries (JCDL '03)","author":"Han Hui","key":"e_1_3_2_1_14_1","unstructured":"Hui Han , C. Lee Giles , Eren Manavoglu , Hongyuan Zha , Zhenyue Zhang , and Edward A. Fox . 2003. Automatic Document Metadata Extraction Using Support Vector Machines . In Proceedings of the 3rd ACM\/IEEE-CS Joint Conference on Digital Libraries (JCDL '03) . IEEE Computer Society, Washington, DC, USA, 37--48. http:\/\/dl.acm.org\/citation.cfm?id=827140.827146 Hui Han, C. Lee Giles, Eren Manavoglu, Hongyuan Zha, Zhenyue Zhang, and Edward A. Fox. 2003. Automatic Document Metadata Extraction Using Support Vector Machines. In Proceedings of the 3rd ACM\/IEEE-CS Joint Conference on Digital Libraries (JCDL '03). IEEE Computer Society, Washington, DC, USA, 37--48. http:\/\/dl.acm.org\/citation.cfm?id=827140.827146"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR).","author":"Hellerstein Joseph M","year":"2017","unstructured":"Joseph M Hellerstein , Vikram Sreekanti , Joseph E Gonzalez , James Dalton , Akon Dey , Sreyashi Nag , Krishna Ramachandran , Sudhanshu Arora , Arka Bhattacharyya , Shirshanka Das , 2017 . Ground: A Data Context Service . In Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR). Joseph M Hellerstein, Vikram Sreekanti, Joseph E Gonzalez, James Dalton, Akon Dey, Sreyashi Nag, Krishna Ramachandran, Sudhanshu Arora, Arka Bhattacharyya, Shirshanka Das, et al. 2017. Ground: A Data Context Service. In Proceedings of the 8th Biennial Conference on Innovative Data Systems Research (CIDR)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/3019046.3019052"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Hiroyasu Sugano Adrian Bateman Wayne Carr Jon Peterson Shingo Fujimoto and Graham Klyne. 2004. Presence information data format (PIDF). RFC 3863. IETF.  Hiroyasu Sugano Adrian Bateman Wayne Carr Jon Peterson Shingo Fujimoto and Graham Klyne. 2004. Presence information data format (PIDF). RFC 3863. IETF.","DOI":"10.17487\/rfc3863"},{"key":"e_1_3_2_1_18_1","volume-title":"Conf. on Innovative Data Systems Research.","author":"Terrizzano Ignacio","year":"2015","unstructured":"Ignacio Terrizzano , Peter M Schwarz , Mary Roth , and John E Colino . 2015 . Data Wrangling: The Challenging Journey from the Wild to the Lake .. In Conf. on Innovative Data Systems Research. Ignacio Terrizzano, Peter M Schwarz, Mary Roth, and John E Colino. 2015. Data Wrangling: The Challenging Journey from the Wild to the Lake.. In Conf. on Innovative Data Systems Research."},{"key":"e_1_3_2_1_19_1","volume-title":"of Energy","author":"Dept U.S.","year":"2017","unstructured":"U.S. Dept . of Energy . 2017 . Carbon Dioxide Information Analysis Center . (Jan 2017). ftp:\/\/cdiac.ornl.gov U.S. Dept. of Energy. 2017. Carbon Dioxide Information Analysis Center. (Jan 2017). ftp:\/\/cdiac.ornl.gov"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0029715"}],"event":{"name":"SSDBM '17: 29th International Conference on Scientific and Statistical Database Management","sponsor":["Northwestern University Northwestern University"],"location":"Chicago IL USA","acronym":"SSDBM '17"},"container-title":["Proceedings of the 29th International Conference on Scientific and Statistical Database Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3085504.3091116","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3085504.3091116","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3085504.3091116","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T21:36:57Z","timestamp":1750282617000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3085504.3091116"}},"subtitle":["A Statistical Learning Pipeline for Taming Unkempt Data Repositories"],"short-title":[],"issued":{"date-parts":[[2017,6,27]]},"references-count":20,"alternative-id":["10.1145\/3085504.3091116","10.1145\/3085504"],"URL":"https:\/\/doi.org\/10.1145\/3085504.3091116","relation":{},"subject":[],"published":{"date-parts":[[2017,6,27]]},"assertion":[{"value":"2017-06-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}