{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T12:05:32Z","timestamp":1751889932615,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,8,1]],"date-time":"2020-08-01T00:00:00Z","timestamp":1596240000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1619028 and 1619371"],"award-info":[{"award-number":["1619028 and 1619371"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000208","name":"Institute of Museum and Library Services","doi-asserted-by":"publisher","award":["LG-71-16-0037-16"],"award-info":[{"award-number":["LG-71-16-0037-16"]}],"id":[{"id":"10.13039\/100000208","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,8]]},"DOI":"10.1145\/3383583.3398542","type":"proceedings-article","created":{"date-parts":[[2020,8,1]],"date-time":"2020-08-01T04:18:32Z","timestamp":1596255512000},"page":"177-186","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["The Case For Alternative Web Archival Formats To Expedite The Data-To-Insight Cycle"],"prefix":"10.1145","author":[{"given":"Xinyue","family":"Wang","sequence":"first","affiliation":[{"name":"Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}]},{"given":"Zhiwu","family":"Xie","sequence":"additional","affiliation":[{"name":"Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,8]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"12","article-title":"NoDB","volume":"58","author":"Alagiannis Ioannis","year":"2015","journal-title":"Efficient Query Execution on Raw Data Files. Commun. ACM"},{"volume-title":"Proceedings of the 16th ACM \/IEEE -CS Joint Conference on Digital Libraries (JCDL '16)","author":"Alam Sawood","key":"e_1_3_2_1_2_1"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00799-016-0184-4"},{"volume-title":"MementoMap Framework for Flexible and Adaptive Web Archive Profiling. In 2019 ACM \/IEEE Joint Conference on Digital Libraries (JCDL ). IEEE, 172--181","year":"2019","author":"Alam Sawood","key":"e_1_3_2_1_4_1"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1038\/nprot.2015.111"},{"key":"e_1_3_2_1_6_1","unstructured":"Apache. 2020 a. Apache\/Parquet-Format. The Apache Software Foundation. https:\/\/github.com\/apache\/parquet-format Retrieved Jan. 20 2020 from Apache. 2020 a. Apache\/Parquet-Format. The Apache Software Foundation. https:\/\/github.com\/apache\/parquet-format Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_7_1","unstructured":"Apache. 2020 b. Avro. The Apache Software Foundation. https:\/\/github.com\/apache\/avro Retrieved Jan. 20 2020 from Apache. 2020 b. Avro. The Apache Software Foundation. https:\/\/github.com\/apache\/avro Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_8_1","unstructured":"Internet Archive. 2020 a. Internet Archive. https:\/\/archive.org\/ Retrieved Jan. 20 2020 from Internet Archive. 2020 a. Internet Archive. https:\/\/archive.org\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_9_1","unstructured":"Internet Archive. 2020 b. Wayback Machine. https:\/\/archive.org\/web\/ Retrieved Jan. 20 2020 from Internet Archive. 2020 b. Wayback Machine. https:\/\/archive.org\/web\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Niels Br\u00fcgger and Ian Milligan (Eds.). 2019. The SAGE Handbook of Web History first ed.). SAGE Publications Ltd . Niels Br\u00fcgger and Ian Milligan (Eds.). 2019. The SAGE Handbook of Web History first ed.). SAGE Publications Ltd .","DOI":"10.4135\/9781526470546"},{"key":"e_1_3_2_1_11_1","first-page":"2012","article-title":"Space Data and Information Transfer Systems - Open Archival Information System (OAIS ) - Reference Model","volume":"14721","author":"CCSDS.","year":"2012","journal-title":"Standard ISO"},{"key":"e_1_3_2_1_12_1","unstructured":"International Internet Preservation Consortium. 2020 a. The CDX File Format. https:\/\/iipc.github.io\/warc-specifications\/specifications\/cdx-format\/cdx-2006\/ Retrieved Jan. 20 2020 from International Internet Preservation Consortium. 2020 a. The CDX File Format. https:\/\/iipc.github.io\/warc-specifications\/specifications\/cdx-format\/cdx-2006\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_13_1","unstructured":"International Internet Preservation Consortium. 2020 b. Openwayback. https:\/\/github.com\/iipc\/openwayback Retrieved Jan. 20 2020 from International Internet Preservation Consortium. 2020 b. Openwayback. https:\/\/github.com\/iipc\/openwayback Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_14_1","unstructured":"International Internet Preservation Consortium. 2020 c. OpenWayback CDXJ File Format. https:\/\/iipc.github.io\/warc-specifications\/specifications\/cdx-format\/openwayback-cdxj\/ Retrieved Jan. 20 2020 from International Internet Preservation Consortium. 2020 c. OpenWayback CDXJ File Format. https:\/\/iipc.github.io\/warc-specifications\/specifications\/cdx-format\/openwayback-cdxj\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00799-016-0171-9"},{"key":"e_1_3_2_1_16_1","unstructured":"Common Crawl. 2020. Common Crawl. https:\/\/commoncrawl.org\/ Retrieved Jan. 20 2020 from Common Crawl. 2020. Common Crawl. https:\/\/commoncrawl.org\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"H. Van de Sompel M. Nelson and R. Sanderson. 2013. HTTP Framework for Time-Based Access to Resource States -- Memento. RFC 7089. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC7089 H. Van de Sompel M. Nelson and R. Sanderson. 2013. HTTP Framework for Time-Based Access to Resource States -- Memento. RFC 7089. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC7089","DOI":"10.17487\/rfc7089"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00799-016-0206-2"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Roy T. Fielding James Gettys Jeffrey C. Mogul etal 1999. Hypertext Transfer Protocol -- HTTP\/1.1. RFC 2616. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC2616 Roy T. Fielding James Gettys Jeffrey C. Mogul et al. 1999. Hypertext Transfer Protocol -- HTTP\/1.1. RFC 2616. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC2616","DOI":"10.17487\/rfc2616"},{"volume-title":"Multipurpose Internet Mail Extensions (MIME) Part Two: Media Types. RFC 2046","year":"2046","author":"Freed Ned","key":"e_1_3_2_1_20_1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1038\/nmeth.3041"},{"key":"e_1_3_2_1_22_1","unstructured":"Vinay Goel. 2011. Web Archive Metadata File Specification. https:\/\/webarchive.jira.com\/wiki\/spaces\/Iresearch\/pages\/13467719\/Web+Archive+Metadata+File+Specification Retrieved Jan. 20 2020 from Vinay Goel. 2011. Web Archive Metadata File Specification. https:\/\/webarchive.jira.com\/wiki\/spaces\/Iresearch\/pages\/13467719\/Web+Archive+Metadata+File+Specification Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Daniel Gomes Jo ao Miranda and Miguel Costa. 2011. A Survey on Web Archiving Initiatives. In Research and Advanced Technology for Digital Libraries Stefan Gradmann Francesca Borri Carlo Meghini and Heiko Schuldt (Eds.). Springer Berlin Heidelberg 408--420. Daniel Gomes Jo ao Miranda and Miguel Costa. 2011. A Survey on Web Archiving Initiatives. In Research and Advanced Technology for Digital Libraries Stefan Gradmann Francesca Borri Carlo Meghini and Heiko Schuldt (Eds.). Springer Berlin Heidelberg 408--420.","DOI":"10.1007\/978-3-642-24469-8_41"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Shawn Graham Ian Milligan and Scott Weingart. 2015. Exploring Big Historical Data : The Historian's Macroscope reprint ed.). Imperial College Press London . Shawn Graham Ian Milligan and Scott Weingart. 2015. Exploring Big Historical Data : The Historian's Macroscope reprint ed.). Imperial College Press London .","DOI":"10.1142\/p981"},{"key":"e_1_3_2_1_25_1","volume-title":"Astronomy and Computing","volume":"12","author":"Greenfield P.","year":"2015"},{"key":"e_1_3_2_1_26_1","unstructured":"Object Management Group. 2012. Common Object Request Broker Architecture. Standard 3.3. https:\/\/www.omg.org\/spec\/CORBA\/3.3\/ Object Management Group. 2012. Common Object Request Broker Architecture. Standard 3.3. https:\/\/www.omg.org\/spec\/CORBA\/3.3\/"},{"volume-title":"Extraction and Derivation. In 2016 ACM \/IEEE Joint Conference on Digital Libraries (JCDL ). ACM","year":"2016","author":"Holzmann Helge","key":"e_1_3_2_1_27_1"},{"key":"e_1_3_2_1_28_1","first-page":"2009","article-title":"Information and Documentation - WARC File Format","volume":"28500","author":"ISO.","year":"2009","journal-title":"Standard ISO"},{"key":"e_1_3_2_1_29_1","unstructured":"Ian Jacobs and Norman Walsh. 2004. Architecture of the World Wide Web Volume One. W3C Recommendation 15 December 2004. World Wide Web Consortium (2004). http:\/\/www.w3.org\/TR\/webarch\/ Ian Jacobs and Norman Walsh. 2004. Architecture of the World Wide Web Volume One. W3C Recommendation 15 December 2004. World Wide Web Consortium (2004). http:\/\/www.w3.org\/TR\/webarch\/"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.14778\/2732977.2732986"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0115253"},{"key":"e_1_3_2_1_32_1","first-page":"10","article-title":"Mison","volume":"10","author":"Li Yinan","year":"2017","journal-title":"A Fast JSON Parser for Data Analytics. Proc. VLDB Endow."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2567948.2579045"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097570"},{"volume-title":"CUG2016 Proceedings. Cray User Group","year":"2016","author":"Liu Jialin","key":"e_1_3_2_1_35_1"},{"volume-title":"The SAGE Handbook of Web History","author":"Mallapragada Madhavi","key":"e_1_3_2_1_36_1"},{"volume-title":"Adam: Genomics Formats and Processing Patterns for Cloud Scale Computing","year":"2013","author":"Massie Matt","key":"e_1_3_2_1_37_1"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.14778\/1920841.1920886"},{"key":"e_1_3_2_1_39_1","unstructured":"Xiangrui Meng Joseph Bradley Burak Yavuz etal 2015. MLlib : Machine Learning in Apache Spark. (May 2015). arxiv: 1505.06807 Xiangrui Meng Joseph Bradley Burak Yavuz et al. 2015. MLlib : Machine Learning in Apache Spark. (May 2015). arxiv: 1505.06807"},{"key":"e_1_3_2_1_40_1","unstructured":"Stephen Merity. 2014. Navigating the WARC File Format. Library Catalog: commoncrawl.org. Stephen Merity. 2014. Navigating the WARC File Format. Library Catalog: commoncrawl.org."},{"volume-title":"RPC: Remote Procedure Call Protocol specification. RFC 1050. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC1050","year":"1988","author":"Microsystems Sun","key":"e_1_3_2_1_41_1"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Ian Milligan. 2019. History in the Age of Abundance ?: How the Web Is Transforming Historical Research .McGill-Queen's University Press Montreal . Ian Milligan. 2019. History in the Age of Abundance ?: How the Web Is Transforming Historical Research .McGill-Queen's University Press Montreal .","DOI":"10.1515\/9780773558212"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ascom.2015.07.001"},{"key":"e_1_3_2_1_44_1","unstructured":"Sebastian Nagel. 2018. Index to WARC Files and URLs in Columnar Format. https:\/\/commoncrawl.org\/2018\/03\/index-to-warc-files-and-urls-in-columnar-format\/ Retrieved Jan. 20 2020 from Sebastian Nagel. 2018. Index to WARC Files and URLs in Columnar Format. https:\/\/commoncrawl.org\/2018\/03\/index-to-warc-files-and-urls-in-columnar-format\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2723372.2742787"},{"key":"e_1_3_2_1_46_1","first-page":"11","article-title":"Filter Before You Parse","volume":"11","author":"Palkar Shoumik","year":"2018","journal-title":"Faster Analytics on Raw Data with Sparser. Proc. VLDB Endow."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/1559845.1559865"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41781-018-0014-z"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"P. Resnick. 2001. Internet Message Format. RFC 2822. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC2822 P. Resnick. 2001. Internet Message Format. RFC 2822. Internet Engineering Task Force. https:\/\/doi.org\/10.17487\/RFC2822","DOI":"10.17487\/rfc2822"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Hany M. SalahEldeen and Michael L. Nelson. 2012. Losing My Revolution : How Many Resources Shared on Social Media Have Been Lost ?. In Theory and Practice of Digital Libraries (Lecture Notes in Computer Science ) Panayiotis Zaphiris George Buchanan Edie Rasmussen and Fernando Loizides (Eds.). Springer Berlin Heidelberg 125--137. https:\/\/doi.org\/10.1007\/978--3--642--33290--6_14 Hany M. SalahEldeen and Michael L. Nelson. 2012. Losing My Revolution : How Many Resources Shared on Social Media Have Been Lost ?. In Theory and Practice of Digital Libraries (Lecture Notes in Computer Science ) Panayiotis Zaphiris George Buchanan Edie Rasmussen and Fernando Loizides (Eds.). Springer Berlin Heidelberg 125--137. https:\/\/doi.org\/10.1007\/978--3--642--33290--6_14","DOI":"10.1007\/978-3-642-33290-6_14"},{"key":"e_1_3_2_1_51_1","unstructured":"Reza Shiftehfar. 2018. Uber's Big Data Platform : 100+ Petabytes with Minute Latency. https:\/\/eng.uber.com\/uber-big-data-platform\/ Retrieved Jan. 20 2020 from Reza Shiftehfar. 2018. Uber's Big Data Platform : 100+ Petabytes with Minute Latency. https:\/\/eng.uber.com\/uber-big-data-platform\/ Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_52_1","unstructured":"Archives Unleashed Team. 2020. Archivesunleashed\/Aut. Archives Unleashed. https:\/\/github.com\/archivesunleashed\/aut Retrieved Jan. 20 2020 from Archives Unleashed Team. 2020. Archivesunleashed\/Aut. Archives Unleashed. https:\/\/github.com\/archivesunleashed\/aut Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ascom.2015.01.009"},{"volume-title":"Memento: Time Travel for the Web.","year":"2009","author":"de Sompel Herbert Van","key":"e_1_3_2_1_54_1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1080\/19312458.2018.1447657"},{"key":"e_1_3_2_1_56_1","unstructured":"Webrecorder. 2020. Pywb. Webrecorder. https:\/\/github.com\/webrecorder\/pywb Retrieved Jan. 20 2020 from Webrecorder. 2020. Pywb. Webrecorder. https:\/\/github.com\/webrecorder\/pywb Retrieved Jan. 20 2020 from"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.3233\/ISU-170853"},{"volume-title":"SDAC : Porting Scientific Data to Spark RDDs. In Network and Parallel Computing (Lecture Notes in Computer Science)","year":"2017","author":"Yang Tian","key":"e_1_3_2_1_58_1"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2015.2427795"}],"event":{"name":"JCDL '20: The ACM\/IEEE Joint Conference on Digital Libraries in 2020","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGIR ACM Special Interest Group on Information Retrieval","IEEE Institute of Electrical and Electronics Engineers"],"location":"Virtual Event China","acronym":"JCDL '20"},"container-title":["Proceedings of the ACM\/IEEE Joint Conference on Digital Libraries in 2020"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3383583.3398542","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3383583.3398542","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3383583.3398542","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:01:11Z","timestamp":1750197671000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3383583.3398542"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,8]]},"references-count":59,"alternative-id":["10.1145\/3383583.3398542","10.1145\/3383583"],"URL":"https:\/\/doi.org\/10.1145\/3383583.3398542","relation":{},"subject":[],"published":{"date-parts":[[2020,8]]},"assertion":[{"value":"2020-08-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}