{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:46:11Z","timestamp":1765503971217,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761551","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:18:04Z","timestamp":1762561084000},"page":"5988-5996","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Zipf-Gramming: Scaling Byte N-Grams Up to Production Sized Malware Corpora"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9900-1972","authenticated-orcid":false,"given":"Edward","family":"Raff","sequence":"first","affiliation":[{"name":"CrowdStrike, Austin, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9903-8214","authenticated-orcid":false,"given":"Ryan R.","family":"Curtin","sequence":"additional","affiliation":[{"name":"Booz Allen Hamilton, McLean, VA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3593-5255","authenticated-orcid":false,"given":"Derek","family":"Everett","sequence":"additional","affiliation":[{"name":"Booz Allen Hamilton, McLean, VA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7168-1237","authenticated-orcid":false,"given":"Robert J.","family":"Joyce","sequence":"additional","affiliation":[{"name":"Booz Allen Hamilton, McLean, VA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6368-8696","authenticated-orcid":false,"given":"James","family":"Holt","sequence":"additional","affiliation":[{"name":"Laboratory for Physical Sciences, College Park, MD, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2018. VirusTotal-Free online virus malware and URL scanner. https:\/\/www. virustotal.com"},{"volume-title":"With Formulas, Graphs, and Mathematical Tables","author":"Abramowitz Milton","key":"e_1_3_2_1_2_1","unstructured":"Milton Abramowitz. 1974. Handbook of Mathematical Functions, With Formulas, Graphs, and Mathematical Tables,. Dover Publications, Inc., USA."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/0167--9473(94)90172--4"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11416-015-0260-0"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2021.102500"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1137\/070710111"},{"key":"e_1_3_2_1_7_1","first-page":"2007","article-title":"What every programmer should know about memory","volume":"11","author":"Drepper Ulrich","year":"2007","unstructured":"Ulrich Drepper. 2007. What every programmer should know about memory. Red Hat, Inc 11, 2007 (2007), 2007.","journal-title":"Red Hat, Inc"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3624567"},{"key":"e_1_3_2_1_9_1","volume-title":"AMD and VIA CPUs","author":"Fog Agner","year":"2022","unstructured":"Agner Fog. 2022. Instruction tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs. Technical University of Denmark (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11409"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData62323.2024.10825735"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2006.4"},{"key":"e_1_3_2_1_13_1","volume-title":"The Next Wave: Cyber Analytics Research","author":"Holt James","year":"2024","unstructured":"James Holt and Edward Raff. 2024. Malware Bytes. In The Next Wave: Cyber Analytics Research, Vol. 25. National Security Agency (NSA). Issue 1. https:\/\/www.govinfo.gov\/app\/details\/GPO-TNW-25--1--2024\/GPO-TNW-25--1--2024--6"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2046707.2046742"},{"key":"e_1_3_2_1_15_1","volume-title":"MOTIF: A Large Malware Reference Dataset with Ground Truth Family Labels. In The AAAI-22 Workshop on Artificial Intelligence for Cyber Security (AICS). doi:10","author":"Joyce Robert J","year":"2022","unstructured":"Robert J Joyce, Dev Amlani, Charles Nicholas, and Edward Raff. 2022. MOTIF: A Large Malware Reference Dataset with Ground Truth Family Labels. In The AAAI-22 Workshop on Artificial Intelligence for Cyber Security (AICS). doi:10. 48550\/arXiv.2111.15031 arXiv: 2111.15031v1."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3701716.3715212"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737431"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605764.3623907"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474369.3486867"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the Conference on Applied Machine Learning for Information Security. arXiv: 2201","author":"Joyce Robert J","year":"2021","unstructured":"Robert J Joyce, Edward Raff, and Charles Nicholas. 2021. Rank-1 Similarity Matrix Decomposition For Modeling Changes in Antivirus Consensus Through Time. In Proceedings of the Conference on Applied Machine Learning for Information Security. arXiv: 2201.00757v1."},{"key":"e_1_3_2_1_21_1","unstructured":"Daniel Lemire. 2024. How fast is rolling Karp-Rabin hashing? https:\/\/lemire. me\/blog\/2024\/02\/04\/how-fast-is-rolling-karp-rabin-hashing\/. [Accessed 04-02- 2025]."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3576915.3616625"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/18"},{"key":"e_1_3_2_1_24_1","volume-title":"Advances in Neural Information Processing Systems","volume":"20","author":"Linstead Erik","year":"2007","unstructured":"Erik Linstead, Paul Rigor, Sushil Bajracharya, Cristina Lopes, and Pierre Baldi. 2007. Mining Internet-Scale Software Repositories. In Advances in Neural Information Processing Systems, Vol. 20. Curran Associates, Inc. https:\/\/papers.nips.cc\/paper_files\/paper\/2007\/hash\/ a532400ed62e772b9dc0b86f46e583ff-Abstract.html"},{"key":"e_1_3_2_1_25_1","volume-title":"James Holt, and Kristopher Micinski.","author":"Liu Chang","year":"2024","unstructured":"Chang Liu, Rebecca Saul, Yihao Sun, Edward Raff, Maya Fuchs, Townsend Southard Pantano, James Holt, and Kristopher Micinski. 2024. Assemblage: Automatic Binary Dataset Construction for Machine Learning. Advances in Neural Information Processing Systems 37 (Dec. 2024), 58698--58715. https:\/\/proceedings. neurips.cc\/paper_files\/paper\/2024\/hash\/6bbefc73a187dd42e0dc065b4e7a0615- Abstract-Datasets_and_Benchmarks_Track.html"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3672038"},{"key":"e_1_3_2_1_27_1","unstructured":"Fred Lu Ryan R. Curtin Edward Raff Francis Ferraro and James Holt. 2024. Optimizing the OptimalWeighted Average: Efficient Distributed Sparse Classification. arXiv:2406.01753 [cs.LG]"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--540--30570--5_27"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1080\/00107510500052444"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData47090.2019.9006132"},{"key":"e_1_3_2_1_31_1","volume-title":"Edward Raff, Charles Nicholas, and James Holt.","author":"Nguyen Andre T","year":"2022","unstructured":"Andre T Nguyen, Richard Zak, Luke E Richards, Maya Fuchs, Fred Lu, Robert Brandon, Gary Lopez Munoz, Edward Raff, Charles Nicholas, and James Holt. 2022. Minimizing Compute Costs: When Should We Run More Expensive Malware Analysis?, Vol. 3391. CEUR. https:\/\/ceur-ws.org\/Vol-3391\/paper6.pdf"},{"key":"e_1_3_2_1_32_1","volume-title":"Small Effect Sizes in Malware Detection? Make Harder Train\/Test Splits! Proceedings of the Conference on Applied Machine Learning in Information Security","author":"Patel Tirth","year":"2023","unstructured":"Tirth Patel, Fred Lu, Edward Raff, Charles Nicholas, Cynthia Matuszek, and James Holt. 2023. Small Effect Sizes in Malware Detection? Make Harder Train\/Test Splits! Proceedings of the Conference on Applied Machine Learning in Information Security (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.1968.10480938"},{"key":"e_1_3_2_1_34_1","volume-title":"28th USENIX Security Symposium (USENIX Security 19)","author":"Pendlebury Feargus","year":"2019","unstructured":"Feargus Pendlebury, Fabio Pierazzi, Roberto Jordaney, Johannes Kinder, and Lorenzo Cavallaro. 2019. TESSERACT: Eliminating Experimental Bias in Malware Classification across Space and Time. In 28th USENIX Security Symposium (USENIX Security 19). USENIX Association, Santa Clara, CA, 729--746. https:\/\/www.usenix.org\/conference\/usenixsecurity19\/presentation\/pendlebury"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.3758\/s13423-014-0585--6"},{"key":"e_1_3_2_1_36_1","first-page":"1","article-title":"JSAT: Java Statistical Analysis Tool, a Library for Machine Learning","volume":"18","author":"Raff Edward","year":"2017","unstructured":"Edward Raff. 2017. JSAT: Java Statistical Analysis Tool, a Library for Machine Learning. Journal of Machine Learning Research 18, 23 (2017), 1--5. http:\/\/jmlr.org\/papers\/v18\/16--131.html","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of KDD 2019 Workshop on Learning and Mining for Cybersecurity (LEMINCS'19)","author":"Raff Edward","year":"2019","unstructured":"Edward Raff, William Fleming, Richard Zak, Hyrum Anderson, Bill Finlayson, Charles K. Nicholas, Mark Mclean, William Fleming, Charles K. Nicholas, Richard Zak, and Mark Mclean. 2019. KiloGrams: Very Large N-Grams for Malware Classification. In Proceedings of KDD 2019 Workshop on Learning and Mining for Cybersecurity (LEMINCS'19). https:\/\/arxiv.org\/abs\/1908.00200"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2018.8622043"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209280.3229085"},{"key":"e_1_3_2_1_40_1","volume-title":"NeurIPS 2020 Workshop: ML Retrospectives, Surveys & Meta-Analyses (ML-RSA). arXiv:2006","author":"Raff Edward","year":"2020","unstructured":"Edward Raff and Charles Nicholas. 2020. A Survey of Machine Learning Methods and Challenges for Windows Malware Classification. In NeurIPS 2020 Workshop: ML Retrospectives, Surveys & Meta-Analyses (ML-RSA). arXiv:2006.09271 http: \/\/arxiv.org\/abs\/2006.09271 Best Paper Award."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2018.8622172"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11416-016-0283-1"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411508.3421372"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.3102\/10769986013002173"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/s100510050359"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3074801"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btt020"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3615669"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3494110.3528242"},{"key":"e_1_3_2_1_50_1","volume-title":"Zipf's law holds for phrases, not words. Scientific Reports 5, 12209 (Aug","author":"Williams Jake Ryland","year":"2015","unstructured":"Jake Ryland Williams, Paul R Lessard, Suma Desu, Eric M Clark, James P Bagrow, Christopher M Danforth, and Peter Sheridan Dodds. 2015. Zipf's law holds for phrases, not words. Scientific Reports 5, 12209 (Aug. 2015). https:\/\/www.nature. com\/articles\/srep12209#supplementary-information Publisher: The Author(s)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btac180"},{"key":"e_1_3_2_1_52_1","volume-title":"Is Function Similarity Over- Engineered? Building a Benchmark. Advances in Neural Information Processing Systems 37 (Dec","author":"Saul Rebecca","year":"2024","unstructured":"Rebecca Saul, Chang Liu, Noah Fleischmann, Richard Zak, Kristopher Micinski, Edward Raff, and James Holt. 2024. Is Function Similarity Over- Engineered? Building a Benchmark. Advances in Neural Information Processing Systems 37 (Dec. 2024), 21636--21655. https:\/\/proceedings.neurips.cc\/ paper_files\/paper\/2024\/hash\/2663c994c84a79b338bca613fe1ae223-Abstract- Datasets_and_Benchmarks_Track.html"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/MALWARE.2015.7413680"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1417"},{"key":"e_1_3_2_1_55_1","unstructured":"Matthew Sills. 2024. Optimizing Rabin-Karp Hashing. https:\/\/mattsills.github.io\/ 2024\/03\/02\/rabin-karp\/. [Accessed 04-02--2025]."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2660193.2660195"},{"key":"e_1_3_2_1_57_1","first-page":"949","article-title":"Mal-ID: Automatic Malware Detection Using Common Segment Analysis and Meta-Features","volume":"13","author":"Tahan Gil","year":"2012","unstructured":"Gil Tahan, Lior Rokach, and Yuval Shahar. 2012. Mal-ID: Automatic Malware Detection Using Common Segment Analysis and Meta-Features. Journal of Machine Learning Research 13, 33 (2012), 949--979. http:\/\/jmlr.org\/papers\/v13\/ tahan12a.html","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2018.11.001"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517312.2517316"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2020408.2020421"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/MALWARE.2017.8323963"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3271688"},{"volume-title":"Human behavior and the principle of least effort","author":"Zipf George Kingsley","key":"e_1_3_2_1_63_1","unstructured":"George Kingsley Zipf. 1949. Human behavior and the principle of least effort. Addison-Wesley Press, Oxford, England. Publication Title: Human behavior and the principle of least effort."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761551","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:41:11Z","timestamp":1765503671000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761551"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":63,"alternative-id":["10.1145\/3746252.3761551","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761551","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}