{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T10:02:13Z","timestamp":1743069733149,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031824807"},{"type":"electronic","value":"9783031824814"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-82481-4_27","type":"book-chapter","created":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T15:45:51Z","timestamp":1741016751000},"page":"390-404","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluation of\u00a0Document Deduplication Algorithms for\u00a0Large Text Corpora"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0603-4191","authenticated-orcid":false,"given":"Johannes","family":"Leveling","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2411-4540","authenticated-orcid":false,"given":"Lennard","family":"Helmer","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8843-5917","authenticated-orcid":false,"given":"Benny Joerg","family":"Stein","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8678-2093","authenticated-orcid":false,"given":"Dennis","family":"Wegener","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5216-4758","authenticated-orcid":false,"given":"Zoha","family":"Sheikh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5601-2093","authenticated-orcid":false,"given":"Elanton","family":"Fernandes","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6283-2310","authenticated-orcid":false,"given":"Hammam","family":"Abdelwahab","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,4]]},"reference":[{"key":"27_CR1","unstructured":"Abadji, J., et al. (eds.) Proceedings of LREC 2022, pp. 4344\u20134355. ELRA, Marseille, France (2022)"},{"key":"27_CR2","doi-asserted-by":"publisher","unstructured":"Abadji, J., Ortiz Su\u00e1rez, P.J., Romary, L., Sagot, B.: Ungoliant: An optimized pipeline for the generation of a very large-scale multilingual web corpus. In: L\u00fcngen, H., Kupietz, M., Baski, P., Barbaresi, A., Clematide, S., Pisetta, I. (eds.) Proceedings of CMLC-9, pp.\u00a01\u20139. Leibniz-Institut f\u00fcr Deutsche Sprache, Mannheim (2021). https:\/\/doi.org\/10.14618\/ids-pub-10468","DOI":"10.14618\/ids-pub-10468"},{"issue":"6","key":"27_CR3","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1016\/j.ipl.2006.10.007","volume":"101","author":"PS Almeida","year":"2007","unstructured":"Almeida, P.S., Baquero, C., Pregui\u00e7a, N., Hutchison, D.: Scalable bloom filters. Inf. Process. Lett. 101(6), 255\u2013261 (2007). https:\/\/doi.org\/10.1016\/j.ipl.2006.10.007","journal-title":"Inf. Process. Lett."},{"key":"27_CR4","doi-asserted-by":"publisher","unstructured":"Biderman, S., Schoelkopf, H., Anthony, Q., Bradley, H., OBrien, K., Hallahan, E., et\u00a0al.: Pythia: a suite for analyzing large language models across training and scaling. In: Proceedings of the 40th International Conference on Machine Learning. ICML\u201923, JMLR.org (2023).https:\/\/doi.org\/10.48550\/arXiv.2304.01373","DOI":"10.48550\/arXiv.2304.01373"},{"key":"27_CR5","doi-asserted-by":"publisher","unstructured":"Bloom, B.H.: Space\/time trade-offs in hash coding with allowable errors. CACM 13(7), 422 \u2013 426 (1970). https:\/\/doi.org\/10.1145\/362686.362692","DOI":"10.1145\/362686.362692"},{"key":"27_CR6","doi-asserted-by":"publisher","unstructured":"Broder, A.: On the resemblance and containment of documents. In: Proceedings of the Compression and Complexity of SEQUENCES 1997, pp. 21\u201329. IEEE Computer Society, USA (1997). https:\/\/doi.org\/10.1109\/SEQUEN.1997.666900","DOI":"10.1109\/SEQUEN.1997.666900"},{"key":"27_CR7","doi-asserted-by":"publisher","unstructured":"Carlini, N., Ippolito, D., Jagielski, M., Lee, K., Tramer, F., Zhang, C.: Quantifying Memorization Across Neural Language Models (2023). https:\/\/doi.org\/10.48550\/arXiv.2202.07646","DOI":"10.48550\/arXiv.2202.07646"},{"issue":"1","key":"27_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3047307","volume":"50","author":"L Chi","year":"2017","unstructured":"Chi, L., Zhu, X.: Hashing techniques: a survey and taxonomy. ACM Comput. Surv. 50(1), 1\u201336 (2017). https:\/\/doi.org\/10.1145\/3047307","journal-title":"ACM Comput. Surv."},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Chum, O., Philbin, J., Zisserman, A.: Near duplicate image detection: min-hash and tf-idf weighting. In: British Machine Vision Conference (2008)","DOI":"10.5244\/C.22.50"},{"key":"27_CR10","doi-asserted-by":"publisher","unstructured":"Conrad, J.G., Raymond, E.L.: Essential deduplication functions for transactional databases in law firms. In: Proceedings of ICAIL 2007, pp. 261\u2013270. Stanford Law School, Stanford, California, USA (2007). https:\/\/doi.org\/10.1145\/1276318.1276368","DOI":"10.1145\/1276318.1276368"},{"key":"27_CR11","doi-asserted-by":"publisher","unstructured":"Coupette, C., Singh, J., Spamann, H.: Simplify your law: using information theory to deduplicate legal documents. In: International Conference on Data Mining Workshops (ICDMW), pp. 631\u2013638 (2021). https:\/\/doi.org\/10.1109\/ICDMW53433.2021.00083","DOI":"10.1109\/ICDMW53433.2021.00083"},{"key":"27_CR12","unstructured":"Gibson, J., Wellner, B., Lubar, S.: Identification of duplicate news stories in web pages. In: Proceedings of the 4th Web as Corpus Workshop (WAC-4) (2008)"},{"key":"27_CR13","doi-asserted-by":"publisher","unstructured":"Hernandez, D., Brown, T., Conerly, T., DasSarma, N., Drain, D., El-Showk, S., et\u00a0al.: Scaling Laws and Interpretability of Learning from Repeated Data (2022). https:\/\/doi.org\/10.48550\/arXiv.2205.10487","DOI":"10.48550\/arXiv.2205.10487"},{"key":"27_CR14","unstructured":"Kandpal, N., Wallace, E., Raffel, C.: Deduplicating training data mitigates privacy risks in language models. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesvari, C., Niu, G., Sabato, S. (eds.) Proceedings of the 39th International Conference on Machine Learning, vol.\u00a0162, pp. 10697\u201310707 (2022)"},{"key":"27_CR15","unstructured":"Kocetkov, D., Li, R., Ben Allal, L., Li, J., Mou, C., Jernite, Y., et\u00a0al.: The Stack: 3 TB of permissively licensed source code. Transactions on Machine Learning Research (2023)"},{"key":"27_CR16","unstructured":"Lauren\u00e7on, H., Saulnier, L., Wang, T., Akiki, C., del Moral, A.V., Scao, T.L., et\u00a0al.: The BigScience ROOTS corpus: A 1.6TB composite multilingual dataset. In: Proceedings of the 36th International Conference on Neural Information Processing Systems, pp. 31809\u201331826. Curran Associates Inc., Red Hook, NY, USA (2022)"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Lee, K., et al.: Deduplicating training data makes language models better. In: Annual Meeting of the Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2022.acl-long.577"},{"key":"27_CR18","unstructured":"Lison, P., Tiedemann, J.: OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In: Calzolari, N., et al. (eds.) Proceedings of the Tenth International Conference on Language Resources and Evaluation. pp. 923\u2013929. ELRA, Portoro\u017e, Slovenia (2016)"},{"issue":"2","key":"27_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3078837","volume":"13","author":"J Ma","year":"2017","unstructured":"Ma, J., et al.: Lazy exact deduplication. ACM Trans. Storage (TOS) 13(2), 1\u201326 (2017). https:\/\/doi.org\/10.1145\/3078837","journal-title":"ACM Trans. Storage (TOS)"},{"issue":"5","key":"27_CR20","doi-asserted-by":"publisher","first-page":"935","DOI":"10.1137\/0222058","volume":"22","author":"U Manber","year":"1993","unstructured":"Manber, U., Myers, G.: Suffix arrays: a new method for on-line string searches. SIAM J. Comput. 22(5), 935\u2013948 (1993). https:\/\/doi.org\/10.1137\/0222058","journal-title":"SIAM J. Comput."},{"key":"27_CR21","doi-asserted-by":"publisher","unstructured":"Mou, C., Ha, C., Enevoldsen, K., Liu, P.: Chenghaomou\/text-dedup: Reference snapshot (Sep 2023). https:\/\/doi.org\/10.5281\/zenodo.8364980","DOI":"10.5281\/zenodo.8364980"},{"key":"27_CR22","unstructured":"Muennighoff, N., Rush, A.M., Barak, B., Scao, T.L., Piktus, A., Tazi, N., et\u00a0al.: Scaling data-constrained language models. In: Advances in Neural Information Processing Systems. vol.\u00a036, pp. 50358\u201350376. Curran Associates, Inc. (2023)"},{"key":"27_CR23","unstructured":"Nguyen, T., Nguyen, C.V., Lai, V.D., Man, H., Ngo, N.T., Dernoncourt, F., et\u00a0al.: CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Calzolari, N., Kan, M.Y., Hoste, V., Lenci, A., Sakti, S., Xue, N. (eds.) Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, pp. 4226\u20134237. ELRA and ICCL, Torino, Italia (2024)"},{"key":"27_CR24","doi-asserted-by":"publisher","unstructured":"Ortiz\u00a0Su\u00e1rez, P.J., Romary, L., Sagot, B.: A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of ACL 2020, pp. 1703\u20131714. Association for Computational Linguistics (2020).https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.156","DOI":"10.18653\/v1\/2020.acl-main.156"},{"key":"27_CR25","doi-asserted-by":"publisher","unstructured":"Penedo, G., Kydl\u00edek, H., et al.: The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale (2024). https:\/\/doi.org\/10.48550\/arXiv.2406.17557","DOI":"10.48550\/arXiv.2406.17557"},{"key":"27_CR26","unstructured":"Penedo, G., Malartic, Q., Hesslow, D., Cojocaru, R., Cappelli, A., Alobeidli, H., et\u00a0al.: The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data Only. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 79155\u201379172. Curran Associates, Inc. (2023)"},{"key":"27_CR27","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et\u00a0al.: Language Models are Unsupervised Multitask Learners (2019). https:\/\/openai.com\/index\/better-language-models\/"},{"key":"27_CR28","doi-asserted-by":"publisher","unstructured":"Rae, J.W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, H.F., et\u00a0al.: Scaling Language Models: Methods, Analysis & Insights from Training Gopher (2022). https:\/\/doi.org\/10.48550\/arXiv.2112.11446","DOI":"10.48550\/arXiv.2112.11446"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Silcock, E., D\u2019Amico-Wong, L., Yang, J., Dell, M.: Noise-robust de-duplication at scale. In: The Eleventh International Conference on Learning Representations (2023)","DOI":"10.3386\/w30726"},{"key":"27_CR30","doi-asserted-by":"publisher","unstructured":"Stein, B., Meyer\u00a0zu Eissen, S., Potthast, M.: Strategies for retrieving plagiarized documents. In: Proceedings of the 30th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 825\u2013826. ACM (2007). https:\/\/doi.org\/10.1145\/1277741.1277928","DOI":"10.1145\/1277741.1277928"},{"key":"27_CR31","unstructured":"Tirumala, K., Simig, D., Aghajanyan, A., Morcos, A.S.: D4: Improving LLM Pretraining via Document De-Duplication and Diversification. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems 36 (NeurIPS 2023) Datasets and Benchmarks Track, pp. 53983\u201353995. Curran Associates, Inc. (2023)"},{"key":"27_CR32","unstructured":"Wenzek, G., et al.: CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 4003\u20134012. ELRA, Marseille, France (2020)"},{"issue":"1","key":"27_CR33","doi-asserted-by":"publisher","first-page":"156","DOI":"10.1109\/TCC.2021.3081702","volume":"11","author":"D Zhang","year":"2023","unstructured":"Zhang, D., Le, J., Mu, N., Wu, J., Liao, X.: Secure and efficient data deduplication in jointcloud storage. IEEE Trans. Cloud Comput. 11(1), 156\u2013167 (2023). https:\/\/doi.org\/10.1109\/TCC.2021.3081702","journal-title":"IEEE Trans. Cloud Comput."}],"container-title":["Lecture Notes in Computer Science","Machine Learning, Optimization, and Data Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-82481-4_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T15:45:58Z","timestamp":1741016758000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-82481-4_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031824807","9783031824814"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-82481-4_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"4 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"LOD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Machine Learning, Optimization, and Data Science","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Castiglione della Pescaia","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mod2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/lod2024.icas.events\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}