{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T17:50:53Z","timestamp":1749577853621,"version":"3.40.3"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031416811"},{"type":"electronic","value":"9783031416828"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-41682-8_22","type":"book-chapter","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:02:59Z","timestamp":1692342179000},"page":"348-365","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["CCpdf: Building a\u00a0High Quality Corpus for\u00a0Visually Rich Documents from\u00a0Web Crawl Data"],"prefix":"10.1007","author":[{"given":"Micha\u0142","family":"Turski","sequence":"first","affiliation":[]},{"given":"Tomasz","family":"Stanis\u0142awek","sequence":"additional","affiliation":[]},{"given":"Karol","family":"Kaczmarek","sequence":"additional","affiliation":[]},{"given":"Pawe\u0142","family":"Dyda","sequence":"additional","affiliation":[]},{"given":"Filip","family":"Grali\u0144ski","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,19]]},"reference":[{"key":"22_CR1","unstructured":"Abadji, J., Suarez, P.O., Romary, L., Sagot, B.: Towards a cleaner document-oriented multilingual crawled corpus. ArXiv abs\/2201.06642 (2022)"},{"key":"22_CR2","doi-asserted-by":"publisher","unstructured":"Allison, T., et al.: Research report: Building a wide reach corpus for secure parser development. In: 2020 IEEE Security and Privacy Workshops (SPW), pp. 318\u2013326 (2020). https:\/\/doi.org\/10.1109\/SPW50608.2020.00066","DOI":"10.1109\/SPW50608.2020.00066"},{"key":"22_CR3","doi-asserted-by":"publisher","unstructured":"Ammar, W., et al.: Construction of the literature graph in semantic scholar. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 3 (Industry Papers), pp. 84\u201391. Association for Computational Linguistics, New Orleans - Louisiana (2018). https:\/\/doi.org\/10.18653\/v1\/N18-3011. https:\/\/aclanthology.org\/N18-3011","DOI":"10.18653\/v1\/N18-3011"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Biten, A.F., Tito, R., Gomez, L., Valveny, E., Karatzas, D.: OCR-IDR: OCR annotations for industry document library dataset. arXiv preprint arXiv:2202.12985 (2022)","DOI":"10.1007\/978-3-031-25069-9_16"},{"key":"22_CR5","unstructured":"Borchmann, \u0141., et al.: DUE: end-to-end document understanding benchmark. In: NeurIPS Datasets and Benchmarks (2021)"},{"key":"22_CR6","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems. vol.\u00a033, pp. 1877\u20131901. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"22_CR7","doi-asserted-by":"publisher","unstructured":"Dodge, J., et al.: Documenting large webtext corpora: a case study on the colossal clean crawled corpus. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 1286\u20131305. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic (2021). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.98. https:\/\/aclanthology.org\/2021.emnlp-main.98","DOI":"10.18653\/v1\/2021.emnlp-main.98"},{"key":"22_CR8","unstructured":"Gao, L., et al.: The Pile: an 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)"},{"key":"22_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1007\/978-3-030-86549-8_34","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"\u0141 Garncarek","year":"2021","unstructured":"Garncarek, \u0141, et al.: LAMBERT: layout-aware language modeling for information extraction. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12821, pp. 532\u2013547. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86549-8_34"},{"key":"22_CR10","doi-asserted-by":"publisher","unstructured":"Gururangan, S., et al.: Don\u2019t stop pretraining: adapt language models to domains and tasks. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.740. http:\/\/dx.doi.org\/10.18653\/v1\/2020.acl-main.740","DOI":"10.18653\/v1\/2020.acl-main.740"},{"key":"22_CR11","unstructured":"Habernal, I., Zayed, O., Gurevych, I.: C4Corpus: Multilingual web-size corpus with free license. In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC2016), pp. 914\u2013922. European Language Resources Association (ELRA), Portoro\u017e, Slovenia (2016). https:\/\/aclanthology.org\/L16-1146"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: ICDAR (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Huber, P., et al.: CCQA: a new web-scale question answering dataset for model pre-training (2021)","DOI":"10.18653\/v1\/2022.findings-naacl.184"},{"key":"22_CR14","doi-asserted-by":"publisher","unstructured":"Lewis, D., Agam, G., Argamon, S., Frieder, O., Grossman, D., Heard, J.: Building a test collection for complex document information processing. In: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 665\u2013666. SIGIR 2006, Association for Computing Machinery, New York, NY, USA (2006). https:\/\/doi.org\/10.1145\/1148170.1148307","DOI":"10.1145\/1148170.1148307"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: DocBank: a benchmark dataset for document layout analysis (2020)","DOI":"10.18653\/v1\/2020.coling-main.82"},{"key":"22_CR16","unstructured":"Liu, V., Curran, J.R.: Web text corpus for natural language processing. In: 11th Conference of the European Chapter of the Association for Computational Linguistics. Association for Computational Linguistics, Trento, Italy (2006). https:\/\/www.aclweb.org\/anthology\/E06-1030"},{"key":"22_CR17","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach (2019)"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Luccioni, A.S., Viviano, J.D.: What\u2019s in the box? An analysis of undesirable content in the Common Crawl corpus. In: ACL (2021)","DOI":"10.18653\/v1\/2021.acl-short.24"},{"key":"22_CR19","unstructured":"Masson, C., Paroubek, P.: NLP analytics in finance with DoRe: a French 250M tokens corpus of corporate annual reports. In: Proceedings of The 12th Language Resources and Evaluation Conference, pp. 2261\u20132267. European Language Resources Association, Marseille, France (2020). https:\/\/www.aclweb.org\/anthology\/2020.lrec-1.275"},{"key":"22_CR20","unstructured":"Patterson, D.A., et al.: Carbon emissions and large neural network training. ArXiv abs\/2104.10350 (2021)"},{"key":"22_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"732","DOI":"10.1007\/978-3-030-86331-9_47","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"R Powalski","year":"2021","unstructured":"Powalski, R., Borchmann, \u0141, Jurkiewicz, D., Dwojak, T., Pietruszka, M., Pa\u0142ka, G.: Going full-TILT boogie on document understanding with text-image-layout transformer. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12822, pp. 732\u2013747. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_47"},{"key":"22_CR22","unstructured":"Qi, D., Su, L., Song, J., Cui, E., Bharti, T., Sacheti, A.: ImageBERT: cross-modal pre-training with large-scale weak-supervised image-text data (2020)"},{"key":"22_CR23","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer (2019)"},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Schwenk, H., Wenzek, G., Edunov, S., Grave, E., Joulin, A.: CCMatrix: mining billions of high-quality parallel sentences on the web. In: ACL (2021)","DOI":"10.18653\/v1\/2021.acl-long.507"},{"key":"22_CR25","unstructured":"Smith, J.R., Saint-Amand, H., Plamada, M., Koehn, P., Callison-Burch, C., Lopez, A.: Dirt cheap web-scale parallel text from the common Crawl. In: Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1374\u20131383. Association for Computational Linguistics, Sofia, Bulgaria (2013). https:\/\/www.aclweb.org\/anthology\/P13-1135"},{"key":"22_CR26","unstructured":"Smith, R.: Tesseract open source OCR engine (2022). https:\/\/github.com\/tesseract-ocr\/tesseract"},{"key":"22_CR27","unstructured":"Turc, I., Chang, M.W., Lee, K., Toutanova, K.: Well-read students learn better: On the importance of pre-training compact models. arXiv preprint arXiv:1908.08962v2 (2019)"},{"key":"22_CR28","unstructured":"Wenzek, G., et al.: CCNet: extracting high quality monolingual datasets from web crawl data (2019)"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: ACL-IJCNLP 2021 (2021)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"22_CR31","unstructured":"Xu, Y., et al.: LayoutXLM: multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2004.21040 (2021)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: mT5: a massively multilingual pre-trained text-to-text transformer. In: NAACL (2021)","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"22_CR33","doi-asserted-by":"publisher","unstructured":"Zhong, X., Tang, J., Yepes, A.J.: PubLayNet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1015\u20131022. IEEE (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00166","DOI":"10.1109\/ICDAR.2019.00166"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2023"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-41682-8_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:20:41Z","timestamp":1692343241000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-41682-8_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031416811","9783031416828"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-41682-8_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"19 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"San Jos\u00e9, CA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"316","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"154","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"49% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.89","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1.50","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Number and type of other papers accepted : IJDAR track papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}