{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T15:28:25Z","timestamp":1782401305448,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761503","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T01:03:42Z","timestamp":1762563822000},"page":"6075-6083","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["End-to-end Information Extraction from Archival Records with Multimodal Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7706-8340","authenticated-orcid":false,"given":"Mahsa","family":"Vafaie","sequence":"first","affiliation":[{"name":"FIZ-Karlsruhe - Leibniz Institute for Information Infrastructure, Eggenstein-Leopoldshafen, Germany and Karlsruhe Institute of Technology, Karlsruhe, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0333-5888","authenticated-orcid":false,"given":"Sven","family":"Hertling","sequence":"additional","affiliation":[{"name":"Data and Web Science Group, University of Mannheim, Mannheim, Germany and FIZ Karlsruhe - Leibniz Institute for Information Infrastructure, Eggenstein-Leopoldshafen, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4871-2646","authenticated-orcid":false,"given":"Inger","family":"Banse-Strobel","sequence":"additional","affiliation":[{"name":"Bundesarchiv, Koblenz, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8747-1072","authenticated-orcid":false,"given":"Kevin","family":"Dubout","sequence":"additional","affiliation":[{"name":"Bundesarchiv, Koblenz, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7069-9804","authenticated-orcid":false,"given":"Harald","family":"Sack","sequence":"additional","affiliation":[{"name":"FIZ Karlsruhe - Leibniz Institute for Information Infrastructure, Eggenstein-Leopoldshafen, Germany and Karlsruhe Institute of Technology, Karlsruhe, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_2_1","volume-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling. arXiv preprint arXiv:2412.05271","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Weiyun Wang, Yue Cao, Yangzhou Liu, Zhangwei Gao, Erfei Cui, Jinguo Zhu, Shenglong Ye, Hao Tian, Zhaoyang Liu, et al. Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling. arXiv preprint arXiv:2412.05271, 2024."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"e_1_3_2_1_5_1","first-page":"73","volume-title":"IIWeb","volume":"3","author":"Cohen William W","year":"2003","unstructured":"William W Cohen, Pradeep Ravikumar, Stephen E Fienberg, et al. A comparison of string distance metrics for name-matching tasks. In IIWeb, volume 3, pages 73--78, 2003."},{"key":"e_1_3_2_1_6_1","volume-title":"Lmdeploy: A toolkit for compressing, deploying, and serving llm. https:\/\/github.com\/InternLM\/lmdeploy","author":"Contributors Deploy","year":"2023","unstructured":"LMDeploy Contributors. Lmdeploy: A toolkit for compressing, deploying, and serving llm. https:\/\/github.com\/InternLM\/lmdeploy, 2023."},{"key":"e_1_3_2_1_7_1","volume-title":"Deep learning based visually rich document content understanding: A survey. arXiv preprint arXiv:2408.01287","author":"Ding Yihao","year":"2024","unstructured":"Yihao Ding, Jean Lee, and Soyeon Caren Han. Deep learning based visually rich document content understanding: A survey. arXiv preprint arXiv:2408.01287, 2024."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685520"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.17723\/0360-9081.78.1.38"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10502-012-9198-x"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1108\/EUM0000000007162"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3086512.3086520"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"e_1_3_2_1_16_1","volume-title":"Cost-effective end-to-end information extraction for semi-structured document images. arXiv preprint arXiv:2104.08041","author":"Hwang Wonseok","year":"2021","unstructured":"Wonseok Hwang, Hyunji Lee, Jinyeong Yim, Geewook Kim, and Minjoon Seo. Cost-effective end-to-end information extraction for semi-structured document images. arXiv preprint arXiv:2104.08041, 2021."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"e_1_3_2_1_18_1","volume-title":"Donut: Document understanding transformer without OCR. CoRR, abs\/2111.15664","author":"Kim Geewook","year":"2021","unstructured":"Geewook Kim, Teakgyu Hong, Moonbin Yim, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, and Seunghyun Park. Donut: Document understanding transformer without OCR. CoRR, abs\/2111.15664, 2021. URL https:\/\/arxiv.org\/abs\/2111.15664."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1017\/S1351324914000114"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_21_1","first-page":"707","volume-title":"Soviet physics doklady","author":"Levenshtein Vladimir I","year":"1966","unstructured":"Vladimir I Levenshtein et al. Binary codes capable of correcting deletions, insertions, and reversals. In Soviet physics doklady, volume 10, pages 707--710. Soviet Union, 1966."},{"key":"e_1_3_2_1_22_1","volume-title":"Docbank: A benchmark dataset for document layout analysis. arXiv preprint arXiv:2006.01038","author":"Li Minghao","year":"2020","unstructured":"Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. arXiv preprint arXiv:2006.01038, 2020."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00560"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1136\/amiajnl-2012-001487"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475345"},{"key":"e_1_3_2_1_26_1","volume-title":"Textmonkey: An ocr-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473","author":"Liu Yuliang","year":"2024","unstructured":"Yuliang Liu, Biao Yang, Qiang Liu, Zhang Li, Zhiyin Ma, Shuo Zhang, and Xiang Bai. Textmonkey: An ocr-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473, 2024."},{"key":"e_1_3_2_1_27_1","volume-title":"Survey of post-ocr processing approaches. ACM Computing Surveys (CSUR), 54 (6):1--37","author":"Hai Nguyen Thi Tuyet","year":"2021","unstructured":"Thi Tuyet Hai Nguyen, Adam Jatowt, Mickael Coustaty, and Antoine Doucet. Survey of post-ocr processing approaches. ACM Computing Surveys (CSUR), 54 (6):1--37, 2021."},{"key":"e_1_3_2_1_28_1","first-page":"2012","article-title":"Sharing cultural heritage the linked open data way: why you should sign up","author":"Oomen Johan","year":"2012","unstructured":"Johan Oomen, MGJ van Erp, and L Baltussen. Sharing cultural heritage the linked open data way: why you should sign up. In Museums and the Web 2012. 2012.","journal-title":"Museums and the Web"},{"key":"e_1_3_2_1_29_1","volume-title":"Gpt-4o mini. https:\/\/platform.openai.com\/docs\/models\/gpt-4o-mini","author":"AI.","year":"2024","unstructured":"OpenAI. Gpt-4o mini. https:\/\/platform.openai.com\/docs\/models\/gpt-4o-mini, 2024. Accessed May 2025."},{"key":"e_1_3_2_1_30_1","volume-title":"Workshop on Document Intelligence at NeurIPS 2019","author":"Park Seunghyun","year":"2019","unstructured":"Seunghyun Park, Seung Shin, Bado Lee, Junyeop Lee, Jaeheung Surh, Minjoon Seo, and Hwalsuk Lee. Cord: a consolidated receipt dataset for post-ocr parsing. In Workshop on Document Intelligence at NeurIPS 2019, 2019."},{"key":"e_1_3_2_1_31_1","volume-title":"Zilong Wang, Zifeng Wang, Jiaqi Mu, Hao Zhang, et al. Lmdx: Language model-based document information extraction and localization. arXiv preprint arXiv:2309.10952","author":"Perot Vincent","year":"2023","unstructured":"Vincent Perot, Kai Kang, Florian Luisier, Guolong Su, Xiaoyu Sun, Ramya Sree Boppana, Zilong Wang, Zifeng Wang, Jiaqi Mu, Hao Zhang, et al. Lmdx: Language model-based document information extraction and localization. arXiv preprint arXiv:2309.10952, 2023."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1086\/684145"},{"key":"e_1_3_2_1_33_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748--8763. PmLR, 2021."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00255"},{"key":"e_1_3_2_1_35_1","volume-title":"Sriparna Saha, Vinija Jain, Samrat Mondal, and Aman Chadha. A systematic survey of prompt engineering in large language models: Techniques and applications. arXiv preprint arXiv:2402.07927","author":"Sahoo Pranab","year":"2024","unstructured":"Pranab Sahoo, Ayush Kumar Singh, Sriparna Saha, Vinija Jain, Samrat Mondal, and Aman Chadha. A systematic survey of prompt engineering in large language models: Techniques and applications. arXiv preprint arXiv:2402.07927, 2024."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86549-8_36"},{"key":"e_1_3_2_1_37_1","volume-title":"CEUR Workshop Proceedings","volume":"2981","author":"Vafaie Mahsa","year":"2021","unstructured":"Mahsa Vafaie, Bruns Oleksandra, Nastasja Pilz, Danilo Dess\u00ed, Harald Sack, et al. Modelling archival hierarchies in practice: Key aspects and lessons learned. In CEUR Workshop Proceedings, volume 2981. CEUR-WS, 2021."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587259.3627562"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.2352\/issn.2168-3204.2023.20.1.7"},{"key":"e_1_3_2_1_40_1","volume-title":"Mary Ann Tan, and Harald Sack. Digitalisation workflows in the age of transformer models: A case study in digital cultural heritage","author":"Vafaie Mahsa","year":"2024","unstructured":"Mahsa Vafaie, Mary Ann Tan, and Harald Sack. Digitalisation workflows in the age of transformer models: A case study in digital cultural heritage. 2024."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-011-0733-1"},{"key":"e_1_3_2_1_42_1","volume-title":"Enhancing the reasoning ability of multimodal large language models via mixed preference optimization. arXiv preprint arXiv:2411.10442","author":"Wang Weiyun","year":"2024","unstructured":"Weiyun Wang, Zhe Chen, Wenhai Wang, Yue Cao, Yangzhou Liu, Zhangwei Gao, Jinguo Zhu, Xizhou Zhu, Lewei Lu, Yu Qiao, and Jifeng Dai. Enhancing the reasoning ability of multimodal large language models via mixed preference optimization. arXiv preprint arXiv:2411.10442, 2024."},{"key":"e_1_3_2_1_43_1","volume-title":"Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824--24837","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824--24837, 2022."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_45_1","volume-title":"Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836","author":"Xu Yiheng","year":"2021","unstructured":"Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, and Furu Wei. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836, 2021."},{"key":"e_1_3_2_1_46_1","volume-title":"mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Yuhao Dan, Chenlin Zhao, Guohai Xu, Chenliang Li, Junfeng Tian, et al. mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499, 2023."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.1078"}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","location":"Seoul Republic of Korea","acronym":"CIKM '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761503","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:14:16Z","timestamp":1765502056000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761503"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":47,"alternative-id":["10.1145\/3746252.3761503","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761503","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}