{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:26:47Z","timestamp":1765499207324,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","funder":[{"name":"European Union HORIZON-WIDERA-2023-TALENTS-01-01","award":["101186647 \u2014 AI4DH"],"award-info":[{"award-number":["101186647 \u2014 AI4DH"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761295","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T23:59:18Z","timestamp":1762559958000},"page":"2366-2376","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Robustness of LLMs in Question Answering on Multilingual Noisy OCR Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3578-2393","authenticated-orcid":false,"given":"Bhawna","family":"Piryani","sequence":"first","affiliation":[{"name":"University of Innsbruck, Innsbruck, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4850-9239","authenticated-orcid":false,"given":"Jamshid","family":"Mozafari","sequence":"additional","affiliation":[{"name":"University of Innsbruck, Innsbruck, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8747-4927","authenticated-orcid":false,"given":"Abdelrahman","family":"Abdallah","sequence":"additional","affiliation":[{"name":"University of Innsbruck, Innsbruck, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6160-3356","authenticated-orcid":false,"given":"Antoine","family":"Doucet","sequence":"additional","affiliation":[{"name":"University of La Rochelle, La Rochelle, France and University of Ljubljana, Ljubljana, Slovenia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7235-0665","authenticated-orcid":false,"given":"Adam","family":"Jatowt","sequence":"additional","affiliation":[{"name":"University of Innsbruck, Innsbruck, Austria"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/JCDL.2017.7991582"},{"key":"e_1_3_2_1_2_1","volume-title":"Symposium on Document Analysis and Information Retrieval. 115-126","author":"Croft W Bruce","year":"1994","unstructured":"W Bruce Croft, SM Harding, Kazem Taghva, and Julie Borsack. 1994. An evaluation of information retrieval accuracy with simulated OCR output. In Symposium on Document Analysis and Information Retrieval. 115-126."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00799-023-00345-6"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.107"},{"key":"e_1_3_2_1_5_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3604931"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58219-7_21"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings 2005 Symposium on Document Image Understanding Technology. UMD, 103","author":"Faisal Farooq^1, SUNY","year":"2005","unstructured":"Faisal Farooq^1, SUNY CEDAR, and Yaser Al-Onaizan. 2005. Effect of degraded input on statistical machine translation. In Proceedings 2005 Symposium on Document Image Understanding Technology. UMD, 103."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/SMC53992.2023.10394665"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC`08)","author":"Grover Claire","year":"2008","unstructured":"Claire Grover, Sharon Givon, Richard Tobin, and Julian Ball. 2008. Named Entity Recognition for Digitised Historical Texts. In Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC`08), Nicoletta Calzolari, Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, and Daniel Tapias (Eds.). European Language Resources Association (ELRA), Marrakech, Morocco. https:\/\/aclanthology.org\/L08-1253\/"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-54956-5_7"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1017\/S1351324922000110"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1017\/S1351324922000110"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1093\/llc\/fqz024"},{"key":"e_1_3_2_1_15_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al., 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Symposium on Document Image Understanding Technology. 111-119","author":"Jing Hongyan","year":"2003","unstructured":"Hongyan Jing, Daniel Lopresti, and Chilin Shih. 2003. Summarizing noisy documents. In Proceedings of the Symposium on Document Image Understanding Technology. 111-119."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-2709"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i27.35038"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.46298\/jdmdh.5864"},{"key":"e_1_3_2_1_20_1","first-page":"102","volume-title":"ICADL 2019, Kuala Lumpur, Malaysia, November 4-7, 2019, Proceedings 21","author":"Pontes Elvys Linhares","year":"2019","unstructured":"Elvys Linhares Pontes, Ahmed Hamdi, Nicolas Sidere, and Antoine Doucet. 2019. Impact of OCR quality on named entity linking. In Digital Libraries at the Crossroads of Digital Information for the Future: 21st International Conference on Asia-Pacific Digital Libraries, ICADL 2019, Kuala Lumpur, Malaysia, November 4-7, 2019, Proceedings 21. Springer, 102-115."},{"key":"e_1_3_2_1_21_1","volume-title":"Levenshtein Distance: Information theory, Computer science, String (computer science), String metric, Damerau?Levenshtein distance, Spell checker, Hamming distance","author":"Miller Frederic P.","year":"2009","unstructured":"Frederic P. Miller, Agnes F. Vandome, and John McBrewster. 2009. Levenshtein Distance: Information theory, Computer science, String (computer science), String metric, Damerau?Levenshtein distance, Spell checker, Hamming distance. Alpha Press."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.mrqa-1.4"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-04257-8_1"},{"volume-title":"Natural language processing for historical texts","author":"Piotrowski Michael","key":"e_1_3_2_1_24_1","unstructured":"Michael Piotrowski. 2012. Natural language processing for historical texts. Vol. 17. Morgan & Claypool Publishers."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657891"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00255"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.21248\/jlcl.33.2018.220"},{"key":"e_1_3_2_1_29_1","volume-title":"A survey of deep learning approaches for ocr and document understanding. arXiv preprint arXiv:2011.13534","author":"Subramani Nishant","year":"2020","unstructured":"Nishant Subramani, Alexandre Matton, Malcolm Greaves, and Adrian Lam. 2020. A survey of deep learning approaches for ocr and document understanding. arXiv preprint arXiv:2011.13534 (2020)."},{"key":"e_1_3_2_1_30_1","volume-title":"Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\u00e9onard Hussenot, Thomas Mesnard, Bobak Shahriari, Alexandre Ram\u00e9, et al., 2024. Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"THE RISE OF DIGITIZATION. Digitisation Perspectives","author":"MELISSA M TERRAS.","year":"2011","unstructured":"MELISSA M TERRAS. 2011. 1. THE RISE OF DIGITIZATION. Digitisation Perspectives (2011), 3."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24592-8_19"},{"key":"e_1_3_2_1_33_1","volume-title":"Assessing the Impact of OCR Quality on Downstream NLP Tasks. In International Conference on Agents and Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:215756646","author":"van Strien Daniel Alexander","year":"2020","unstructured":"Daniel Alexander van Strien, Kaspar Beelen, Mariona Coll Ardanuy, Kasra Hosseini, Barbara McGillivray, and Giovanni Colavizza. 2020. Assessing the Impact of OCR Quality on Downstream NLP Tasks. In International Conference on Agents and Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:215756646"},{"key":"e_1_3_2_1_34_1","volume-title":"Evaluating the Impact of OCR Quality on Short Texts Classification Task. In Mexican International Conference on Artificial Intelligence. Springer, 163-177","author":"Vitman Oxana","year":"2022","unstructured":"Oxana Vitman, Yevhen Kostiuk, Paul Plachinda, Alisa Zhila, Grigori Sidorov, and Alexander Gelbukh. 2022. Evaluating the Impact of OCR Quality on Short Texts Classification Task. In Mexican International Conference on Artificial Intelligence. Springer, 163-177."},{"key":"e_1_3_2_1_35_1","unstructured":"Haoyu Wang Guozheng Ma Cong Yu Ning Gui Linrui Zhang Zhiqi Huang Suwei Ma Yongzhe Chang Sen Zhang Li Shen et al. 2023. Are large language models really robust to word-level perturbations? arXiv preprint arXiv:2309.11166 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"volume-title":"Proceedings of the 5th ACL-HLT Workshop on Language Technology for Cultural Heritage","author":"Yang I","key":"e_1_3_2_1_37_1","unstructured":"Tze-I Yang, Andrew Torget, and Rada Mihalcea. 2011. Topic Modeling on Historical Newspapers. In Proceedings of the 5th ACL-HLT Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities, Kalliopi Zervanou and Piroska Lendvai (Eds.). Association for Computational Linguistics, Portland, OR, USA, 96-104. https:\/\/aclanthology.org\/W11-1513\/"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 35th International Conference on Neural Information Processing Systems (NIPS '21)","author":"Yuan Weizhe","year":"2021","unstructured":"Weizhe Yuan, Graham Neubig, and Pengfei Liu. 2021. BARTSCORE: evaluating generated text as text generation. In Proceedings of the 35th International Conference on Neural Information Processing Systems (NIPS '21). Curran Associates Inc., Red Hook, NY, USA, Article 2088, 15 pages."},{"key":"e_1_3_2_1_39_1","unstructured":"Shengyu Zhang Linfeng Dong Xiaoya Li Sen Zhang Xiaofei Sun Shuhe Wang Jiwei Li Runyi Hu Tianwei Zhang Fei Wu et al. 2023. Instruction tuning for large language models: A survey. arXiv preprint arXiv:2308.10792 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-91669-5_30"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-30483-8_49"}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761295","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:22:28Z","timestamp":1765498948000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761295"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":41,"alternative-id":["10.1145\/3746252.3761295","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761295","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}