{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:31:31Z","timestamp":1765506691407,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","funder":[{"name":"Funda\u00e7\u00e3o para a Ci\u00eancia e a Tecnologia, I.P.\/ MCTES","award":["UID\/00027"],"award-info":[{"award-number":["UID\/00027"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761633","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:52:37Z","timestamp":1762563157000},"page":"6361-6366","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Portuguese post-OCR Resources for Text Optimisation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2036-3197","authenticated-orcid":false,"given":"Tom\u00e1s","family":"Freitas Os\u00f3rio","sequence":"first","affiliation":[{"name":"LIACC, Faculdade de Engenharia, Universidade do Porto, Porto, Portugal"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1252-7515","authenticated-orcid":false,"given":"Henrique","family":"Lopes Cardoso","sequence":"additional","affiliation":[{"name":"LIACC, Faculdade de Engenharia, Universidade do Porto, Porto, Portugal"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n.d.]. Arquivo dos A\u00e7ores. https:\/\/hdl.handle.net\/21.11129\/0000-000D-F8C0--2. Accessed: 16--5--2023."},{"key":"e_1_3_2_1_2_1","unstructured":"[n.d.]. Corpus Hist\u00f3rico da Linguagem da Medicina em Portugu\u00eas (S\u00e9culo XVIII): Terminologia Diacr\u00f4nica e Humanidades Digitais. https:\/\/sites.google.com\/view\/projeto38597. Accessed: 16--5--2023."},{"key":"e_1_3_2_1_3_1","unstructured":"[n.d.]. What are the top 200 most spoken languages? https:\/\/www.ethnologue.com\/guides\/ethnologue200. Accessed: 26-01--2023."},{"key":"e_1_3_2_1_4_1","unstructured":"Michael Arrigo Stephanie Strassel Nolan King Thao Tran and Lisa Mason. 2022. CAMIO: A Corpus for OCR in Multiple Languages. In Proceedings of the Thirteenth Language Resources and Evaluation Conference Nicoletta Calzolari Fr\u00e9d\u00e9ric B\u00e9chet Philippe Blache Khalid Choukri Christopher Cieri Thierry Declerck Sara Goggi Hitoshi Isahara Bente Maegaard Joseph Mariani H\u00e9l\u00e8ne Mazo Jan Odijk and Stelios Piperidis (Eds.). European Language Resources Association Marseille France 1209--1216. https:\/\/aclanthology.org\/2022.lrec-1.129\/"},{"key":"e_1_3_2_1_5_1","volume-title":"Danny Suarez Vargas, and Viviane P. Moreira.","author":"Bazzo Guilherme Torresan","year":"2020","unstructured":"Guilherme Torresan Bazzo, Gustavo Acauan Lorentz, Danny Suarez Vargas, and Viviane P. Moreira. 2020. Assessing the Impact of OCR Errors in Information Retrieval. In Advances in Information Retrieval, Joemon M. Jose, Emine Yilmaz, Jo\u00e3o Magalh\u00e3es, Pablo Castells, Nicola Ferro, M\u00e1rio J. Silva, and Fl\u00e1vio Martins (Eds.). Springer International Publishing, Cham, 102--109."},{"volume-title":"Computer Vision -- ECCV 2022 Workshops","author":"Biten Ali Furkan","key":"e_1_3_2_1_6_1","unstructured":"Ali Furkan Biten, Rub\u00e8n Tito, Lluis Gomez, Ernest Valveny, and Dimosthenis Karatzas. 2023. OCR-IDL: OCR Annotations for Industry Document Library Dataset. In Computer Vision -- ECCV 2022 Workshops, Leonid Karlinsky, Tomer Michaeli, and Ko Nishino (Eds.). Springer Nature Switzerland, Cham, 241--252."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.58"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.232"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JCDL.2017.7991582"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2015.7333898"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"DBNL. 2019. DBNL OCR Data set. doi:10.5281\/zenodo.3239290","DOI":"10.5281\/zenodo.3239290"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00799-023-00345--6"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","author":"D'hondt Eva","year":"2017","unstructured":"Eva D'hondt, Cyril Grouin, and Brigitte Grau. 2017. Generating a Training Corpus for OCR Post-Correction Using Encoder-Decoder Model. Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers) (2017), 1006--1014."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1220"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dss.2021.113662"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3604931"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2595188.2595200"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.25747\/WPNA-JE39"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Conference on Language Techonologies & Digital Humanities","author":"Finatto Maria Jos\u00e9","year":"2018","unstructured":"Maria Jos\u00e9 Finatto, Paulo Quaresma, and Maria Filomena Gon\u00e7alves. 2018. Portuguese corpora of the 18th century: old Medicine texts for teaching and research. Proceedings of the Conference on Language Techonologies & Digital Humanities (2018)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.46298\/jdmdh.6492"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1075\/lv.00004.gal"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2015.7333841"},{"key":"e_1_3_2_1_23_1","volume-title":"Benjamin Han, and Soundar Srinivasan.","author":"Gupte Amit","year":"2021","unstructured":"Amit Gupte, Alexey Romanov, Sahitya Mantravadi, Dalitso Banda, Jianjie Liu, Raza Khan, Lakshmanan Ramu Meenal, Benjamin Han, and Soundar Srinivasan. 2021. Lights, Camera, Action! A Framework to Improve NLP Accuracy over OCR documents. ArXiv abs\/2108.02899 (2021). https:\/\/api.semanticscholar.org\/CorpusID:236950757"},{"key":"e_1_3_2_1_24_1","unstructured":"Anne G\u00f6hring and Martin Volk. 2011. The TextBerg Corpus An Alpine French- German Parallel Resource. Actes de la 18e conf\u00e9rence sur le Traitement Automatique des Langues Naturelles. Articles courts (2011)"},{"volume-title":"Digital Libraries for Open Knowledge, Mark Hall, Tanja Mer\u010dun","author":"Hamdi Ahmed","key":"e_1_3_2_1_25_1","unstructured":"Ahmed Hamdi, Axel Jean-Caurant, Nicolas Sid\u00e8re, Micka\u00ebl Coustaty, and Antoine Doucet. 2020. Assessing and Minimizing the Impact of OCR Quality on Named Entity Recognition. In Digital Libraries for Open Knowledge, Mark Hall, Tanja Mer\u010dun, Thomas Risse, and Fabien Duchateau (Eds.). Springer International Publishing, Cham, 87--101."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s42001-021-00149--1"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1023\/a:1009902609570"},{"key":"e_1_3_2_1_28_1","unstructured":"Kimmo Kettunen Heikki Keskustalo Sanna Kumpulainen Tuula P\u00e4\u00e4kk\u00f6nen and Juha Rautiainen. 2022. OCR quality affects perceived usefulness of historical newspaper clippings -- a user study. arXiv:2203.03557 [cs.IR] https:\/\/arxiv.org\/abs\/2203.03557"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAS.2016.44"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3103010.3121032"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2018.06.001"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the Australasian Language Technology Association Workshop 2017","author":"Moll\u00e1 Diego","year":"2017","unstructured":"Diego Moll\u00e1 and Steve Cassidy. 2017. Overview of the 2017 ALTA Shared Task: Correcting OCR Errors. Proceedings of the Australasian Language Technology Association Workshop 2017 (2017), 115--118. https:\/\/www.nla.gov.au\/"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)","author":"Nastase Vivi","year":"2018","unstructured":"Vivi Nastase and Julian Hitschler. 2018. Correction of OCR Word Segmentation Errors in Articles from the ACL Collection through Neural Machine Translation Methods. Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) (2018). http:\/\/www.cl.uniheidelberg.de\/english\/"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3476887.3476888"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030-04257--8_29"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453476"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1086\/686075"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-024-09757--5"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2501115.2501130"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5220\/0010177403410349"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1108\/00330331011039481"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00255"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.478"},{"key":"e_1_3_2_1_44_1","volume-title":"Lucas Lima de Oliveira, and Viviane Pereira Moreira","author":"Santos Moniele Kunrath","year":"2023","unstructured":"Moniele Kunrath Santos, Guilherme Bazzo, Lucas Lima de Oliveira, and Viviane Pereira Moreira. 2023. ESTER-Pt: An Evaluation Suite for TExt Recognition in Portuguese. In Document Analysis and Recognition - ICDAR 2023, Gernot A. Fink, Rajiv Jain, Koichi Kise, and Richard Zanibbi (Eds.). Springer Nature Switzerland, Cham, 366--383."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2015.7333865"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.21248\/jlcl.33.2018.220"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--319--24592--8_19"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.5220\/0009169004840496"},{"key":"e_1_3_2_1_49_1","unstructured":"L. Wilms R. Nijssen and T. Koster. 2020. Historical newspaper OCR ground-truth data set. KB Lab: The Hague."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2011.157"},{"volume-title":"Towards Open and Trustworthy Digital Societies, Hao-Ren Ke","author":"Zosa Elaine","key":"e_1_3_2_1_51_1","unstructured":"Elaine Zosa, Stephen Mutuvi, Mark Granroth-Wilding, and Antoine Doucet. 2021. Evaluating the Robustness of Embedding-Based Topic Models to OCR Noise. In Towards Open and Trustworthy Digital Societies, Hao-Ren Ke, Chei Sian Lee, and Kazunari Sugiyama (Eds.). Springer International Publishing, Cham, 392--400."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761633","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:27:15Z","timestamp":1765506435000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761633"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":51,"alternative-id":["10.1145\/3746252.3761633","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761633","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}