{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:32:39Z","timestamp":1772119959433,"version":"3.50.1"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031704413","type":"print"},{"value":"9783031704420","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70442-0_13","type":"book-chapter","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T08:09:40Z","timestamp":1725955780000},"page":"213-228","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Confidence-Aware Document OCR Error Detection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7550-2568","authenticated-orcid":false,"given":"Arthur","family":"Hemmer","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0123-439X","authenticated-orcid":false,"given":"Micka\u00ebl","family":"Coustaty","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8049-9611","authenticated-orcid":false,"given":"Nicola","family":"Bartolo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5666-475X","authenticated-orcid":false,"given":"Jean-Marc","family":"Ogier","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,11]]},"reference":[{"key":"13_CR1","first-page":"17","volume":"9","author":"Y Adesam","year":"2019","unstructured":"Adesam, Y., Dann\u00e9lls, D., Tahmasebi, N.: Exploring the quality of the digital historical newspaper archive KubHist. DHN 9, 17 (2019)","journal-title":"DHN"},{"issue":"1","key":"13_CR2","doi-asserted-by":"publisher","first-page":"49","DOI":"10.21248\/jlcl.33.2018.218","volume":"33","author":"C Amrhein","year":"2018","unstructured":"Amrhein, C., Clematide, S.: Supervised OCR error detection and correction using statistical and neural machine translation methods. J. Lang. Technol. Comput. Linguist. (JLCL) 33(1), 49\u201376 (2018)","journal-title":"J. Lang. Technol. Comput. Linguist. (JLCL)"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Arachchige, P., Randika, A.: Unknown-box approximation to improve optical character recognition performance (2021)","DOI":"10.1007\/978-3-030-86549-8_31"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Baek, Y., et al.: CLEval: character-level evaluation for text detection and recognition tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 564\u2013565 (2020)","DOI":"10.1109\/CVPRW50498.2020.00290"},{"issue":"3","key":"13_CR5","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1007\/s00799-022-00325-2","volume":"23","author":"E Boros","year":"2022","unstructured":"Boros, E., Nguyen, N.K., Lejeune, G., Doucet, A.: Assessing the impact of OCR noise on multilingual event detection over digitised documents. Int. J. Digit. Libr. 23(3), 241\u2013266 (2022)","journal-title":"Int. J. Digit. Libr."},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Brill, E., Moore, R.C.: An improved error model for noisy channel spelling correction. In: Proceedings of the 38th Annual Meeting of the Association for Computational Linguistics, pp. 286\u2013293 (2000)","DOI":"10.3115\/1075218.1075255"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Chiron, G., Doucet, A., Coustaty, M., Moreux, J.P.: ICDAR2017 competition on post-OCR text correction. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 1423\u20131428. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.232"},{"key":"13_CR8","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1007\/BF01889984","volume":"1","author":"KW Church","year":"1991","unstructured":"Church, K.W., Gale, W.A.: Probability scoring for spelling correction. Stat. Comput. 1, 93\u2013103 (1991)","journal-title":"Stat. Comput."},{"key":"13_CR9","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-031-41734-4_7","volume-title":"Document Analysis and Recognition","author":"M Cuper","year":"2023","unstructured":"Cuper, M., van Dongen, C., Koster, T.: Unraveling confidence: examining confidence scores as proxy for OCR quality. In: Fink, G.A., Jain, R., Kise, K., Zanibbi, R. (eds.) ICDAR 2023. LNCS, vol. 14191, pp. 104\u2013120. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-41734-4_7"},{"key":"13_CR10","unstructured":"Du, Y., et al.: PP-OCR: a practical ultra lightweight OCR system. arXiv preprint arXiv:2009.09941 (2020)"},{"key":"13_CR11","unstructured":"Fleischhacker, D., Goederle, W., Kern, R.: Improving OCR quality in 19th century historical documents using a combined machine learning based approach. arXiv preprint arXiv:2401.07787 (2024)"},{"key":"13_CR12","unstructured":"Guo, C., Pleiss, G., Sun, Y., Weinberger, K.Q.: On calibration of modern neural networks. In: International Conference on Machine Learning, pp. 1321\u20131330. PMLR (2017)"},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Gupta, A., et al.: Automatic assessment of OCR quality in historical documents. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a029 (2015)","DOI":"10.1609\/aaai.v29i1.9487"},{"key":"13_CR14","series-title":"Lecture Notes in Networks and Systems","doi-asserted-by":"publisher","first-page":"1045","DOI":"10.1007\/978-3-030-80119-9_69","volume-title":"Intelligent Computing","author":"M Hajiali","year":"2022","unstructured":"Hajiali, M., Fonseca Cacho, J.R., Taghva, K.: Generating correction candidates for OCR errors using BERT language model and FastText SubWord embeddings. In: Arai, K. (ed.) Intelligent Computing. LNNS, vol. 283, pp. 1045\u20131053. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-80119-9_69"},{"key":"13_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-030-54956-5_7","volume-title":"Digital Libraries for Open Knowledge","author":"A Hamdi","year":"2020","unstructured":"Hamdi, A., Jean-Caurant, A., Sid\u00e8re, N., Coustaty, M., Doucet, A.: Assessing and minimizing the impact of OCR quality on named entity recognition. In: Hall, M., Mer\u010dun, T., Risse, T., Duchateau, F. (eds.) TPDL 2020. LNCS, vol. 12246, pp. 87\u2013101. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-54956-5_7"},{"issue":"2","key":"13_CR16","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1017\/S1351324922000110","volume":"29","author":"A Hamdi","year":"2023","unstructured":"Hamdi, A., Pontes, E.L., Sidere, N., Coustaty, M., Doucet, A.: In-depth analysis of the impact of OCR errors on named entity recognition and linking. Nat. Lang. Eng. 29(2), 425\u2013448 (2023)","journal-title":"Nat. Lang. Eng."},{"key":"13_CR17","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-031-42430-4_6","volume-title":"Recent Challenges in Intelligent Information and Database Systems","author":"A Hemmer","year":"2023","unstructured":"Hemmer, A., Brachat, J., Coustaty, M., Ogier, J.M.: Estimating post-OCR denoising complexity on numerical texts. In: Nguyen, N.T., et al. (eds.) ACIIDS 2023. CCIS, vol. 1863, pp. 67\u201379. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-42430-4_6"},{"issue":"4","key":"13_CR18","doi-asserted-by":"publisher","first-page":"825","DOI":"10.1093\/llc\/fqz024","volume":"34","author":"MJ Hill","year":"2019","unstructured":"Hill, M.J., Hengchen, S.: Quantifying the impact of dirty OCR on historical text analysis: eighteenth century collections online as a case study. Digit. Scholarsh. Humanit. 34(4), 825\u2013843 (2019)","journal-title":"Digit. Scholarsh. Humanit."},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Jatowt, A., Coustaty, M., Nguyen, N.V., Doucet, A., et al.: Deep statistical analysis of OCR errors for effective post-OCR processing. In: 2019 ACM\/IEEE Joint Conference on Digital Libraries (JCDL), pp. 29\u201338. IEEE (2019)","DOI":"10.1109\/JCDL.2019.00015"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Jatowt, A., Coustaty, M., Nguyen, N.V., Doucet, A., et al.: Post-OCR error detection by generating plausible candidates. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 876\u2013881. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00145"},{"key":"13_CR22","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.P.: FUNSD: a dataset for form understanding in noisy scanned documents. In: 2019 International Conference on Document Analysis and Recognition Workshops (ICDARW), vol.\u00a02, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"13_CR23","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1007\/978-3-031-19815-1_29","volume-title":"European Conference on Computer Vision","author":"G Kim","year":"2022","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13688, pp. 498\u2013517. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_29"},{"key":"13_CR24","unstructured":"Mindee: doctr: Document text recognition (2021). https:\/\/github.com\/mindee\/doctr"},{"key":"13_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-04257-8_1","volume-title":"Maturity and Innovation in Digital Libraries","author":"S Mutuvi","year":"2018","unstructured":"Mutuvi, S., Doucet, A., Odeo, M., Jatowt, A.: Evaluating the impact of OCR errors on topic modeling. In: Dobreva, M., Hinze, A., \u017dumer, M. (eds.) ICADL 2018. LNCS, vol. 11279, pp. 3\u201314. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-04257-8_1"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"Neudecker, C., Baierer, K., Gerber, M., Clausner, C., Antonacopoulos, A., Pletschacher, S.: A survey of OCR evaluation tools and metrics. In: The 6th International Workshop on Historical Document Imaging and Processing, pp. 13\u201318 (2021)","DOI":"10.1145\/3476887.3476888"},{"key":"13_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"278","DOI":"10.1007\/978-3-030-04257-8_29","volume-title":"Maturity and Innovation in Digital Libraries","author":"T-T-H Nguyen","year":"2018","unstructured":"Nguyen, T.-T.-H., Coustaty, M., Doucet, A., Jatowt, A., Nguyen, N.-V.: Adaptive edit-distance and regression approach for post-OCR text correction. In: Dobreva, M., Hinze, A., \u017dumer, M. (eds.) ICADL 2018. LNCS, vol. 11279, pp. 278\u2013289. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-04257-8_29"},{"issue":"6","key":"13_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3453476","volume":"54","author":"TTH Nguyen","year":"2021","unstructured":"Nguyen, T.T.H., Jatowt, A., Coustaty, M., Doucet, A.: Survey of post-OCR processing approaches. ACM Comput. Surv. (CSUR) 54(6), 1\u201337 (2021)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Nguyen, T.T.H., Jatowt, A., Nguyen, N.V., Coustaty, M., Doucet, A.: Neural machine translation with BERT for post-OCR error detection and correction. In: Proceedings of the ACM\/IEEE Joint Conference on Digital Libraries in 2020, pp. 333\u2013336 (2020)","DOI":"10.1145\/3383583.3398605"},{"key":"13_CR30","unstructured":"Olejniczak, K., \u0160ulc, M.: Text detection forgot about document OCR. arXiv preprint arXiv:2210.07903 (2022)"},{"issue":"1","key":"13_CR31","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1007\/s00799-023-00345-6","volume":"24","author":"LL de Oliveira","year":"2023","unstructured":"de Oliveira, L.L., et al.: Evaluating and mitigating the impact of OCR errors on information retrieval. Int. J. Digit. Libr. 24(1), 45\u201362 (2023)","journal-title":"Int. J. Digit. Libr."},{"key":"13_CR32","unstructured":"Park, S., et al.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"13_CR33","doi-asserted-by":"crossref","unstructured":"Ramirez-Orta, J.A., Xamena, E., Maguitman, A., Milios, E., Soto, A.J.: Post-OCR document correction with large ensembles of character sequence-to-sequence models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 11192\u201311199 (2022)","DOI":"10.1609\/aaai.v36i10.21369"},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Rigaud, C., Doucet, A., Coustaty, M., Moreux, J.P.: ICDAR 2019 competition on post-OCR text correction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1588\u20131593. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00255"},{"key":"13_CR35","unstructured":"Rotman, D., Azulai, O., Shapira, I., Burshtein, Y., Barzelay, U.: Detection masking for improved OCR on noisy documents. arXiv preprint arXiv:2205.08257 (2022)"},{"issue":"3","key":"13_CR36","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1002\/j.1538-7305.1948.tb01338.x","volume":"27","author":"C Shannon","year":"1948","unstructured":"Shannon, C.: A mathematical theory of communication. Bell Syst. Tech. J. 27(3), 379\u2013423 (1948)","journal-title":"Bell Syst. Tech. J."},{"key":"13_CR37","doi-asserted-by":"crossref","unstructured":"Spithourakis, G.P., Riedel, S.: Numeracy for language models: evaluating and improving their ability to predict numbers. arXiv preprint arXiv:1805.08154 (2018)","DOI":"10.18653\/v1\/P18-1196"},{"key":"13_CR38","unstructured":"Springmann, U., Fink, F., Schulz, K.U.: Automatic quality evaluation and (semi-)automatic improvement of OCR models for historical printings. arXiv preprint arXiv:1606.05157 (2016)"},{"key":"13_CR39","unstructured":"Subramani, N., Matton, A., Greaves, M., Lam, A.: A survey of deep learning approaches for OCR and document understanding. arXiv preprint arXiv:2011.13534 (2020)"},{"key":"13_CR40","doi-asserted-by":"crossref","unstructured":"Todorov, K., Colavizza, G.: An assessment of the impact of OCR noise on language models. arXiv preprint arXiv:2202.00470 (2022)","DOI":"10.5220\/0010945100003116"},{"key":"13_CR41","doi-asserted-by":"crossref","unstructured":"Top\u00e7u, A.\u0130., T\u00f6reyin, B.U.: Neural machine translation approaches for post-OCR text processing. In: 2022 30th Signal Processing and Communications Applications Conference (SIU), pp.\u00a01\u20134. IEEE (2022)","DOI":"10.1109\/SIU55565.2022.9864878"},{"key":"13_CR42","doi-asserted-by":"crossref","unstructured":"Van\u00a0Strien, D., Beelen, K., Ardanuy, M.C., Hosseini, K., McGillivray, B., Colavizza, G.: Assessing the impact of OCR quality on downstream NLP tasks (2020)","DOI":"10.5220\/0009169004840496"},{"key":"13_CR43","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1007\/978-3-031-41501-2_6","volume-title":"Document Analysis and Recognition","author":"N Yasin","year":"2023","unstructured":"Yasin, N., Siddiqi, I., Moetesum, M., Rauf, S.A.: Transformer-based neural machine translation for post-OCR error correction in cursive text. In: Coustaty, M., Forn\u00e9s, A. (eds.) ICDAR 2023. LNCS, vol. 14194, pp. 80\u201393. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-41501-2_6"},{"key":"13_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 19\u201327 (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Document Analysis Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70442-0_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T23:12:30Z","timestamp":1732749150000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70442-0_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031704413","9783031704420"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70442-0_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"11 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DAS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Document Analysis Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 August 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"das2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/das2024.seecs.edu.pk\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}