{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:48:50Z","timestamp":1771951730793,"version":"3.50.1"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030801182","type":"print"},{"value":"9783030801199","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,7,13]],"date-time":"2021-07-13T00:00:00Z","timestamp":1626134400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,7,13]],"date-time":"2021-07-13T00:00:00Z","timestamp":1626134400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-030-80119-9_69","type":"book-chapter","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T19:14:28Z","timestamp":1626117268000},"page":"1045-1053","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Generating Correction Candidates for OCR Errors using BERT Language Model and FastText SubWord Embeddings"],"prefix":"10.1007","author":[{"given":"Mahdi","family":"Hajiali","sequence":"first","affiliation":[]},{"given":"Jorge Ram\u00f3n","family":"Fonseca Cacho","sequence":"additional","affiliation":[]},{"given":"Kazem","family":"Taghva","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,7,13]]},"reference":[{"key":"69_CR1","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. Trans. Assoc. Comput. Linguist. 5, 135\u2013146 (2017)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"69_CR2","doi-asserted-by":"crossref","unstructured":"Cappelatti, E.: Post-correction of OCR errors using pyenchant spelling suggestions selected through a modified needleman-wunsch algorithm. In: International Conference on Human-Computer Interaction, pp. 3\u201310. Springer (2018)","DOI":"10.1007\/978-3-319-92270-6_1"},{"key":"69_CR3","unstructured":"Croft, W.B., Harding, S.M., Taghva, K., Borsack, J.: An evaluation of information retrieval accuracy with simulated OCR output. In: Symposium on Document Analysis and Information Retrieval, pp. 115\u2013126 (1994)"},{"key":"69_CR4","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"69_CR5","unstructured":"Evert, S.: Google web 1t 5-grams made easy (but not for the computer). In: Proceedings of the NAACL HLT 2010 Sixth Web as Corpus Workshop, pp. 32\u201340. Association for Computational Linguistics (2010)"},{"key":"69_CR6","unstructured":"Cacho, J.R.F.: Improving OCR post processing with machine learning tools. University of Nevada, Las Vegas, Phd diss. (2019)"},{"key":"69_CR7","doi-asserted-by":"crossref","unstructured":"Cacho, J.R.F., Taghva, K.: OCR post processing using support vector machines. In: Science and Information Conference, pp. 694\u2013713. Springer (2020)","DOI":"10.1007\/978-3-030-52246-9_51"},{"key":"69_CR8","doi-asserted-by":"crossref","unstructured":"Cacho, J.R.F., Taghva, K.: The state of reproducible research in computer science. In: 17th International Conference on Information Technology-New Generations (ITNG 2020), pp. 519\u2013524. Springer (2020)","DOI":"10.1007\/978-3-030-43020-7_68"},{"key":"69_CR9","doi-asserted-by":"crossref","unstructured":"Cacho, J.R.F., Taghva, K., Alvarez, D.: Using the google web 1t 5-gram corpus for OCR error correction. In: 16th International Conference on Information Technology-New Generations (ITNG 2019), pp. 505\u2013511. Springer (2019)","DOI":"10.1007\/978-3-030-14070-0_71"},{"key":"69_CR10","unstructured":"Furrer, L., Volk, M.: Reducing OCR errors in gothic-script documents (2011)"},{"key":"69_CR11","doi-asserted-by":"crossref","unstructured":"Hajiali, M.: Big data and sentiment analysis: a comprehensive and systematic literature review. Concurrency Comput. Pract. Exper. 32(14), e5671 (2020)","DOI":"10.1002\/cpe.5671"},{"key":"69_CR12","doi-asserted-by":"crossref","unstructured":"Heidari, M., Rafatirad, S.: Using transfer learning approach to implement convolutional neural network model to recommend airline tickets by using online reviews. In: 2020 15th International Workshop on Semantic and Social Media Adaptation and Personalization, SMA, pp. 1\u20136 (2020)","DOI":"10.1109\/SMAP49528.2020.9248443"},{"key":"69_CR13","unstructured":"Levenshtein, V.I.: Binary codes capable of correcting deletions, insertions, and reversals. In: Soviet physics doklady, vol. 10, pp. 707\u2013710 (1966)"},{"key":"69_CR14","doi-asserted-by":"crossref","unstructured":"Mei, J., Islam, A., Moh\u2019d, A., Wu, Y., Milios, E.E.: Mibio: a dataset for OCR post-processing evaluation. Data Brief 21, 251\u2013255 (2018)","DOI":"10.1016\/j.dib.2018.08.099"},{"key":"69_CR15","unstructured":"Mei, J., Islam, A., Wu, Y., Moh\u2019d, A., Milios, E.E.: Statistical learning for OCR text correction. arXiv preprint arXiv:1611.06950 (2016)"},{"key":"69_CR16","unstructured":"Mihov, S., Koeva, S., Ringlstetter, C., Schulz, K.U., Strohmaier, C.: Precise and efficient text correction using levenshtein automata, dynamic web dictionaries and optimized correction models. In: Proceedings of Workshop on International Proofing Tools and Language Technologies (2004)"},{"key":"69_CR17","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)"},{"key":"69_CR18","doi-asserted-by":"crossref","unstructured":"Nguyen, T.T.H., Jatowt, A., Nguyen, N.-V., Coustaty, M., Doucet, A.: Neural machine translation with Bert for post-OCR error detection and correction. In: Proceedings of the ACM\/IEEE Joint Conference on Digital Libraries in 2020, pp. 333\u2013336 (2020)","DOI":"10.1145\/3383583.3398605"},{"key":"69_CR19","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"69_CR20","doi-asserted-by":"crossref","unstructured":"Schulz, K., Mihov, S., Mitankin, P.: Fast selection of small and precise candidate sets from dictionaries for text correction tasks. In: Ninth International Conference on Document Analysis and Recognition (ICDAR 2007), vol. 1, pp. 471\u2013475. IEEE (2007)","DOI":"10.1109\/ICDAR.2007.4378754"},{"key":"69_CR21","doi-asserted-by":"crossref","unstructured":"Taghva, K., Agarwal, S.: Utilizing web data in identification and correction of OCR errors. In: Document Recognition and Retrieval XXI, vo. 9021, p. 902109. International Society for Optics and Photonics (2014)","DOI":"10.1117\/12.2042403"},{"issue":"1","key":"69_CR22","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/214174.214180","volume":"14","author":"K Taghva","year":"1996","unstructured":"Taghva, K., Borsack, J., Condit, A.: Evaluation of model-based retrieval effectiveness with OCR text. ACM Trans. Inf. Syst. (TOIS) 14(1), 64\u201393 (1996)","journal-title":"ACM Trans. Inf. Syst. (TOIS)"},{"key":"69_CR23","doi-asserted-by":"crossref","unstructured":"Taghva, K., Borsack, J., Condit, A.: Information retrieval and OCR. In: Handbook of Character Recognition and Document Image Analysis, pp. 755\u2013777. World Scientific (1997)","DOI":"10.1142\/9789812830968_0029"},{"issue":"1","key":"69_CR24","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1002\/(SICI)1097-4571(199401)45:1<50::AID-ASI6>3.0.CO;2-B","volume":"45","author":"K Taghva","year":"1994","unstructured":"Taghva, K., Borsack, J., Condit, A., Erva, S.: The effects of noisy data on text retrieval. J. Am. Soc. Inf. Sci. 45(1), 50\u201358 (1994)","journal-title":"J. Am. Soc. Inf. Sci."},{"key":"69_CR25","doi-asserted-by":"crossref","unstructured":"Taghva, K., Nartker, T.A., Borsack, J., Condit, A.: UNLV-ISRI document collection for research in OCR and information retrieval. In: Document recognition and retrieval VII, vol. 3967, pp. 157\u2013164. International Society for Optics and Photonics (1999)","DOI":"10.1117\/12.373489"},{"issue":"3","key":"69_CR26","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1007\/PL00013558","volume":"3","author":"K Taghva","year":"2001","unstructured":"Taghva, K., Stofsky, E.: Ocrspell: an interactive spelling correction system for OCR errors in text. Int. J. Doc. Anal. Recogn. 3(3), 125\u2013137 (2001)","journal-title":"Int. J. Doc. Anal. Recogn."},{"key":"69_CR27","doi-asserted-by":"crossref","unstructured":"van Strien, D., Beelen, K., Ardanuy, M.C., Hosseini, K., McGillivray, B., Colavizza, G.: Assessing the impact of OCR quality on downstream NLP tasks. In: ICAART (1), pp. 484\u2013496 (2020)","DOI":"10.5220\/0009169004840496"},{"key":"69_CR28","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"69_CR29","unstructured":"Wolf, T., et al.: Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345 (2020)"},{"key":"69_CR30","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 19\u201327 (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Networks and Systems","Intelligent Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-80119-9_69","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,8,18]],"date-time":"2021-08-18T13:23:16Z","timestamp":1629292996000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-80119-9_69"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,13]]},"ISBN":["9783030801182","9783030801199"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-80119-9_69","relation":{},"ISSN":["2367-3370","2367-3389"],"issn-type":[{"value":"2367-3370","type":"print"},{"value":"2367-3389","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,7,13]]},"assertion":[{"value":"13 July 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}