{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:48:04Z","timestamp":1767340084061,"version":"3.40.3"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030665265"},{"type":"electronic","value":"9783030665272"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-66527-2_2","type":"book-chapter","created":{"date-parts":[[2020,12,30]],"date-time":"2020-12-30T18:04:51Z","timestamp":1609351491000},"page":"17-30","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["How to Improve Optical Character Recognition of Historical Finnish Newspapers Using Open Source Tesseract OCR Engine \u2013 Final Notes on Development and Evaluation"],"prefix":"10.1007","author":[{"given":"Mika","family":"Koistinen","sequence":"first","affiliation":[]},{"given":"Kimmo","family":"Kettunen","sequence":"additional","affiliation":[]},{"given":"Jukka","family":"Kervinen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,12,31]]},"reference":[{"key":"2_CR1","unstructured":"Kettunen, K., Honkela, T., Lind\u00e9n, K., Kauppinen, P., P\u00e4\u00e4kk\u00f6nen, T., Kervinen, J.: Analyzing and improving the quality of a historical news collection using language technology and statistical machine learning methods. In: IFLA World Library and Information Congress, Lyon (2014). http:\/\/www.ifla.org\/files\/assets\/newspapers\/Geneva_2014\/s6-honkela-en.pdf"},{"key":"2_CR2","unstructured":"Kettunen, K., P\u00e4\u00e4kk\u00f6nen, T.: Measuring lexical quality of a historical finnish newspaper collection \u2013 analysis of garbled OCR data with basic language technology tools and means. In: Calzolari, N., et al. (ed.) Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016) (2016). http:\/\/www.lrec-conf.org\/proceedings\/lrec2016\/pdf\/17_Paper.pdf"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"P\u00e4\u00e4kk\u00f6nen, T., Kervinen, J., Nivala, A., Kettunen, K., M\u00e4kel\u00e4, E.: Exporting Finnish digitized historical newspaper contents for offline use. D-Lib Mag. 22, July\/August 2016 (2016)","DOI":"10.1045\/july2016-paakkonen"},{"key":"2_CR4","doi-asserted-by":"publisher","unstructured":"P\u00e4\u00e4kk\u00f6nen, T., Kettunen, K.: Kansalliskirjaston sanomalehtiaineistot: k\u00e4ytt\u00e4j\u00e4t ja tutkijat kes\u00e4ll\u00e4 2018. Informaatiotutkimus 37(3), 15\u201319 (2018). https:\/\/doi.org\/10.23978\/inf.76067","DOI":"10.23978\/inf.76067"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Piotrowski, M.: Natural language processing for historical texts. Synthesis Lectures on Human Language Technologies. Morgan & Claypool Publishers, San Rafael (2012)","DOI":"10.2200\/S00436ED1V01Y201207HLT017"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Holley, R.: How good can it get? Analysing and improving OCR accuracy in large scale historic newspaper digitisation programs. D-Lib Mag. 15(3\/4) (2009)","DOI":"10.1045\/march2009-holley"},{"key":"2_CR7","unstructured":"Springmann, U., L\u00fcdeling, A.: OCR of historical printings with an application to building diachronic corpora: a case study using the RIDGES herbal corpus. Digit. Humanit. Q. 11(2) (2017)"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Tanner, S., Mu\u00f1oz, T., Ros, P.H.: Measuring mass text digitization quality and usefulness. Lessons learned from assessing the OCR accuracy of the british library\u2019s 19th century online newspaper archive. D-Lib Mag. 15(8) (2009)","DOI":"10.1045\/july2009-munoz"},{"key":"2_CR9","unstructured":"Clematide, S., Furrer, L., Volk, M.: Crowdsourcing an OCR gold standard for a german and french heritage corpus. In: Calzolari, N., et al. (eds.) Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016) (2016). http:\/\/www.lrec-conf.org\/proceedings\/lrec2016\/pdf\/917_Paper.pdf"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Reynaert, M.: Non-interactive OCR post-correction for giga-scale digitization projects. In: Proceedings of the 9th International Conference on Computational linguistics and Intelligent Text Processing, CICLing\u201908, pp. 617\u2013630 (2008)","DOI":"10.1007\/978-3-540-78135-6_53"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Holley, R.: Crowdsourcing: how and why should libraries do it? D-Lib Mag. 16(3\/4) (2010)","DOI":"10.1045\/march2010-holley"},{"key":"2_CR12","unstructured":"Chrons, O., Sundell, S.: Digitalkoot: making old archives accessible using crowdsourcing. In: Human Computation, Papers from the 2011 AAAI Workshop (2011). http:\/\/www.aaai.org\/ocs\/index.php\/WS\/AAAIW11\/paper\/view\/3813\/4246"},{"key":"2_CR13","unstructured":"Silfverberg, M., Kauppinen, P., Linden, K.: Data-driven spelling correction using weighted finite-state methods. In: Proceedings of the ACL Workshop on Statistical NLP and Weighted Automata, pp. 51\u201359 (2016). https:\/\/aclweb.org\/anthology\/W\/W16\/W16-2406.pdf"},{"key":"2_CR14","unstructured":"Drobac, S., Kauppinen, P., Lind\u00e9n, K.: OCR and post-correction of historical Finnish texts. In: Tiedemann, J. (ed.) Proceedings of the 21st Nordic Conference on Computational Linguistics, NoDaLiDa, 22\u201324 May 2017, Gothenburg, Sweden, pp. 70\u201376 (2017)"},{"issue":"5","key":"2_CR15","doi-asserted-by":"publisher","first-page":"108","DOI":"10.17148\/IARJSET.2016.3523","volume":"3","author":"ML Smitha","year":"2016","unstructured":"Smitha, M.L., Antony, P.J., Sachin, D.J.: Document image analysis using imagemagick and tesseract-ocr. Int. Adv. Res. J. Sci. Eng. Technol. 3(5), 108\u2013112 (2016)","journal-title":"Int. Adv. Res. J. Sci. Eng. Technol."},{"key":"2_CR16","unstructured":"Koistinen, M., Kettunen, K., P\u00e4\u00e4kk\u00f6nen, T.: Improving optical character recognition of finnish historical newspapers with a combination of fraktur & antiqua models and image preprocessing. In: Tiedemann, J. (ed.) Proceedings of the 21st Nordic Conference on Computational Linguistics, NoDaLiDa, 22\u201324 May 2017, Gothenburg, Sweden, pp. 277\u2013283 (2017)"},{"key":"2_CR17","unstructured":"Breuel, T.: The hOCR microformat for OCR workflow and results. Document analysis and recognition, 2007. In: ICDAR 2007, Ninth International Conference on Document Analysis and Recognition (2007). http:\/\/ieeexplore.ieee.org\/stamp\/stamp.jsp?arnumber=4377078"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Kettunen, K., Koistinen, M., Kervinen, J: Ground truth OCR sample data of finnish historical newspapers and journals in data improvement validation of a re-OCRing process. LIBER Q. 30(1), 1\u201320 (2020). http:\/\/doi.org\/10.18352\/lq.10322","DOI":"10.18352\/lq.10322"},{"key":"2_CR19","volume-title":"Foundations of Statistical Natural Language Processing","author":"CD Manning","year":"1999","unstructured":"Manning, C.D., Sch\u00fctze, H.: Foundations of Statistical Natural Language Processing. The MIT Press, Cambridge (1999)"},{"key":"2_CR20","series-title":"Communications in Computer and Information Science","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1007\/978-3-319-41938-1_11","volume-title":"Digital Libraries on the Move","author":"K Kettunen","year":"2016","unstructured":"Kettunen, K.: Keep, change or delete? Setting up a low resource OCR post-correction framework for a digitized old finnish newspaper collection. In: Calvanese, D., De Nart, D., Tasso, C. (eds.) IRCDL 2015. CCIS, vol. 612, pp. 95\u2013103. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-41938-1_11"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Carrasco, R.C.: An open-source OCR evaluation tool. In: Proceeding DATeCH \u201814 Proceedings of the First International Conference on Digital Access to Textual Cultural Heritage, pp. 179\u2013184 (2014)","DOI":"10.1145\/2595188.2595221"},{"key":"2_CR22","unstructured":"Kettunen, K., Kervinen, J., Koistinen, M.: Creating and using ground truth OCR sample data for Finnish historical newspapers and journals. In: DHN2018, Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference, pp. 162\u2013169. http:\/\/ceur-ws.org\/Vol-2084\/)"},{"issue":"1","key":"2_CR23","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/214174.214180","volume":"14","author":"K Taghva","year":"1996","unstructured":"Taghva, K., Borsack, J., Condit, A.: Evaluation of model-based retrieval effectiveness with OCR text. ACM Trans. Inf. Syst. 14(1), 64\u201393 (1996)","journal-title":"ACM Trans. Inf. Syst."},{"key":"2_CR24","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1023\/A:1009902609570","volume":"2","author":"PB Kantor","year":"2000","unstructured":"Kantor, P.B., Voorhees, E.M.: The TREC-5 confusion track: comparing retrieval meth-ods for scanned texts. Inf. Retrieval 2, 165\u2013176 (2000)","journal-title":"Inf. Retrieval"},{"issue":"12","key":"2_CR25","doi-asserted-by":"publisher","first-page":"2928","DOI":"10.1002\/asi.23379","volume":"67","author":"A J\u00e4rvelin","year":"2016","unstructured":"J\u00e4rvelin, A., Keskustalo, H., Sormunen, E., Saastamoinen, M., Kettunen, K.: Information retrieval from historical newspaper collections in highly inflectional languages: a query ex-pansion approach. J. Assoc. Inf. Sci. Technol. 67(12), 2928\u20132946 (2016)","journal-title":"J. Assoc. Inf. Sci. Technol."},{"key":"2_CR26","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1007\/s10032-009-0094-8","volume":"12","author":"D Lopresti","year":"2009","unstructured":"Lopresti, D.: Optical character recognition errors and their effects on natural language pro-cessing. Int. J. Doc. Anal. Recogn. 12, 141\u2013151 (2009)","journal-title":"Int. J. Doc. Anal. Recogn."},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Jarlbrink, J., Snickars, P.: Cultural heritage as digital noise: nineteenth century newspapers in the digital archive. J. Doc. 73(6), 1228\u20131243 (2017)","DOI":"10.1108\/JD-09-2016-0106"}],"container-title":["Lecture Notes in Computer Science","Human Language Technology. Challenges for Computer Science and Linguistics"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-66527-2_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,12,30]],"date-time":"2020-12-30T18:05:19Z","timestamp":1609351519000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-66527-2_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030665265","9783030665272"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-66527-2_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"31 December 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"LTC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Language and Technology Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pozna\u0144","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Poland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2017","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2017","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2017","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ltconf2017","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ltc.amu.edu.pl\/a2017\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"97","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"26","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}