{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T17:40:15Z","timestamp":1756489215450,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,18]],"date-time":"2024-04-18T00:00:00Z","timestamp":1713398400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,18]]},"DOI":"10.1145\/3603287.3651200","type":"proceedings-article","created":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T12:06:34Z","timestamp":1714219594000},"page":"50-59","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Challenges of Automatic Document Processing with Historical Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-2907-4494","authenticated-orcid":false,"given":"Katerina","family":"Vilkomir","sequence":"first","affiliation":[{"name":"East Carolina University, Greenville, North Carolina, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9712-148X","authenticated-orcid":false,"given":"Nic","family":"Herndon","sequence":"additional","affiliation":[{"name":"East Carolina University, Greenville, North Carolina, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"unstructured":"[n. d.]. The Evolution of the English Spelling System. https:\/\/www.timetoast.com\/timelines\/the-evolution-of-the-english-spelling-system","key":"e_1_3_2_1_1_1"},{"key":"e_1_3_2_1_2_1","volume-title":"Muhammad Tanvir Afzal, and Muhammad Abdul Qadir","author":"Ahmad Riaz","year":"2016","unstructured":"Riaz Ahmad, Muhammad Tanvir Afzal, and Muhammad Abdul Qadir. 2016. Information Extraction from PDF Sources Based on Rule-based System Using Integrated Formats. In Semantic Web Challenges: Third SemWebEval Challenge at ESWC 2016, Heraklion, Crete, Greece, May 29-June 2, 2016, Revised Selected Papers 3. Springer, Crete, Greece, 293--308."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1007\/s11042-024-18187-y"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1007\/s00799-020-00280-w"},{"key":"e_1_3_2_1_5_1","volume-title":"A Large-scale Comparison of Historical Text Normalization Systems. arXiv preprint arXiv:1904.02036","author":"Bollmann Marcel","year":"2019","unstructured":"Marcel Bollmann. 2019. A Large-scale Comparison of Historical Text Normalization Systems. arXiv preprint arXiv:1904.02036 (2019)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1007\/s00146-016-0686-z"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1145\/3352631.3352643"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the international association for pattern recognition (IAPR)","author":"Clausner Christian","year":"2014","unstructured":"Christian Clausner, Stefan Pletschacher, and Apostolos Antonacopoulos. 2014. Efficient OCR Training Data Generation With Aletheia. Proceedings of the international association for pattern recognition (IAPR), Tours, France (2014), 7--10."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1145\/2494266.2494271"},{"key":"e_1_3_2_1_10_1","volume-title":"Matteo Romanello, and Antoine Doucet.","author":"Ehrmann Maud","year":"2021","unstructured":"Maud Ehrmann, Ahmed Hamdi, Elvys Linhares Pontes, Matteo Romanello, and Antoine Doucet. 2021. Named Entity Recognition and Classification on Historical Documents: A Survey. arXiv preprint arXiv:2109.11406 (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Generation of Synthetic Training Data for Handwritten Indic Script Recognition. In 2015 13th International Conference on Document Analysis and Recognition (ICDAR). IEEE","author":"Gaur Shivansh","year":"2015","unstructured":"Shivansh Gaur, Siddhant Sonkar, and Partha Pratim Roy. 2015. Generation of Synthetic Training Data for Handwritten Indic Script Recognition. In 2015 13th International Conference on Document Analysis and Recognition (ICDAR). IEEE, Nancy, France, 491--495."},{"volume-title":"An Automated Pipeline for Robust Image Processing and Optical Character Recognition of Historical Documents","author":"Gruber Ivan","unstructured":"Ivan Gruber, Pavel Ircing, Petr Neduchal, Marek Hr\u00faz, Miroslav Hlav\u00e1\u010d, Zbyn\u011bk Zaj\u00edc, Jan \u0160vec, and Martin Bul\u00edn. 2020. An Automated Pipeline for Robust Image Processing and Optical Character Recognition of Historical Documents. Vol. 12335. Springer, Cham. 166--175 pages.","key":"e_1_3_2_1_12_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1038\/s41378-022-00432-9"},{"unstructured":"Justus Friedrich Carl Hecker. 1844. The Epidemics of the Middle Ages. G. Woodfall.","key":"e_1_3_2_1_14_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1016\/j.patcog.2013.02.008"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1093\/llc\/fqz024"},{"key":"e_1_3_2_1_17_1","volume-title":"Johann Gutenberg: The Man and His Invention. (No Title)","author":"Kapr Albert","year":"1996","unstructured":"Albert Kapr and Douglas Martin. 1996. Johann Gutenberg: The Man and His Invention. (No Title) (1996)."},{"key":"e_1_3_2_1_18_1","volume-title":"University of Pennsylvania LJS 25: Liber metaphisice","author":"Kislak Center","year":"1853","unstructured":"Kislak Center. 2015. University of Pennsylvania LJS 25: Liber metaphisice; Liber ethicorum, fol. 69r. https:\/\/www.flickr.com\/photos\/130418531@N02\/16990603432."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_19_1","DOI":"10.1007\/11735106_36"},{"doi-asserted-by":"crossref","unstructured":"Sara Lafia David A Bleckley and J Trent Alexander. 2023. Digitizing and Parsing Semi-structured Historical Administrative Documents from the GI Bill Mortgage Guarantee Program. (2023).","key":"e_1_3_2_1_20_1","DOI":"10.1108\/JD-03-2023-0055"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.3390\/jimaging6100110"},{"key":"e_1_3_2_1_22_1","first-page":"4","article-title":"Ethical Issues in Digitization of Cultural Heritage","volume":"4","author":"Man\u017euch Zinaida","year":"2017","unstructured":"Zinaida Man\u017euch. 2017. Ethical Issues in Digitization of Cultural Heritage. Journal of Contemporary Archival Studies 4, 2 (2017), 4.","journal-title":"Journal of Contemporary Archival Studies"},{"key":"e_1_3_2_1_23_1","volume-title":"Hybrid Training Data for Historical Text OCR. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE","author":"Mart\u00ednek Ji\u0159\u00ed","year":"2019","unstructured":"Ji\u0159\u00ed Mart\u00ednek, Ladislav Lenc, Pavel Kr\u00e1l, Anguelos Nicolaou, and Vincent Christlein. 2019. Hybrid Training Data for Historical Text OCR. In 2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, Sydney, Australia, 565--570."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1145\/3453476"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1073\/pnas.96.14.8028"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1890\/ES13-00295.1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1109\/ICDAR.2019.00028"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1145\/2037342.2037363"},{"unstructured":"William John Simpson. 1903. Report on the Causes and Continuance of Plague in Hongkong and Suggestions as to Remedial Measures. Waterlow.","key":"e_1_3_2_1_29_1"},{"volume-title":"Epidemiological, Clinical, Therapeutic and Preventive Aspects of the Disease","author":"Simpson William John","unstructured":"William John Simpson. 1905. A Treatise on Plague Dealing with the Historical, Epidemiological, Clinical, Therapeutic and Preventive Aspects of the Disease. University Press.","key":"e_1_3_2_1_30_1"},{"key":"e_1_3_2_1_31_1","volume-title":"A Research Agenda for Historical and Multilingual Optical Character Recognition. NUlab","author":"Smith David A","year":"2018","unstructured":"David A Smith and Ryan Cordell. 2018. A Research Agenda for Historical and Multilingual Optical Character Recognition. NUlab, Northeastern University (2018), 36."},{"key":"e_1_3_2_1_32_1","volume-title":"Non-destructive Digitization of Soiled Historical Chinese Bamboo Scrolls. In 2018 13th IAPR International Workshop on Document Analysis Systems (DAS). IEEE","author":"Stromer Daniel","year":"2018","unstructured":"Daniel Stromer, Vincent Christlein, Andreas Maier, Patrick Zippert, Eric Helmecke, Tino Hausotte, and Xiaolin Huang. 2018. Non-destructive Digitization of Soiled Historical Chinese Bamboo Scrolls. In 2018 13th IAPR International Workshop on Document Analysis Systems (DAS). IEEE, Vienna, Austria, 55--60."},{"unstructured":"J.D. Thomas. Sep 9 2015. The Rise of the Spelling Reform Movement. https:\/\/wordsfrom.us\/2015\/09\/the-rise-of-the-spelling-reform-movement\/","key":"e_1_3_2_1_33_1"},{"key":"e_1_3_2_1_34_1","volume-title":"Jacco Van Ossenbruggen, and Lynda Hardman","author":"Traub Myriam C","year":"2015","unstructured":"Myriam C Traub, Jacco Van Ossenbruggen, and Lynda Hardman. 2015. Impact Analysis of OCR Quality on Research Tasks in Digital Archives. In Research and Advanced Technology for Digital Libraries: 19th International Conference on Theory and Practice of Digital Libraries, TPDL 2015, Pozna\u0144, Poland, September 14-18, 2015, Proceedings 19. Springer, Pozna\u0144, Poland, 252--263."},{"unstructured":"University of Glasgow Library. 2009. The Workes of our Ancient and Learned English Poet Geffrey Chaucer - printed by Adam Islip 1602. https:\/\/www.flickr.com\/photos\/35401416@N08\/3654028802.","key":"e_1_3_2_1_35_1"},{"key":"e_1_3_2_1_36_1","volume-title":"Kasra Hosseini, Barbara McGillivray, and Giovanni Colavizza.","author":"Strien Daniel Van","year":"2020","unstructured":"Daniel Van Strien, Kaspar Beelen, Mariona Coll Ardanuy, Kasra Hosseini, Barbara McGillivray, and Giovanni Colavizza. 2020. Assessing the Impact of OCR Quality on Downstream NLP Tasks. (2020)."},{"unstructured":"Walters Art Museum Illuminated Manuscripts. 2012. Two Leaves from the Mirror of Human Salvation The Marriage of the Virgin and the Marriage of Sarah and Tobias Walters Manuscript W.149 fol. 1v. https:\/\/www.flickr.com\/photos\/medmss\/8094009545\/.","key":"e_1_3_2_1_37_1"},{"unstructured":"Walters Art Museum Illuminated Manuscripts. 2013. Misbound Liturgical Psalter Initial D with a Jewish Priest Walters Manuscript W.70 fol. 56r. https:\/\/www.flickr.com\/photos\/medmss\/8738198593.","key":"e_1_3_2_1_38_1"},{"unstructured":"World Intellectual Property Organization. 2017. Intellectual Property and Genetic Resources Traditional Knowledge and Traditional Cultural Expressions. In Research Handbook on the World Intellectual Property Organization. WIPO.int.","key":"e_1_3_2_1_39_1"}],"event":{"sponsor":["ACM Association for Computing Machinery"],"acronym":"ACM SE '24","name":"ACM SE '24: 2024 ACM Southeast Conference","location":"Marietta GA USA"},"container-title":["Proceedings of the 2024 ACM Southeast Conference on ZZZ"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3603287.3651200","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3603287.3651200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T17:07:10Z","timestamp":1756487230000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3603287.3651200"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,18]]},"references-count":39,"alternative-id":["10.1145\/3603287.3651200","10.1145\/3603287"],"URL":"https:\/\/doi.org\/10.1145\/3603287.3651200","relation":{},"subject":[],"published":{"date-parts":[[2024,4,18]]},"assertion":[{"value":"2024-04-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}