{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T05:21:10Z","timestamp":1769923270156,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2008,7,24]],"date-time":"2008-07-24T00:00:00Z","timestamp":1216857600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000144","name":"Division of Computer and Network Systems","doi-asserted-by":"publisher","award":["CNS-0430178"],"award-info":[{"award-number":["CNS-0430178"]}],"id":[{"id":"10.13039\/100000144","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2008,7,24]]},"DOI":"10.1145\/1390749.1390753","type":"proceedings-article","created":{"date-parts":[[2008,7,25]],"date-time":"2008-07-25T17:17:47Z","timestamp":1217006267000},"page":"9-16","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Optical character recognition errors and their effects on natural language processing"],"prefix":"10.1145","author":[{"given":"Daniel","family":"Lopresti","sequence":"first","affiliation":[{"name":"Lehigh University, Bethlehem, PA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2008,7,24]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/844379.844617"},{"key":"e_1_3_2_1_3_1","first-page":"401","volume-title":"Proceedings of the Third Annual Symposium on Document Analysis and Information Retrieval","author":"Esakov J.","year":"1994","unstructured":"J. Esakov , D. P. Lopresti , J. S. Sandberg , and J. Zhou . Issues in automatic OCR error classification . In Proceedings of the Third Annual Symposium on Document Analysis and Information Retrieval , pages 401 -- 412 , April 1994 . J. Esakov, D. P. Lopresti, J. S. Sandberg, and J. Zhou. Issues in automatic OCR error classification. In Proceedings of the Third Annual Symposium on Document Analysis and Information Retrieval, pages 401--412, April 1994."},{"key":"e_1_3_2_1_4_1","first-page":"103","volume-title":"Proceedings of the Symposium on Document Image Understanding Technology","author":"Farooq F.","year":"2005","unstructured":"F. Farooq and Y. Al-Onaizan . Effect of degraded input on statistical machine translation . In Proceedings of the Symposium on Document Image Understanding Technology , pages 103 -- 109 , November 2005 . F. Farooq and Y. Al-Onaizan. Effect of degraded input on statistical machine translation. In Proceedings of the Symposium on Document Image Understanding Technology, pages 103--109, November 2005."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the Workshop on Analytics for Noisy Unstructured Text Data","author":"Foster J.","year":"2007","unstructured":"J. Foster . Treebanks gone bad: Generating a treebank of ungrammatical English . In Proceedings of the Workshop on Analytics for Noisy Unstructured Text Data , Hyderabad, India , January 2007 . J. Foster. Treebanks gone bad: Generating a treebank of ungrammatical English. In Proceedings of the Workshop on Analytics for Noisy Unstructured Text Data, Hyderabad, India, January 2007."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1117\/12.234714"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1117\/12.373506","volume-title":"Proceedings of Document Recognition and Retrieval VII (IS&T\/SPIE Electronic Imaging)","volume":"3967","author":"Hu J.","year":"2000","unstructured":"J. Hu , R. Kashi , D. Lopresti , and G. Wilfong . Medium-independent table detection. In D. P. Lopresti and J. Zhou, editors , Proceedings of Document Recognition and Retrieval VII (IS&T\/SPIE Electronic Imaging) , volume 3967 , pages 291 -- 302 , San Jose, CA , January 2000 . J. Hu, R. Kashi, D. Lopresti, and G. Wilfong. Medium-independent table detection. In D. P. Lopresti and J. Zhou, editors, Proceedings of Document Recognition and Retrieval VII (IS&T\/SPIE Electronic Imaging), volume 3967, pages 291--302, San Jose, CA, January 2000."},{"key":"e_1_3_2_1_8_1","first-page":"111","volume-title":"Proceedings of the Symposium on Document Image Understanding Technology","author":"Jing H.","year":"2003","unstructured":"H. Jing , D. Lopresti , and C. Shih . Summarizing noisy documents . In Proceedings of the Symposium on Document Image Understanding Technology , pages 111 -- 119 , April 2003 . H. Jing, D. Lopresti, and C. Shih. Summarizing noisy documents. In Proceedings of the Symposium on Document Image Understanding Technology, pages 111--119, April 2003."},{"key":"e_1_3_2_1_9_1","volume-title":"Distribution 1.0","author":"Lewis D. D.","year":"2008","unstructured":"D. D. Lewis . Reuters-21578 Test Collection , Distribution 1.0 , May 2008 . http:\/\/www.daviddlewis.com\/resources\/testcollections\/reuters21578\/. D. D. Lewis. Reuters-21578 Test Collection, Distribution 1.0, May 2008. http:\/\/www.daviddlewis.com\/resources\/testcollections\/reuters21578\/."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1066677.1066851"},{"key":"e_1_3_2_1_11_1","first-page":"0G","volume-title":"Proceedings of Document Recognition and Retrieval XV (IS&T\/SPIE Electronic Imaging)","volume":"6815","author":"Lopresti D.","unstructured":"D. Lopresti . Measuring the impact of character recognition errors on downstream text analysis . In Proceedings of Document Recognition and Retrieval XV (IS&T\/SPIE Electronic Imaging) , volume 6815 , pages 0G .01--0G.11, San Jose, CA, January 2008. D. Lopresti. Measuring the impact of character recognition errors on downstream text analysis. In Proceedings of Document Recognition and Retrieval XV (IS&T\/SPIE Electronic Imaging), volume 6815, pages 0G.01--0G.11, San Jose, CA, January 2008."},{"key":"e_1_3_2_1_12_1","volume-title":"May","author":"Lopresti D.","year":"2008","unstructured":"D. Lopresti . Noisy OCR text dataset , May 2008 . http:\/\/www.cse.lehigh.edu\/~lopresti\/noisytext.html. D. Lopresti. Noisy OCR text dataset, May 2008. http:\/\/www.cse.lehigh.edu\/~lopresti\/noisytext.html."},{"key":"e_1_3_2_1_13_1","volume-title":"Penn Treebank tokenizer (sed script source code)","author":"MacIntyre R.","year":"1995","unstructured":"R. MacIntyre . Penn Treebank tokenizer (sed script source code) , 1995 . http:\/\/www.cis.upenn.edu\/~treebank\/tokenizer.sed. R. MacIntyre. Penn Treebank tokenizer (sed script source code), 1995. http:\/\/www.cis.upenn.edu\/~treebank\/tokenizer.sed."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.3115\/974147.974191"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3115\/1072133.1072186"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Empirical Methods in Natural Language Processing Conference","author":"Ratnaparkhi A.","year":"1996","unstructured":"A. Ratnaparkhi . A maximum entropy part-of-speech tagger . In Proceedings of the Empirical Methods in Natural Language Processing Conference , May 1996 . ftp:\/\/ftp.cis.upenn.edu\/pub\/adwait\/jmx\/jmx.tar.gz. A. Ratnaparkhi. A maximum entropy part-of-speech tagger. In Proceedings of the Empirical Methods in Natural Language Processing Conference, May 1996. ftp:\/\/ftp.cis.upenn.edu\/pub\/adwait\/jmx\/jmx.tar.gz."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.3115\/974557.974561"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(95)00058-5"},{"key":"e_1_3_2_1_19_1","unstructured":"Tesseract open source OCR engine May 2008. http:\/\/code.google.com\/p\/tesseract-ocr\/.  Tesseract open source OCR engine May 2008. http:\/\/code.google.com\/p\/tesseract-ocr\/."},{"key":"e_1_3_2_1_20_1","unstructured":"Workshop on Analytics for Noisy Unstructured Text Data. Hyderabad India January 2007. http:\/\/research.ihost.com\/and2007\/. Workshop on Analytics for Noisy Unstructured Text Data . Hyderabad India January 2007. http:\/\/research.ihost.com\/and2007\/."}],"event":{"name":"AND '08: Second Workshop on Analytics for Noisy Unstructured Text Data","location":"Singapore","acronym":"AND '08"},"container-title":["Proceedings of the second workshop on Analytics for noisy unstructured text data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1390749.1390753","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/1390749.1390753","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T14:47:10Z","timestamp":1750258030000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1390749.1390753"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2008,7,24]]},"references-count":19,"alternative-id":["10.1145\/1390749.1390753","10.1145\/1390749"],"URL":"https:\/\/doi.org\/10.1145\/1390749.1390753","relation":{},"subject":[],"published":{"date-parts":[[2008,7,24]]},"assertion":[{"value":"2008-07-24","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}