{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:48:49Z","timestamp":1771951729934,"version":"3.50.1"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030042561","type":"print"},{"value":"9783030042578","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-04257-8_1","type":"book-chapter","created":{"date-parts":[[2018,11,14]],"date-time":"2018-11-14T09:23:25Z","timestamp":1542187405000},"page":"3-14","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Evaluating the Impact of OCR Errors on Topic Modeling"],"prefix":"10.1007","author":[{"given":"Stephen","family":"Mutuvi","sequence":"first","affiliation":[]},{"given":"Antoine","family":"Doucet","sequence":"additional","affiliation":[]},{"given":"Moses","family":"Odeo","sequence":"additional","affiliation":[]},{"given":"Adam","family":"Jatowt","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,11,15]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Silfverberg, M., Rueter, J.: Can morphological analyzers improve the quality of optical character recognition? In: Septentrio Conference Series, vol. 2, pp. 45\u201356 (2015)","DOI":"10.7557\/5.3467"},{"key":"1_CR2","unstructured":"Rosen-Zvi, M., Griffiths, T., Steyvers, M., Smyth, P.: The author-topic model for authors and documents. In: Proceedings of the 20th Conference on Uncertainty in Artificial Intelligence, pp. 487\u2013494. AUAI Press (2004)"},{"key":"1_CR3","first-page":"993","volume":"3","author":"DM Blei","year":"2003","unstructured":"Blei, D.M., Ng, A.Y., Jordan, M.I.: Latent dirichlet allocation. J. Mach. Learn. Res. 3, 993\u20131022 (2003)","journal-title":"J. Mach. Learn. Res."},{"issue":"6","key":"1_CR4","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1002\/asi.20342","volume":"57","author":"DJ Newman","year":"2006","unstructured":"Newman, D.J., Block, S.: Probabilistic topic decomposition of an eighteenth-century American newspaper. J. Assoc. Inf. Sci. Technol. 57(6), 753\u2013767 (2006)","journal-title":"J. Assoc. Inf. Sci. Technol."},{"key":"1_CR5","unstructured":"Nelson, R.K.: Mining the dispatch (2010)"},{"key":"1_CR6","unstructured":"Yang, T.I., Torget, A.J., Mihalcea, R.: Topic modeling on historical newspapers. In: Proceedings of the 5th ACL-HLT Workshop on Language Technology for Cultural Heritage, Social Sciences and Humanities, pp. 96\u2013104. Association for Computational Linguistics (2011)"},{"key":"1_CR7","unstructured":"Chang, J., Gerrish, S., Wang, C., Boyd-Graber, J.L., Blei, D.M.: Reading tea leaves: how humans interpret topic models. In: Advances in Neural Information Processing Systems, pp. 288\u2013296 (2009)"},{"key":"1_CR8","unstructured":"McCallum, A.K.: Mallet: a machine learning for language toolkit (2002)"},{"key":"1_CR9","unstructured":"Walker, D.D., Lund, W.B., Ringger, E.K.: Evaluating models of latent document semantics in the presence of OCR errors. In: Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing, pp. 240\u2013250. Association for Computational Linguistics (2010)"},{"key":"1_CR10","unstructured":"Blevins, C.: Topic modeling Martha Ballard\u2019s diary. http:\/\/historying.org\/2010\/04\/01\/topic-modeling-martha-ballards-diary . Accessed 23 Feb 2018"},{"key":"1_CR11","doi-asserted-by":"publisher","first-page":"788","DOI":"10.1038\/44565","volume":"401","author":"DD Lee","year":"1999","unstructured":"Lee, D.D., Seung, H.S.: Learning the parts of objects by non-negative matrix factorization. Nature 401, 788\u201391 (1999)","journal-title":"Nature"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Arora, S., Ge, R., Moitra, A.: Learning topic models - going beyond SVD. In: Proceedings of 53rd Symposium on Foundations of Computer Science, pp. 1\u201310. IEEE (2012)","DOI":"10.1109\/FOCS.2012.49"},{"key":"1_CR13","doi-asserted-by":"publisher","first-page":"215","DOI":"10.1007\/978-3-319-09259-1_7","volume-title":"Partitional Clustering Algorithms","author":"D Kuang","year":"2015","unstructured":"Kuang, D., Choo, J., Park, H.: Nonnegative matrix factorization for interactive topic modeling and document clustering. In: Celebi, M.E. (ed.) Partitional Clustering Algorithms, pp. 215\u2013243. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-09259-1_7"},{"key":"1_CR14","doi-asserted-by":"publisher","first-page":"159","DOI":"10.1016\/j.eswa.2017.08.047","volume":"91","author":"M Belford","year":"2018","unstructured":"Belford, M., Mac Namee, B., Greene, D.: Stability of topic modeling via matrix factorization. Expert Syst. Appl. 91, 159\u2013169 (2018)","journal-title":"Expert Syst. Appl."},{"key":"1_CR15","series-title":"LNCS (LNAI)","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1007\/978-3-662-44848-9_32","volume-title":"ECML PKDD 2014","author":"D Greene","year":"2014","unstructured":"Greene, D., O\u2019Callaghan, D., Cunningham, P.: How many topics? Stability analysis for topic models. In: Calders, T., Esposito, F., H\u00fcllermeier, E., Meo, R. (eds.) ECML PKDD 2014. LNCS (LNAI), vol. 8724, pp. 498\u2013513. Springer, Heidelberg (2014). https:\/\/doi.org\/10.1007\/978-3-662-44848-9_32"},{"issue":"6","key":"1_CR16","doi-asserted-by":"publisher","first-page":"1299","DOI":"10.1162\/089976604773717621","volume":"16","author":"T Lange","year":"2004","unstructured":"Lange, T., Roth, V., Braun, M.L., Buhmann, J.M.: Stability-based validation of clustering solutions. Neural Comput. 16(6), 1299\u20131323 (2004)","journal-title":"Neural Comput."},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Fang, A., Macdonald, C., Ounis, I., Habel, P.: Using word embedding to evaluate the coherence of topics from Twitter data. In: Proceedings of the 39th International ACM SIGIR Conference on Research and Development in Information Retrieval - SIGIR 2016, pp. 1057\u20131060 (2016)","DOI":"10.1145\/2911451.2914729"},{"issue":"13","key":"1_CR18","doi-asserted-by":"publisher","first-page":"5645","DOI":"10.1016\/j.eswa.2015.02.055","volume":"42","author":"D O\u2019Callaghan","year":"2015","unstructured":"O\u2019Callaghan, D., Greene, D., Carthy, J., Cunningham, P.: An analysis of the coherence of descriptors in topic modeling. Expert Syst. Appl. 42(13), 5645\u20135657 (2015)","journal-title":"Expert Syst. Appl."},{"key":"1_CR19","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1017\/pan.2016.7","volume":"25","author":"D Greene","year":"2017","unstructured":"Greene, D., Cross, J.P.: Exploring the political agenda of the European parliament using a dynamic topic modeling approach. Polit. Anal. 25, 77\u201394 (2017)","journal-title":"Polit. Anal."},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Chiron, G., Doucet, A., Coustaty, M., Visani, M., Moreux, J.P.: Impact of OCR errors on the use of digital libraries: towards a better access to information. In: Proceedings of the ACM\/IEEE Joint Conference on Digital Libraries (2017)","DOI":"10.1109\/JCDL.2017.7991582"},{"key":"1_CR21","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space (2013)"},{"key":"1_CR22","unstructured":"Afli, H., Barrault, L., Schwenk, H.: OCR error correction using statistical machine translation. In: 16th International Conference Intelligent Text Processing Computational Linguistics (CICLing 2015), vol. 7, pp. 175\u2013191 (2015)"},{"issue":"3-4","key":"1_CR23","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1007\/s10032-007-0058-9","volume":"10","author":"Craig Knoblock","year":"2007","unstructured":"Knoblock, C., Lopresti, D., Roy, S., Subramaniam, V.: Special issue on noisy text analytics. Int. J. Doc. Anal. Recogn. 10(3\u20134), 127\u2013128 (2007)","journal-title":"International Journal of Document Analysis and Recognition (IJDAR)"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Eder, M.: Mind your corpus: systematic errors in authorship attribution. Literary Linguist. Comput. 10, 1093 (2013)","DOI":"10.1093\/llc\/fqt039"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Lopresti, D.: Optical character recognition errors and their effects on natural language processing. Presented at The Second Workshop on Analytics for Noisy Unstructured Text Data, Sponsored by ACM (2008)","DOI":"10.1145\/1390749.1390753"},{"key":"1_CR26","first-page":"202","volume-title":"SIGIR 1994","author":"K Taghva","year":"1994","unstructured":"Taghva, K., Borsack, J., Condit, A.: Results of applying probabilistic IR to OCR text. In: Croft, B.W., van Rijsbergen, C.J. (eds.) SIGIR 1994, pp. 202\u2013211. Springer, New York (1994)"},{"key":"1_CR27","unstructured":"Beitzel, S., Jensen, E.C., Grossman, D.A.: A survey of retrieval strategies for OCR text collections. In: Proceedings of 2003 Symposium on Document Image Understanding Technology (2003)"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Taghva, K., Nartker, T., Borsack, J., Lumos, S., Condit, A., Young, R.: Evaluating text categorization in the presence of OCR errors. In: Document Recognition and Retrieval VIII. International Society for Optics and Photonics, vol. 4307, pp. 68\u201375 (2000)","DOI":"10.1117\/12.410861"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Agarwal, S., Godbole, S., Punjani, D., Roy, S.: How much noise is too much: a study in automatic text classification. In: Proceedings of the Seventh IEEE International Conference on Data Mining, ICDM 2007, pp. 3\u201312 (2007)","DOI":"10.1109\/ICDM.2007.21"},{"key":"1_CR30","unstructured":"Steyvers, M., Griffiths, T.: Probabilistic topic models. In: Handbook of Latent Semantic Analysis, vol. 427, no. 7, pp. 424\u2013440 (2007)"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Walker, D., Ringger, E., Seppi, K.: Evaluating supervised topic models in the presence of OCR errors. In: Document Recognition and Retrieval XX, vol. 8658, p. 865812. International Society for Optics and Photonics (2013)","DOI":"10.1117\/12.2008345"}],"container-title":["Lecture Notes in Computer Science","Maturity and Innovation in Digital Libraries"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-04257-8_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,11,2]],"date-time":"2019-11-02T02:12:12Z","timestamp":1572660732000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-04257-8_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030042561","9783030042578"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-04257-8_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"ICADL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Asian Digital Libraries","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hamilton","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 November 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icadl2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icadl2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}