{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,16]],"date-time":"2025-10-16T20:15:49Z","timestamp":1760645749779},"publisher-location":"Berlin, Heidelberg","reference-count":20,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642125492"},{"type":"electronic","value":"9783642125508"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2010]]},"DOI":"10.1007\/978-3-642-12550-8_23","type":"book-chapter","created":{"date-parts":[[2010,4,10]],"date-time":"2010-04-10T00:32:38Z","timestamp":1270859558000},"page":"281-290","source":"Crossref","is-referenced-by-count":3,"title":["On the Assessment of Text Corpora"],"prefix":"10.1007","author":[{"given":"David","family":"Pinto","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Paolo","family":"Rosso","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"H\u00e9ctor","family":"Jim\u00e9nez-Salazar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","reference":[{"issue":"6","key":"23_CR1","doi-asserted-by":"publisher","first-page":"584","DOI":"10.1002\/asi.20147","volume":"56","author":"F. Debole","year":"2005","unstructured":"Debole, F., Sebastiani, F.: An analysis of the relative hardness of Reuters-21578 subsets. Journal of the American Society for Information Science and Technology\u00a056(6), 584\u2013596 (2005)","journal-title":"Journal of the American Society for Information Science and Technology"},{"key":"23_CR2","unstructured":"Wibowo, W., Williams, H.: On using hierarchies for document classification. In: Proc. of the Australian Document Computing Symposium, pp. 31\u201337 (1999)"},{"key":"23_CR3","volume-title":"Type-Token Mathematics: A Textbook of Mathematical Linguistics","author":"G. Herdan","year":"1960","unstructured":"Herdan, G.: Type-Token Mathematics: A Textbook of Mathematical Linguistics. Mouton & Co., The Hague (1960)"},{"issue":"5","key":"23_CR4","doi-asserted-by":"publisher","first-page":"323","DOI":"10.1023\/A:1001749303137","volume":"32","author":"F.J. Tweedie","year":"1998","unstructured":"Tweedie, F.J., Baayen, R.H.: How variable may a constant be?: Measures of lexical richness in perspective. Computers and the Humanities\u00a032(5), 323\u2013352 (1998)","journal-title":"Computers and the Humanities"},{"issue":"2","key":"23_CR5","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1023\/A:1022673822140","volume":"37","author":"D.L. Hoover","year":"2004","unstructured":"Hoover, D.L.: Another perspective on vocabulary richness. Computers and the Humanities\u00a037(2), 151\u2013178 (2004)","journal-title":"Computers and the Humanities"},{"key":"23_CR6","unstructured":"Japkowicz, N.: The class imbalance problem: Significance and strategies. In: Proc. of the 2000 International Conference on Artificial Intelligence (IC-AI 2000), vol.\u00a01, pp. 111\u2013117 (2000)"},{"key":"23_CR7","unstructured":"Montejo-R\u00e1ez, A.: Automatic text categorization of documents in the High Energy Physics domain. Phd thesis, Granada University, Spain (2006)"},{"issue":"1-2","key":"23_CR8","first-page":"109","volume":"19","author":"J. Diederich","year":"2004","unstructured":"Diederich, J., Kindermann, J., Leopold, E., Paass, G.: Authorship attribution with support vector machines. Applied Intelligence\u00a019(1-2), 109\u2013123 (2004)","journal-title":"Applied Intelligence"},{"issue":"1","key":"23_CR9","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1023\/B:CHUM.0000009225.28847.77","volume":"38","author":"F. Can","year":"2004","unstructured":"Can, F., Patton, J.M.: Change of writing style with time. Computers and the Humanities\u00a038(1), 61\u201382 (2004)","journal-title":"Computers and the Humanities"},{"issue":"2","key":"23_CR10","first-page":"174","volume":"41","author":"D.L. Hoover","year":"2007","unstructured":"Hoover, D.L.: Corpus stylistics, stylometry, and the styles of henry james. Style\u00a041(2), 174\u2013203 (2007)","journal-title":"Style"},{"key":"23_CR11","unstructured":"Brants, T., Popat, A.C., Xu, P., Och, F.J., Dean, J.: Large language models in machine translation. In: Proc. of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL), pp. 858\u2013867 (2007)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"M\u00e0rquez, L., Padr\u00f3, L.: A flexible pos tagger using an automatically acquired language model. In: Proc. of the 35th annual meeting on Association for Computational Linguistics, pp. 238\u2013245 (1997)","DOI":"10.3115\/976909.979648"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Ponte, J.M., Croft, W.B.: A language modeling approach to information retrieval. In: Research and Development in Information Retrieval, pp. 275\u2013281 (1998)","DOI":"10.1145\/290941.291008"},{"issue":"2","key":"23_CR14","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1109\/TPAMI.1983.4767370","volume":"5","author":"L.R. Bahl","year":"1983","unstructured":"Bahl, L.R., Jelinek, E., Mercer, R.L.: A maximum likelihood approach to continuous speech recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence\u00a05(2), 179\u2013190 (1983)","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"4","key":"23_CR15","first-page":"467","volume":"18","author":"P.F. Brown","year":"1992","unstructured":"Brown, P.F., Pietra, V.J.D., de Souza, P.V., Lai, J.C., Mercer, R.L.: Class-based n-gram models of natural language. Computational Linguistics\u00a018(4), 467\u2013479 (1992)","journal-title":"Computational Linguistics"},{"key":"23_CR16","volume-title":"Human behaviour and the principle of least effort","author":"G.K. Zipf","year":"1949","unstructured":"Zipf, G.K.: Human behaviour and the principle of least effort. Addison-Wesley, Reading (1949)"},{"key":"23_CR17","unstructured":"Cardoso-Cachopo, A., Oliveira, A.: Combining LSI with other classifiers to improve accuracy of single-label text categorization. In: First European Workshop on Latent Semantic Analysis in Technology Enhanced Learning - EWLSATEL 2007 (2007)"},{"key":"23_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1007\/978-3-540-70939-8_54","volume-title":"Computational Linguistics and Intelligent Text Processing","author":"D. Pinto","year":"2007","unstructured":"Pinto, D., Bened\u00ed, J.M., Rosso, P.: Clustering narrow-domain short texts by using the Kullback-Leibler distance. In: Gelbukh, A. (ed.) CICLing 2007. LNCS, vol.\u00a04394, pp. 611\u2013622. Springer, Heidelberg (2007)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Agirre, E., Soroa, A.: Semeval-2007 task 2: Evaluating word sense induction and discrimination systems. In: Proc. of the 4th International Workshop on Semantic Evaluations - SemEval 2007, pp. 7\u201312. Association for Computational Linguistics (2007)","DOI":"10.3115\/1621474.1621476"},{"key":"23_CR20","doi-asserted-by":"crossref","first-page":"81","DOI":"10.1093\/biomet\/30.1-2.81","volume":"30","author":"M. Kendall","year":"1938","unstructured":"Kendall, M.: A new measure of rank correlation. Biometrika\u00a030, 81\u201389 (1938)","journal-title":"Biometrika"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Information Systems"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-12550-8_23.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,24]],"date-time":"2020-11-24T02:55:07Z","timestamp":1606186507000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-12550-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010]]},"ISBN":["9783642125492","9783642125508"],"references-count":20,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-12550-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2010]]}}}