{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T09:38:47Z","timestamp":1772789927179,"version":"3.50.1"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2020,3,19]],"date-time":"2020-03-19T00:00:00Z","timestamp":1584576000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,3,19]],"date-time":"2020-03-19T00:00:00Z","timestamp":1584576000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","award":["CZ.02.1.01\/0.0\/0.0\/16_013\/0001758"],"award-info":[{"award-number":["CZ.02.1.01\/0.0\/0.0\/16_013\/0001758"]}],"id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006108","name":"Kult\u00farna a Edukacn\u00e1 Grantov\u00e1 Agent\u00fara M\u0160VVa\u0160 SR","doi-asserted-by":"publisher","award":["K-16-022-00"],"award-info":[{"award-number":["K-16-022-00"]}],"id":[{"id":"10.13039\/501100006108","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006109","name":"Vedeck\u00e1 Grantov\u00e1 Agent\u00fara M\u0160VVa\u0160 SR a SAV","doi-asserted-by":"publisher","award":["2\/0017\/17"],"award-info":[{"award-number":["2\/0017\/17"]}],"id":[{"id":"10.13039\/501100006109","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Lang Resources &amp; Evaluation"],"published-print":{"date-parts":[[2020,9]]},"DOI":"10.1007\/s10579-020-09487-4","type":"journal-article","created":{"date-parts":[[2020,3,19]],"date-time":"2020-03-19T19:03:38Z","timestamp":1584644618000},"page":"713-745","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["Comparing web-crawled and traditional corpora"],"prefix":"10.1007","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3977-2393","authenticated-orcid":false,"given":"V\u00e1clav","family":"Cvr\u010dek","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1170-9344","authenticated-orcid":false,"given":"Zuzana","family":"Komrskov\u00e1","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0429-6542","authenticated-orcid":false,"given":"David","family":"Luke\u0161","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3707-6466","authenticated-orcid":false,"given":"Petra","family":"Poukarov\u00e1","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6676-317X","authenticated-orcid":false,"given":"Anna","family":"\u0158eho\u0159kov\u00e1","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9348-5833","authenticated-orcid":false,"given":"Adrian Jan","family":"Zasina","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4600-5515","authenticated-orcid":false,"given":"Vladim\u00edr","family":"Benko","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,3,19]]},"reference":[{"key":"9487_CR1","unstructured":"Anthony, L. (2018). AntCorGen. Tokyo: Waseda University. Retrieved November 23, 2018, from http:\/\/www.laurenceanthony.net\/software."},{"key":"9487_CR2","volume-title":"The BNC handbook: Exploring the British national corpus with SARA","author":"G Aston","year":"1998","unstructured":"Aston, G., & Burnard, L. (1998). The BNC handbook: Exploring the British national corpus with SARA. Edinburgh: Edinburgh University Press."},{"key":"9487_CR3","volume-title":"Always on: Language in an online and mobile world","author":"N Baron","year":"2010","unstructured":"Baron, N. (2010). Always on: Language in an online and mobile world (1st ed.). Oxford: Oxford University Press.","edition":"1"},{"issue":"3","key":"9487_CR4","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1007\/s10579-009-9081-4","volume":"43","author":"M Baroni","year":"2009","unstructured":"Baroni, M., Bernardini, S., Ferraresi, A., & Zanchetta, E. (2009). The WaCky wide web: A collection of very large linguistically processed web-crawled corpora. Language Resources and Evaluation,43(3), 209\u2013226. https:\/\/doi.org\/10.1007\/s10579-009-9081-4.","journal-title":"Language Resources and Evaluation"},{"key":"9487_CR5","unstructured":"Baroni, M., Kilgarriff, A., Pomik\u00e1lek, J., & Rychl\u00fd, P. (2006). WebBootCaT: a web tool for instant corpora. In Proceeding of the EuraLex Conference (pp. 123\u2013132)."},{"key":"9487_CR6","unstructured":"Bene\u0161ov\u00e1, L., K\u0159en, M., & Waclawicov\u00e1, M. (2013). ORAL2013: Representative corpus of informal spoken Czech. czech, Praha: Institute of the Czech National Corpus. FF UK. Retrieved March 18, 2020, from http:\/\/www.korpus.cz."},{"key":"9487_CR7","unstructured":"Benko, V. (2014). Aranea: Yet another family of (comparable) web corpora. In International Conference on Text, Speech, and Dialogue (pp. 257\u2013264). Berlin: Springer."},{"key":"9487_CR8","unstructured":"Benko, V. (2016a). Two years of Aranea: Increasing counts and tuning the pipeline. In LREC (pp. 4245\u20134248)."},{"key":"9487_CR9","first-page":"19","volume":"10","author":"V Benko","year":"2016","unstructured":"Benko, V. (2016b). Feeding the \u201cBrno Pipeline\u201d: The case of Araneum Slovacum. RASLAN 2016 Recent Advances in Slavonic Natural Language Processing,10, 19\u201327.","journal-title":"RASLAN 2016 Recent Advances in Slavonic Natural Language Processing"},{"key":"9487_CR10","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511621024","volume-title":"Variation across speech and writing","author":"D Biber","year":"1988","unstructured":"Biber, D. (1988). Variation across speech and writing. Cambridge: Cambridge University Press."},{"issue":"4","key":"9487_CR11","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1093\/llc\/8.4.243","volume":"8","author":"D Biber","year":"1993","unstructured":"Biber, D. (1993). Representativeness in corpus design. Literary and Linguistic Computing,8(4), 243\u2013257.","journal-title":"Literary and Linguistic Computing"},{"key":"9487_CR12","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511519871","volume-title":"Dimensions of register variation: A cross-linguistic comparison","author":"D Biber","year":"1995","unstructured":"Biber, D. (1995). Dimensions of register variation: A cross-linguistic comparison. Cambridge: Cambridge University Press."},{"issue":"1","key":"9487_CR13","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1075\/lic.14.1.02bib","volume":"14","author":"D Biber","year":"2014","unstructured":"Biber, D. (2014). Using multi-dimensional analysis to explore cross-linguistic universals of register variation. Languages in Contrast,14(1), 7\u201334. https:\/\/doi.org\/10.1075\/lic.14.1.02bib.","journal-title":"Languages in Contrast"},{"issue":"2","key":"9487_CR14","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1177\/0075424216628955","volume":"44","author":"D Biber","year":"2016","unstructured":"Biber, D., & Egbert, J. (2016). Register variation on the searchable web: A multi-dimensional analysis. Journal of English Linguistics,44(2), 95\u2013137. https:\/\/doi.org\/10.1177\/0075424216628955.","journal-title":"Journal of English Linguistics"},{"key":"9487_CR15","unstructured":"\u010cerm\u00e1k, F., Adamovi\u010dov\u00e1, A., & Pe\u0161i\u010dka, J. (2001). PMK: Prague spoken corpus. czech, Praha: Institute of the Czech National Corpus. FF UK. Retrieved March 18, 2020, from http:\/\/www.korpus.cz."},{"issue":"2","key":"9487_CR16","first-page":"83","volume":"77","author":"V Cvr\u010dek","year":"2016","unstructured":"Cvr\u010dek, V., \u010cerm\u00e1kov\u00e1, A., & K\u0159en, M. (2016). Nov\u00e1 koncepce synchronn\u00edch korpus\u016f psan\u00e9 \u010de\u0161tiny. Slovo a slovesnost,77(2), 83\u2013101.","journal-title":"Slovo a slovesnost"},{"issue":"4","key":"9487_CR17","first-page":"293","volume":"79","author":"V Cvr\u010dek","year":"2018","unstructured":"Cvr\u010dek, V., Komrskov\u00e1, Z., Luke\u0161, D., Poukarov\u00e1, P., \u0158eho\u0159kov\u00e1, A., & Zasina, A. J. (2018a). Variabilita \u010de\u0161tiny: multidimenzion\u00e1ln\u00ed anal\u00fdza [Variability of Czech: A multi-dimensional analysis]. Slovo a slovesnost,79(4), 293\u2013321.","journal-title":"Slovo a slovesnost"},{"key":"9487_CR18","doi-asserted-by":"publisher","DOI":"10.1515\/cllt-2018-0020","author":"V Cvr\u010dek","year":"2018","unstructured":"Cvr\u010dek, V., Komrskov\u00e1, Z., Luke\u0161, D., Poukarov\u00e1, P., \u0158eho\u0159kov\u00e1, A., & Zasina, A. J. (2018b). From extra- to intratextual characteristics: Charting the space of variation in Czech through MDA. Corpus Linguistics and Linguistic Theory. https:\/\/doi.org\/10.1515\/cllt-2018-0020.","journal-title":"Corpus Linguistics and Linguistic Theory"},{"key":"9487_CR19","unstructured":"Cvr\u010dek, V., Komrskov\u00e1, Z., Luke\u0161, D., Poukarov\u00e1, P., \u0158eho\u0159kov\u00e1, A., & Zasina, A. J. (forthcoming). Register variability of elicited texts."},{"key":"9487_CR20","unstructured":"Davies, M. (2018). The 14 Billion Word iWeb Corpus. Retrieved May 10, 2019, from https:\/\/www.english-corpora.org\/iweb\/."},{"key":"9487_CR21","unstructured":"Francis, W. N., & Ku\u010dera, H. (1964, 1979). Manual of information to accompany A Standard Corpus of Present-Day Edited American English, for use with Digital Computers. Brown Corpus Manual. Retrieved December 13, 2018, from http:\/\/clu.uni.no\/icame\/manuals\/BROWN\/INDEX.HTM."},{"key":"9487_CR22","first-page":"25","volume-title":"Narodowy korpus j\u0119zyka polskiego: praca zbiorowa","author":"RL G\u00f3rski","year":"2012","unstructured":"G\u00f3rski, R. L., & \u0141azi\u0144ski, M. (2012). Reprezentatywno\u015b\u0107 i zr\u00f3wnowa\u017cenie korpusu. In A. Przepi\u00f3rkowski, M. Ba\u0144ko, R. L. G\u00f3rski, & B. Lewandowska-Tomaszczyk (Eds.), Narodowy korpus j\u0119zyka polskiego: praca zbiorowa (pp. 25\u201336). Warszawa: Wydawnictwo Naukowe PWN."},{"issue":"4","key":"9487_CR23","doi-asserted-by":"publisher","first-page":"430","DOI":"10.1037\/1082-989X.6.4.430","volume":"6","author":"JW Grice","year":"2001","unstructured":"Grice, J. W. (2001). Computing and evaluating factor scores. Psychological Methods,6(4), 430\u2013450.","journal-title":"Psychological Methods"},{"key":"9487_CR24","unstructured":"Herring, S. C. (2010). Computer-mediated conversation Part I: Introduction and overview. Language@ internet, 7(2). Retrieved March 18, 2020, from https:\/\/www.languageatinternet.org\/articles\/2010\/2801."},{"key":"9487_CR25","unstructured":"Hladk\u00e1, Z. (2002). BMK: Brno spoken corpus. Praha: Institute of the Czech National Corpus. FF UK. Retrieved March 18, 2020, from http:\/\/www.korpus.cz."},{"key":"9487_CR26","volume-title":"Stylistika mluven\u00e9 a psan\u00e9 \u010de\u0161tiny","author":"J Hoffmannov\u00e1","year":"2016","unstructured":"Hoffmannov\u00e1, J., Homol\u00e1\u010d, J., Chvalovsk\u00e1, E., J\u00edlkov\u00e1, L., Kaderka, P., Mare\u0161, P., et al. (2016). Stylistika mluven\u00e9 a psan\u00e9 \u010de\u0161tiny (1st ed.). Praha: Academia.","edition":"1"},{"key":"9487_CR27","unstructured":"Ide, N., Reppen, R., & Suderman, K. (2002). The American National Corpus: More Than the Web Can Provide. In Proceedings of the Third Language Resources and Evaluation Conference (LREC) (pp. 839\u2013844). Presented at the LREC 2002, Las Palmas, Canary Islands, Spain: Citeseer. Retrieved March 18, 2020, from http:\/\/www.lrec-conf.org\/proceedings\/lrec2002\/pdf\/303.pdf."},{"key":"9487_CR28","unstructured":"Jakub\u00ed\u010dek, M., Kilgarriff, A., Kov\u00e1\u0159, V., Rychl\u00fd, P., & Suchomel, V. (2013). The tenten corpus family. In 7th International Corpus Linguistics Conference CL (pp. 125\u2013127)."},{"key":"9487_CR29","unstructured":"Kaderka, P. (2012). Dialog: corpus of broadcasted Czech discussions. czech, Praha: \u00dastav pro jazyk \u010desk\u00fd, AV \u010cR. Retrieved March 18, 2020, from http:\/\/www.korpus.cz."},{"issue":"1","key":"9487_CR30","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1075\/ijcl.6.1.05kil","volume":"6","author":"A Kilgarriff","year":"2001","unstructured":"Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics,6(1), 97\u2013133.","journal-title":"International Journal of Corpus Linguistics"},{"key":"9487_CR31","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-642-32790-2_1","volume-title":"Text, speech and dialogue","author":"A Kilgarriff","year":"2012","unstructured":"Kilgarriff, A. (2012). Getting to know your corpus. In P. Sojka, A. Hor\u00e1k, I. Kope\u010dek, & K. Pala (Eds.), Text, speech and dialogue (pp. 3\u201315). Berlin: Springer."},{"key":"9487_CR32","unstructured":"Kilgarriff, A., Reddy, S., Pomik\u00e1lek, J., & Avinesh, P. V. S. (2010). A corpus factory for many languages. In Proceedings of the International Conference on Language Resources and Evaluation, LREC 2010, 17\u201323 May 2010, Valletta, Malta (pp. 17\u201323). Valleta, Malta. Retrieved March 18, 2020, from http:\/\/www.lrec-conf.org\/proceedings\/lrec2010\/summaries\/79.html."},{"key":"9487_CR33","unstructured":"K\u0159en, M., Cvr\u010dek, V., \u010capka, T., \u010cerm\u00e1kov\u00e1, A., Hn\u00e1tkov\u00e1, M., Jel\u00ednek, T., et al. (2016). SYN2015: Representative corpus of contemporary written Czech. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (pp. 2522\u20132528). Presented at the LREC\u201916, Portoro\u017e: ELRA."},{"key":"9487_CR34","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1163\/9789401203791_009","volume-title":"Corpus Linguistics and the Web","author":"G Leech","year":"2007","unstructured":"Leech, G. (2007). New resources, or just better old ones? The Holy Grail of representativeness. In M. Hundt, N. Nesselhauf, & C. Biewer (Eds.), Corpus Linguistics and the Web (pp. 133\u2013149). Amsterdam: Rodopi."},{"key":"9487_CR35","unstructured":"Michelfeit, J., Pomik\u00e1lek, J., & Suchomel, V. (2014). Text tokenisation using unitok. In 8th Workshop on Recent Advances in Slavonic Natural Language Processing, Brno, Tribun EU (pp. 71\u201375). Presented at the RASLAN 2014, Brno: NLP Consulting."},{"key":"9487_CR36","unstructured":"Piperski, A. (2017). Sum of Minimum Frequencies as a Measure of Corpus Similarity. Presented at the Corpus Linguistics 2017, Birmingham. Retrieved March 18, 2020, from https:\/\/www.birmingham.ac.uk\/Documents\/college-artslaw\/corpus\/conference-archives\/2017\/general\/paper143.pdf."},{"key":"9487_CR37","unstructured":"Piperski, A. (2018). Corpus size and the robustness of measures of corpus distance. In Computational Linguistics and Intellectual Technologies (pp. 590\u2013600). Presented at the Dialogue 2018, Moscow. http:\/\/www.dialog-21.ru\/media\/4327\/piperskiach.pdf."},{"key":"9487_CR38","unstructured":"Pomik\u00e1lek, J. (2011). Removing boilerplate and duplicate content from web corpora (PhD Thesis). Masarykova univerzita, Fakulta informatiky, Brno. Retrieved March 18, 2020, from https:\/\/is.muni.cz\/th\/o6om2\/phdthesis.pdf."},{"key":"9487_CR39","unstructured":"R Core Team. (2018). R: A language and environment for statistical computing. Vienna: R Foundation for Statistical Computing. Retrieved March 18, 2020, from https:\/\/www.R-project.org\/."},{"key":"9487_CR40","doi-asserted-by":"publisher","unstructured":"Rayson, P., & Garside, R. (2000). Comparing corpora using frequency profiling. In Proceedings of the Workshop on Comparing Corpora\u2014Volume 9 (pp. 1\u20136). Stroudsburg, PA, USA: Association for Computational Linguistics. https:\/\/doi.org\/10.3115\/1117729.1117730.","DOI":"10.3115\/1117729.1117730"},{"key":"9487_CR41","unstructured":"Revelle, W. (2018). psych: Procedures for Psychological, Psychometric, and Personality Research. Evanston, IL: Northwestern University. Retrieved March 18, 2020, from https:\/\/CRAN.R-project.org\/package=psych."},{"issue":"1","key":"9487_CR42","doi-asserted-by":"publisher","first-page":"65","DOI":"10.3366\/cor.2018.0136","volume":"13","author":"S Sharoff","year":"2018","unstructured":"Sharoff, S. (2018). Functional text dimensions for the annotation of web corpora. Corpora,13(1), 65\u201395. https:\/\/doi.org\/10.3366\/cor.2018.0136.","journal-title":"Corpora"},{"key":"9487_CR43","unstructured":"Suchomel, V., & Pomik\u00e1lek, J. (2012). Efficient web crawling for large text corpora. In Proceedings of the seventh Web as Corpus Workshop (WAC7) (pp. 39\u201343). Lyon."},{"key":"9487_CR44","unstructured":"V\u00e1lkov\u00e1, L., Waclawicov\u00e1, M., & K\u0159en, M. (2012). Balanced data repository of spontaneous spoken Czech. In Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC\u201912) (pp. 3345\u20133349). Presented at the LREC\u201912, Istanbul: ELRA. Retrieved March 18, 2020, from http:\/\/www.lrec-conf.org\/proceedings\/lrec2012\/pdf\/179_Paper.pdf."},{"issue":"1","key":"9487_CR45","first-page":"127","volume":"10","author":"AJ Zasina","year":"2019","unstructured":"Zasina, A. J., & Komrskov\u00e1, Z. (2019). Koditex \u2014 korpus diverzifikovan\u00fdch text\u016f. Studie z aplikovan\u00e9 lingvistiky - Studies in Applied Linguistics,10(1), 127\u2013132.","journal-title":"Studie z aplikovan\u00e9 lingvistiky - Studies in Applied Linguistics"},{"key":"9487_CR46","unstructured":"Zasina, A. J., Luke\u0161, D., Komrskov\u00e1, Z., Poukarov\u00e1, P., & \u0158eho\u0159kov\u00e1, A. (2018). Koditex: corpus of diversified texts. Czech, Prague: Institute of the Czech National Corpus. FF UK. Retrieved November 26, 2018, from http:\/\/www.korpus.cz."}],"container-title":["Language Resources and Evaluation"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-020-09487-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10579-020-09487-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-020-09487-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,3,19]],"date-time":"2021-03-19T00:44:58Z","timestamp":1616114698000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10579-020-09487-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3,19]]},"references-count":46,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2020,9]]}},"alternative-id":["9487"],"URL":"https:\/\/doi.org\/10.1007\/s10579-020-09487-4","relation":{},"ISSN":["1574-020X","1574-0218"],"issn-type":[{"value":"1574-020X","type":"print"},{"value":"1574-0218","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3,19]]},"assertion":[{"value":"19 March 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}