{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T02:51:22Z","timestamp":1768791082460,"version":"3.49.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031705625","type":"print"},{"value":"9783031705632","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70563-2_5","type":"book-chapter","created":{"date-parts":[[2024,8,31]],"date-time":"2024-08-31T22:29:51Z","timestamp":1725143391000},"page":"55-70","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["The Aranea Corpora Family: Ten+ Years of\u00a0Processing Web-Crawled Data"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4600-5515","authenticated-orcid":false,"given":"Vladim\u00edr","family":"Benko","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,1]]},"reference":[{"key":"5_CR1","unstructured":"Abadji, J., Suarez, P.O., Romary, L., Sagot, B.: Towards a cleaner document-oriented multilingual crawled corpus. In: Thirteenth Language Resources and Evaluation Conference-LREC 2022 (2022)"},{"key":"5_CR2","unstructured":"Baroni, M., Bernardini, S.: BootCaT: bootstrapping corpora and terms from the web. In: LREC, pp. 1313\u20131316 (2004)"},{"key":"5_CR3","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1007\/s10579-009-9081-4","volume":"43","author":"M Baroni","year":"2009","unstructured":"Baroni, M., Bernardini, S., Ferraresi, A., Zanchetta, E.: The WaCky wide web: a collection of very large linguistically processed web-crawled corpora. Lang. Resour. Eval. 43, 209\u2013226 (2009)","journal-title":"Lang. Resour. Eval."},{"key":"5_CR4","unstructured":"Benko, V.: Data Deduplication in Slovak Corpora. In: Slovko 2013: Natural Language Processing, Corpus Linguistics, E-learning, pp. 27\u201339. RAM-Verlag: L\u00fcdenscheid (2013)"},{"key":"5_CR5","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1007\/978-3-319-10816-2_31","volume-title":"Text, Speech and Dialogue","author":"V Benko","year":"2014","unstructured":"Benko, V.: Aranea: yet\u00a0another\u00a0family\u00a0of\u00a0(comparable)\u00a0web\u00a0corpora. In: Sojka, P., Hor\u00e1k, A., Kope\u010dek, I., Pala, K. (eds.) TSD 2014. LNCS (LNAI), vol. 8655, pp. 247\u2013256. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10816-2_31"},{"key":"5_CR6","unstructured":"Benko, V.: Aranea Go Middle East: Persicum. In: RASLAN, pp. 113\u2013121 (2022)"},{"key":"5_CR7","unstructured":"Benko, V., Kunilovskaya, M.: Comparable web-crawled corpora as a resource for contrastive studies. In: Accepted for presentation at the Between Languages: Methods of Contrastive Research Using Corpora Workshop, Biennial of Czech Linguistics (2024)"},{"key":"5_CR8","unstructured":"Benko, V., Zakharov, V.: Very Large Russian Corpora: New Opportunities and New Challenges. In: Kompjuternaja lingvistika i intellektual\u2019nyje technologii, pp. 83\u201398 (2016)"},{"key":"5_CR9","unstructured":"Davies, M.: The best of both worlds: multi-billion word \u201cdynamic\u201d corpora. In: Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7), pp. 23\u201328 (2019)"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Hal\u00e1csy, P., Kornai, A., Oravecz, C.: HunPos \u2013 an open source trigram tagger. In: Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions, pp. 209\u2013212 (2007)","DOI":"10.3115\/1557769.1557830"},{"key":"5_CR11","first-page":"125","volume":"2013","author":"M Jakub\u00ed\u010dek","year":"2013","unstructured":"Jakub\u00ed\u010dek, M., Kilgarriff, A., Kov\u00e1\u0159, V., Rychl\u00fd, P., Suchomel, V.: The TenTen corpus family. Corpus. Linguistics 2013, 125 (2013)","journal-title":"Linguistics"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Jongejan, B., Dalianis, H.: Automatic training of lemmatization rules that handle morphological changes in pre-, in-and suffixes alike. In: Proceedings of the Joint Conference of the 47th Annual Meeting of the ACL and the 4th International Joint Conference on Natural Language Processing of the AFNLP, pp. 145\u2013153 (2009)","DOI":"10.3115\/1687878.1687900"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Khanna, T., et al.: Recent advances in Apertium, a free\/open-source rule-based machine translation platform for low-resource languages. Mach. Translation 35(4), 475\u2013502 (2021)","DOI":"10.1007\/s10590-021-09260-6"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Ljube\u0161i\u0107, N., Klubi\u010dka, F.: bs, hr, srWaC \u2013 Web corpora of Bosnian, Croatian and Serbian. In: Proceedings of the 9th web as corpus workshop (WaC-9), pp. 29\u201335 (2014)","DOI":"10.3115\/v1\/W14-0405"},{"key":"5_CR15","unstructured":"McDonald, R., et\u00a0al.: Universal dependency annotation for multilingual parsing. In: Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 92\u201397 (2013)"},{"key":"5_CR16","unstructured":"Michelfeit, J., Pomik\u00e1lek, J., Suchomel, V.: Text Tokenisation Using unitok. In: RASLAN, pp. 71\u201375 (2014)"},{"key":"5_CR17","unstructured":"Paikens, P.: Deep neural learning approaches for Latvian morphological tagging. In: Human Language Technologies\u2013The Baltic Perspective, pp. 160\u2013166. IOS Press (2016)"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Patel, J.M., Patel, J.M.: Introduction to common crawl datasets. Getting structured data from the internet: running web crawlers\/scrapers on a big data production scale, pp. 277\u2013324 (2020)","DOI":"10.1007\/978-1-4842-6576-5_6"},{"key":"5_CR19","unstructured":"Pomik\u00e1lek, J.: Removing Boilerplate and Duplicate Content from Web Corpora. Ph.D. thesis, Masaryk University, Faculty of Informatics, Brno, Czech Republic (2011)"},{"key":"5_CR20","unstructured":"Rabin, M.O.: Fingerprinting by random polynomials. Technical report (1981)"},{"key":"5_CR21","unstructured":"Rychl\u00fd, P.: Manatee\/Bonito \u2013 A Modular Corpus Manager. In: Recent Advances in Slavonic Natural Language Processing (RASLAN 2007), pp. 65\u201370 (2007)"},{"key":"5_CR22","unstructured":"Sch\u00e4fer, R., Bildhauer, F., et\u00a0al.: Building large corpora from the web using a new efficient tool chain. In: Lrec, pp. 486\u2013493 (2012)"},{"key":"5_CR23","unstructured":"Schmid, H.: Probabilistic part-of-speech tagging using decision trees. In: New Methods in Language Processing, pp. 154\u2013164. Routledge (2013)"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Spoustov\u00e1, D., Haji\u010d, J., Raab, J., Spousta, M.: Semi-supervised training for the averaged perceptron POS tagger. In: Proceedings of the 12th Conference of the European Chapter of the ACL (EACL 2009), pp. 763\u2013771 (2009)","DOI":"10.3115\/1609067.1609152"},{"key":"5_CR25","unstructured":"Straka, M., Haji\u010d, J., Strakov\u00e1, J.: UDPipe: trainable pipeline for processing CoNLL-U files performing tokenization, morphological analysis, pos tagging and parsing. In: Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC\u201916), pp. 4290\u20134297 (2016)"},{"key":"5_CR26","unstructured":"Suchomel, V.: Discriminating between similar languages using large web corpora. In: Recent Advances in Slavonic Natural Language Processing (RASLAN 2019), p.\u00a0129 (2019)"},{"key":"5_CR27","unstructured":"Suchomel, V., Pomik\u00e1lek, J., et\u00a0al.: Efficient web crawling for large text corpora. In: Proceedings of the seventh Web as Corpus Workshop (WAC7), pp. 39\u201343 (2012)"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70563-2_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,31]],"date-time":"2024-08-31T22:33:22Z","timestamp":1725143602000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70563-2_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705625","9783031705632"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70563-2_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"1 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The author has no competing interests to declare relevant to this article\u2019s content.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brno","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Czech Republic","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.tsdconference.org\/tsd2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}