{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T14:41:03Z","timestamp":1781880063279,"version":"3.54.5"},"reference-count":59,"publisher":"MIT Press","license":[{"start":{"date-parts":[[2022,2,8]],"date-time":"2022-02-08T00:00:00Z","timestamp":1644278400000},"content-version":"vor","delay-in-days":38,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["direct.mit.edu"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,1,31]]},"abstract":"<jats:title>Abstract<\/jats:title><jats:p>With the success of large-scale pre-training and multilingual modeling in Natural Language Processing (NLP), recent years have seen a proliferation of large, Web-mined text datasets covering hundreds of languages. We manually audit the quality of 205 language-specific corpora released with five major public datasets (CCAligned, ParaCrawl, WikiMatrix, OSCAR, mC4). Lower-resource corpora have systematic issues: At least 15 corpora have no usable text, and a significant fraction contains less than 50% sentences of acceptable quality. In addition, many are mislabeled or use nonstandard\/ambiguous language codes. We demonstrate that these issues are easy to detect even for non-proficient speakers, and supplement the human audit with automatic analyses. Finally, we recommend techniques to evaluate and improve multilingual corpora and discuss potential risks that come with low-quality data releases.<\/jats:p>","DOI":"10.1162\/tacl_a_00447","type":"journal-article","created":{"date-parts":[[2022,2,8]],"date-time":"2022-02-08T14:56:38Z","timestamp":1644332198000},"page":"50-72","update-policy":"https:\/\/doi.org\/10.1162\/mitpressjournals.corrections.policy","source":"Crossref","is-referenced-by-count":67,"title":["Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets"],"prefix":"10.1162","volume":"10","author":[{"given":"Julia","family":"Kreutzer","sequence":"first","affiliation":[{"name":"Google Research, Canada"},{"name":"Masakhane NLP, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Isaac","family":"Caswell","sequence":"additional","affiliation":[{"name":"Google Research, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lisa","family":"Wang","sequence":"additional","affiliation":[{"name":"Google Research, USA"},{"name":"Google Research, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahsan","family":"Wahab","sequence":"additional","affiliation":[{"name":"Turkic Interlingua"},{"name":"University of South Florida, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Daan","family":"van Esch","sequence":"additional","affiliation":[{"name":"Google Research, The Netherlands"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nasanbayar","family":"Ulzii-Orshikh","sequence":"additional","affiliation":[{"name":"Haverford College, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Allahsera","family":"Tapo","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Mali"},{"name":"RobotsMali, Mali"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nishant","family":"Subramani","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"Allen Institute for Artificial Intelligence, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Artem","family":"Sokolov","sequence":"additional","affiliation":[{"name":"Google Research, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Claytone","family":"Sikasote","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Zambia"},{"name":"University of Zambia, Zambia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Monang","family":"Setyawan","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Supheakmungkol","family":"Sarin","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sokhar","family":"Samb","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Senegal"},{"name":"AIMS-AMMI, Senegal"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Beno\u00eet","family":"Sagot","sequence":"additional","affiliation":[{"name":"Inria, France"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Clara","family":"Rivera","sequence":"additional","affiliation":[{"name":"Google Research, UK"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Annette","family":"Rios","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Isabel","family":"Papadimitriou","sequence":"additional","affiliation":[{"name":"Stanford University, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Salomey","family":"Osei","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Ghana"},{"name":"Kwame Nkrumah University of Science and Technology, Ghana"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pedro Ortiz","family":"Suarez","sequence":"additional","affiliation":[{"name":"Inria, France"},{"name":"Sorbonne Universit\u00e9, France"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Iroro","family":"Orife","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"Niger-Volta LTI, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kelechi","family":"Ogueji","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"University of Waterloo, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andre Niyongabo","family":"Rubungo","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Spain"},{"name":"Universitat Polit\u00e8cnica de Catalunya, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Toan Q.","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Notre Dame, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mathias","family":"M\u00fcller","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andr\u00e9","family":"M\u00fcller","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shamsuddeen Hassan","family":"Muhammad","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Nigeria"},{"name":"Bayero University Kano, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nanda","family":"Muhammad","sequence":"additional","affiliation":[{"name":"Bayero University Kano, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ayanda","family":"Mnyakeni","sequence":"additional","affiliation":[{"name":"Google, South Africa"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jamshidbek","family":"Mirzakhalov","sequence":"additional","affiliation":[{"name":"Turkic Interlingua"},{"name":"University of South Florida, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tapiwanashe","family":"Matangira","sequence":"additional","affiliation":[{"name":"Google, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Colin","family":"Leong","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nze","family":"Lawson","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sneha","family":"Kudugunta","sequence":"additional","affiliation":[{"name":"Google Research, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yacine","family":"Jernite","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"Hugging Face, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mathias","family":"Jenny","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Orhan","family":"Firat","sequence":"additional","affiliation":[{"name":"Google Research, USA"},{"name":"Turkic Interlingua"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bonaventure F. P.","family":"Dossou","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Germany"},{"name":"Jacobs University Bremen, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sakhile","family":"Dlamini","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nisansa","family":"de Silva","sequence":"additional","affiliation":[{"name":"University of Moratuwa, Sri Lanka"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sakine","family":"\u00c7abuk Ball\u0131","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stella","family":"Biderman","sequence":"additional","affiliation":[{"name":"EleutherAI, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Alessia","family":"Battisti","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahmed","family":"Baruwa","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"Obafemi Awolowo University, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ankur","family":"Bapna","sequence":"additional","affiliation":[{"name":"Google Research, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pallavi","family":"Baljekar","sequence":"additional","affiliation":[{"name":"Google Research, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Israel Abebe","family":"Azime","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Ethiopia"},{"name":"AIMS-AMMI, Ethiopia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ayodele","family":"Awokoya","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Nigeria"},{"name":"University of Ibadan, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Duygu","family":"Ataman","sequence":"additional","affiliation":[{"name":"University of Zurich, Switzerland"},{"name":"Turkic Interlingua, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Orevaoghene","family":"Ahia","sequence":"additional","affiliation":[{"name":"Masakhane NLP, USA"},{"name":"Instadeep, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Oghenefego","family":"Ahia","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sweta","family":"Agrawal","sequence":"additional","affiliation":[{"name":"University of Maryland, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mofetoluwa","family":"Adeyemi","sequence":"additional","affiliation":[{"name":"Masakhane NLP, Nigeria"},{"name":"Defence Space Administration Abuja, Nigeria"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"281","published-online":{"date-parts":[[2022,1,31]]},"reference":[{"key":"2022033118510166500_bib1","doi-asserted-by":"publisher","first-page":"3204","DOI":"10.18653\/v1\/P19-1310","article-title":"JW300: A wide-coverage parallel corpus for low-resource languages","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","author":"Agi\u0107","year":"2019"},{"key":"2022033118510166500_bib2","first-page":"191","article-title":"AraELECTRA: Pre-training text discriminators for Arabic language understanding","volume-title":"Proceedings of the Sixth Arabic Natural Language Processing Workshop","author":"Antoun","year":"2021"},{"key":"2022033118510166500_bib3","article-title":"Massively multilingual neural machine translation in the wild: Findings and challenges","author":"Arivazhagan","year":"2019","journal-title":"arXiv preprint arXiv:1907.05019"},{"key":"2022033118510166500_bib4","doi-asserted-by":"publisher","first-page":"597","DOI":"10.1162\/tacl_a_00288","article-title":"Massively multilingual sentence embeddings for zero-shot cross-lingual transfer and beyond","volume":"7","author":"Artetxe","year":"2019","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2022033118510166500_bib5","first-page":"355","article-title":"Domain adaptation via pseudo in-domain data selection","volume-title":"Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing","author":"Axelrod","year":"2011"},{"key":"2022033118510166500_bib6","doi-asserted-by":"publisher","first-page":"4555","DOI":"10.18653\/v1\/2020.acl-main.417","article-title":"ParaCrawl: Web-scale acquisition of parallel corpora","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","author":"Ba\u00f1\u00f3n","year":"2020"},{"key":"2022033118510166500_bib7","doi-asserted-by":"publisher","first-page":"587","DOI":"10.1162\/tacl_a_00041","article-title":"Data statements for natural language processing: Toward mitigating system bias and enabling better science","volume":"6","author":"Bender","year":"2018","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2022033118510166500_bib8","doi-asserted-by":"publisher","first-page":"610","DOI":"10.1145\/3442188.3445922","article-title":"On the dangers of stochastic parrots: Can language models be too big?","volume-title":"Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency","author":"Bender","year":"2021"},{"key":"2022033118510166500_bib9","article-title":"Pitfalls in machine learning research: Reexamining the development cycle","author":"Biderman","year":"2020","journal-title":"arXiv preprint arXiv:2011.02832"},{"key":"2022033118510166500_bib10","doi-asserted-by":"publisher","first-page":"1536","DOI":"10.1109\/WACV48630.2021.00158","article-title":"Large image datasets: A pyrrhic win for computer vision?","volume-title":"2021 IEEE Winter Conference on Applications of Computer Vision (WACV)","author":"Birhane","year":"2021"},{"key":"2022033118510166500_bib11","doi-asserted-by":"publisher","first-page":"2879","DOI":"10.18653\/v1\/D17-1309","article-title":"Natural language processing with small feed-forward networks","volume-title":"Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing","author":"Botha","year":"2017"},{"key":"2022033118510166500_bib12","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","author":"Brown","year":"2020"},{"key":"2022033118510166500_bib13","doi-asserted-by":"publisher","first-page":"6588","DOI":"10.18653\/v1\/2020.coling-main.579","article-title":"Language ID in the wild: Unexpected challenges on the path to a thousand-language web text corpus","volume-title":"Proceedings of the 28th International Conference on Computational Linguistics","author":"Caswell","year":"2020"},{"key":"2022033118510166500_bib14","doi-asserted-by":"publisher","first-page":"6788","DOI":"10.18653\/v1\/2020.coling-main.598","article-title":"German\u2019s next language model","volume-title":"Proceedings of the 28th International Conference on Computational Linguistics","author":"Chan","year":"2020"},{"key":"2022033118510166500_bib15","article-title":"HeBERT & HebEMO: A Hebrew BERT Model and a Tool for Polarity Analysis and Emotion Recognition","author":"Chriqui","year":"2021","journal-title":"arXiv preprint arXiv:2102.01909"},{"key":"2022033118510166500_bib16","doi-asserted-by":"publisher","first-page":"8440","DOI":"10.18653\/v1\/2020.acl-main.747","article-title":"Unsupervised cross-lingual representation learning at scale","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","author":"Conneau","year":"2020"},{"key":"2022033118510166500_bib17","doi-asserted-by":"publisher","first-page":"2475","DOI":"10.18653\/v1\/D18-1269","article-title":"XNLI: Evaluating cross-lingual sentence representations","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","author":"Conneau","year":"2018"},{"key":"2022033118510166500_bib18","doi-asserted-by":"publisher","first-page":"3255","DOI":"10.18653\/v1\/2020.findings-emnlp.292","article-title":"RobBERT: a Dutch RoBERTa-based Language Model","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","author":"Delobelle","year":"2020"},{"key":"2022033118510166500_bib19","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin","year":"2019"},{"key":"2022033118510166500_bib20","article-title":"Documenting the english colossal clean crawled corpus","author":"Dodge","year":"2021","journal-title":"arXiv preprint arXiv:2104.08758"},{"key":"2022033118510166500_bib21","doi-asserted-by":"publisher","first-page":"4324","DOI":"10.18653\/v1\/2020.findings-emnlp.387","article-title":"The birth of Romanian BERT","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","author":"Dumitrescu","year":"2020"},{"key":"2022033118510166500_bib22","doi-asserted-by":"publisher","first-page":"5960","DOI":"10.18653\/v1\/2020.emnlp-main.480","article-title":"CCAligned: A massive collection of cross-lingual web-document pairs","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"El-Kishky","year":"2020"},{"key":"2022033118510166500_bib23","first-page":"118","article-title":"ParaCrawl: Web-scale parallel corpora for the languages of the EU","volume-title":"Proceedings of Machine Translation Summit XVII: Translator, Project and User Tracks","author":"Espl\u00e0","year":"2019"},{"key":"2022033118510166500_bib24","article-title":"Beyond English-centric multilingual machine translation","author":"Fan","year":"2020","journal-title":"arXiv preprint arXiv:2010.11125"},{"key":"2022033118510166500_bib25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.195","article-title":"Participatory research for low-resourced machine translation: A case study in African languages","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","author":"Nekoto","year":"2020"},{"key":"2022033118510166500_bib26","article-title":"The pile: An 800gb dataset of diverse text for language modeling","author":"Gao","year":"2020","journal-title":"arXiv preprint arXiv:2101.00027"},{"key":"2022033118510166500_bib27","article-title":"Datasheets for datasets","author":"Gebru","year":"2018","journal-title":"arXiv preprint arXiv:1803.09010"},{"key":"2022033118510166500_bib28","article-title":"The FLORES-101 evaluation benchmark for low-resource and multilingual machine translation","author":"Goyal","year":"2021","journal-title":"arXiv preprint arXiv:2106.03193"},{"key":"2022033118510166500_bib29","article-title":"Learning word vectors for 157 languages","volume-title":"Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)","author":"Grave","year":"2018"},{"key":"2022033118510166500_bib30","article-title":"The dataset nutrition label: A framework to drive higher data quality standards","author":"Holland","year":"2018","journal-title":"arXiv preprint arXiv:1805.03677"},{"key":"2022033118510166500_bib31","first-page":"4411","article-title":"XTREME: A massively multilingual multi-task benchmark for evaluating cross-lingual generalisation","volume-title":"Proceedings of the 37th International Conference on Machine Learning","author":"Junjie","year":"2020"},{"key":"2022033118510166500_bib32","article-title":"Fasttext.zip: Compressing text classification models","author":"Joulin","year":"2016","journal-title":"arXiv preprint arXiv:1612.03651"},{"key":"2022033118510166500_bib33","doi-asserted-by":"publisher","first-page":"427","DOI":"10.18653\/v1\/E17-2068","article-title":"Bag of tricks for efficient text classification","volume-title":"Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers","author":"Joulin","year":"2017"},{"key":"2022033118510166500_bib34","doi-asserted-by":"publisher","first-page":"888","DOI":"10.18653\/v1\/W18-6478","article-title":"Dual conditional cross-entropy filtering of noisy parallel corpora","volume-title":"Proceedings of the Third Conference on Machine Translation: Shared Task Papers","author":"Junczys-Dowmunt","year":"2018"},{"key":"2022033118510166500_bib35","doi-asserted-by":"publisher","first-page":"225","DOI":"10.18653\/v1\/W19-5321","article-title":"Microsoft translator at WMT 2019: Towards large-scale document-level neural machine translation","volume-title":"Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)","author":"Junczys-Dowmunt","year":"2019"},{"key":"2022033118510166500_bib36","doi-asserted-by":"publisher","first-page":"4948","DOI":"10.18653\/v1\/2020.findings-emnlp.445","article-title":"IndicNLPSuite: Monolingual corpora, evaluation benchmarks and pre-trained multilingual language models for Indian languages","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","author":"Kakwani","year":"2020"},{"key":"2022033118510166500_bib37","first-page":"3145","article-title":"PanLex: Building a resource for panlingual lexical translation","volume-title":"Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC\u201914)","author":"Kamholz","year":"2014"},{"key":"2022033118510166500_bib38","doi-asserted-by":"publisher","first-page":"28","DOI":"10.18653\/v1\/W18-5505","article-title":"Information nutrition labels: A plugin for online news evaluation","volume-title":"Proceedings of the First Workshop on Fact Extraction and VERification (FEVER)","author":"Kevin","year":"2018"},{"key":"2022033118510166500_bib39","doi-asserted-by":"publisher","first-page":"74","DOI":"10.18653\/v1\/W18-2709","article-title":"On the impact of various types of noise on neural machine translation","volume-title":"Proceedings of the 2nd Workshop on Neural Machine Translation and Generation","author":"Khayrallah","year":"2018"},{"key":"2022033118510166500_bib40","first-page":"726","article-title":"Findings of the WMT 2020 shared task on parallel corpus filtering and alignment","volume-title":"Proceedings of the Fifth Conference on Machine Translation","author":"Koehn","year":"2020"},{"key":"2022033118510166500_bib41","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1145\/3411408.3411440","article-title":"Greek-bert: The greeks visiting sesame street","volume-title":"11th Hellenic Conference on Artificial Intelligence","author":"Koutsikakis","year":"2020"},{"key":"2022033118510166500_bib42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.24","article-title":"What\u2019s in the box? an analysis of undesirable content in the common crawl corpus","author":"Luccioni","year":"2021","journal-title":"arXiv preprint arXiv:2105.02732"},{"key":"2022033118510166500_bib43","doi-asserted-by":"publisher","first-page":"7203","DOI":"10.18653\/v1\/2020.acl-main.645","article-title":"CamemBERT: A tasty French language model","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","author":"Martin","year":"2020"},{"key":"2022033118510166500_bib44","doi-asserted-by":"publisher","first-page":"6626","DOI":"10.18653\/v1\/2020.coling-main.581","article-title":"RoBERT\u2014a Romanian BERT model","volume-title":"Proceedings of the 28th International Conference on Computational Linguistics","author":"Masala","year":"2020"},{"key":"2022033118510166500_bib45","first-page":"220","article-title":"Intelligent selection of language model training data","volume-title":"Proceedings of the ACL 2010 Conference Short Papers","author":"Moore","year":"2010"},{"key":"2022033118510166500_bib46","doi-asserted-by":"publisher","first-page":"1703","DOI":"10.18653\/v1\/2020.acl-main.156","article-title":"A monolingual approach to contextualized word embeddings for mid-resource languages","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","author":"Su\u00e1rez","year":"2020"},{"key":"2022033118510166500_bib47","first-page":"9","article-title":"Asynchronous pipelines for processing huge corpora on medium to low resource infrastructures","volume-title":"Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7) 2019. Cardiff, 22nd July 2019","author":"Su\u00e1rez","year":"2019"},{"key":"2022033118510166500_bib48","doi-asserted-by":"crossref","unstructured":"Addison Phillips and MarkDavis. 2005. Tags for Identifying Languages. Internet Engineering Task Force. Work in Progress.","DOI":"10.17487\/rfc4646"},{"key":"2022033118510166500_bib49","doi-asserted-by":"publisher","first-page":"529","DOI":"10.18653\/v1\/N18-2084","article-title":"When and why are pre-trained word embeddings useful for neural machine translation?","volume-title":"Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)","author":"Ye","year":"2018"},{"key":"2022033118510166500_bib50","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"2022033118510166500_bib51","article-title":"MT detection in Web-scraped parallel corpora","volume-title":"Proceedings of MT Summit XIII","author":"Rarrick","year":"2011"},{"key":"2022033118510166500_bib52","doi-asserted-by":"publisher","first-page":"1351","DOI":"10.18653\/v1\/2021.eacl-main.115","article-title":"WikiMatrix: Mining 135M parallel sentences in 1620 language pairs from Wikipedia","volume-title":"Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume","author":"Schwenk","year":"2021"},{"key":"2022033118510166500_bib53","article-title":"AlephBERT:A Hebrew large pre-trained language model to start-off your Hebrew NLP application with","author":"Seker","year":"2021","journal-title":"arXiv preprint arXiv:2104.04052"},{"issue":"5","key":"2022033118510166500_bib54","doi-asserted-by":"publisher","first-page":"991","DOI":"10.1006\/ijhc.1999.0252","article-title":"Does automation bias decision-making?","volume":"51","author":"Skitka","year":"1999","journal-title":"International Journal of Human-Computer Studies"},{"key":"2022033118510166500_bib55","doi-asserted-by":"crossref","first-page":"2893","DOI":"10.1145\/3357384.3357853","article-title":"Mithralabel: Flexible dataset nutritional labels for responsible data science","volume-title":"Proceedings of the 28th ACM International Conference on Information and Knowledge Management","author":"Sun","year":"2019"},{"key":"2022033118510166500_bib56","doi-asserted-by":"publisher","first-page":"133","DOI":"10.18653\/v1\/W18-6314","article-title":"Denoising neural machine translation training with trusted data and online data selection","volume-title":"Proceedings of the Third Conference on Machine Translation: Research Papers","author":"Wang","year":"2018"},{"key":"2022033118510166500_bib57","first-page":"843","article-title":"IndoNLU: Benchmark and resources for evaluating Indonesian natural language understanding","volume-title":"Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing","author":"Wilie","year":"2020"},{"key":"2022033118510166500_bib58","first-page":"2945","article-title":"Zipporah: A fast and scalable data cleaning system for noisy Web-crawled parallel corpora","volume-title":"Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing","author":"Hainan","year":"2017"},{"key":"2022033118510166500_bib59","first-page":"483","article-title":"mT5: A massively multilingual pre-trained text-to-text transformer","volume-title":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Xue","year":"2021"}],"container-title":["Transactions of the Association for Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/direct.mit.edu\/tacl\/article-pdf\/doi\/10.1162\/tacl_a_00447\/1986585\/tacl_a_00447.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/direct.mit.edu\/tacl\/article-pdf\/doi\/10.1162\/tacl_a_00447\/1986585\/tacl_a_00447.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,26]],"date-time":"2023-01-26T08:15:21Z","timestamp":1674720921000},"score":1,"resource":{"primary":{"URL":"https:\/\/direct.mit.edu\/tacl\/article\/doi\/10.1162\/tacl_a_00447\/109285\/Quality-at-a-Glance-An-Audit-of-Web-Crawled"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1162\/tacl_a_00447","relation":{},"ISSN":["2307-387X"],"issn-type":[{"value":"2307-387X","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2022]]},"published":{"date-parts":[[2022]]}}}