{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T11:50:55Z","timestamp":1769773855863,"version":"3.49.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032159830","type":"print"},{"value":"9783032159847","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-15984-7_8","type":"book-chapter","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T20:34:09Z","timestamp":1769718849000},"page":"107-120","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Is Machine-Translation Enough? Understanding Impacts in\u00a0LLM Benchmarking"],"prefix":"10.1007","author":[{"given":"Ta\u00edgo \u00cdtalo","family":"de Moraes Pedrosa","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Evandro","family":"de Barros Costa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rob\u00e9rio Jos\u00e9 Rog\u00e9rio","family":"dos Santos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,30]]},"reference":[{"key":"8_CR1","unstructured":"Grattafiori, A., et. al.: The llama 3 herd of models (2024). https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"8_CR2","unstructured":"Abonizio, H., et al.: Sabi\u00e1-3 Technical Report (2025). https:\/\/arxiv.org\/abs\/2410.12049"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Almeida, T.S., Laitz, T., Bon\u00e1s, G.K., Nogueira, R.: Bluex: A benchmark based onnbsp; brazilian leading universities entrance exams. In: Intelligent Systems: 12th Brazilian Conference, BRACIS 2023, Belo Horizonte, Brazil, September 25\u201329, 2023, Proceedings, Part I. p. 337\u2013347. Springer-Verlag, Berlin, Heidelberg (2023). https:\/\/doi.org\/10.1007\/978-3-031-45368-7_22","DOI":"10.1007\/978-3-031-45368-7_22"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Balloccu, S., Schmidtov\u00e1, P., Lango, M., Dusek, O.: Leak, cheat, repeat: data contamination and evaluation malpractices in closed-source LLMs. In: Graham, Y., Purver, M. (eds.) Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 67\u201393. Association for Computational Linguistics, St. Julian\u2019s, Malta (2024). https:\/\/aclanthology.org\/2024.eacl-long.5\/","DOI":"10.18653\/v1\/2024.eacl-long.5"},{"key":"8_CR5","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems. vol.\u00a033, pp. 1877\u20131901. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"8_CR6","unstructured":"Brum, H., das Gra\u00e7as Volpe\u00a0Nunes, M.: Building a Sentiment Corpus of Tweets in Brazilian Portuguese. In: chair), N.C.C., et al., (eds.) Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018). European Language Resources Association (ELRA), Miyazaki, Japan (2018)"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Chang, Y., et al.: A survey on evaluation of large language models. ACM Trans. Intell. Syst. Technol. 15(3) (2024). DOIurlhttps:\/\/doi.org\/10.1145\/3641289","DOI":"10.1145\/3641289"},{"key":"8_CR8","unstructured":"Dac\u00a0Lai, V., Van\u00a0Nguyen, C., Ngo, N.T., Nguyen, T., Dernoncourt, F., Rossi, R.A., Nguyen, T.H.: Evaluation framework for multilingual large language models. https:\/\/github.com\/nlp-uoregon\/mlmm-evaluation (2023)"},{"key":"8_CR9","unstructured":"Delfino, P., Cuconato, B., Haeusler, E.H., Rademaker, A.: In: Passing the Brazilian Oab Exam: Data Preparation and Some Experiments, vol. 302, p. 89. IOS Press (2017)"},{"key":"8_CR10","doi-asserted-by":"publisher","unstructured":"Fenogenova, A., et al.: MERA: A comprehensive LLM evaluation in Russian. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 9920\u20139948. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.534","DOI":"10.18653\/v1\/2024.acl-long.534"},{"key":"8_CR11","doi-asserted-by":"publisher","unstructured":"Fortuna, P., Rocha\u00a0da Silva, J., Soler-Company, J., Wanner, L., Nunes, S.: A hierarchically-labeled Portuguese hate speech dataset. In: Proceedings of the 3rd Workshop on Abusive Language Online (ALW3), pp. 94\u2013104. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/W19-3510","DOI":"10.18653\/v1\/W19-3510"},{"key":"8_CR12","unstructured":"Garcia, E.A.S.: Open Portuguese LLM leaderboard (2024). https:\/\/huggingface.co\/spaces\/eduagarcia\/open_pt_llm_leaderboard"},{"key":"8_CR13","unstructured":"Gemma Team et. al.: Gemma 3 Technical Report (2025). https:\/\/arxiv.org\/abs\/2503.19786"},{"key":"8_CR14","unstructured":"He, J., Rungta, M., Koleczek, D., Sekhon, A., Wang, F.X., Hasan, S.: Does prompt formatting have any impact on LLM performance? (2024). https:\/\/arxiv.org\/abs\/2411.10541"},{"key":"8_CR15","unstructured":"Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., Steinhardt, J.: Measuring massive multitask language understanding. Presented at the (2021)"},{"key":"8_CR16","doi-asserted-by":"publisher","unstructured":"Huang, Y., Li, B., Feng, X., Huo, W., Fu, C., Liu, T., Qin, B.: Aligning translation-specific understanding to general understanding in large language models. In: Al-Onaizan, Y., Bansal, M., Chen, Y.N. (eds.) Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing. pp. 5028\u20135041. Association for Computational Linguistics, Miami, Florida, USA (2024). https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.289","DOI":"10.18653\/v1\/2024.emnlp-main.289"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Kapoor, S., Narayanan, A.: Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9), 100804 (2023)","DOI":"10.1016\/j.patter.2023.100804"},{"key":"8_CR18","doi-asserted-by":"publisher","unstructured":"Lai, V., et al.: Okapi: Instruction-tuned large language models in multiple languages with reinforcement learning from human feedback. In: Feng, Y., Lefever, E. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 318\u2013327. Association for Computational Linguistics, Singapore (2023). https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-demo.28","DOI":"10.18653\/v1\/2023.emnlp-demo.28"},{"key":"8_CR19","doi-asserted-by":"publisher","unstructured":"Li, H., et al.: CMMLU: Measuring massive multitask language understanding in Chinese. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Findings of the Association for Computational Linguistics: ACL 2024, pp. 11260\u201311285. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.findings-acl.671","DOI":"10.18653\/v1\/2024.findings-acl.671"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Liu, C., Zhang, W., Zhao, Y., Luu, A.T., Bing, L.: Is translation all you need? a study on solving multilingual tasks with large language models. In: Chiruzzo, L., Ritter, A., Wang, L. (eds.) Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 9594\u20139614. Association for Computational Linguistics, Albuquerque, New Mexico (2025). https:\/\/aclanthology.org\/2025.naacl-long.485\/","DOI":"10.18653\/v1\/2025.naacl-long.485"},{"key":"8_CR21","unstructured":"Llama 3 Team: The llama 3.1 EVALS collection (2024). https:\/\/huggingface.co\/collections\/meta-llama\/llama-31-evals-66a2c5a14c2093e58298ac7f"},{"key":"8_CR22","unstructured":"Nguyen, T., et al.: CulturaX: A cleaned, enormous, and multilingual dataset for large language models in 167 languages. In: Calzolari, N., Kan, M.Y., Hoste, V., Lenci, A., Sakti, S., Xue, N. (eds.) Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). pp. 4226\u20134237. ELRA and ICCL, Torino, Italia (2024). https:\/\/aclanthology.org\/2024.lrec-main.377\/"},{"key":"8_CR23","unstructured":"OpenAI: Multilingual massive multitask language understanding (mmmlu) (2024). https:\/\/huggingface.co\/datasets\/openai\/MMMLU"},{"key":"8_CR24","unstructured":"Pires, R., Almeida, T.S., Abonizio, H., Nogueira, R.: Evaluating GPT-4\u2019s vision capabilities on Brazilian university admission exams (2023)"},{"key":"8_CR25","unstructured":"Qwen et.. al.: Qwen2.5 Technical Report (2025). https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Razavi, A., Soltangheis, M., Arabzadeh, N., Salamat, S., Zihayat, M., Bagheri, E.: Benchmarking prompt sensitivity in large language models. In: Hauff, C., et al., (eds.) Advances in Information Retrieval, pp. 303\u2013313. Springer Nature Switzerland, Cham (2025)","DOI":"10.1007\/978-3-031-88714-7_29"},{"key":"8_CR27","unstructured":"Rein, D., Hou, B.L., Stickland, A.C., Petty, J., Pang, R.Y., Dirani, J., Michael, J., Bowman, S.R.: GPQA: A graduate-level google-proof q &a benchmark. Presented at the (2024)"},{"key":"8_CR28","doi-asserted-by":"publisher","unstructured":"Sayama, H.F., Araujo, A.V., Fernandes, E.R.: Faquad,: Reading comprehension dataset in the domain of Brazilian higher education. Presented at the (2019). https:\/\/doi.org\/10.1109\/BRACIS.2019.00084","DOI":"10.1109\/BRACIS.2019.00084"},{"key":"8_CR29","unstructured":"Singh, S., et al.: Global mmlu: Understanding and addressing cultural and linguistic biases in multilingual evaluation (2025). https:\/\/arxiv.org\/abs\/2412.03304"},{"key":"8_CR30","unstructured":"Vargas, F., Carvalho, I., Rodrigues\u00a0de G\u00f3es, F., Pardo, T., Benevenuto, F.: HateBR: A large expert annotated corpus of Brazilian Instagram comments for offensive language and hate speech detection. In: Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 7174\u20137183. European Language Resources Association, Marseille, France (2022). https:\/\/aclanthology.org\/2022.lrec-1.777"},{"key":"8_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al., (eds.) Advances in Neural Information Processing Systems. vol.\u00a030. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Mmlu-pro: A more robust and challenging multi-task language understanding benchmark. In: Globerson, A., et al.,. (eds.) Advances in Neural Information Processing Systems. vol.\u00a037, pp. 95266\u201395290. Curran Associates, Inc. (2024). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2024\/file\/ad236edc564f3e3156e1b2feafb99a24-Paper-Datasets_and_Benchmarks_Track.pdf","DOI":"10.52202\/079017-3018"},{"key":"8_CR33","unstructured":"Xu, H., Murray, K., Koehn, P., Hoang, H., Eriguchi, A., Khayrallah, H.: X-ALMA: Plug & play modules and adaptive rejection for quality translation at scale. In: The Thirteenth International Conference on Learning Representations (2025). https:\/\/openreview.net\/forum?id=csbf1p8xUq"},{"key":"8_CR34","unstructured":"Zhou, W., Wang, Q., Xu, M., Chen, M., Duan, X.: Revisiting the self-consistency challenges in multi-choice question formats for large language model evaluation. In: Calzolari, N., Kan, M.Y., Hoste, V., Lenci, A., Sakti, S., Xue, N. (eds.) Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pp. 14103\u201314110. ELRA and ICCL, Torino, Italia (2024). https:\/\/aclanthology.org\/2024.lrec-main.1229\/"}],"container-title":["Lecture Notes in Computer Science","Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-15984-7_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T20:34:18Z","timestamp":1769718858000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-15984-7_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032159830","9783032159847"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-15984-7_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"30 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"BRACIS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazilian Conference on Intelligent Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Fortaleza-CE","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazil","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"bracis2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/bracis.sbc.org.br\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}