{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T23:31:48Z","timestamp":1774481508131,"version":"3.50.1"},"reference-count":43,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825576","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"5342-5351","source":"Crossref","is-referenced-by-count":6,"title":["Evaluation and Comparison of Open-Source LLMs Using Natural Language Generation Quality Metrics"],"prefix":"10.1109","author":[{"given":"Dzenan","family":"Hamzic","sequence":"first","affiliation":[{"name":"AIT Austrian Institute of Technology,Vienna,Austria"}]},{"given":"Markus","family":"Wurzenberger","sequence":"additional","affiliation":[{"name":"AIT Austrian Institute of Technology,Vienna,Austria"}]},{"given":"Florian","family":"Skopik","sequence":"additional","affiliation":[{"name":"AIT Austrian Institute of Technology,Vienna,Austria"}]},{"given":"Max","family":"Landauer","sequence":"additional","affiliation":[{"name":"AIT Austrian Institute of Technology,Vienna,Austria"}]},{"given":"Andreas","family":"Rauber","sequence":"additional","affiliation":[{"name":"TU Wien,Vienna,Austria"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A comprehensive overview of large language models","author":"Naveed","year":"2023"},{"key":"ref2","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"ref3","first-page":"140:1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"key":"ref4","first-page":"240:1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref5","article-title":"Beyond the answers: Reviewing the rationality of multiple choice question answering for the evaluation of large language models","author":"Wang","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.13715"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"ref9","article-title":"Ethical and social risks of harm from language models","author":"Weidinger","year":"2021"},{"key":"ref10","first-page":"447","article-title":"A survey of the state of explainable AI for natural language processing","volume-title":"Proc. 1st Conf. Asia-Pacific Chapter Assoc. Comput. Linguist. and 10th Int. Joint Conf. Natural Language Process","author":"Danilevsky"},{"key":"ref11","first-page":"9332","article-title":"Evaluating the factual consistency of abstractive text summarization","volume-title":"Proc. 2020 Conf. Empirical Methods in Natural Language Process. (EMNLP)","author":"Kryscinski"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.171"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.740"},{"key":"ref14","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","author":"Lewis","year":"2020"},{"key":"ref15","article-title":"Improving language models by retrieving from trillions of tokens","volume-title":"Proc. Int. Conf. Mach. Learn","author":"Borgeaud"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2021.naacl-main.200","article-title":"KILT: A benchmark for knowledge intensive language tasks","volume-title":"Proc. North Amer. Chapter Assoc. Comput. Linguist","author":"Petroni"},{"key":"ref17","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","volume-title":"Proc. 40th Annu. Meet. Assoc. Comput. Linguist","author":"Papineni"},{"key":"ref18","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Proc. ACL-04 Workshop","author":"Lin"},{"key":"ref19","article-title":"BERTScore: Evaluating text generation with BERT","volume-title":"Proc. Int. Conf. Learn. Representations (ICLR)","author":"Zhang"},{"key":"ref20","article-title":"RAGAS: Automated evaluation of retrieval augmented generation","author":"Es","year":"2023"},{"key":"ref21","article-title":"On the opportunities and risks of foundation models","author":"Bommasani","year":"2021"},{"key":"ref22","volume-title":"Replicate"},{"key":"ref23","article-title":"Introducing LLaMA 3"},{"key":"ref24","article-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"ref25","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.nlppower-1.6"},{"key":"ref27","article-title":"Judging LLM-as-a-judge with MT-Bench and Chatbot Arena","author":"Zheng","year":"2023"},{"key":"ref28","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v38i16.29728","article-title":"Benchmarking large language models in retrieval-augmented generation","volume-title":"Proc. AAAI Conf. Artif. Intell","author":"Chen"},{"key":"ref29","article-title":"Evaluation of retrieval-augmented generation: A survey","author":"Yu","year":"2024"},{"key":"ref30","article-title":"RAGBench: Explainable Benchmark for Retrieval-Augmented Generation Systems","author":"Friel","year":"2024"},{"key":"ref31","article-title":"Language model evaluation harness"},{"key":"ref32","article-title":"Instruction-following evaluation for large language models","author":"Zhou","year":"2023"},{"key":"ref33","article-title":"Open LLM leaderboard","volume-title":"Hugging Face, n.d","author":"Face"},{"key":"ref34","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.findings-acl.824","article-title":"Challenging BIG-Bench tasks and whether chain-of-thought can solve them","volume-title":"Proc. Annu. Meet. Assoc. Comput. Linguist","author":"Suzgun"},{"key":"ref35","article-title":"Measuring mathematical problem solving with the MATH dataset","author":"Hendrycks","year":"2021"},{"key":"ref36","article-title":"GPQA: A graduate-level google-proof Q&A benchmark","author":"Rein","year":"2023"},{"key":"ref37","article-title":"MuSR: Testing the limits of chain-of-thought with multistep soft reasoning","author":"Sprague","year":"2023"},{"key":"ref38","article-title":"MMLU-Pro: A more robust and challenging multi-task language understanding benchmark","author":"Wang","year":"2024"},{"key":"ref39","article-title":"ARAGOG: Advanced RAG output grading","author":"Eibich","year":"2024"},{"key":"ref40","article-title":"AI arXiv dataset","author":"Calam"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref42","article-title":"RAGAS: Metrics","author":"Es"},{"key":"ref43","doi-asserted-by":"crossref","DOI":"10.3115\/1218955.1219032","article-title":"Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics","volume-title":"Proc. 43rd Annu. Meet. Assoc. Comput. Linguist. (ACL \u201905)","author":"Lin"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","location":"Washington, DC, USA","start":{"date-parts":[[2024,12,15]]},"end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825576.pdf?arnumber=10825576","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:56:25Z","timestamp":1737100585000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825576\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825576","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}