{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T13:33:02Z","timestamp":1769002382016,"version":"3.49.0"},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,26]]},"DOI":"10.1109\/fllm63129.2024.10852484","type":"proceedings-article","created":{"date-parts":[[2025,1,28]],"date-time":"2025-01-28T18:35:23Z","timestamp":1738089323000},"page":"272-279","source":"Crossref","is-referenced-by-count":1,"title":["Adaptive Composite Accuracy Scoring for Domainspecific LLM Evaluation"],"prefix":"10.1109","author":[{"given":"Jocelyn Shuang","family":"Ru Teh","sequence":"first","affiliation":[{"name":"Intel Corporation"}]},{"given":"Eng Keong","family":"Koay","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Shin Wei","family":"Lim","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Kuan","family":"Heng Lee","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Mee Sim","family":"Lai","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Meng","family":"Siong Lee","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]},{"given":"Yuan Kuok","family":"Nee","sequence":"additional","affiliation":[{"name":"Intel Corporation"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Introducing ChatGPT"},{"key":"ref2","article-title":"GPT-4 Technical Report","year":"2023"},{"key":"ref3","article-title":"M3KE: a massive Multi-Level Multi-Subject knowledge evaluation benchmark for Chinese large language models","author":"Liu","year":"2023"},{"key":"ref4","article-title":"SuperGLUE: a stickier benchmark for General-Purpose Language Understanding Systems","author":"Wang","year":"2019"},{"key":"ref5","article-title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","author":"Clark","year":"2018"},{"key":"ref6","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2020"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"ref9","article-title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena","author":"Zheng","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00266"},{"key":"ref11","article-title":"Large language models struggle to learn long-tail knowledge","author":"Kandpal","year":"2023"},{"key":"ref12","article-title":"Open LLM Leaderboard v2","volume-title":"Hugging Face","author":"Fourrier"},{"key":"ref13","article-title":"Training language models to follow instructions with human feedback","author":"Ouyang","year":"2022"},{"key":"ref14","article-title":"Retrieval-Augmented Generation for Knowledge-Intensive NLP tasks","author":"Lewis","year":"2020"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"ref16","article-title":"Inadequacies of large language model benchmarks in the era of generative artificial intelligence","author":"McIntosh","year":"2024"},{"key":"ref17","article-title":"Survey on factuality in large language models: Knowledge, retrieval and domain-specificity","author":"Wang","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.899"},{"key":"ref19","article-title":"Chain-of-Thought prompting elicits reasoning in large language models","author":"Wei","year":"2022"},{"key":"ref20","article-title":"Mistral 7B","author":"Jiang","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.348"},{"key":"ref22","article-title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","author":"Touvron","year":"2023"},{"key":"ref23","article-title":"The Llama 3 Herd of Models","author":"Dubey","year":"2024"},{"key":"ref24","article-title":"A framework for few-shot language model evaluation","author":"Leo","year":"2021","journal-title":"Zenodo"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4413"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.149"}],"event":{"name":"2024 2nd International Conference on Foundation and Large Language Models (FLLM)","location":"Dubai, United Arab Emirates","start":{"date-parts":[[2024,11,26]]},"end":{"date-parts":[[2024,11,29]]}},"container-title":["2024 2nd International Conference on Foundation and Large Language Models (FLLM)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10852419\/10852420\/10852484.pdf?arnumber=10852484","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,29]],"date-time":"2025-01-29T06:47:08Z","timestamp":1738133228000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10852484\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,26]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/fllm63129.2024.10852484","relation":{},"subject":[],"published":{"date-parts":[[2024,11,26]]}}}