{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,22]],"date-time":"2026-07-22T04:25:10Z","timestamp":1784694310928,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"USDA National Institute of Food and Agriculture","award":["2021-67022-33447"],"award-info":[{"award-number":["2021-67022-33447"]}]},{"DOI":"10.13039\/100000006","name":"Office of Naval Research","doi-asserted-by":"publisher","award":["N00014-22-1-2507"],"award-info":[{"award-number":["N00014-22-1-2507"]}],"id":[{"id":"10.13039\/100000006","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712091","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"952-966","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":63,"title":["Limitations of the LLM-as-a-Judge Approach for Evaluating LLM Outputs in Expert Knowledge Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5472-282X","authenticated-orcid":false,"given":"Annalisa","family":"Szymanski","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1002-1959","authenticated-orcid":false,"given":"Noah","family":"Ziems","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1261-4291","authenticated-orcid":false,"given":"Heather A.","family":"Eicher-Miller","sequence":"additional","affiliation":[{"name":"Nutrition Science, Purdue University, West Lafayette, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3009-519X","authenticated-orcid":false,"given":"Meng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2206-1720","authenticated-orcid":false,"given":"Ronald A.","family":"Metoyer","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"2024. Prolific. https:\/\/www.prolific.com\/. Accessed: 2024-10-02."},{"key":"e_1_3_3_3_3_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_4_2","unstructured":"Jaewoo Ahn Taehyun Lee Junyoung Lim Jin-Hwa Kim Sangdoo Yun Hwaran Lee and Gunhee Kim. 2024. TimeChara: Evaluating Point-in-Time Character Hallucination of Role-Playing Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.18027 (2024)."},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586182.3616660"},{"key":"e_1_3_3_3_6_2","unstructured":"Rishi Bommasani Drew\u00a0A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael\u00a0S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et\u00a0al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07258 (2021)."},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Linyi Yang Kaijie Zhu Hao Chen Xiaoyuan Yi Cunxiang Wang Yidong Wang et\u00a0al. 2024. A survey on evaluation of large language models. ACM Transactions on Intelligent Systems and Technology 15 3 (2024) 1\u201345.","DOI":"10.1145\/3641289"},{"key":"e_1_3_3_3_8_2","unstructured":"Guiming\u00a0Hardy Chen Shunian Chen Ziche Liu Feng Jiang and Benyou Wang. 2024. Humans or llms as the judge? a study on judgement biases. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.10669 (2024)."},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"crossref","unstructured":"Szu-Wei Cheng Chung-Wen Chang Wan-Jung Chang Hao-Wei Wang Chih-Sung Liang Taishiro Kishimoto Jane Pei-Chen Chang John\u00a0S Kuo and Kuan-Pin Su. 2023. The now and future of ChatGPT and GPT in psychiatry. Psychiatry and clinical neurosciences 77 11 (2023) 592\u2013596.","DOI":"10.1111\/pcn.13588"},{"key":"e_1_3_3_3_10_2","unstructured":"Wei-Lin Chiang Lianmin Zheng Ying Sheng Anastasios\u00a0Nikolas Angelopoulos Tianle Li Dacheng Li Hao Zhang Banghua Zhu Michael Jordan Joseph\u00a0E Gonzalez et\u00a0al. 2024. Chatbot arena: An open platform for evaluating llms by human preference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.04132 (2024)."},{"key":"e_1_3_3_3_11_2","unstructured":"Paul\u00a0F Christiano Jan Leike Tom Brown Miljan Martic Shane Legg and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640544.3645216"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"crossref","unstructured":"Gavin Doherty David Coyle and Mark Matthews. 2010. Design and evaluation guidelines for mental health technologies. Interacting with computers 22 4 (2010) 243\u2013252.","DOI":"10.1016\/j.intcom.2010.02.006"},{"key":"e_1_3_3_3_14_2","unstructured":"Yann Dubois Bal\u00e1zs Galambosi Percy Liang and Tatsunori\u00a0B Hashimoto. 2024. Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.04475 (2024)."},{"key":"e_1_3_3_3_15_2","unstructured":"Yann Dubois Chen\u00a0Xuechen Li Rohan Taori Tianyi Zhang Ishaan Gulrajani Jimmy Ba Carlos Guestrin Percy\u00a0S Liang and Tatsunori\u00a0B Hashimoto. 2024. Alpacafarm: A simulation framework for methods that learn from human feedback. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_16_2","unstructured":"Arpad\u00a0E Elo and Sam Sloan. 1978. The rating of chessplayers: Past and present. (No Title) (1978)."},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642002"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Manuel\u00a0B Garcia. 2023. ChatGPT as a virtual dietitian: Exploring its potential as a tool for improving nutrition knowledge. Applied System Innovation 6 5 (2023) 96.","DOI":"10.3390\/asi6050096"},{"key":"e_1_3_3_3_19_2","unstructured":"Yingqiang Ge Wenyue Hua Kai Mei Juntao Tan Shuyuan Xu Zelong Li Yongfeng Zhang et\u00a0al. 2024. Openagi: When llm meets domain experts. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_20_2","unstructured":"Simret\u00a0Araya Gebreegziabher Kuangshi Ai Zheng Zhang Elena\u00a0L Glassman and Toby Jia-Jun Li. 2024. Leveraging Variation Theory in Counterfactual Data Augmentation for Optimized Active Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03819 (2024)."},{"key":"e_1_3_3_3_21_2","unstructured":"Simret\u00a0Araya Gebreegziabher Yukun Yang Elena\u00a0L Glassman and Toby Jia-Jun Li. 2024. Supporting Co-Adaptive Machine Teaching through Human Concept Learning and Cognitive Theories. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.16561 (2024)."},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581352"},{"key":"e_1_3_3_3_23_2","volume-title":"Overview of Gemini: Large Multimodal Models","author":"Research Google","year":"2024","unstructured":"Google Research. 2024. Overview of Gemini: Large Multimodal Models. Technical Report. Google Research. https:\/\/gemini.google\/overview-gemini-app.pdf Accessed: 2024-10-08."},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"crossref","unstructured":"Yining Hua Fenglin Liu Kailai Yang Zehan Li Yi-han Sheu Peilin Zhou Lauren\u00a0V Moran Sophia Ananiadou and Andrew Beam. 2024. Large language models in mental health care: a scoping review. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.02984 (2024).","DOI":"10.2196\/preprints.64088"},{"key":"e_1_3_3_3_25_2","unstructured":"Shaoxiong Ji Tianlin Zhang Kailai Yang Sophia Ananiadou and Erik Cambria. 2023. Rethinking large language models in mental health applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.11267 (2023)."},{"key":"e_1_3_3_3_26_2","unstructured":"Shivani Kapania Ruiyi Wang Toby Jia-Jun Li Tianshi Li and Hong Shen. 2024. \" I\u2019m categorizing LLM as a productivity tool\": Examining ethics of LLM use in HCI research practices. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.19876 (2024)."},{"key":"e_1_3_3_3_27_2","unstructured":"Tae\u00a0Soo Kim Yoonjoo Lee Jamin Shin Young-Ho Kim and Juho Kim. 2023. Evallm: Interactive evaluation of large language model prompts on user-defined criteria. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.13633 (2023)."},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"crossref","unstructured":"Darlene\u00a0R King Guransh Nanda Joel Stoddard Allison Dempsey Sarah Hergert Jay\u00a0H Shore and John Torous. 2023. An introduction to generative artificial intelligence in mental health care: considerations and guidance. Current psychiatry reports 25 12 (2023) 839\u2013846.","DOI":"10.1007\/s11920-023-01477-x"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"crossref","unstructured":"Daniel Kirk Cagatay Catal and Bedir Tekinerdogan. 2021. Precision nutrition: A systematic literature review. Computers in Biology and Medicine 133 (2021) 104365.","DOI":"10.1016\/j.compbiomed.2021.104365"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"crossref","unstructured":"Daniel Kirk Elise van Eijnatten and Guido Camps. 2023. Comparison of answers between ChatGPT and human dieticians to common nutrition questions. Journal of Nutrition and Metabolism 2023 1 (2023) 5548684.","DOI":"10.1155\/2023\/5548684"},{"key":"e_1_3_3_3_31_2","unstructured":"Cheng Li Ziang Leng Chenxi Yan Junyi Shen Hao Wang Weishi Mi Yaying Fei Xiaoyang Feng Song Yan HaoSheng Wang et\u00a0al. 2023. Chatharuhi: Reviving anime character in reality via large language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.09597 (2023)."},{"key":"e_1_3_3_3_32_2","unstructured":"Xuechen Li Tianyi Zhang Yann Dubois Rohan Taori Ishaan Gulrajani Carlos Guestrin Percy Liang and Tatsunori\u00a0B. Hashimoto. 2023. AlpacaEval: An Automatic Evaluator of Instruction-following Models. https:\/\/github.com\/tatsu-lab\/alpaca_eval."},{"key":"e_1_3_3_3_33_2","unstructured":"Bill\u00a0Yuchen Lin Yuntian Deng Khyathi Chandu Faeze Brahman Abhilasha Ravichander Valentina Pyatkin Nouha Dziri Ronan\u00a0Le Bras and Yejin Choi. 2024. WILDBENCH: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.04770 (2024)."},{"key":"e_1_3_3_3_34_2","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74\u201381. https:\/\/aclanthology.org\/W04-1013"},{"key":"e_1_3_3_3_35_2","unstructured":"June\u00a0M Liu Donghao Li He Cao Tianhe Ren Zeyi Liao and Jiamin Wu. 2023. Chatcounselor: A large language models for mental health support. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.15461 (2023)."},{"key":"e_1_3_3_3_36_2","unstructured":"Yiren Liu Pranav Sharma Mehul\u00a0Jitendra Oswal Haijun Xia and Yun Huang. 2024. PersonaFlow: Boosting Research Ideation with LLM-Simulated Expert Personas. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.12538 (2024)."},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"crossref","unstructured":"Ryan Louie Ananjan Nandi William Fang Cheng Chang Emma Brunskill and Diyi Yang. 2024. Roleplay-doh: Enabling Domain-Experts to Create LLM-simulated Patients via Eliciting and Adhering to Principles. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.00870 (2024).","DOI":"10.18653\/v1\/2024.emnlp-main.591"},{"key":"e_1_3_3_3_38_2","unstructured":"Yuwen Lu Ziang Tong Qinyi Zhao Yewon Oh Bryan Wang and Toby Jia-Jun Li. 2024. Flowy: Supporting UX Design Decisions Through AI-Driven Pattern Annotation in Multi-Screen User Flows. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16177 (2024)."},{"key":"e_1_3_3_3_39_2","unstructured":"Yuwen Lu Yuewen Yang Qinyi Zhao Chengzhi Zhang and Toby Jia-Jun Li. 2024. AI Assistance for UX: A Literature Review Through Human-Centered AI. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.06089 (2024)."},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3519809"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"crossref","unstructured":"Peter\u00a0J Martin. 1999. Influences on clinical judgement in mental health nursing. NT Research 4 4 (1999) 273\u2013281.","DOI":"10.1177\/136140969900400405"},{"key":"e_1_3_3_3_42_2","unstructured":"Benjamin\u00a0Paul Michael. 2019. Clinical Judgement: an investigation of clinical decision-making. Ph.\u00a0D. Dissertation. University of Sheffield."},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"crossref","unstructured":"Pawe\u0142 Niszczota and Iga Rybicka. 2023. The credibility of dietary advice formulated by ChatGPT: robo-diets for people with food allergies. Nutrition 112 (2023) 112076.","DOI":"10.1016\/j.nut.2023.112076"},{"key":"e_1_3_3_3_44_2","unstructured":"OpenAI. 2024. GPT-4 Research Overview. https:\/\/openai.com\/index\/gpt-4-research\/ Accessed: 2024-10-08."},{"key":"e_1_3_3_3_45_2","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730\u201327744."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei jing Zhu. 2002. BLEU: a Method for Automatic Evaluation of Machine Translation. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"crossref","unstructured":"Valentina Ponzo Ilaria Goitre Enrica Favaro Fabio\u00a0Dario Merlo Maria\u00a0Vittoria Mancino Sergio Riso and Simona Bo. 2024. Is ChatGPT an Effective Tool for Providing Dietary Advice? Nutrients 16 4 (2024) 469.","DOI":"10.3390\/nu16040469"},{"key":"e_1_3_3_3_48_2","unstructured":"Sumedh Rasal and EJ Hauer. 2024. Navigating Complexity: Orchestrated Problem Solving with Multi-Agent LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.16713 (2024)."},{"key":"e_1_3_3_3_49_2","unstructured":"Alireza Salemi Sheshera Mysore Michael Bendersky and Hamed Zamani. 2023. Lamp: When large language models meet personalization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.11406 (2023)."},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"crossref","unstructured":"Shreya Shankar JD Zamfirescu-Pereira Bj\u00f6rn Hartmann Aditya\u00a0G Parameswaran and Ian Arawjo. 2024. Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.12272 (2024).","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642400"},{"key":"e_1_3_3_3_52_2","unstructured":"Annalisa Szymanski Simret\u00a0Araya Gebreegziabher Oghenemaro Anuyah Ronald\u00a0A Metoyer and Toby Jia-Jun Li. 2024. Comparing Criteria Development Across Domain Experts Lay Users and Models in Large Language Model Evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02054 (2024)."},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641924"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1421"},{"key":"e_1_3_3_3_55_2","unstructured":"Rohan Taori Ishaan Gulrajani Tianyi Zhang Yann Dubois Xuechen Li Carlos Guestrin Percy Liang and Tatsunori\u00a0B Hashimoto. 2023. Stanford alpaca: An instruction-following llama model."},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"crossref","unstructured":"Yu-Min Tseng Yu-Chao Huang Teng-Yun Hsiao Yu-Ching Hsu Jia-Yin Foo Chao-Wei Huang and Yun-Nung Chen. 2024. Two tales of persona in llms: A survey of role-playing and personalization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.01171 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.969"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"crossref","unstructured":"Ruth Vo M Smith and N Patton. 2021. Journal of Human Nutrition and Dietetics 34 1 (2021) 124\u2013133.","DOI":"10.1111\/jhn.12820"},{"key":"e_1_3_3_3_58_2","unstructured":"Yufei Wang Wanjun Zhong Liangyou Li Fei Mi Xingshan Zeng Wenyong Huang Lifeng Shang Xin Jiang and Qun Liu. 2023. Aligning large language models with human: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.12966 (2023)."},{"key":"e_1_3_3_3_59_2","unstructured":"Zhichao Wang Bin Bi Shiva\u00a0Kumar Pentyala Kiran Ramnath Sougata Chaudhuri Shubham Mehrotra Xiang-Bo Mao Sitaram Asur et\u00a0al. 2024. A Comprehensive Survey of LLM Alignment Techniques: RLHF RLAIF PPO DPO and More. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.16216 (2024)."},{"key":"e_1_3_3_3_60_2","unstructured":"Michael Williams and Tami Moser. 2019. The art of coding and thematic exploration in qualitative research. International management review 15 1 (2019) 45\u201355."},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"crossref","unstructured":"Xuhai Xu Bingsheng Yao Yuanzhe Dong Saadia Gabriel Hong Yu James Hendler Marzyeh Ghassemi Anind\u00a0K Dey and Dakuo Wang. 2024. Mental-llm: Leveraging large language models for mental health prediction via online text data. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 8 1 (2024) 1\u201332.","DOI":"10.1145\/3643540"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"crossref","unstructured":"Zhongqi Yang Elahe Khatibi Nitish Nagesh Mahyar Abbasian Iman Azimi Ramesh Jain and Amir\u00a0M Rahmani. 2024. ChatDiet: Empowering personalized nutrition-oriented food recommender chatbots through an LLM-augmented framework. Smart Health 32 (2024) 100465.","DOI":"10.1016\/j.smhl.2024.100465"},{"key":"e_1_3_3_3_63_2","volume-title":"International Conference on Learning Representations","author":"Zhang Tianyi","year":"2020","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian\u00a0Q Weinberger, and Yoav Artzi. 2020. BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations."},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606800"},{"key":"e_1_3_3_3_65_2","unstructured":"Wayne\u00a0Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et\u00a0al. 2023. A survey of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.18223 (2023)."},{"key":"e_1_3_3_3_66_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et\u00a0al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_67_2","unstructured":"Lianghui Zhu Xinggang Wang and Xinlong Wang. 2023. Judgelm: Fine-tuned large language models are scalable judges. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.17631 (2023)."}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","location":"Cagliari Italy","acronym":"IUI '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712091","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712091","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712091","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:09:46Z","timestamp":1750295386000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712091"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":66,"alternative-id":["10.1145\/3708359.3712091","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712091","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}