{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,8]],"date-time":"2026-07-08T23:43:50Z","timestamp":1783554230458,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":9,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T00:00:00Z","timestamp":1710720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,18]]},"DOI":"10.1145\/3640544.3645216","type":"proceedings-article","created":{"date-parts":[[2024,4,5]],"date-time":"2024-04-05T18:21:08Z","timestamp":1712341268000},"page":"30-32","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":35,"title":["EvaluLLM: LLM assisted evaluation of generative outputs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1796-1161","authenticated-orcid":false,"given":"Michael","family":"Desmond","sequence":"first","affiliation":[{"name":"IBM Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0686-7911","authenticated-orcid":false,"given":"Zahra","family":"Ashktorab","sequence":"additional","affiliation":[{"name":"IBM Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0437-1736","authenticated-orcid":false,"given":"Qian","family":"Pan","sequence":"additional","affiliation":[{"name":"IBM Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1508-2091","authenticated-orcid":false,"given":"Casey","family":"Dugan","sequence":"additional","affiliation":[{"name":"IBM Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7199-5493","authenticated-orcid":false,"given":"James M.","family":"Johnson","sequence":"additional","affiliation":[{"name":"IBM Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,4,5]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"A survey on evaluation of large language models. arXiv preprint arXiv:2307.03109","author":"Chang Yupeng","year":"2023","unstructured":"Yupeng Chang, Xu Wang, Jindong Wang, Yuan Wu, Kaijie Zhu, Hao Chen, Linyi Yang, Xiaoyuan Yi, Cunxiang Wang, Yidong Wang, 2023. A survey on evaluation of large language models. arXiv preprint arXiv:2307.03109 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Gptscore: Evaluate as you desire. arXiv preprint arXiv:2302.04166","author":"Fu Jinlan","year":"2023","unstructured":"Jinlan Fu, See-Kiong Ng, Zhengbao Jiang, and Pengfei Liu. 2023. Gptscore: Evaluate as you desire. arXiv preprint arXiv:2302.04166 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"Prometheus: Inducing fine-grained evaluation capability in language models. arXiv preprint arXiv:2310.08491","author":"Kim Seungone","year":"2023","unstructured":"Seungone Kim, Jamin Shin, Yejin Cho, Joel Jang, Shayne Longpre, Hwaran Lee, Sangdoo Yun, Seongjin Shin, Sungdong Kim, James Thorne, 2023. Prometheus: Inducing fine-grained evaluation capability in language models. arXiv preprint arXiv:2310.08491 (2023)."},{"key":"e_1_3_2_2_4_1","volume-title":"Alpacaeval: An automatic evaluator of instruction-following models.","author":"Li Xuechen","year":"2023","unstructured":"Xuechen Li, Tianyi Zhang, Yann Dubois, Rohan Taori, Ishaan Gulrajani, Carlos Guestrin, Percy Liang, and Tatsunori\u00a0B Hashimoto. 2023. Alpacaeval: An automatic evaluator of instruction-following models."},{"key":"e_1_3_2_2_5_1","volume-title":"Holistic evaluation of language models. arXiv preprint arXiv:2211.09110","author":"Liang Percy","year":"2022","unstructured":"Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, 2022. Holistic evaluation of language models. arXiv preprint arXiv:2211.09110 (2022)."},{"key":"e_1_3_2_2_6_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_2_2_7_1","volume-title":"May","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment, May 2023. arXiv preprint arXiv:2303.16634 (2023)."},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_2_2_9_1","volume-title":"Is chatgpt a good nlg evaluator? a preliminary study. arXiv preprint arXiv:2303.04048","author":"Wang Jiaan","year":"2023","unstructured":"Jiaan Wang, Yunlong Liang, Fandong Meng, Haoxiang Shi, Zhixu Li, Jinan Xu, Jianfeng Qu, and Jie Zhou. 2023. Is chatgpt a good nlg evaluator? a preliminary study. arXiv preprint arXiv:2303.04048 (2023)."}],"event":{"name":"IUI '24: 29th International Conference on Intelligent User Interfaces","location":"Greenville SC USA","acronym":"IUI '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Companion Proceedings of the 29th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640544.3645216","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3640544.3645216","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:51:36Z","timestamp":1764550296000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640544.3645216"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,18]]},"references-count":9,"alternative-id":["10.1145\/3640544.3645216","10.1145\/3640544"],"URL":"https:\/\/doi.org\/10.1145\/3640544.3645216","relation":{},"subject":[],"published":{"date-parts":[[2024,3,18]]},"assertion":[{"value":"2024-04-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}