{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T12:08:54Z","timestamp":1773490134460,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","license":[{"start":{"date-parts":[[2027,3,22]],"date-time":"2027-03-22T00:00:00Z","timestamp":1805673600000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000879","name":"Alfred P. Sloan Foundation","doi-asserted-by":"publisher","award":["G-2024-22427"],"award-info":[{"award-number":["G-2024-22427"]}],"id":[{"id":"10.13039\/100000879","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000879","name":"Notre Dame-IBM Technology Ethics Lab","doi-asserted-by":"publisher","award":["2024 Research Award"],"award-info":[{"award-number":["2024 Research Award"]}],"id":[{"id":"10.13039\/100000879","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004316","name":"International Business Machines Corporation","doi-asserted-by":"publisher","award":["Ph.D. Fellowship"],"award-info":[{"award-number":["Ph.D. Fellowship"]}],"id":[{"id":"10.13039\/100004316","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Amazon Inc.","award":["Research Award"],"award-info":[{"award-number":["Research Award"]}]},{"DOI":"10.13039\/100006785","name":"Google","doi-asserted-by":"publisher","award":["Research Award"],"award-info":[{"award-number":["Research Award"]}],"id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2426395"],"award-info":[{"award-number":["CNS-2426395"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004344","name":"Adobe Systems","doi-asserted-by":"publisher","award":["Research Award"],"award-info":[{"award-number":["Research Award"]}],"id":[{"id":"10.13039\/100004344","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007065","name":"Nvidia","doi-asserted-by":"publisher","award":["Academic Hardware Grant"],"award-info":[{"award-number":["Academic Hardware Grant"]}],"id":[{"id":"10.13039\/100007065","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3742414.3795100","type":"proceedings-article","created":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T11:03:52Z","timestamp":1773054232000},"page":"180-184","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MultEval: Collaboratively Creating Criteria for LLM-as-a-Judge Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6079-4355","authenticated-orcid":false,"given":"Charles","family":"Chiang","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1772-6065","authenticated-orcid":false,"given":"Simret Araya","family":"Gebreegziabher","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5472-282X","authenticated-orcid":false,"given":"Annalisa","family":"Szymanski","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1971-4468","authenticated-orcid":false,"given":"Yukun","family":"Yang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8297-6792","authenticated-orcid":false,"given":"Hyo Jin","family":"Do","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0686-7911","authenticated-orcid":false,"given":"Zahra","family":"Ashktorab","sequence":"additional","affiliation":[{"name":"Thomas J. Watson Center, IBM Research, Yorktown Heights, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4699-5026","authenticated-orcid":false,"given":"Werner","family":"Geyer","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4609-6293","authenticated-orcid":false,"given":"Diego","family":"G\u00f3mez-Zar\u00e1","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"[n. d.]. Secure & reliable LLMs | Promptfoo. https:\/\/www.promptfoo.dev\/"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642016"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","unstructured":"Zahra Ashktorab Michael Desmond Qian Pan James\u00a0M. Johnson Martin\u00a0Santillan Cooper Elizabeth\u00a0M. Daly Rahul Nair Tejaswini Pedapati Hyo\u00a0Jin Do and Werner Geyer. 2025. Aligning Human and LLM Judgments: Insights from EvalAssist on Task-Specific Evaluations and AI-assisted Assessment Strategy Preferences. arXiv:2410.00873 (Aug. 2025). 10.48550\/arXiv.2410.00873","DOI":"10.48550\/arXiv.2410.00873"},{"key":"e_1_3_3_3_5_2","unstructured":"Robert\u00a0O Briggs Gwendolyn\u00a0L Kolfschoten and Gert-Jan\u00a0de Vreede. 2005. Toward a theoretical model of consensus building. AMCIS 2005 Proceedings (2005) 12."},{"key":"e_1_3_3_3_6_2","volume-title":"Proceedings of the Workshop on Text Summarization Branches Out, 2004","author":"Chin-Yew Lin","year":"2004","unstructured":"Lin Chin-Yew. 2004. Rouge: A package for automatic evaluation of summaries. In Proceedings of the Workshop on Text Summarization Branches Out, 2004."},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640544.3645216"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3729176.3729199"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","unstructured":"Jiawei Gu Xuhui Jiang Zhichao Shi Hexiang Tan Xuehao Zhai Chengjin Xu Wei Li Yinghan Shen Shengjie Ma Honghao Liu Saizhuo Wang Kun Zhang Yuanzhuo Wang Wen Gao Lionel Ni and Jian Guo. 2025. A Survey on LLM-as-a-Judge. arXiv:2411.15594 (March 2025). 10.48550\/arXiv.2411.15594","DOI":"10.48550\/arXiv.2411.15594"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","unstructured":"Tae\u00a0Soo Kim Heechan Lee Yoonjoo Lee Joseph Seering and Juho Kim. 2025. Evalet: Evaluating Large Language Models by Fragmenting Outputs into Functions. arXiv:2509.11206 (2025). 10.48550\/arXiv.2509.11206","DOI":"10.48550\/arXiv.2509.11206"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.138"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1230"},{"key":"e_1_3_3_3_14_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"crossref","unstructured":"Ehud Reiter. 2018. A structured review of the validity of BLEU. Computational Linguistics 44 3 (2018) 393\u2013401.","DOI":"10.1162\/coli_a_00322"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_3_17_2","unstructured":"Stax. [n. d.]. Stax - The complete toolkit for AI evaluation. https:\/\/stax.withgoogle.com\/landing"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et\u00a0al. 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in neural information processing systems 36 (2023) 46595\u201346623.","DOI":"10.52202\/075280-2020"}],"event":{"name":"IUI '26: 31st International Conference on Intelligent User Interfaces","location":"Paphos Cyprus","acronym":"IUI '26 Companion","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Companion Proceedings of the 31st International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3742414.3795100","content-type":"text\/html","content-version":"vor","intended-application":"syndication"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:04:14Z","timestamp":1773486254000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742414.3795100"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":17,"alternative-id":["10.1145\/3742414.3795100","10.1145\/3742414"],"URL":"https:\/\/doi.org\/10.1145\/3742414.3795100","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}