{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T06:12:57Z","timestamp":1782799977109,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,28]]},"DOI":"10.1145\/3746059.3747740","type":"proceedings-article","created":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T07:49:12Z","timestamp":1758959352000},"page":"1-23","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["EvalAssist: Insights on Task-Specific Evaluations and AI-Assisted Judgment Strategy Preferences"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0686-7911","authenticated-orcid":false,"given":"Zahra","family":"Ashktorab","sequence":"first","affiliation":[{"name":"IBM Research, Yorktown Heights, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1796-1161","authenticated-orcid":false,"given":"Michael","family":"Desmond","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0437-1736","authenticated-orcid":false,"given":"Qian","family":"Pan","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7199-5493","authenticated-orcid":false,"given":"James M.","family":"Johnson","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3808-0509","authenticated-orcid":false,"given":"Martin","family":"Santill\u00e1n Cooper","sequence":"additional","affiliation":[{"name":"IBM Research, Capital Federal, Argentina"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0162-3683","authenticated-orcid":false,"given":"Elizabeth M.","family":"Daly","sequence":"additional","affiliation":[{"name":"IBM Research, Dublin, Ireland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1875-2409","authenticated-orcid":false,"given":"Rahul","family":"Nair","sequence":"additional","affiliation":[{"name":"IBM Research, Dublin, Ireland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5260-0951","authenticated-orcid":false,"given":"Tejaswini","family":"Pedapati","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8297-6792","authenticated-orcid":false,"given":"Hyo Jin","family":"Do","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4699-5026","authenticated-orcid":false,"given":"Werner","family":"Geyer","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,9,27]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Anthropic. 2024. Claude 3.5 sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet Accessed: 2024-09-09."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Ian Arawjo Chelse Swoopes Priyan Vaithilingam Martin Wattenberg and Elena\u00a0L Glassman. 2024. ChainForge: A Visual Toolkit for Prompt Engineering and LLM Hypothesis Testing. 18\u00a0pages.","DOI":"10.1145\/3613904.3642016"},{"key":"e_1_3_3_2_4_2","unstructured":"Yushi Bai Jiahao Ying Yixin Cao Xin Lv Yuze He Xiaozhi Wang Jifan Yu Kaisheng Zeng Yijia Xiao Haozhe Lyu Jiayin Zhang Juanzi Li and Lei Hou. 2024. Benchmarking foundation models with language-model-as-an-examiner. 26\u00a0pages."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Anna Bavaresco Raffaella Bernardi Leonardo Bertolazzi Desmond Elliott Raquel Fern\u00e1ndez Albert Gatt E. Ghaleb Mario Giulianelli Michael Hanna Alexander Koller Andr\u00e9 F.\u00a0T. Martins Philipp Mondorf Vera Neplenbroek Sandro Pezzelle Barbara Plank David Schlangen Alessandro Suglia Aditya\u00a0K Surikuchi Ece Takmaz and Alberto Testoni. 2024. LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks. https:\/\/arxiv.org\/abs\/2406.18403","DOI":"10.18653\/v1\/2025.acl-short.20"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Michelle Brachman Amina El-Ashry Casey Dugan and Werner Geyer. 2024. How Knowledge Workers Use and Want to Use LLMs in an Enterprise Context. 8\u00a0pages.","DOI":"10.1145\/3613905.3650841"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Zana Bu\u00e7inca Phoebe Lin Krzysztof\u00a0Z Gajos and Elena\u00a0L Glassman. 2020. Proxy tasks and subjective measures can be misleading in evaluating explainable AI systems. 454\u2013464\u00a0pages.","DOI":"10.1145\/3377325.3377498"},{"key":"e_1_3_3_2_8_2","unstructured":"Guiming\u00a0Hardy Chen Shunian Chen Ziche Liu Feng Jiang and Benyou Wang. 2024. Humans or llms as the judge? a study on judgement biases."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.599"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.870"},{"key":"e_1_3_3_2_11_2","unstructured":"Hyung\u00a0Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma Albert Webson Shixiang\u00a0Shane Gu Zhuyun Dai Mirac Suzgun Xinyun Chen Aakanksha Chowdhery Sharan Narang Gaurav Mishra Adams Yu Vincent Zhao Yanping Huang Andrew Dai Hongkun Yu Slav Petrov Ed\u00a0H. Chi Jeff Dean Jacob Devlin Adam Roberts Denny Zhou Quoc\u00a0V. Le and Jason Wei. 2022. Scaling Instruction-Finetuned Language Models. doi:10.48550\/ARXIV.2210.11416"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Sasha Costanza-Chock. 2020. Design justice: Community-led practices to build the worlds we need.","DOI":"10.7551\/mitpress\/12255.001.0001"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640544.3645216"},{"key":"e_1_3_3_2_14_2","unstructured":"Michael Desmond and Michelle Brachman. 2024. Exploring Prompt Engineering Practices in the Enterprise."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Michael Desmond Michelle Brachman Evelyn Duesterwald Casey Dugan Narendra\u00a0Nath Joshi Qian Pan and Carolina Spina. 2022. AI Assisted Data Labeling with Interactive Auto Label. 13161\u201313163\u00a0pages.","DOI":"10.1609\/aaai.v36i11.21714"},{"key":"e_1_3_3_2_16_2","unstructured":"Yann Dubois Chen\u00a0Xuechen Li Rohan Taori Tianyi Zhang Ishaan Gulrajani Jimmy Ba Carlos Guestrin Percy\u00a0S Liang and Tatsunori\u00a0B Hashimoto. 2024. Alpacafarm: A simulation framework for methods that learn from human feedback."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Alexander\u00a0R Fabbri Wojciech Kry\u015bci\u0144ski Bryan McCann Caiming Xiong Richard Socher and Dragomir Radev. 2021. Summeval: Re-evaluating summarization evaluation. 391\u2013409\u00a0pages.","DOI":"10.1162\/tacl_a_00373"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Luciano Floridi and Massimo Chiriatti. 2020. GPT-3: Its nature scope limits and consequences. 681\u2013694\u00a0pages.","DOI":"10.1007\/s11023-020-09548-1"},{"key":"e_1_3_3_2_19_2","unstructured":"Center for Research\u00a0on Foundation\u00a0Models. 2024. HELM: Holistic Evaluation of Language Models. https:\/\/crfm.stanford.edu\/helm\/. Accessed: 2024-11-20."},{"key":"e_1_3_3_2_20_2","unstructured":"Yunfan Gao Yun Xiong Xinyu Gao Kangxiang Jia Jinliu Pan Yuxi Bi Yi Dai Jiawei Sun and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Sebastian Gehrmann Elizabeth Clark and Thibault Sellam. 2023. Repairing the cracked foundation: A survey of obstacles in evaluation practices for generated text. 103\u2013166\u00a0pages.","DOI":"10.1613\/jair.1.13715"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Ahmad Ghazal Tilmann Rabl Minqing Hu Francois Raab Meikel Poess Alain Crolotte and Hans-Arno Jacobsen. 2013. Bigbench: Towards an industry standard benchmark for big data analytics. 1197\u20131208\u00a0pages.","DOI":"10.1145\/2463676.2463712"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Ben Greiner Philipp Gr\u00fcnwald Thomas Lindner Georg Lintner and Martin Wiernsperger. 2024. Incentives Framing and Reliance on Algorithmic Advice: An Experimental Study.","DOI":"10.1287\/mnsc.2022.02777"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Sandra\u00a0G Hart. 2006. NASA-task load index (NASA-TLX); 20 years later. 904\u2013908\u00a0pages.","DOI":"10.1177\/154193120605000909"},{"key":"e_1_3_3_2_25_2","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2020. Measuring massive multitask language understanding."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Hui Huang Yingqi Qu Jing Liu Muyun Yang and Tiejun Zhao. 2024. An empirical study of llm-as-a-judge for llm evaluation: Fine-tuned judge models are task-specific classifiers.","DOI":"10.18653\/v1\/2025.findings-acl.306"},{"key":"e_1_3_3_2_27_2","unstructured":"Albert\u00a0Q Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier et\u00a0al. 2023. Mistral 7B."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Minsuk Kahng Ian Tenney Mahima Pushkarna Michael\u00a0Xieyang Liu James Wexler Emily Reif Krystal Kallarackal Minsuk Chang Michael Terry and Lucas Dixon. 2024. Llm comparator: Visual analytics for side-by-side evaluation of large language models. 7\u00a0pages.","DOI":"10.1145\/3613905.3650755"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Evangelos Karapanos Jean-Bernard Martens and Marc Hassenzahl. 2009. Accounting for diversity in subjective judgments. 639\u2013648\u00a0pages.","DOI":"10.1145\/1518701.1518801"},{"key":"e_1_3_3_2_30_2","unstructured":"Seungone Kim Jamin Shin Yejin Cho Joel Jang Shayne Longpre Hwaran Lee Sangdoo Yun Seongjin Shin Sungdong Kim James Thorne et\u00a0al. 2023. Prometheus: Inducing fine-grained evaluation capability in language models."},{"key":"e_1_3_3_2_31_2","unstructured":"Seungone Kim Juyoung Suk Shayne Longpre Bill\u00a0Yuchen Lin Jamin Shin Sean Welleck Graham Neubig Moontae Lee Kyungjae Lee and Minjoon Seo. 2024. Prometheus 2: An open source language model specialized in evaluating other language models."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Aniket Kittur Jeffrey\u00a0V Nickerson Michael Bernstein Elizabeth Gerber Aaron Shaw John Zimmerman Matt Lease and John Horton. 2013. The future of crowd work. 1301\u20131318\u00a0pages.","DOI":"10.1145\/2441776.2441923"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.750"},{"key":"e_1_3_3_2_35_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. 9459\u20139474\u00a0pages."},{"key":"e_1_3_3_2_36_2","unstructured":"Margaret Li Jason Weston and Stephen Roller. 2019. ACUTE-EVAL: Improved Dialogue Evaluation with Optimized Questions and Multi-turn Comparisons. arxiv:https:\/\/arXiv.org\/abs\/1909.03087\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.03087"},{"key":"e_1_3_3_2_37_2","unstructured":"Xuechen Li Tianyi Zhang Yann Dubois Rohan Taori Ishaan Gulrajani Carlos Guestrin Percy Liang and Tatsunori\u00a0B Hashimoto. 2023. Alpacaeval: An automatic evaluator of instruction-following models."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Zongjie Li Chaozheng Wang Pingchuan Ma Daoyuan Wu Shuai Wang Cuiyun Gao and Yang Liu. 2024. Split and Merge: Aligning Position Biases in LLM-based Evaluators. 11084\u201311108\u00a0pages.","DOI":"10.18653\/v1\/2024.emnlp-main.621"},{"key":"e_1_3_3_2_39_2","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar et\u00a0al. 2022. Holistic evaluation of language models."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter Yichong Xu Shuohang Wang Ruochen Xu and Chenguang Zhu. 2023. G-eval: Nlg evaluation using gpt-4 with better human alignment.","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_3_2_41_2","unstructured":"Yinhong Liu Han Zhou Zhijiang Guo Ehsan Shareghi Ivan Vuli\u0107 Anna Korhonen and Nigel Collier. 2024. Aligning with human judgement: The role of pairwise preference in large language model evaluators."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Andreas Madsen Sarath Chandar and Siva Reddy. 2024. Are self-explanations from Large Language Models faithful?295\u2013337\u00a0pages.","DOI":"10.18653\/v1\/2024.findings-acl.19"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Qian Pan Zahra Ashktorab Michael Desmond Martin\u00a0Santillan Cooper James Johnson Rahul Nair Elizabeth Daly and Werner Geyer. 2024. Human-Centered Design Recommendations for LLM-as-a-Judge.","DOI":"10.18653\/v1\/2024.hucllm-1.2"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Kayur Patel Naomi Bancroft Steven\u00a0M Drucker James Fogarty Amy\u00a0J Ko and James Landay. 2010. Gestalt: integrated support for implementation and analysis in machine learning. 37\u201346\u00a0pages.","DOI":"10.1145\/1866029.1866038"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Forough Poursabzi-Sangdeh Daniel\u00a0G Goldstein Jake\u00a0M Hofman Jennifer\u00a0Wortman Wortman\u00a0Vaughan and Hanna Wallach. 2021. Manipulating and measuring model interpretability. 52\u00a0pages.","DOI":"10.1145\/3411764.3445315"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-86623-4_13"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Ravi Raju Swayambhoo Jain Bo Li Jonathan Li and Urmish Thakkar. 2024. Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge.","DOI":"10.18653\/v1\/2024.customnlp4u-1.14"},{"key":"e_1_3_3_2_48_2","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et\u00a0al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context."},{"key":"e_1_3_3_2_49_2","unstructured":"Keita Saito Akifumi Wachi Koki Wataoka and Youhei Akimoto. 2023. Verbosity bias in preference labeling by large language models."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Shreya Shankar JD Zamfirescu-Pereira Bj\u00f6rn Hartmann Aditya\u00a0G Parameswaran and Ian Arawjo. 2024. Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences.","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_2_51_2","unstructured":"Lin Shi Chiyu Ma Wenhua Liang Weicheng Ma and Soroush Vosoughi. 2024. Judging the Judges: A Systematic Investigation of Position Bias in Pairwise Comparative Assessments by LLMs."},{"key":"e_1_3_3_2_52_2","unstructured":"Patrice Simard David Chickering Aparna Lakshmiratan Denis Charles L\u00e9on Bottou Carlos Garcia\u00a0Jurado Suarez David Grangier Saleema Amershi Johan Verwey and Jina Suh. 2014. Ice: enabling non-experts to build models interactively for large-scale lopsided problems."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"David\u00a0R Thomas. 2006. A general inductive approach for analyzing qualitative evaluation data. 237\u2013246\u00a0pages.","DOI":"10.1177\/1098214005283748"},{"key":"e_1_3_3_2_54_2","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed Chi Quoc\u00a0V Le Denny Zhou et\u00a0al. 2022. Chain-of-thought prompting elicits reasoning in large language models. 24824\u201324837\u00a0pages."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Wenda Xu Guanglei Zhu Xuandong Zhao Liangming Pan Lei Li and William Wang. 2024. Pride and prejudice: LLM amplifies self-bias in self-refinement. 15474\u201315492\u00a0pages.","DOI":"10.18653\/v1\/2024.acl-long.826"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Hao Yu Aoran Gan Kai Zhang Shiwei Tong Qi Liu and Zhaofeng Liu. 2024. Evaluation of Retrieval-Augmented Generation: A Survey.","DOI":"10.1007\/978-981-96-1024-2_8"},{"key":"e_1_3_3_2_57_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric\u00a0P. Xing Hao Zhang Joseph\u00a0E. Gonzalez and Ion Stoica. 2024. Judging LLM-as-a-judge with MT-bench and Chatbot Arena. 29\u00a0pages."}],"event":{"name":"UIST '25: The 38th Annual ACM Symposium on User Interface Software and Technology","location":"Busan Republic of Korea","acronym":"UIST '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the 38th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746059.3747740","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T22:11:18Z","timestamp":1759011078000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746059.3747740"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,27]]},"references-count":56,"alternative-id":["10.1145\/3746059.3747740","10.1145\/3746059"],"URL":"https:\/\/doi.org\/10.1145\/3746059.3747740","relation":{},"subject":[],"published":{"date-parts":[[2025,9,27]]},"assertion":[{"value":"2025-09-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}