{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T12:10:07Z","timestamp":1773490207909,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2027,3,22]],"date-time":"2027-03-22T00:00:00Z","timestamp":1805673600000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Notre Dame\u2013IBM Technology Ethics Lab Award"},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2426395"],"award-info":[{"award-number":["CNS-2426395"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Google Research Scholar Award"},{"name":"Adobe Inc."},{"name":"NVIDIA Academic Hardware Grant"},{"name":"IBM Ph.D. Fellowship"},{"name":"Amazon Science Award"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3742414.3795096","type":"proceedings-article","created":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T11:03:52Z","timestamp":1773054232000},"page":"171-175","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["EvalAgent: Interactive Comparative Evaluation of Computer-Using GUI Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1971-4468","authenticated-orcid":false,"given":"Yukun","family":"Yang","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1772-6065","authenticated-orcid":false,"given":"Simret Araya","family":"Gebreegziabher","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6979-1072","authenticated-orcid":false,"given":"Hojun","family":"Yoo","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6079-4355","authenticated-orcid":false,"given":"Charles","family":"Chiang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9161-4088","authenticated-orcid":false,"given":"Chaoran","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5472-282X","authenticated-orcid":false,"given":"Annalisa","family":"Szymanski","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8297-6792","authenticated-orcid":false,"given":"Hyo Jin","family":"Do","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0686-7911","authenticated-orcid":false,"given":"Zahra","family":"Ashktorab","sequence":"additional","affiliation":[{"name":"Thomas J. Watson Center, IBM Research, Yorktown Heights, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4699-5026","authenticated-orcid":false,"given":"Werner","family":"Geyer","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4609-6293","authenticated-orcid":false,"given":"Diego","family":"G\u00f3mez-Zar\u00e1","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, Indiana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Dario Amodei Chris Olah Jacob Steinhardt Paul Christiano John Schulman and Dan Man\u00e9. 2016. Concrete problems in AI safety. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1606.06565 (2016)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747740"},{"key":"e_1_3_3_2_4_2","unstructured":"Harrison Chase. 2023. LangChain. https:\/\/www.langchain.com\/. Accessed: 2025-11-25."},{"key":"e_1_3_3_2_5_2","unstructured":"Chaoran Chen Zhiping Zhang Bingcan Guo Shang Ma Ibrahim Khalilov Simret\u00a0A Gebreegziabher Yanfang Ye Ziang Xiao Yaxing Yao Tianshi Li et\u00a0al. 2025. The Obvious Invisible Threat: LLM-Powered GUI Agents\u2019 Vulnerability to Fine-Print Injections. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.11281 (2025)."},{"key":"e_1_3_3_2_6_2","unstructured":"Wei-Lin Chiang Anastasios Angelopoulos L Zheng Y Sheng L Dunlap C Chou T Li E Frick N Jain D Li et\u00a0al. 2024. Chatbot arena. LMArena https:\/\/lmarena.ai (2024)."},{"key":"e_1_3_3_2_7_2","unstructured":"Paul\u00a0F Christiano Jan Leike Tom Brown Miljan Martic Shane Legg and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_8_2","unstructured":"Devin Ersoy Brandon Lee Ananth Shreekumar Arjun Arunasalam Muhammad Ibrahim Antonio Bianchi and Z\u00a0Berkay Celik. 2025. Investigating the Impact of Dark Patterns on LLM-Based Web Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.18113 (2025)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.391"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3729176.3729199"},{"key":"e_1_3_3_2_11_2","unstructured":"Evan Hubinger Chris van Merwijk Vladimir Mikulik Joar Skalse and Scott Garrabrant. 2019. Risks from learned optimization in advanced machine learning systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.01820 (2019)."},{"key":"e_1_3_3_2_12_2","unstructured":"Seungone Kim Jamin Shin Yejin Cho Joel Jang Shayne Longpre Hwaran Lee Sangdoo Yun Seongjin Shin Sungdong Kim James Thorne and Minjoon Seo. 2023. Prometheus: Inducing Fine-grained Evaluation Capability in Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.08491 (2023)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.138"},{"key":"e_1_3_3_2_15_2","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar et\u00a0al. 2022. Holistic evaluation of language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.09110 (2022)."},{"key":"e_1_3_3_2_16_2","unstructured":"Stephanie Lin Jacob Hilton and Owain Evans. 2021. Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.07958 (2021)."},{"key":"e_1_3_3_2_17_2","unstructured":"Xiao Liu Hao Yu Hanchen Zhang Yifan Xu Xuanyu Lei Hanyu Lai Yu Gu Hangliang Ding Kaiwen Men Kejuan Yang et\u00a0al. 2023. AgentBench: Evaluating LLMs as Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.03688 (2023)."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter Yichong Xu Shuohang Wang Ruochen Xu and Chenguang Zhu. 2023. G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.16634 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.63317\/3posqwchnb6g"},{"key":"e_1_3_3_2_20_2","unstructured":"Qingyu Lu Liang Ding Siyi Cao Xuebo Liu Kanjian Zhang Jinxia Zhang and Dacheng Tao. 2025. Runaway is Ashamed But Helpful: On the Early-Exit Behavior of Large Language Model-based Agents in Embodied Environments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.17616 (2025)."},{"key":"e_1_3_3_2_21_2","unstructured":"Xiaoya Lu Zeren Chen Xuhao Hu Yijin Zhou Weichen Zhang Dongrui Liu Lu Sheng and Jing Shao. 2025. IS-Bench: Evaluating Interactive Safety of VLM-Driven Embodied Agents in Daily Household Tasks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.16402 (2025)."},{"key":"e_1_3_3_2_22_2","unstructured":"Yuxuan Lu Bingsheng Yao Hansu Gu Jing Huang Jessie Wang Yang Li Jiri Gesi Qi He Toby Jia-Jun Li and Dakuo Wang. 2025. UXAgent: A System for Simulating Usability Testing of Web Design with LLM Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.09407 (2025)."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3736555"},{"key":"e_1_3_3_2_24_2","unstructured":"Tapio Pitk\u00e4ranta and Leena Pitk\u00e4ranta. 2025. HADA: Human-AI Agent Decision Alignment Architecture. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.04253 (2025)."},{"key":"e_1_3_3_2_25_2","volume-title":"The Thirty-Ninth Annual Conference on Neural Information Processing Systems Position Paper Track","author":"Shen Hua","year":"2025","unstructured":"Hua Shen, Tiffany Knearem, Reshmi Ghosh, Kenan Alkiek, Kundan Krishna, Yachuan Liu, Savvas Petridis, Yi-Hao Peng, Li Qiwei, Chenglei Si, Yutong Xie, Jeffrey\u00a0P. Bigham, Frank Bentley, Joyce Chai, Zachary\u00a0Chase Lipton, Qiaozhu Mei, Michael Terry, Diyi Yang, Meredith\u00a0Ringel Morris, Paul Resnick, and David Jurgens. 2025. Position: Towards Bidirectional Human-AI Alignment. In The Thirty-Ninth Annual Conference on Neural Information Processing Systems Position Paper Track. https:\/\/openreview.net\/forum?id=PgA9rZoMY8"},{"key":"e_1_3_3_2_26_2","unstructured":"Hang Su Jun Luo Chang Liu Xiao Yang Yichi Zhang Yinpeng Dong and Jun Zhu. 2025. A Survey on Autonomy-Induced Security Risks in Large Model-Based Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.23844 (2025)."},{"key":"e_1_3_3_2_27_2","unstructured":"Annalisa Szymanski Simret\u00a0Araya Gebreegziabher Oghenemaro Anuyah Ronald\u00a0A Metoyer and Toby Jia-Jun Li. 2024. Comparing criteria development across domain experts lay users and models in large language model evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02054 (2024)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708359.3712091"},{"key":"e_1_3_3_2_29_2","unstructured":"Jingyu Tang Chaoran Chen Jiawen Li Zhiping Zhang Bingcan Guo Ibrahim Khalilov Simret\u00a0Araya Gebreegziabher Bingsheng Yao Dakuo Wang Yanfang Ye et\u00a0al. 2025. Dark patterns meet gui agents: Llm agent susceptibility to manipulative interfaces and the role of human oversight. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.10723 (2025)."},{"key":"e_1_3_3_2_30_2","unstructured":"Ada\u00a0Defne Tur Nicholas Meade Xing\u00a0Han L\u00f9 Alejandra Zambrano Arkil Patel Esin Durmus Spandana Gella Karolina Sta\u0144czak and Siva Reddy. 2025. Safearena: Evaluating the safety of autonomous web agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.04957 (2025)."},{"key":"e_1_3_3_2_31_2","unstructured":"Jonathan Uesato Nate Kushman Ramana Kumar Francis Song Noah Siegel Lisa Wang Antonia Creswell Geoffrey Irving and Irina Higgins. 2022. Solving math word problems with process-and outcome-based feedback. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.14275 (2022)."},{"key":"e_1_3_3_2_32_2","unstructured":"Jialin Wang and Zhihua Duan. 2024. Agent ai with langgraph: A modular framework for enhancing machine translation using large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03801 (2024)."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_3_2_34_2","unstructured":"Colin White Samuel Dooley Manley Roberts Arka Pal Ben Feuer Siddhartha Jain Ravid Shwartz-Ziv Neel Jain Khalid Saifullah Siddartha Naidu et\u00a0al. 2024. Livebench: A challenging contamination-free llm benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.19314 4 (2024)."},{"key":"e_1_3_3_2_35_2","unstructured":"Fangzhou Wu Shutong Wu Yulong Cao and Chaowei Xiao. 2024. Wipi: A new web threat for llm-driven web agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.16965 (2024)."},{"key":"e_1_3_3_2_36_2","unstructured":"Yilong Xu Xiang Long Zhi Zheng and Jinhua Gao. 2025. Ravine: Reality-aligned evaluation for agentic search. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.16725 (2025)."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Wenkai Yang Xiaohan Bi Yankai Lin Sishuo Chen Jie Zhou and Xu Sun. 2024. Watch out for your agents! investigating backdoor threats to llm-based agents. Advances in Neural Information Processing Systems 37 (2024) 100938\u2013100964.","DOI":"10.52202\/079017-3201"},{"key":"e_1_3_3_2_38_2","unstructured":"Asaf Yehudai Lilach Eden Alan Li Guy Uziel Yilun Zhao Roy Bar-Haim Arman Cohan and Michal Shmueli-Scheuer. 2025. Survey on evaluation of llm-based agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.16416 (2025)."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713356"},{"key":"e_1_3_3_2_40_2","unstructured":"Fangyi Yu. 2025. When AIs Judge AIs: The Rise of Agent-as-a-Judge Evaluation for LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.02994 (2025)."},{"key":"e_1_3_3_2_41_2","first-page":"1931","volume-title":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track","author":"Yu Fangyi","year":"2025","unstructured":"Fangyi Yu, Nabeel Seedat, Drahomira Herrmannova, Frank Schilder, and Jonathan\u00a0Richard Schwarz. 2025. Beyond Pointwise Scores: Decomposed Criteria-Based Evaluation of LLM Responses. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track. 1931\u20131954."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Tongxin Yuan Zhiwei He Lingzhong Dong Yiming Wang Ruijie Zhao Tian Xia Lizhen Xu Binglin Zhou Fangqi Li Zhuosheng Zhang et\u00a0al. 2024. R-judge: Benchmarking safety risk awareness for llm agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.10019 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.79"},{"key":"e_1_3_3_2_43_2","unstructured":"Jinchuan Zhang Lu Yin Yan Zhou and Songlin Hu. 2025. AgentAlign: Navigating Safety Alignment in the Shift from Informative to Agentic Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.23020 (2025)."},{"key":"e_1_3_3_2_44_2","unstructured":"Weizhi Zhang Xinyang Zhang Chenwei Zhang Liangwei Yang Jingbo Shang Zhepei Wei Henry\u00a0Peng Zou Zijie Huang Zhengyang Wang Yifan Gao et\u00a0al. 2025. Personaagent: When large language model agents meet personalization at test time. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.06254 (2025)."},{"key":"e_1_3_3_2_45_2","unstructured":"Boyuan Zheng Boyu Gou Jihyung Kil Huan Sun and Yu Su. 2024. Gpt-4v (ision) is a generalist web agent if grounded. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.01614 (2024)."},{"key":"e_1_3_3_2_46_2","unstructured":"Shuyan Zhou Frank\u00a0F Xu Hao Zhu Xuhui Zhou Robert Lo Abishek Sridhar Xianyi Cheng Tianyue Ou Yonatan Bisk Daniel Fried et\u00a0al. 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.13854 (2023)."}],"event":{"name":"IUI '26: 31st International Conference on Intelligent User Interfaces","location":"Paphos Cyprus","acronym":"IUI '26 Companion","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Companion Proceedings of the 31st International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3742414.3795096","content-type":"text\/html","content-version":"vor","intended-application":"syndication"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:05:15Z","timestamp":1773486315000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742414.3795096"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":45,"alternative-id":["10.1145\/3742414.3795096","10.1145\/3742414"],"URL":"https:\/\/doi.org\/10.1145\/3742414.3795096","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}