{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T13:48:45Z","timestamp":1776088125677,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2027,4,13]],"date-time":"2027-04-13T00:00:00Z","timestamp":1807574400000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Danish Novo Nordisk Foundation","award":["NNF20OC0066119"],"award-info":[{"award-number":["NNF20OC0066119"]}]},{"name":"National Science Foundation","award":["IIS-2040942"],"award-info":[{"award-number":["IIS-2040942"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772363.3778709","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T01:55:28Z","timestamp":1776045328000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Third Workshop on Human-Centered Evaluation and Auditing of Language Models: AI Agents-in-the-Loop"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0245-1633","authenticated-orcid":false,"given":"Willem","family":"van der Maden","sequence":"first","affiliation":[{"name":"IT University of Copenhagen, Copenhagen, Denmark"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3375-5285","authenticated-orcid":false,"given":"Wesley","family":"Hanwen Deng","sequence":"additional","affiliation":[{"name":"Human-Computer Interaction Institution, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4869-8460","authenticated-orcid":false,"given":"Yu Lu","family":"Liu","sequence":"additional","affiliation":[{"name":"Computer Science, Johns Hopkins University, Baltimore, Maryland, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5670-5427","authenticated-orcid":false,"given":"Han","family":"Jiang","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, Maryland, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2783-0265","authenticated-orcid":false,"given":"Valerie","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9547-3449","authenticated-orcid":false,"given":"Haotian","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6348-4127","authenticated-orcid":false,"given":"Juho","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4543-7196","authenticated-orcid":false,"given":"Q. Vera","family":"Liao","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Michigan, Ann Arbor, Ann Arbor, Michigan, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7044-3232","authenticated-orcid":false,"given":"Wei","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Interactive Computing, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1499-3045","authenticated-orcid":false,"given":"Motahhare","family":"Eslami","sequence":"additional","affiliation":[{"name":"School of Computer Science, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3368-0180","authenticated-orcid":false,"given":"Ziang","family":"Xiao","sequence":"additional","affiliation":[{"name":"Computer Science, Johns Hopkins University, Baltimore, Maryland, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Open AI. 2022. ChatGPT Feedback Contest: Official Rules. https:\/\/cdn.openai.com\/chatgpt\/ChatGPT_Feedback_Contest_Rules.pdf"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747740"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Jack Bandy. 2021. Problematic machine behavior: A systematic literature review of algorithm audits. Proceedings of the acm on human-computer interaction 5 CSCW1 (2021) 1\u201334.","DOI":"10.1145\/3449148"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/SaTML59370.2024.00037"},{"key":"e_1_3_3_2_6_2","unstructured":"Rishi Bommasani Drew\u00a0A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael\u00a0S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et\u00a0al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07258 (2021)."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581268"},{"key":"e_1_3_3_2_8_2","unstructured":"Rumman Chowdhury and Jutta Williams. 2021. Introducing Twitter\u2019s first algorithmic bias bounty challenge. URl: https:\/\/blog. twitter. com\/engineering\/en_us\/topics\/insights\/2021\/algorithmic-bias-bountychallenge (2021)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581026"},{"key":"e_1_3_3_2_10_2","unstructured":"Wesley\u00a0Hanwen Deng Sunnie S.\u00a0Y. Kim Akshita Jha Ken Holstein Motahhare Eslami Lauren Wilcox and Leon\u00a0A Gatys. 2025. PersonaTeaming: Exploring How Introducing Personas Can Improve Automated AI Red-Teaming. arxiv:https:\/\/arXiv.org\/abs\/2509.03728\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2509.03728"},{"key":"e_1_3_3_2_11_2","unstructured":"Wesley.\u00a0H. Deng Nikita Mehandru Samantha Robertson and Niloufar Salehi. 2022. Beyond General Purpose Machine Translation: The Need for Context-specific Empirical Research to Design for Appropriate User Trust. Workshop on Trust and Reliance in AI-Human Teaming at CHI \u201922 (2022). https:\/\/arxiv.org\/abs\/2205.06920"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533113"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594037"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517441"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.741"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702556"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Sebastian Gehrmann Elizabeth Clark and Thibault Sellam. 2023. Repairing the cracked foundation: A survey of obstacles in evaluation practices for generated text. Journal of Artificial Intelligence Research 77 (2023) 103\u2013166.","DOI":"10.1613\/jair.1.13715"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.211"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.30"},{"key":"e_1_3_3_2_20_2","unstructured":"Alina Hyk Kiera McCormick Mian Zhong Ioana Ciuc\u0103 Sanjib Sharma John\u00a0F Wu JEG Peek Kartheik\u00a0G Iyer Ziang Xiao and Anjalie Field. 2025. From queries to criteria: Understanding how astronomers evaluate LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.15715 (2025)."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Douwe Kiela Max Bartolo Yixin Nie Divyansh Kaushik Atticus Geiger Zhengxuan Wu Bertie Vidgen Grusha Prasad Amanpreet Singh Pratik Ringshia et\u00a0al. 2021. Dynabench: Rethinking benchmarking in NLP. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.14337 (2021).","DOI":"10.18653\/v1\/2021.naacl-main.324"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606833"},{"key":"e_1_3_3_2_23_2","unstructured":"Tae\u00a0Soo Kim Yoonjoo Lee Jamin Shin Young-Ho Kim and Juho Kim. 2023. EvalLM: Interactive Evaluation of Large Language Model Prompts on User-Defined Criteria. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.13633 (2023)."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","unstructured":"Michelle\u00a0S. Lam Mitchell\u00a0L. Gordon Dana\u00eb Metaxa Jeffrey\u00a0T. Hancock James\u00a0A. Landay and Michael\u00a0S. Bernstein. 2022. End-User Audits: A System Empowering Communities to Lead Large-Scale Investigations of Harmful Algorithmic Behavior. Proc. ACM Hum.-Comput. Interact. 6 CSCW2 Article 512 (Nov 2022) 34\u00a0pages. 10.1145\/3555625","DOI":"10.1145\/3555625"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Michelle\u00a0S. Lam Ayush Pandit Colin\u00a0H. Kalicki Rachit Gupta Poonam Sahoo and Dana\"e Metaxa. 2023. Sociotechnical Audits: Broadening the Algorithm Auditing Lens to Investigate Targeted Advertising. Proc. ACM Hum.-Comput. Interact. 7 CSCW2 Article 360 (Oct 2023) 37\u00a0pages. 10.1145\/3610209","DOI":"10.1145\/3610209"},{"key":"e_1_3_3_2_26_2","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar et\u00a0al. 2022. Holistic evaluation of language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.09110 (2022)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376590"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533182"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Q\u00a0Vera Liao and Jennifer\u00a0Wortman Vaughan. 2023. AI Transparency in the Age of LLMs: A Human-Centered Research Roadmap. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.01941 (2023).","DOI":"10.1162\/99608f92.8036d03b"},{"key":"e_1_3_3_2_30_2","unstructured":"Q\u00a0Vera Liao and Ziang Xiao. 2023. Rethinking Model Evaluation as Narrowing the Socio-Technical Gap. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.03100 (2023)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.17"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.861"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.413"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3706729"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"J.\u00a0Derek Lomas Willem van der Maden Sohhom Bandyopadhyay Giovanni Lion Nirmal Patel Gyanesh Jain Yanna Litowsky Haian Xue and Pieter Desmet. 2024. Evaluating the alignment of AI with human emotions. Advanced Design Research 2 2 (2024) 88\u201397. 10.1016\/j.ijadr.2024.10.002","DOI":"10.1016\/j.ijadr.2024.10.002"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.905"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.881"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Dana\u00eb Metaxa Joon\u00a0Sung Park Ronald\u00a0E Robertson Karrie Karahalios Christo Wilson Jeff Hancock Christian Sandvig et\u00a0al. 2021. Auditing algorithms: Understanding algorithmic systems from the outside in. Foundations and Trends\u00ae in Human\u2013Computer Interaction 14 4 (2021) 272\u2013344.","DOI":"10.1561\/1100000083"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v6i1.13337"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3461778.3462032"},{"key":"e_1_3_3_2_41_2","unstructured":"Giada Pistilli. 2022. HuggingFace announcedthe new feature to flag any Model Dataset or Space on the Hub. https:\/\/twitter.com\/GiadaPistilli\/status\/1571865167092396033?s=20&t=LRhhEu63s6ftPmtZdfz8Cw"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Charvi Rastogi Marco\u00a0Tulio Ribeiro Nicholas King and Saleema Amershi. 2023. Supporting Human-AI Collaboration in Auditing LLMs with LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.09991 (2023).","DOI":"10.1145\/3600211.3604712"},{"key":"e_1_3_3_2_43_2","unstructured":"Christian Sandvig Kevin Hamilton Karrie Karahalios and Cedric Langbort. 2014. Auditing algorithms: Research methods for detecting discrimination on internet platforms. Data and discrimination: converting critical concerns into productive inquiry 22 2014 (2014) 4349\u20134357."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445971"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533110"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544549.3583178"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Willem Van Der\u00a0Maden Derek Lomas and Paul Hekkert. 2023. A framework for designing AI systems that support community wellbeing. Frontiers in Psychology 13 (2023) 1011883.","DOI":"10.3389\/fpsyg.2022.1011883"},{"key":"e_1_3_3_2_48_2","unstructured":"Jason Wei Zhiqing Sun Spencer Papay Scott McKinney Jeffrey Han Isa Fulford Hyung\u00a0Won Chung Alex\u00a0Tachard Passos William Fedus and Amelia Glaese. 2025. Browsecomp: A simple yet challenging benchmark for browsing agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.12516 (2025)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3636302"},{"key":"e_1_3_3_2_50_2","unstructured":"Ziang Xiao Susu Zhang Vivian Lai and Q\u00a0Vera Liao. 2023. Evaluating NLG Evaluation Metrics: A Measurement Theory Perspective. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14889 (2023)."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Ziang Xiao Michelle\u00a0X Zhou Q\u00a0Vera Liao Gloria Mark Changyan Chi Wenxi Chen and Huahai Yang. 2020. Tell me about yourself: Using an AI-powered chatbot to conduct conversational surveys with open-ended questions. ACM Transactions on Computer-Human Interaction (TOCHI) 27 3 (2020) 1\u201337.","DOI":"10.1145\/3381804"},{"key":"e_1_3_3_2_52_2","unstructured":"Seonghyeon Ye Doyoung Kim Sungdong Kim Hyeonbin Hwang Seungone Kim Yongrae Jo James Thorne Juho Kim and Minjoon Seo. 2023. Flask: Fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.10928 (2023)."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372852"},{"key":"e_1_3_3_2_54_2","first-page":"3810","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics","author":"Zong Shi","year":"2022","unstructured":"Shi Zong, Ashutosh Baheti, Wei Xu, and Alan Ritter. 2022. Extracting a Knowledge Base of COVID-19 Events from Social Media. In Proceedings of the 29th International Conference on Computational Linguistics. International Committee on Computational Linguistics, Gyeongju, Republic of Korea, 3810\u20133823. https:\/\/aclanthology.org\/2022.coling-1.335"}],"event":{"name":"CHI EA '26: Extended Abstracts of the 2026 CHI Conference on Human Factors in Computing Systems","location":"Barcelona , Spain","acronym":"CHI EA '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the Extended Abstracts of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772363.3778709","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772363.3778709","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:53:21Z","timestamp":1776084801000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772363.3778709"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":53,"alternative-id":["10.1145\/3772363.3778709","10.1145\/3772363"],"URL":"https:\/\/doi.org\/10.1145\/3772363.3778709","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}