{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,16]],"date-time":"2026-07-16T05:19:43Z","timestamp":1784179183930,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706599.3706729","type":"proceedings-article","created":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T20:28:23Z","timestamp":1745440103000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Human-Centered Evaluation and Auditing of Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4869-8460","authenticated-orcid":false,"given":"Yu Lu","family":"Liu","sequence":"first","affiliation":[{"name":"Computer Science, Johns Hopkins University, Baltimore, Maryland, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3375-5285","authenticated-orcid":false,"given":"Wesley Hanwen","family":"Deng","sequence":"additional","affiliation":[{"name":"Human-Computer Interaction Institution, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3448-5961","authenticated-orcid":false,"given":"Michelle S.","family":"Lam","sequence":"additional","affiliation":[{"name":"Dept. of Computer Science, Stanford University, Stanford, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1499-3045","authenticated-orcid":false,"given":"Motahhare","family":"Eslami","sequence":"additional","affiliation":[{"name":"School of Computer Science, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6348-4127","authenticated-orcid":false,"given":"Juho","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Democratic People's Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4543-7196","authenticated-orcid":false,"given":"Q. Vera","family":"Liao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Montreal, Quebec, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7044-3232","authenticated-orcid":false,"given":"Wei","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Interactive Computing, Georgia Institute of Technology, Atlanta, Georgia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4754-6126","authenticated-orcid":false,"given":"Jekaterina","family":"Novikova","sequence":"additional","affiliation":[{"name":"AI Risk and Vulnerability Alliance, Seattle, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3368-0180","authenticated-orcid":false,"given":"Ziang","family":"Xiao","sequence":"additional","affiliation":[{"name":"Computer Science, Johns Hopkins University, Baltimore, Maryland, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Open AI. 2022. ChatGPT Feedback Contest: Official Rules. https:\/\/cdn.openai.com\/chatgpt\/ChatGPT_Feedback_Contest_Rules.pdf"},{"key":"e_1_3_3_2_3_2","unstructured":"Rishi Bommasani Drew\u00a0A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael\u00a0S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et\u00a0al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07258 (2021)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"\u00c1ngel\u00a0Alexander Cabrera Abraham\u00a0J Druck Jason\u00a0I Hong and Adam Perer. 2021. Discovering and validating ai errors with crowdsourced failure reports. Proceedings of the ACM on Human-Computer Interaction 5 CSCW2 (2021) 1\u201322.","DOI":"10.1145\/3479569"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581268"},{"key":"e_1_3_3_2_6_2","unstructured":"Rumman Chowdhury and Jutta Williams. 2021. Introducing Twitter\u2019s first algorithmic bias bounty challenge. URl: https:\/\/blog. twitter. com\/engineering\/en_us\/topics\/insights\/2021\/algorithmic-bias-bountychallenge (2021)."},{"key":"e_1_3_3_2_7_2","first-page":"7282","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","author":"Clark Elizabeth","year":"2021","unstructured":"Elizabeth Clark, Tal August, Sofia Serrano, Nikita Haduong, Suchin Gururangan, and Noah\u00a0A Smith. 2021. All That\u2019s \u2018Human\u2019Is Not Gold: Evaluating Human Evaluation of Generated Text. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 7282\u20137296."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581026"},{"key":"e_1_3_3_2_9_2","unstructured":"Wesley.\u00a0H. Deng Nikita Mehandru Samantha Robertson and Niloufar Salehi. 2022. Beyond General Purpose Machine Translation: The Need for Context-specific Empirical Research to Design for Appropriate User Trust. Workshop on Trust and Reliance in AI-Human Teaming at CHI \u201922 (2022). https:\/\/arxiv.org\/abs\/2205.06920"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533113"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594037"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517441"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Alicia DeVos Aditi Dhabalia Hong Shen Kenneth Holstein and Motahhare Eslami. 2022. Toward User-Driven Algorithm Auditing: Investigating users\u2019 strategies for uncovering harmful algorithmic behavior. CHI Conference on Human Factors in Computing Systems (2022).","DOI":"10.1145\/3491102.3517441"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.741"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702556"},{"key":"e_1_3_3_2_16_2","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics","author":"Fleisig Eve","year":"2023","unstructured":"Eve Fleisig, Aubrie Amstutz, Chad Atalla, Su\u00a0Lin Blodgett, Hal Daum\u00e9\u00a0III, Alexandra Olteanu, Emily Sheng, Dan Vann, and Hanna Wallach. 2023. Fair-Prism: Evaluating fairness-related harms in text generation. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics."},{"key":"e_1_3_3_2_17_2","unstructured":"Deep Ganguli Liane Lovitt Jackson Kernion Amanda Askell Yuntao Bai Saurav Kadavath Ben Mann Ethan Perez Nicholas Schiefer Kamal Ndousse et\u00a0al. 2022. Red teaming language models to reduce harms: Methods scaling behaviors and lessons learned. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.07858 (2022)."},{"key":"e_1_3_3_2_18_2","unstructured":"Timnit Gebru. 2021. Hierarchy of Knowledge in Machine Learning and Related Fields and Its Consequences. https:\/\/www.youtube.com\/watch?v=OL3DowBM9uc"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Sebastian Gehrmann Elizabeth Clark and Thibault Sellam. 2023. Repairing the cracked foundation: A survey of obstacles in evaluation practices for generated text. Journal of Artificial Intelligence Research 77 (2023) 103\u2013166.","DOI":"10.1613\/jair.1.13715"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/2463676.2463712"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.211"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.30"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300830"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Douwe Kiela Max Bartolo Yixin Nie Divyansh Kaushik Atticus Geiger Zhengxuan Wu Bertie Vidgen Grusha Prasad Amanpreet Singh Pratik Ringshia et\u00a0al. 2021. Dynabench: Rethinking benchmarking in NLP. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.14337 (2021).","DOI":"10.18653\/v1\/2021.naacl-main.324"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606833"},{"key":"e_1_3_3_2_26_2","unstructured":"Tae\u00a0Soo Kim Yoonjoo Lee Jamin Shin Young-Ho Kim and Juho Kim. 2023. EvalLM: Interactive Evaluation of Large Language Model Prompts on User-Defined Criteria. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.13633 (2023)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Michelle\u00a0S. Lam Mitchell\u00a0L. Gordon Dana\u00eb Metaxa Jeffrey\u00a0T. Hancock James\u00a0A. Landay and Michael\u00a0S. Bernstein. 2022. End-User Audits: A System Empowering Communities to Lead Large-Scale Investigations of Harmful Algorithmic Behavior. Proc. ACM Hum.-Comput. Interact. 6 CSCW2 Article 512 (Nov 2022) 34\u00a0pages. 10.1145\/3555625","DOI":"10.1145\/3555625"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","unstructured":"Michelle\u00a0S. Lam Ayush Pandit Colin\u00a0H. Kalicki Rachit Gupta Poonam Sahoo and Dana\"e Metaxa. 2023. Sociotechnical Audits: Broadening the Algorithm Auditing Lens to Investigate Targeted Advertising. Proc. ACM Hum.-Comput. Interact. 7 CSCW2 Article 360 (Oct 2023) 37\u00a0pages. 10.1145\/3610209","DOI":"10.1145\/3610209"},{"key":"e_1_3_3_2_29_2","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar et\u00a0al. 2022. Holistic evaluation of language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.09110 (2022)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376590"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533182"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Q\u00a0Vera Liao and Jennifer\u00a0Wortman Vaughan. 2023. AI Transparency in the Age of LLMs: A Human-Centered Research Roadmap. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.01941 (2023).","DOI":"10.1162\/99608f92.8036d03b"},{"key":"e_1_3_3_2_33_2","unstructured":"Q\u00a0Vera Liao and Ziang Xiao. 2023. Rethinking Model Evaluation as Narrowing the Socio-Technical Gap. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.03100 (2023)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.17"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.861"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.413"},{"key":"e_1_3_3_2_37_2","unstructured":"Michael Madaio Lisa Egede Hariharan Subramonyam Jennifer\u00a0Wortman Vaughan and Hanna Wallach. 2021. Assessing the Fairness of AI Systems: AI Practitioners\u2019 Processes Challenges and Needs for Support. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.05675 (2021)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.905"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.881"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Dana\u00eb Metaxa Joon\u00a0Sung Park Ronald\u00a0E Robertson Karrie Karahalios Christo Wilson Jeff Hancock Christian Sandvig et\u00a0al. 2021. Auditing algorithms: Understanding algorithmic systems from the outside in. Foundations and Trends\u00ae in Human\u2013Computer Interaction 14 4 (2021) 272\u2013344.","DOI":"10.1561\/1100000083"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.862"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v6i1.13337"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3461778.3462032"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372873"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3514094.3534181"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Charvi Rastogi Marco\u00a0Tulio Ribeiro Nicholas King and Saleema Amershi. 2023. Supporting Human-AI Collaboration in Auditing LLMs with LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.09991 (2023).","DOI":"10.1145\/3600211.3604712"},{"key":"e_1_3_3_2_47_2","unstructured":"Christian Sandvig Kevin Hamilton Karrie Karahalios and Cedric Langbort. 2014. Auditing algorithms: Research methods for detecting discrimination on internet platforms. Data and Discrimination: Converting Critical Concerns into Productive Inquiry (2014)."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287598"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445971"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533110"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544549.3583178"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533088"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3636302"},{"key":"e_1_3_3_2_54_2","unstructured":"Ziang Xiao Susu Zhang Vivian Lai and Q\u00a0Vera Liao. 2023. Evaluating NLG Evaluation Metrics: A Measurement Theory Perspective. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14889 (2023)."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Ziang Xiao Michelle\u00a0X Zhou Q\u00a0Vera Liao Gloria Mark Changyan Chi Wenxi Chen and Huahai Yang. 2020. Tell me about yourself: Using an AI-powered chatbot to conduct conversational surveys with open-ended questions. ACM Transactions on Computer-Human Interaction (TOCHI) 27 3 (2020) 1\u201337.","DOI":"10.1145\/3381804"},{"key":"e_1_3_3_2_56_2","unstructured":"Seonghyeon Ye Doyoung Kim Sungdong Kim Hyeonbin Hwang Seungone Kim Yongrae Jo James Thorne Juho Kim and Minjoon Seo. 2023. Flask: Fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.10928 (2023)."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"crossref","unstructured":"Meg Young Lassana Magassa and Batya Friedman. 2019. Toward inclusive tech policy design: a method for underrepresented voices to strengthen tech policy documents. Ethics and Information Technology 21 2 (2019) 89\u2013103.","DOI":"10.1007\/s10676-019-09497-z"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581161"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372852"},{"key":"e_1_3_3_2_60_2","first-page":"3810","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics","author":"Zong Shi","year":"2022","unstructured":"Shi Zong, Ashutosh Baheti, Wei Xu, and Alan Ritter. 2022. Extracting a Knowledge Base of COVID-19 Events from Social Media. In Proceedings of the 29th International Conference on Computational Linguistics. International Committee on Computational Linguistics, Gyeongju, Republic of Korea, 3810\u20133823. https:\/\/aclanthology.org\/2022.coling-1.335"}],"event":{"name":"CHI EA '25: Extended Abstracts of the CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI EA '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the Extended Abstracts of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706599.3706729","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706599.3706729","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:10Z","timestamp":1750298230000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706599.3706729"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":59,"alternative-id":["10.1145\/3706599.3706729","10.1145\/3706599"],"URL":"https:\/\/doi.org\/10.1145\/3706599.3706729","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}