{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T08:40:34Z","timestamp":1781599234931,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T00:00:00Z","timestamp":1715385600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,11]]},"DOI":"10.1145\/3613905.3636302","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:15:21Z","timestamp":1715415321000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Human-Centered Evaluation and Auditing of Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3368-0180","authenticated-orcid":false,"given":"Ziang","family":"Xiao","sequence":"first","affiliation":[{"name":"Johns Hopkins University, United States and Microsoft Research, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3375-5285","authenticated-orcid":false,"given":"Wesley Hanwen","family":"Deng","sequence":"additional","affiliation":[{"name":"Human-Computer Interaction Institution, Carnegie Mellon University, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3448-5961","authenticated-orcid":false,"given":"Michelle S.","family":"Lam","sequence":"additional","affiliation":[{"name":"Dept. of Computer Science, Stanford University, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1499-3045","authenticated-orcid":false,"given":"Motahhare","family":"Eslami","sequence":"additional","affiliation":[{"name":"School of Computer Science, Carnegie Mellon University, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6348-4127","authenticated-orcid":false,"given":"Juho","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Korea, Republic of"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0428-4720","authenticated-orcid":false,"given":"Mina","family":"Lee","sequence":"additional","affiliation":[{"name":"Microsoft Research, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4543-7196","authenticated-orcid":false,"given":"Q. Vera","family":"Liao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"key":"e_1_3_3_2_1_1","unstructured":"Open AI. 2022. ChatGPT Feedback Contest: Official Rules. https:\/\/cdn.openai.com\/chatgpt\/ChatGPT_Feedback_Contest_Rules.pdf"},{"key":"e_1_3_3_2_2_1","volume-title":"On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258","author":"Bommasani Rishi","year":"2021","unstructured":"Rishi Bommasani, Drew\u00a0A Hudson, Ehsan Adeli, Russ Altman, Simran Arora, Sydney von Arx, Michael\u00a0S Bernstein, Jeannette Bohg, Antoine Bosselut, Emma Brunskill, 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3479569"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581268"},{"key":"e_1_3_3_2_5_1","volume-title":"Introducing Twitter\u2019s first algorithmic bias bounty challenge. URl: https:\/\/blog. twitter. com\/engineering\/en_us\/topics\/insights\/2021\/algorithmic-bias-bountychallenge","author":"Chowdhury Rumman","year":"2021","unstructured":"Rumman Chowdhury and Jutta Williams. 2021. Introducing Twitter\u2019s first algorithmic bias bounty challenge. URl: https:\/\/blog. twitter. com\/engineering\/en_us\/topics\/insights\/2021\/algorithmic-bias-bountychallenge (2021)."},{"key":"e_1_3_3_2_6_1","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 7282\u20137296","author":"Clark Elizabeth","year":"2021","unstructured":"Elizabeth Clark, Tal August, Sofia Serrano, Nikita Haduong, Suchin Gururangan, and Noah\u00a0A Smith. 2021. All That\u2019s \u2018Human\u2019Is Not Gold: Evaluating Human Evaluation of Generated Text. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 7282\u20137296."},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581026"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533113"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594037"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517441"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517441"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702556"},{"key":"e_1_3_3_2_13_1","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics.","author":"Fleisig Eve","year":"2023","unstructured":"Eve Fleisig, Aubrie Amstutz, Chad Atalla, Su\u00a0Lin Blodgett, Hal Daum\u00e9\u00a0III, Alexandra Olteanu, Emily Sheng, Dan Vann, and Hanna Wallach. 2023. Fair-Prism: Evaluating fairness-related harms in text generation. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics."},{"key":"e_1_3_3_2_14_1","volume-title":"Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858","author":"Ganguli Deep","year":"2022","unstructured":"Deep Ganguli, Liane Lovitt, Jackson Kernion, Amanda Askell, Yuntao Bai, Saurav Kadavath, Ben Mann, Ethan Perez, Nicholas Schiefer, Kamal Ndousse, 2022. Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858 (2022)."},{"key":"e_1_3_3_2_15_1","unstructured":"Timnit Gebru. 2021. Hierarchy of Knowledge in Machine Learning and Related Fields and Its Consequences. https:\/\/www.youtube.com\/watch?v=OL3DowBM9uc"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.13715"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2463676.2463712"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300830"},{"key":"e_1_3_3_2_19_1","volume-title":"Dynabench: Rethinking benchmarking in NLP. arXiv preprint arXiv:2104.14337","author":"Kiela Douwe","year":"2021","unstructured":"Douwe Kiela, Max Bartolo, Yixin Nie, Divyansh Kaushik, Atticus Geiger, Zhengxuan Wu, Bertie Vidgen, Grusha Prasad, Amanpreet Singh, Pratik Ringshia, 2021. Dynabench: Rethinking benchmarking in NLP. arXiv preprint arXiv:2104.14337 (2021)."},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606833"},{"key":"e_1_3_3_2_21_1","volume-title":"EvalLM: Interactive Evaluation of Large Language Model Prompts on User-Defined Criteria. arXiv preprint arXiv:2309.13633","author":"Kim Tae\u00a0Soo","year":"2023","unstructured":"Tae\u00a0Soo Kim, Yoonjoo Lee, Jamin Shin, Young-Ho Kim, and Juho Kim. 2023. EvalLM: Interactive Evaluation of Large Language Model Prompts on User-Defined Criteria. arXiv preprint arXiv:2309.13633 (2023)."},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3555625"},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610209"},{"key":"e_1_3_3_2_24_1","volume-title":"Swords: A benchmark for lexical substitution with improved data coverage and quality. arXiv preprint arXiv:2106.04102","author":"Lee Mina","year":"2021","unstructured":"Mina Lee, Chris Donahue, Robin Jia, Alexander Iyabor, and Percy Liang. 2021. Swords: A benchmark for lexical substitution with improved data coverage and quality. arXiv preprint arXiv:2106.04102 (2021)."},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502030"},{"key":"e_1_3_3_2_26_1","volume-title":"Evaluating human-language model interaction. arXiv preprint arXiv:2212.09746","author":"Lee Mina","year":"2022","unstructured":"Mina Lee, Megha Srivastava, Amelia Hardy, John Thickstun, Esin Durmus, Ashwin Paranjape, Ines Gerard-Ursin, Xiang\u00a0Lisa Li, Faisal Ladhak, Frieda Rong, 2022. Evaluating human-language model interaction. arXiv preprint arXiv:2212.09746 (2022)."},{"key":"e_1_3_3_2_27_1","volume-title":"Holistic evaluation of language models. arXiv preprint arXiv:2211.09110","author":"Liang Percy","year":"2022","unstructured":"Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, 2022. Holistic evaluation of language models. arXiv preprint arXiv:2211.09110 (2022)."},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376590"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533182"},{"key":"e_1_3_3_2_30_1","volume-title":"AI Transparency in the Age of LLMs: A Human-Centered Research Roadmap. arXiv preprint arXiv:2306.01941","author":"Liao Q\u00a0Vera","year":"2023","unstructured":"Q\u00a0Vera Liao and Jennifer\u00a0Wortman Vaughan. 2023. AI Transparency in the Age of LLMs: A Human-Centered Research Roadmap. arXiv preprint arXiv:2306.01941 (2023)."},{"key":"e_1_3_3_2_31_1","volume-title":"Rethinking Model Evaluation as Narrowing the Socio-Technical Gap. arXiv preprint arXiv:2306.03100","author":"Liao Q\u00a0Vera","year":"2023","unstructured":"Q\u00a0Vera Liao and Ziang Xiao. 2023. Rethinking Model Evaluation as Narrowing the Socio-Technical Gap. arXiv preprint arXiv:2306.03100 (2023)."},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.17"},{"key":"e_1_3_3_2_33_1","volume-title":"Processes, Challenges, and Needs for Support. arXiv preprint arXiv:2112.05675","author":"Madaio Michael","year":"2021","unstructured":"Michael Madaio, Lisa Egede, Hariharan Subramonyam, Jennifer\u00a0Wortman Vaughan, and Hanna Wallach. 2021. Assessing the Fairness of AI Systems: AI Practitioners\u2019 Processes, Challenges, and Needs for Support. arXiv preprint arXiv:2112.05675 (2021)."},{"key":"e_1_3_3_2_34_1","volume-title":"Auditing algorithms: Understanding algorithmic systems from the outside in. Foundations and Trends\u00ae in Human\u2013Computer Interaction 14, 4","author":"Metaxa Dana\u00eb","year":"2021","unstructured":"Dana\u00eb Metaxa, Joon\u00a0Sung Park, Ronald\u00a0E Robertson, Karrie Karahalios, Christo Wilson, Jeff Hancock, Christian Sandvig, 2021. Auditing algorithms: Understanding algorithmic systems from the outside in. Foundations and Trends\u00ae in Human\u2013Computer Interaction 14, 4 (2021), 272\u2013344."},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v6i1.13337"},{"key":"e_1_3_3_2_36_1","volume-title":"Search Atlas: Visualizing Divergent Search Results Across Geopolitical Borders. In Designing Interactive Systems Conference 2021. 1970","author":"Ochigame Rodrigo","year":"2021","unstructured":"Rodrigo Ochigame and Katherine Ye. 2021. Search Atlas: Visualizing Divergent Search Results Across Geopolitical Borders. In Designing Interactive Systems Conference 2021. 1970\u20131983."},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372873"},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514094.3534181"},{"key":"e_1_3_3_2_39_1","volume-title":"Supporting Human-AI Collaboration in Auditing LLMs with LLMs. arXiv preprint arXiv:2304.09991","author":"Rastogi Charvi","year":"2023","unstructured":"Charvi Rastogi, Marco\u00a0Tulio Ribeiro, Nicholas King, and Saleema Amershi. 2023. Supporting Human-AI Collaboration in Auditing LLMs with LLMs. arXiv preprint arXiv:2304.09991 (2023)."},{"key":"e_1_3_3_2_40_1","volume-title":"Auditing algorithms: Research methods for detecting discrimination on internet platforms. Data and Discrimination: Converting Critical Concerns into Productive Inquiry","author":"Sandvig Christian","year":"2014","unstructured":"Christian Sandvig, Kevin Hamilton, Karrie Karahalios, and Cedric Langbort. 2014. Auditing algorithms: Research methods for detecting discrimination on internet platforms. Data and Discrimination: Converting Critical Concerns into Productive Inquiry (2014)."},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287598"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445971"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533110"},{"key":"e_1_3_3_2_44_1","volume-title":"Human-Centered Responsible Artificial Intelligence: Current & Future Trends. In Extended Abstracts of the 2023 CHI Conference on Human Factors in Computing Systems. 1\u20134.","author":"Tahaei Mohammad","year":"2023","unstructured":"Mohammad Tahaei, Marios Constantinides, Daniele Quercia, Sean Kennedy, Michael Muller, Simone Stumpf, Q\u00a0Vera Liao, Ricardo Baeza-Yates, Lora Aroyo, Jess Holbrook, 2023. Human-Centered Responsible Artificial Intelligence: Current & Future Trends. In Extended Abstracts of the 2023 CHI Conference on Human Factors in Computing Systems. 1\u20134."},{"key":"e_1_3_3_2_45_1","volume-title":"Workshop on Trust and Reliance in AI-Human Teaming at CHI \u201922","author":"Deng H.","year":"2022","unstructured":"Wesley. H. Deng, Nikita Mehandru, Samantha Robertson, and Niloufar Salehi. 2022. Beyond General Purpose Machine Translation: The Need for Context-specific Empirical Research to Design for Appropriate User Trust. Workshop on Trust and Reliance in AI-Human Teaming at CHI \u201922 (2022). https:\/\/arxiv.org\/abs\/2205.06920"},{"key":"e_1_3_3_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533088"},{"key":"e_1_3_3_2_47_1","volume-title":"Evaluating NLG Evaluation Metrics: A Measurement Theory Perspective. arXiv preprint arXiv:2305.14889","author":"Xiao Ziang","year":"2023","unstructured":"Ziang Xiao, Susu Zhang, Vivian Lai, and Q\u00a0Vera Liao. 2023. Evaluating NLG Evaluation Metrics: A Measurement Theory Perspective. arXiv preprint arXiv:2305.14889 (2023)."},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3381804"},{"key":"e_1_3_3_2_49_1","volume-title":"Flask: Fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:2307.10928","author":"Ye Seonghyeon","year":"2023","unstructured":"Seonghyeon Ye, Doyoung Kim, Sungdong Kim, Hyeonbin Hwang, Seungone Kim, Yongrae Jo, James Thorne, Juho Kim, and Minjoon Seo. 2023. Flask: Fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:2307.10928 (2023)."},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10676-019-09497-z"},{"key":"e_1_3_3_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581161"},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372852"}],"event":{"name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA","acronym":"CHI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"]},"container-title":["Extended Abstracts of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613905.3636302","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613905.3636302","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:57:18Z","timestamp":1750291038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613905.3636302"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,11]]},"references-count":52,"alternative-id":["10.1145\/3613905.3636302","10.1145\/3613905"],"URL":"https:\/\/doi.org\/10.1145\/3613905.3636302","relation":{},"subject":[],"published":{"date-parts":[[2024,5,11]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}