{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:00:24Z","timestamp":1776931224130,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":106,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2025-00557726"],"award-info":[{"award-number":["RS-2025-00557726"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Institute of Information & communications Technology Planning & Evaluation, South Korea","award":["RS-2024-00443251"],"award-info":[{"award-number":["RS-2024-00443251"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772318.3790285","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T07:34:33Z","timestamp":1776065673000},"page":"1-27","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Evalet: Evaluating Large Language Models through Functional Fragmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9078-6032","authenticated-orcid":false,"given":"Tae Soo","family":"Kim","sequence":"first","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2740-1178","authenticated-orcid":false,"given":"Heechan","family":"Lee","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7491-986X","authenticated-orcid":false,"given":"Yoonjoo","family":"Lee","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, University of Michigan, Ann Arbor, Michigan, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7606-4711","authenticated-orcid":false,"given":"Joseph","family":"Seering","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6348-4127","authenticated-orcid":false,"given":"Juho","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Computing, KAIST, Daejeon, Republic of Korea and SkillBench, Santa Barbara, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Genspark AI. 2024. Genspark: Agents That Write Code and Explain It. https:\/\/genspark.ai\/. Accessed: 2025-04-10."},{"key":"e_1_3_3_3_3_2","unstructured":"Meta AI. 2025. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/. Accessed: 2026-02-02."},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/2531602.2531653"},{"key":"e_1_3_3_3_5_2","unstructured":"Anthropic. 2025. Claude 3.7 Sonnet and Claude Code. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet Accessed: 2025-03-19."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642016"},{"key":"e_1_3_3_3_7_2","unstructured":"Zahra Ashktorab Michael Desmond Qian Pan James\u00a0M Johnson Martin\u00a0Santillan Cooper Elizabeth\u00a0M Daly Rahul Nair Tejaswini Pedapati Swapnaja Achintalwar and Werner Geyer. 2024. Aligning Human and LLM Judgments: Insights from EvalAssist on Task-Specific Evaluations and AI-assisted Assessment Strategy Preferences. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.00873 (2024)."},{"key":"e_1_3_3_3_8_2","unstructured":"Xuechunzi Bai Angelina Wang Ilia Sucholutsky and Thomas\u00a0L. Griffiths. 2024. Measuring Implicit Bias in Explicitly Unbiased Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2402.04105\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2402.04105"},{"key":"e_1_3_3_3_9_2","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion Andy Jones Anna Chen Anna Goldie Azalia Mirhoseini Cameron McKinnon et\u00a0al. 2022. Constitutional ai: Harmlessness from ai feedback. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.08073 (2022)."},{"key":"e_1_3_3_3_10_2","unstructured":"Rishabh Bhardwaj and Soujanya Poria. 2023. Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment. arxiv:https:\/\/arXiv.org\/abs\/2308.09662\u00a0[cs.CL]"},{"key":"e_1_3_3_3_11_2","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems , H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.), Vol.\u00a033. Curran Associates, Inc., 1877\u20131901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581268"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"crossref","unstructured":"\u00c1ngel\u00a0Alexander Cabrera Marco Tulio\u00a0Ribeiro Bongshin Lee Robert Deline Adam Perer and Steven\u00a0M Drucker. 2023. What did my AI learn? How data scientists make sense of model behavior. ACM Transactions on Computer-Human Interaction 30 1 (2023) 1\u201327.","DOI":"10.1145\/3542921"},{"key":"e_1_3_3_3_14_2","unstructured":"Yapei Chang Kyle Lo Tanya Goyal and Mohit Iyyer. 2023. Booookscore: A systematic exploration of book-length summarization in the era of llms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.00785 (2023)."},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/1978942.1978967"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714002"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501819"},{"key":"e_1_3_3_3_18_2","unstructured":"Yiran Ding Li\u00a0Lyna Zhang Chengruidong Zhang Yuanyuan Xu Ning Shang Jiahang Xu Fan Yang and Mao Yang. 2024. Longrope: Extending llm context window beyond 2 million tokens. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13753 (2024)."},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"crossref","unstructured":"Yao Dou Maxwell Forbes Rik Koncel-Kedziorski Noah\u00a0A Smith and Yejin Choi. 2021. Is GPT-3 text indistinguishable from human text? scarecrow: A framework for scrutinizing machine text. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.01294 (2021).","DOI":"10.18653\/v1\/2022.acl-long.501"},{"key":"e_1_3_3_3_20_2","unstructured":"Lisa Dunlap Krishna Mandal Trevor Darrell Jacob Steinhardt and Joseph\u00a0E Gonzalez. 2024. VibeCheck: Discover and Quantify Qualitative Differences in Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.12851 (2024)."},{"key":"e_1_3_3_3_21_2","unstructured":"Jinlan Fu See-Kiong Ng Zhengbao Jiang and Pengfei Liu. 2023. Gptscore: Evaluate as you desire. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.04166 (2023)."},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"crossref","unstructured":"Simret\u00a0Araya Gebreegziabher Charles Chiang Zichu Wang Zahra Ashktorab Michelle Brachman Werner Geyer Toby Jia-Jun Li and Diego G\u00f3mez-Zar\u00e1. 2025. MetricMate: An Interactive Tool for Generating Evaluation Criteria for LLM-as-a-Judge Workflow. (2025).","DOI":"10.1145\/3729176.3729199"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"crossref","unstructured":"Dedre Gentner. 1983. Structure-mapping: A theoretical framework for analogy. Cognitive science 7 2 (1983) 155\u2013170.","DOI":"10.1016\/S0364-0213(83)80009-3"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642139"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"crossref","unstructured":"Elena\u00a0L Glassman Jeremy Scott Rishabh Singh Philip\u00a0J Guo and Robert\u00a0C Miller. 2015. OverCode: Visualizing variation in student solutions to programming problems at scale. ACM Transactions on Computer-Human Interaction (TOCHI) 22 2 (2015) 1\u201335.","DOI":"10.1145\/2699751"},{"key":"e_1_3_3_3_26_2","unstructured":"Google. 2024. Gemini Deep Research - your personal research assistant. https:\/\/gemini.google\/overview\/deep-research\/. Accessed: 2025-04-10."},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747773"},{"key":"e_1_3_3_3_28_2","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et\u00a0al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12948 (2025)."},{"key":"e_1_3_3_3_29_2","unstructured":"Cheng-Ping Hsieh Simeng Sun Samuel Kriman Shantanu Acharya Dima Rekesh Fei Jia Yang Zhang and Boris Ginsburg. 2024. RULER: What\u2019s the Real Context Size of Your Long-Context Language Models? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.06654 (2024)."},{"key":"e_1_3_3_3_30_2","unstructured":"Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney et\u00a0al. 2024. Openai o1 system card. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.16720 (2024)."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606737"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"crossref","unstructured":"Minsuk Kahng Ian Tenney Mahima Pushkarna Michael\u00a0Xieyang Liu James Wexler Emily Reif Krystal Kallarackal Minsuk Chang Michael Terry and Lucas Dixon. 2024. LLM Comparator: Interactive Analysis of Side-by-Side Evaluation of Large Language Models. IEEE Transactions on Visualization and Computer Graphics (2024).","DOI":"10.1145\/3613905.3650755"},{"key":"e_1_3_3_3_33_2","unstructured":"Andrej\u00a0[@karpathy] Karpathy. 2025. My reaction is that there is an evaluation crisis. I don\u2019t really know what metrics to look at right now. [...] In absence of great comprehensive evals I tried to turn to vibe checks instead but I now fear they are misleading and there is too much opportunity for confirmation bias too low sample size etc. it\u2019s just not great. TLDR my reaction is I don\u2019t really know how good these models are right now. https:\/\/x.com\/karpathy\/status\/1896266683301659068. Accessed: 2025-04-01."},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533135"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"crossref","unstructured":"Minjeong Kim Kyeongpil Kang Deokgun Park Jaegul Choo and Niklas Elmqvist. 2016. Topiclens: Efficient multi-level visual topic exploration of large-scale document collections. IEEE transactions on visualization and computer graphics 23 1 (2016) 151\u2013160.","DOI":"10.1109\/TVCG.2016.2598445"},{"key":"e_1_3_3_3_36_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Kim Seungone","year":"2023","unstructured":"Seungone Kim, Jamin Shin, Yejin Cho, Joel Jang, Shayne Longpre, Hwaran Lee, Sangdoo Yun, Seongjin Shin, Sungdong Kim, James Thorne, et\u00a0al. 2023. Prometheus: Inducing fine-grained evaluation capability in language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_3_37_2","unstructured":"Seungone Kim Juyoung Suk Ji\u00a0Yong Cho Shayne Longpre Chaeeun Kim Dongkeun Yoon Guijin Son Yejin Cho Sheikh Shafayat Jinheon Baek et\u00a0al. 2024. The biggen bench: A principled benchmark for fine-grained evaluation of language models with language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.05761 (2024)."},{"key":"e_1_3_3_3_38_2","unstructured":"Seungone Kim Ian Wu Jinu Lee Xiang Yue Seongyun Lee Mingyeong Moon Kiril Gashteovski Carolin Lawrence Julia Hockenmaier Graham Neubig et\u00a0al. 2025. Scaling Evaluation-time Compute with Reasoning Models as Process Evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.19877 (2025)."},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606833"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501999"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287590"},{"key":"e_1_3_3_3_43_2","unstructured":"Michelle\u00a0S Lam Fred Hohman Dominik Moritz Jeffrey\u00a0P Bigham Kenneth Holstein and Mary\u00a0Beth Kery. 2024. AI Policy Projector: Grounding LLM Policy Design in Iterative Mapmaking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.18203 (2024)."},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642830"},{"key":"e_1_3_3_3_45_2","unstructured":"Nathan Lambert Valentina Pyatkin Jacob Morrison LJ Miranda Bill\u00a0Yuchen Lin Khyathi Chandu Nouha Dziri Sachin Kumar Tom Zick Yejin Choi et\u00a0al. 2024. Rewardbench: Evaluating reward models for language modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.13787 (2024)."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376590"},{"key":"e_1_3_3_3_47_2","unstructured":"Q\u00a0Vera Liao and Kush\u00a0R Varshney. 2021. Human-centered explainable ai (xai): From algorithms to user experiences. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.10790 (2021)."},{"key":"e_1_3_3_3_48_2","unstructured":"Bill\u00a0Yuchen Lin Yuntian Deng Khyathi Chandu Faeze Brahman Abhilasha Ravichander Valentina Pyatkin Nouha Dziri Ronan\u00a0Le Bras and Yejin Choi. 2024. Wildbench: Benchmarking llms with challenging tasks from real users in the wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.04770 (2024)."},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580817"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642149"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter Yichong Xu Shuohang Wang Ruochen Xu and Chenguang Zhu. 2023. G-eval: NLG evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.16634 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"crossref","unstructured":"Stuart Lloyd. 1982. Least squares quantization in PCM. IEEE transactions on information theory 28 2 (1982) 129\u2013137.","DOI":"10.1109\/TIT.1982.1056489"},{"key":"e_1_3_3_3_53_2","unstructured":"Chris Lu Cong Lu Robert\u00a0Tjarko Lange Jakob Foerster Jeff Clune and David Ha. 2024. The ai scientist: Towards fully automated open-ended scientific discovery. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06292 (2024)."},{"key":"e_1_3_3_3_54_2","unstructured":"Scott\u00a0M Lundberg and Su-In Lee. 2017. A unified approach to interpreting model predictions. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_55_2","first-page":"281","volume-title":"Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Statistics","volume":"5","author":"MacQueen James","year":"1967","unstructured":"James MacQueen. 1967. Some methods for classification and analysis of multivariate observations. In Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Statistics , Vol.\u00a05. University of California press, 281\u2013298."},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"crossref","unstructured":"Leland McInnes John Healy Steve Astels et\u00a0al. 2017. hdbscan: Hierarchical density based clustering. J. Open Source Softw. 2 11 (2017) 205.","DOI":"10.21105\/joss.00205"},{"key":"e_1_3_3_3_57_2","unstructured":"Leland McInnes John Healy and James Melville. 2018. Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1802.03426 (2018)."},{"key":"e_1_3_3_3_58_2","unstructured":"Meta. 2026. Manus: Hands On AI. https:\/\/manus.im. Accessed: 2026-02-02."},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_3_3_60_2","unstructured":"Aditi Mishra Utkarsh Soni Anjana Arunkumar Jinbin Huang Bum\u00a0Chul Kwon and Chris Bryan. 2023. PromptAid: Prompt Exploration Perturbation Testing and Iteration using Visual Analytics for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.01964 (2023)."},{"key":"e_1_3_3_3_61_2","unstructured":"Vishvak Murahari Ameet Deshpande Peter Clark Tanmay Rajpurohit Ashish Sabharwal Karthik Narasimhan and Ashwin Kalyan. 2023. Qualeval: Qualitative evaluation for model improvement. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.02807 (2023)."},{"key":"e_1_3_3_3_62_2","first-page":"145","volume-title":"Proceedings of the human language technology conference of the north american chapter of the association for computational linguistics: Hlt-naacl 2004","author":"Nenkova Ani","year":"2004","unstructured":"Ani Nenkova and Rebecca\u00a0J Passonneau. 2004. Evaluating content selection in summarization: The pyramid method. In Proceedings of the human language technology conference of the north american chapter of the association for computational linguistics: Hlt-naacl 2004. 145\u2013152."},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445618"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545696"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_3_3_66_2","unstructured":"Joon\u00a0Sung Park Carolyn\u00a0Q Zou Aaron Shaw Benjamin\u00a0Mako Hill Carrie Cai Meredith\u00a0Ringel Morris Robb Willer Percy Liang and Michael\u00a0S Bernstein. 2024. Generative agent simulations of 1 000 people. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10109 (2024)."},{"key":"e_1_3_3_3_67_2","unstructured":"Carole\u00a0C Perlman. 2003. Performance Assessment: Designing Appropriate Performance Tasks and Scoring Rubrics. (2003)."},{"key":"e_1_3_3_3_68_2","first-page":"2","volume-title":"Proceedings of international conference on intelligence analysis","volume":"5","author":"Pirolli Peter","year":"2005","unstructured":"Peter Pirolli and Stuart Card. 2005. The sensemaking process and leverage points for analyst technology as identified through cognitive task analysis. In Proceedings of international conference on intelligence analysis , Vol.\u00a05. McLean, VA, USA, 2\u20134."},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3715579"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3713357"},{"key":"e_1_3_3_3_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450649"},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.230"},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939778"},{"key":"e_1_3_3_3_74_2","unstructured":"Marco\u00a0Tulio Ribeiro Tongshuang Wu Carlos Guestrin and Sameer Singh. 2020. Beyond accuracy: Behavioral testing of NLP models with CheckList. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.04118 (2020)."},{"key":"e_1_3_3_3_75_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580790"},{"key":"e_1_3_3_3_76_2","unstructured":"Jon Saad-Falcon Rajan Vivek William Berrios Nandita\u00a0Shankar Naik Matija Franklin Bertie Vidgen Amanpreet Singh Douwe Kiela and Shikib Mehri. 2024. Lmunit: Fine-grained evaluation with natural language unit tests. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.13091 (2024)."},{"key":"e_1_3_3_3_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_3_78_2","unstructured":"Hua Shen Tiffany Knearem Reshmi Ghosh Kenan Alkiek Kundan Krishna Yachuan Liu Ziqiao Ma Savvas Petridis Yi-Hao Peng Li Qiwei et\u00a0al. 2024. Towards bidirectional human-ai alignment: A systematic review for clarifications framework and future directions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.09264 (2024)."},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"crossref","unstructured":"Venkatesh Sivaraman Zexuan Li and Adam Perer. 2025. Divisi: Interactive Search and Visualization for Scalable Exploratory Subgroup Analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.10537 (2025).","DOI":"10.1145\/3706598.3713103"},{"key":"e_1_3_3_3_80_2","unstructured":"Charlie Snell Jaehoon Lee Kelvin Xu and Aviral Kumar. 2024. Scaling llm test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03314 (2024)."},{"key":"e_1_3_3_3_81_2","unstructured":"Giulio Starace Oliver Jaffe Dane Sherburn James Aung Jun\u00a0Shern Chan Leon Maksin Rachel Dias Evan Mays Benjamin Kinsella Wyatt Thompson et\u00a0al. 2025. PaperBench: Evaluating AI\u2019s Ability to Replicate AI Research. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.01848 (2025)."},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","unstructured":"Hendrik Strobelt Albert Webson Victor Sanh Benjamin Hoover Johanna Beyer Hanspeter Pfister and Alexander\u00a0M. Rush. 2023. Interactive and Visual Prompt Engineering for Ad-hoc Task Adaptation with Large Language Models. IEEE Transactions on Visualization and Computer Graphics 29 1 (2023) 1146\u20131156. 10.1109\/TVCG.2022.3209479","DOI":"10.1109\/TVCG.2022.3209479"},{"key":"e_1_3_3_3_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642400"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606756"},{"key":"e_1_3_3_3_85_2","unstructured":"Annalisa Szymanski Simret\u00a0Araya Gebreegziabher Oghenemaro Anuyah Ronald\u00a0A Metoyer and Toby Jia-Jun Li. 2024. Comparing Criteria Development Across Domain Experts Lay Users and Models in Large Language Model Evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02054 (2024)."},{"key":"e_1_3_3_3_86_2","unstructured":"Alex Tamkin Miles McCain Kunal Handa Esin Durmus Liane Lovitt Ankur Rathi Saffron Huang Alfred Mountfield Jerry Hong Stuart Ritchie et\u00a0al. 2024. Clio: Privacy-Preserving Insights into Real-World AI Use. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.13678 (2024)."},{"key":"e_1_3_3_3_87_2","unstructured":"OpenThoughts Team. 2025. Open Thoughts. https:\/\/open-thoughts.ai."},{"key":"e_1_3_3_3_88_2","doi-asserted-by":"crossref","unstructured":"David\u00a0R Thomas. 2006. A general inductive approach for analyzing qualitative evaluation data. American journal of evaluation 27 2 (2006) 237\u2013246.","DOI":"10.1177\/1098214005283748"},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300831"},{"key":"e_1_3_3_3_90_2","unstructured":"Ruiyi Wang Haofei Yu Wenxin Zhang Zhengyang Qi Maarten Sap Graham Neubig Yonatan Bisk and Hao Zhu. 2024. SOTOPIA-\u03c0 : Interactive Learning of Socially Intelligent Language Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08715 (2024)."},{"key":"e_1_3_3_3_91_2","unstructured":"Xinyuan Wang Bowen Wang Dunjie Lu Junlin Yang Tianbao Xie Junli Wang Jiaqi Deng Xiaole Guo Yiheng Xu Chen\u00a0Henry Wu et\u00a0al. 2025. Opencua: Open foundations for computer-use agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.09123 (2025)."},{"key":"e_1_3_3_3_92_2","unstructured":"Zijie\u00a0J Wang Fred Hohman and Duen\u00a0Horng Chau. 2023. Wizmap: Scalable interactive visualization for exploring large machine learning embeddings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.09328 (2023)."},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"publisher","unstructured":"James Wexler Mahima Pushkarna Tolga Bolukbasi Martin Wattenberg Fernanda Vi\u00e9gas and Jimbo Wilson. 2020. The What-If Tool: Interactive Probing of Machine Learning Models. IEEE Transactions on Visualization and Computer Graphics 26 1 (2020) 56\u201365. 10.1109\/TVCG.2019.2934619","DOI":"10.1109\/TVCG.2019.2934619"},{"key":"e_1_3_3_3_94_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584059"},{"key":"e_1_3_3_3_95_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1073"},{"key":"e_1_3_3_3_96_2","unstructured":"Tongshuang Wu Marco\u00a0Tulio Ribeiro Jeffrey Heer and Daniel\u00a0S Weld. 2021. Polyjuice: Generating counterfactuals for explaining evaluating and improving models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2101.00288 (2021)."},{"key":"e_1_3_3_3_97_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517582"},{"key":"e_1_3_3_3_98_2","unstructured":"Zeqiu Wu Yushi Hu Weijia Shi Nouha Dziri Alane Suhr Prithviraj Ammanabrolu Noah\u00a0A Smith Mari Ostendorf and Hannaneh Hajishirzi. 2023. Fine-Grained Human Feedback Gives Better Rewards for Language Model Training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.01693 (2023)."},{"key":"e_1_3_3_3_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/1978942.1979167"},{"key":"e_1_3_3_3_100_2","unstructured":"Seonghyeon Ye Doyoung Kim Sungdong Kim Hyeonbin Hwang Seungone Kim Yongrae Jo James Thorne Juho Kim and Minjoon Seo. 2023. Flask: Fine-grained language model evaluation based on alignment skill sets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.10928 (2023)."},{"key":"e_1_3_3_3_101_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511105"},{"key":"e_1_3_3_3_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596138"},{"key":"e_1_3_3_3_103_2","unstructured":"Zhiyuan Zeng Yizhong Wang Hannaneh Hajishirzi and Pang\u00a0Wei Koh. 2025. EvalTree: Profiling Language Model Weaknesses via Hierarchical Capability Trees. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.08893 (2025)."},{"key":"e_1_3_3_3_104_2","unstructured":"Jingyue Zhang and Ian Arawjo. 2024. ChainBuddy: An AI Agent System for Generating LLM Pipelines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.13588 (2024)."},{"key":"e_1_3_3_3_105_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric.\u00a0P Xing Hao Zhang Joseph\u00a0E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arxiv:https:\/\/arXiv.org\/abs\/2306.05685\u00a0[cs.CL]"},{"key":"e_1_3_3_3_106_2","doi-asserted-by":"crossref","unstructured":"Ming Zhong Yang Liu Da Yin Yuning Mao Yizhu Jiao Pengfei Liu Chenguang Zhu Heng Ji and Jiawei Han. 2022. Towards a unified multi-dimensional evaluator for text generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.07197 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.131"},{"key":"e_1_3_3_3_107_2","unstructured":"Xuhui Zhou Hao Zhu Leena Mathur Ruohong Zhang Haofei Yu Zhengyang Qi Louis-Philippe Morency Yonatan Bisk Daniel Fried Graham Neubig et\u00a0al. 2023. Sotopia: Interactive evaluation for social intelligence in language agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11667 (2023)."}],"event":{"name":"CHI 2026: CHI Conference on Human Factors in Computing Systems","location":"Barcelona Spain","acronym":"CHI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772318.3790285","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T09:14:15Z","timestamp":1776417255000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772318.3790285"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":106,"alternative-id":["10.1145\/3772318.3790285","10.1145\/3772318"],"URL":"https:\/\/doi.org\/10.1145\/3772318.3790285","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}