{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,8]],"date-time":"2026-07-08T01:26:25Z","timestamp":1783473985980,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712152","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"1032-1047","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["More than Marketing? On the Information Value of AI Benchmarks for Practitioners"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6741-0159","authenticated-orcid":false,"given":"Amelia","family":"Hardy","sequence":"first","affiliation":[{"name":"Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7913-9296","authenticated-orcid":false,"given":"Anka","family":"Reuel","sequence":"additional","affiliation":[{"name":"Computer Science, Stanford University, Stanford, California, USA and Belfer Center, Harvard University, Cambridge, Massachusetts, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2001-3488","authenticated-orcid":false,"given":"Kiana","family":"Jafari Meimandi","sequence":"additional","affiliation":[{"name":"Stanford University, Palo Alto, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6972-4491","authenticated-orcid":false,"given":"Lisa","family":"Soder","sequence":"additional","affiliation":[{"name":"Interface, London, United Kingdom,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9371-201X","authenticated-orcid":false,"given":"Allie","family":"Griffith","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6849-0999","authenticated-orcid":false,"given":"Dylan M","family":"Asmar","sequence":"additional","affiliation":[{"name":"Stanford Intelligent Systems Laboratory, Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4023-419X","authenticated-orcid":false,"given":"Sanmi","family":"Koyejo","sequence":"additional","affiliation":[{"name":"Computer Science, Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8020-9434","authenticated-orcid":false,"given":"Michael S.","family":"Bernstein","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7238-9663","authenticated-orcid":false,"given":"Mykel John","family":"Kochenderfer","sequence":"additional","affiliation":[{"name":"Aeronautics and Astronautics, Stanford University, Stanford, California, USA,"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1002\/9781119171386.ch19"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Karen Anderson and Rodney McAdam. 2005. An empirical analysis of lead benchmarking and performance measurement: Guidance for qualitative research. International Journal of Quality & Reliability Management 22 4 (2005) 354\u2013375.","DOI":"10.1108\/02656710510591200"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Mohamed\u00a0Radhouene Aniba Olivier Poch and Julie\u00a0D. Thompson. 2010. Issues in bioinformatics benchmarking: the case study of multiple sequence alignment. Nucleic Acids Research 38 21 (07 2010) 7353\u20137363. 10.1093\/nar\/gkq625 arXiv:https:\/\/academic.oup.com\/nar\/article-pdf\/38\/21\/7353\/7186841\/gkq625.pdf","DOI":"10.1093\/nar\/gkq625"},{"key":"e_1_3_3_2_5_2","unstructured":"Anthropic. 2024. Introducing the Next Generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family. Accessed: 2024-10-10."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376718"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Nick Bostrom Allan Dafoe and Carrick Flynn. 2020. Public policy and superintelligent AI: a vector field approach. Ethics of artificial intelligence (2020) 293\u2013326.","DOI":"10.1093\/oso\/9780190905033.003.0011"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-90403-0_17"},{"key":"e_1_3_3_2_10_2","unstructured":"Margarita Boyarskaya Alexandra Olteanu and Kate Crawford. 2020. Overcoming failures of imagination in AI infused system development and deployment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2011.13416 (2020)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580014"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Peter\u00a0M Chapman. 2018. Environmental Quality Benchmarks \u2014 The Good The Bad and The Ugly. Environmental Science and Pollution Research 25 4 (2018) 3043\u20133046.","DOI":"10.1007\/s11356-016-7924-2"},{"key":"e_1_3_3_2_13_2","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde De\u00a0Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman et\u00a0al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.03374 (2021)."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Zhihui Cheng Chin-Sheng Pang Peiqi Wang Son\u00a0T Le Yanqing Wu Davood Shahrjerdi Iuliana Radu Max\u00a0C Lemme Lian-Mao Peng Xiangfeng Duan et\u00a0al. 2022. How to Rport And Benchmark Emerging Field-Effect Transistors. Nature Electronics 5 7 (2022) 416\u2013423.","DOI":"10.1038\/s41928-022-00798-8"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Kenneth\u00a0Ward Church. 2018. Emerging trends: A tribute to Charles Wayne. Natural Language Engineering 24 1 (2018) 155\u2013160. 10.1017\/S1351324917000389","DOI":"10.1017\/S1351324917000389"},{"key":"e_1_3_3_2_16_2","volume-title":"Basics of qualitative research","author":"Corbin Juliet","year":"2015","unstructured":"Juliet Corbin and Anselm Strauss. 2015. Basics of qualitative research. Vol.\u00a014. sage."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Lee\u00a0J Cronbach and Paul\u00a0E Meehl. 1955. Construct validity in psychological tests. Psychological bulletin 52 4 (1955) 281.","DOI":"10.1037\/h0040957"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Fred\u00a0D. Davis. 1989. Perceived Usefulness Perceived Ease of Use and User Acceptance of Information Technology. MIS Quarterly 13 3 (1989) 319\u2013340. http:\/\/www.jstor.org\/stable\/249008","DOI":"10.2307\/249008"},{"key":"e_1_3_3_2_19_2","unstructured":"Luciano Floridi. 2024. Three tensions in understanding AI\u2013Comment on Pope Francis\u2019 message Artificial Intelligence and the Wisdom of the Heart: Towards a Fully Human Communication. Available at SSRN (2024)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Timnit Gebru Jamie Morgenstern Briana Vecchione Jennifer\u00a0Wortman Vaughan Hanna Wallach Hal\u00a0Daum\u00e9 Iii and Kate Crawford. 2021. Datasheets for datasets. Commun. ACM 64 12 (2021) 86\u201392.","DOI":"10.1145\/3458723"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445423"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1101\/2024.04.07.24305462"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Helen Heath and Sarah Cowley. 2004. Developing a grounded theory approach: a comparison of Glaser and Strauss. International journal of nursing studies 41 2 (2004) 141\u2013150.","DOI":"10.1016\/S0020-7489(03)00113-5"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581503"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Abhishek Kadian Joanne Truong Aaron Gokaslan Alexander Clegg Erik Wijmans Stefan Lee Manolis Savva Sonia Chernova and Dhruv Batra. 2020. Sim2real predictivity: Does evaluation in simulation predict real-world performance? IEEE Robotics and Automation Letters 5 4 (2020) 6670\u20136677.","DOI":"10.1109\/LRA.2020.3013848"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Shahid\u00a0N Khan. 2014. Qualitative research method: Grounded theory. International journal of business and management 9 11 (2014) 224\u2013233.","DOI":"10.5539\/ijbm.v9n11p224"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Sarah Lebovitz Natalia Levina and Hila Lifshitz-Assaf. 2021. Is AI Ground Truth Really True? The Dangers of Training and Evaluating AI Tools Based on Experts\u2019 Know-What. MIS Quarterly 45 (09 2021) 1501\u20131526. 10.25300\/MISQ\/2021\/16564","DOI":"10.25300\/MISQ\/2021\/16564"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Byron\u00a0C. Lewis and Albert\u00a0E. Crews. 1985. The Evolution of Benchmarking as a Computer Performance Evaluation Technique. MIS Quarterly 9 1 (1985) 7\u201316. http:\/\/www.jstor.org\/stable\/249270","DOI":"10.2307\/249270"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Mark Liberman. 2010. Fred Jelinek. Computational Linguistics 36 (2010) 595\u2013599. Issue 4.","DOI":"10.1162\/coli_a_00032"},{"key":"e_1_3_3_2_30_2","unstructured":"Yu\u00a0Lu Liu Su\u00a0Lin Blodgett Jackie Chi\u00a0Kit Cheung Q\u00a0Vera Liao Alexandra Olteanu and Ziang Xiao. 2024. ECBD: Evidence-Centered Benchmark Design for NLP. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08723 (2024)."},{"key":"e_1_3_3_2_31_2","volume-title":"Analyzing social settings: A guide to qualitative observation and analysis","author":"Lofland John","year":"2022","unstructured":"John Lofland, David Snow, Leon Anderson, and Lyn\u00a0H Lofland. 2022. Analyzing social settings: A guide to qualitative observation and analysis. Waveland Press."},{"key":"e_1_3_3_2_32_2","unstructured":"Nestor Maslej Loredana Fattorini Raymond Perrault Vanessa Parli Anka Reuel Erik Brynjolfsson John Etchemendy Katrina Ligett Terah Lyons James Manyika Juan\u00a0Carlos Niebles Yoav Shoham Russell Wald and Jack Clark. 2024. The AI Index 2024 Annual Report. https:\/\/aiindex.stanford.edu\/. Institute for Human-Centered AI (2024)."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-08-051574-8.50019-4"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4939-0378-8_2"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300356"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Anton\u00a0J Nederhof. 1985. Methods of coping with social desirability bias: A review. European journal of social psychology 15 3 (1985) 263\u2013280.","DOI":"10.1002\/ejsp.2420150303"},{"key":"e_1_3_3_2_37_2","unstructured":"OpenAI. 2023. GPT-4 Research and Insights. https:\/\/openai.com\/index\/gpt-4-research\/. Accessed: 2024-10-10."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.5555\/1620270.1620333"},{"key":"e_1_3_3_2_39_2","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)","author":"Raji Inioluwa\u00a0Deborah","year":"2021","unstructured":"Inioluwa\u00a0Deborah Raji, Emily Denton, Emily\u00a0M. Bender, Alex Hanna, and Amandalynne Paullada. 2021. AI and the Everything in the Whole Wide World Benchmark. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2). https:\/\/openreview.net\/forum?id=j6NxpQbREA1"},{"key":"e_1_3_3_2_40_2","unstructured":"Anka Reuel* Amelia Hardy* Chandler Smith Max Lamparth and Mykel\u00a0J. Kochenderfer. 2024. BetterBench: Assessing AI Benchmarks Uncovering Issues and Establishing Best Practices. https:\/\/betterbench.stanford.edu"},{"key":"e_1_3_3_2_41_2","unstructured":"Michael Saxon Ari Holtzman Peter West William\u00a0Yang Wang and Naomi Saphra. 2024. Benchmarks as Microscopes: A Call for Model Metrology. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.16711 (2024)."},{"key":"e_1_3_3_2_42_2","unstructured":"Doga Tascilar. 2023. A Quest through Interconnected Datasets: Research on Annotation Practices in Highly Cited Audio Machine Learning Work and Their Utilized Datasets. (2023)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Viswanath Venkatesh Michael\u00a0G. Morris Gordon\u00a0B. Davis and Fred\u00a0D. Davis. 2003. User Acceptance of Information Technology: Toward a Unified View. MIS Quarterly 27 3 (2003) 425\u2013478. http:\/\/www.jstor.org\/stable\/30036540","DOI":"10.2307\/30036540"}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","location":"Cagliari Italy","acronym":"IUI '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712152","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712152","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:06Z","timestamp":1750298226000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712152"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":42,"alternative-id":["10.1145\/3708359.3712152","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712152","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}