{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T20:16:56Z","timestamp":1762460216245,"version":"3.44.0"},"reference-count":17,"publisher":"Association for Computing Machinery (ACM)","issue":"2","funder":[{"DOI":"10.13039\/100025178","name":"OpenAI","doi-asserted-by":"crossref","id":[{"id":"10.13039\/100025178","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Stanford Accelerator for Learning \/ Institute for Human-Centered Artificial Intelligence"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["ACM Inroads"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1145\/3734876","type":"journal-article","created":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T13:47:14Z","timestamp":1749476834000},"page":"46-51","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Automated Benchmarking Infrastructure: Moving Toward Robust Investigations of Gen AI in Computing Education"],"prefix":"10.1145","volume":"16","author":[{"given":"Murtaza","family":"Ali","sequence":"first","affiliation":[{"name":"University of Washington"}]},{"given":"Benjamin","family":"Xie","sequence":"additional","affiliation":[{"name":"Stanford University"}]}],"member":"320","published-online":{"date-parts":[[2025,6,9]]},"reference":[{"key":"e_1_2_1_1_1","first-page":"452","article-title":"Using benchmarking infrastructure to evaluate LLM performance on CS concept inventories: Challenges, opportunities, and critiques. In Proceedings of the 2024 ACM Conf. on Intern","volume":"1","author":"Ali M.","year":"2024","unstructured":"Ali, M., Rao, P., Mai, Y., and Xie, B. Using benchmarking infrastructure to evaluate LLM performance on CS concept inventories: Challenges, opportunities, and critiques. In Proceedings of the 2024 ACM Conf. on Intern. Computing Education Research 1 (2024), 452--468.","journal-title":"Computing Education Research"},{"key":"e_1_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1111\/nyas.15007"},{"key":"e_1_2_1_3_1","unstructured":"Center for Research on Foundation Models. Adding New Scenarios---HELM Documentation. CRFM HELM. https:\/\/crfm-helm.readthedocs.io\/en\/latest\/adding_new_scenarios"},{"key":"e_1_2_1_4_1","unstructured":"Center for Research on Foundation Models. CRFM Ecosystem Graphs.; https:\/\/crfm.stanford.edu\/ecosystem-graphs\/index.html?mode=table."},{"key":"e_1_2_1_5_1","first-page":"3","article-title":"A survey on evaluation of large language models","volume":"15","author":"Chang Y.","year":"2024","unstructured":"Chang, Y. et al. A survey on evaluation of large language models. In Proceedings of ACM Trans. on Intelligent Systems and Technology 15, 3 (2024), 39.","journal-title":"Proceedings of ACM Trans. on Intelligent Systems and Technology"},{"key":"e_1_2_1_6_1","volume-title":"The Theory and Practice of Item Response Theory","author":"de Ayala R.J.","year":"2022","unstructured":"de Ayala, R.J. The Theory and Practice of Item Response Theory. 2nd ed. (New York: The Guilford Press, 2022).","edition":"2"},{"key":"e_1_2_1_7_1","volume-title":"King's College London. Concept inventories for evaluating teaching. Active Learning at King's (Dec. 17","author":"King's Academy","year":"2019","unstructured":"King's Academy, King's College London. Concept inventories for evaluating teaching. Active Learning at King's (Dec. 17, 2019); https:\/\/blogs.kcl.ac.uk\/activelearning\/2019\/12\/17\/concept-inventories-for-evaluating-teaching."},{"key":"e_1_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3695765"},{"volume-title":"Proceedings of the 2023 Conf. on Innovation and Technology in Computer Science Education. ACM (2023)","author":"Leinonen J.","key":"e_1_2_1_9_1","unstructured":"Leinonen, J. et al. Comparing code explanations created by students and large language models. In Proceedings of the 2023 Conf. on Innovation and Technology in Computer Science Education. ACM (2023); 124--130."},{"key":"e_1_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569770"},{"key":"e_1_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569785"},{"key":"e_1_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588794"},{"key":"e_1_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2960310.2960316"},{"volume-title":"Proceedings of the 2019 ACM Conf. on Intern. Computing Education Research, 111--119","author":"Porter L.","key":"e_1_2_1_14_1","unstructured":"Porter, L. et al. BDSI: A validated concept inventory for basic data structures. In Proceedings of the 2019 ACM Conf. on Intern. Computing Education Research, 111--119."},{"key":"e_1_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623762.3633499"},{"key":"e_1_2_1_16_1","volume-title":"Proceedings of the 8th ACM Conf. on Learning @ Scale. ACM","author":"Xie B.","year":"2021","unstructured":"Xie, B. et al. Domain experts' interpretations of assessment bias in a scaled, online computer science curriculum. In Proceedings of the 8th ACM Conf. on Learning @ Scale. ACM (2021), 77--89."},{"key":"e_1_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287324.3287370"}],"container-title":["ACM Inroads"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3734876","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T20:30:38Z","timestamp":1755981038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3734876"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":17,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["10.1145\/3734876"],"URL":"https:\/\/doi.org\/10.1145\/3734876","relation":{},"ISSN":["2153-2184","2153-2192"],"issn-type":[{"type":"print","value":"2153-2184"},{"type":"electronic","value":"2153-2192"}],"subject":[],"published":{"date-parts":[[2025,6]]},"assertion":[{"value":"2025-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}