{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,14]],"date-time":"2026-07-14T03:16:09Z","timestamp":1783998969234,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T00:00:00Z","timestamp":1739318400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Research Council of Finland","award":["356114"],"award-info":[{"award-number":["356114"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,12]]},"DOI":"10.1145\/3641554.3701791","type":"proceedings-article","created":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T18:33:41Z","timestamp":1739903621000},"page":"624-630","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["Evaluating Language Models for Generating and Judging Programming Feedback"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2272-2763","authenticated-orcid":false,"given":"Charles","family":"Koutcheme","sequence":"first","affiliation":[{"name":"Aalto University, Espoo, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9806-419X","authenticated-orcid":false,"given":"Nicola","family":"Dainese","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7277-9282","authenticated-orcid":false,"given":"Sami","family":"Sarsa","sequence":"additional","affiliation":[{"name":"University of Jyv\u00e4skyl\u00e4, Jyv\u00e4skyl\u00e4, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6502-209X","authenticated-orcid":false,"given":"Arto","family":"Hellas","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6829-9449","authenticated-orcid":false,"given":"Juho","family":"Leinonen","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0513-4613","authenticated-orcid":false,"given":"Syed","family":"Ashraf","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5150-9806","authenticated-orcid":false,"given":"Paul","family":"Denny","sequence":"additional","affiliation":[{"name":"The University of Auckland, Auckland, New Zealand"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,2,18]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, et al.","author":"Abdin Marah","year":"2024","unstructured":"Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, et al. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arxiv: 2404.14219"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630799"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588852"},{"key":"e_1_3_2_1_4_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur et al. 2024. The Llama 3 Herd of Models. arxiv: 2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_5_1","volume-title":"On the Opportunities of Large Language Models for Programming Process Data. arXiv preprint arXiv:2411.00414","author":"Edwards John","year":"2024","unstructured":"John Edwards, Arto Hellas, and Juho Leinonen. 2024. On the Opportunities of Large Language Models for Programming Process Data. arXiv preprint arXiv:2411.00414 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Gemma: Our open-source models for machine learning fairness. https:\/\/blog.google\/technology\/developers\/gemma-open-models\/","year":"2024","unstructured":"Google. 2024. Gemma: Our open-source models for machine learning fairness. https:\/\/blog.google\/technology\/developers\/gemma-open-models\/"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1080\/08993408.2020.1860408"},{"key":"e_1_3_2_1_8_1","volume-title":"Experiences from Integrating Large Language Model Chatbots into the Classroom. arXiv preprint arXiv:2406.04817","author":"Hellas Arto","year":"2024","unstructured":"Arto Hellas, Juho Leinonen, and Leo Lepp\u00e4nen. 2024. Experiences from Integrating Large Language Model Chatbots into the Classroom. arXiv preprint arXiv:2406.04817 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3568813.3600139"},{"key":"e_1_3_2_1_10_1","unstructured":"Yann Hicke Anmol Agarwal Qianou Ma and Paul Denny. 2023. AI-TA: Towards an Intelligent Question-Answer Teaching Assistant using Open-Source LLMs. arxiv: 2311.02775 [cs.LG]"},{"key":"e_1_3_2_1_11_1","volume-title":"Devendra Singh Chaplot, et al","author":"Jiang Albert Q.","year":"2023","unstructured":"Albert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, et al. 2023. Mistral 7B. arxiv: 2310.06825 [cs.CL]"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580919"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2899415.2899422"},{"key":"e_1_3_2_1_14_1","article-title":"A Systematic Literature Review of Automated Feedback Generation for Programming Exercises","volume":"19","author":"Keuning Hieke","year":"2018","unstructured":"Hieke Keuning, Johan Jeuring, and Bastiaan Heeren. 2018. A Systematic Literature Review of Automated Feedback Generation for Programming Exercises. ACM Trans. Comput. Educ., Vol. 19, 1, Article 3 (2018), 43 pages.","journal-title":"ACM Trans. Comput. Educ."},{"key":"e_1_3_2_1_15_1","volume-title":"Exploring the Potential of Large Language Models to Generate Formative Programming Feedback. arXiv preprint arXiv:2309.00029","author":"Kiesler Natalie","year":"2023","unstructured":"Natalie Kiesler, Dominic Lohr, and Hieke Keuning. 2023. Exploring the Potential of Large Language Models to Generate Formative Programming Feedback. arXiv preprint arXiv:2309.00029 (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"Nachiket Kotalwar Alkis Gotovos and Adish Singla. 2024. Hints-In-Browser: Benchmarking Language Models for Programming Feedback Generation. arxiv: 2406.05053 [cs.LG] https:\/\/arxiv.org\/abs\/2406.05053"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA","author":"Koutcheme Charles","year":"2024","unstructured":"Charles Koutcheme, Nicola Dainese, and Arto Hellas. 2024a. Using Program Repair as a Proxy for Language Models' Feedback Ability in Programming Education. In Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024), Ekaterina Kochmar, Marie Bexte, Jill Burstein, Andrea Horbach, Ronja Laarmann-Quante, Ana\u00efs Tack, Victoria Yaneva, and Zheng Yuan (Eds.). Association for Computational Linguistics, Mexico City, Mexico, 165--181."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649217.3653612"},{"key":"e_1_3_2_1_19_1","unstructured":"Charles Koutcheme Nicola Dainese Sami Sarsa Juho Leinonen Arto Hellas and Paul Denny. 2024c. Benchmarking Educational Program Repair. arxiv: 2405.05347 [cs.SE] https:\/\/arxiv.org\/abs\/2405.05347"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3647782.3647803"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588785"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569770"},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. of the 23rd Koli Calling Int. Conf. on Computing Education Research. ACM","author":"Liffiton Mark","year":"2024","unstructured":"Mark Liffiton, Brad E Sheese, Jaromir Savelka, and Paul Denny. 2024. CodeHelp: Using Large Language Models with Guardrails for Scalable Support in Programming Classes. In Proc. of the 23rd Koli Calling Int. Conf. on Computing Education Research. ACM, New York, NY, USA, Article 8, 11 pages."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626253.3635427"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649217.3653554"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636515"},{"key":"e_1_3_2_1_28_1","article-title":"Automated Assessment in Computer Science Education","volume":"22","author":"Paiva Jos\u00e9 Carlos","year":"2022","unstructured":"Jos\u00e9 Carlos Paiva, Jos\u00e9 Paulo Leal, and \u00c1lvaro Figueira. 2022. Automated Assessment in Computer Science Education: A State-of-the-Art Review. ACM Trans. Comput. Educ., Vol. 22, 3, Article 34 (2022), 40 pages.","journal-title":"A State-of-the-Art Review. ACM Trans. Comput. Educ."},{"key":"e_1_3_2_1_29_1","volume-title":"Baker","author":"Pankiewicz Maciej","year":"2023","unstructured":"Maciej Pankiewicz and Ryan S. Baker. 2023. Large Language Models (GPT) for automating feedback on programming assignments. arxiv: 2307.00150 [cs.HC]"},{"key":"e_1_3_2_1_30_1","unstructured":"Tung Phung Jos\u00e9 Cambronero Sumit Gulwani Tobias Kohn Rupak Majumdar et al. 2023a. Generating High-Precision Feedback for Programming Syntax Errors using language Models. arxiv: 2302.04662 [cs.PL]"},{"key":"e_1_3_2_1_31_1","first-page":"100790","article-title":"Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human","volume":"21","author":"Phung Tung","year":"2023","unstructured":"Tung Phung, Victor-Alexandru P\u0103durean, Jos\u00e9 Cambronero, Sumit Gulwani, Tobias Kohn, et al. 2023b. Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors. Int. J. of Management, Vol. 21, 2 (2023), 100790.","journal-title":"Tutors. Int. J. of Management"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Tung Phung Victor-Alexandru P?durean Anjali Singh Christopher Brooks Jos\u00e9 Cambronero et al. 2023c. Automating Human Tutor-Style Programming Feedback: Leveraging GPT-4 Tutor Model for Hint Generation and GPT-3.5 Student Model for Hint Validation. arxiv: 2310.03780 [cs.AI]","DOI":"10.1145\/3636555.3636846"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649405.3659534"},{"key":"e_1_3_2_1_34_1","unstructured":"Nazneen Rajani Nathan Lambert Sheon Han Jean Wang Osvald Nitski et al. 2023. Can foundation models label data like humans? https:\/\/huggingface.co\/blog\/llm-v-human-data."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636243.3636259"},{"key":"e_1_3_2_1_36_1","volume-title":"Artificial Intelligence in Education, Andrew M. Olney, Irene-Angelica Chounta","author":"Scarlatos Alexander","unstructured":"Alexander Scarlatos, Digory Smith, Simon Woodhead, and Andrew Lan. 2024. Improving the Validity of Automatically Generated Feedback via Reinforcement Learning. In Artificial Intelligence in Education, Andrew M. Olney, Irene-Angelica Chounta, Zitao Liu, Olga C. Santos, and Ig Ibert Bittencourt (Eds.). Springer Nature Switzerland, Cham, 280--294."},{"key":"e_1_3_2_1_37_1","unstructured":"Pat Verga Sebastian Hofstatter Sophia Althammer Yixuan Su Aleksandra Piktus Arkady Arkhangorodsky Minjie Xu Naomi White and Patrick Lewis. 2024. Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models. arxiv: 2404.18796 [cs.CL] https:\/\/arxiv.org\/abs\/2404.18796"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630764"},{"key":"e_1_3_2_1_39_1","volume-title":"Education: A Systematic Scoping Review. British Journal of Educational Technology","author":"Yan Lixiang","year":"2023","unstructured":"Lixiang Yan, Lele Sha, Linxuan Zhao, Yuheng Li, Roberto Martinez-Maldonado, et al. 2023. Practical and Ethical Challenges of Large Language Models in Education: A Systematic Scoping Review. British Journal of Educational Technology (2023)."},{"key":"e_1_3_2_1_40_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu et al. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arxiv: 2306.05685 [cs.CL]"}],"event":{"name":"SIGCSE TS 2025: The 56th ACM Technical Symposium on Computer Science Education","location":"Pittsburgh PA USA","acronym":"SIGCSE TS 2025","sponsor":["SIGCSE ACM Special Interest Group on Computer Science Education"]},"container-title":["Proceedings of the 56th ACM Technical Symposium on Computer Science Education V. 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641554.3701791","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3641554.3701791","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T06:26:35Z","timestamp":1755757595000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641554.3701791"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,12]]},"references-count":40,"alternative-id":["10.1145\/3641554.3701791","10.1145\/3641554"],"URL":"https:\/\/doi.org\/10.1145\/3641554.3701791","relation":{},"subject":[],"published":{"date-parts":[[2025,2,12]]},"assertion":[{"value":"2025-02-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}