{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T17:54:14Z","timestamp":1776102854270,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,3]],"date-time":"2024-07-03T00:00:00Z","timestamp":1719964800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Research Council of Finland","award":["356114"],"award-info":[{"award-number":["356114"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,3]]},"DOI":"10.1145\/3649217.3653612","type":"proceedings-article","created":{"date-parts":[[2024,7,3]],"date-time":"2024-07-03T18:30:20Z","timestamp":1720031420000},"page":"52-58","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["Open Source Language Models Can Provide Feedback: Evaluating LLMs' Ability to Help Students Using GPT-4-As-A-Judge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2272-2763","authenticated-orcid":false,"given":"Charles","family":"Koutcheme","sequence":"first","affiliation":[{"name":"Aalto University, Espoo, Finland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9806-419X","authenticated-orcid":false,"given":"Nicola","family":"Dainese","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7277-9282","authenticated-orcid":false,"given":"Sami","family":"Sarsa","sequence":"additional","affiliation":[{"name":"University of Jyv\u00e4skyl\u00e4 &amp; Aalto University, Jyv\u00e4skyl\u00e4, Finland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6502-209X","authenticated-orcid":false,"given":"Arto","family":"Hellas","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6829-9449","authenticated-orcid":false,"given":"Juho","family":"Leinonen","sequence":"additional","affiliation":[{"name":"Aalto University, Espoo, Finland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5150-9806","authenticated-orcid":false,"given":"Paul","family":"Denny","sequence":"additional","affiliation":[{"name":"The University of Auckland, Auckland, New Zealand"}]}],"member":"320","published-online":{"date-parts":[[2024,7,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588852"},{"key":"e_1_3_2_1_2_1","volume-title":"Henrique Ponde de Oliveira Pinto, et al","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, et al. 2021. Evaluating language Models Trained on Code. arxiv: 2107.03374 [cs.LG]"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569823"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630909"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3624720"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3568813.3600139"},{"key":"e_1_3_2_1_7_1","unstructured":"Dan Hendrycks Steven Basart Saurav Kadavath Mantas Mazeika Akul Arora et al. 2021. Measuring Coding Challenge Competence With APPS. arxiv: 2105.09938 [cs.SE]"},{"key":"e_1_3_2_1_8_1","unstructured":"Yann Hicke Anmol Agarwal Qianou Ma and Paul Denny. 2023. AI-TA: Towards an Intelligent Question-Answer Teaching Assistant using Open-Source LLMs. arxiv: 2311.02775 [cs.LG]"},{"key":"e_1_3_2_1_9_1","volume-title":"Potential and Limitations of ChatGPT in Explaining Implicit Hate Speech. In Companion Proceedings of the ACM Web Conference","author":"Huang Fan","year":"2023","unstructured":"Fan Huang, Haewoon Kwak, and Jisun An. 2023. Is ChatGPT better than Human Annotators? Potential and Limitations of ChatGPT in Explaining Implicit Hate Speech. In Companion Proceedings of the ACM Web Conference 2023. ACM."},{"key":"e_1_3_2_1_10_1","volume-title":"Devendra Singh Chaplot, et al","author":"Jiang Albert Q.","year":"2023","unstructured":"Albert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, et al. 2023. Mistral 7B. arxiv: 2310.06825 [cs.CL]"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580919"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2899415.2899422"},{"key":"e_1_3_2_1_13_1","article-title":"A Systematic Literature Review of Automated Feedback Generation for Programming Exercises","volume":"19","author":"Keuning Hieke","year":"2018","unstructured":"Hieke Keuning, Johan Jeuring, and Bastiaan Heeren. 2018. A Systematic Literature Review of Automated Feedback Generation for Programming Exercises. ACM Trans. Comput. Educ. , Vol. 19, 1, Article 3 (2018), bibinfonumpages43 pages.","journal-title":"ACM Trans. Comput. Educ."},{"key":"e_1_3_2_1_14_1","volume-title":"Exploring the Potential of Large Language Models to Generate Formative Programming Feedback. arXiv preprint arXiv:2309.00029","author":"Kiesler Natalie","year":"2023","unstructured":"Natalie Kiesler, Dominic Lohr, and Hieke Keuning. 2023. Exploring the Potential of Large Language Models to Generate Formative Programming Feedback. arXiv preprint arXiv:2309.00029 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Prometheus: Inducing Fine-grained Evaluation Capability in Language Models. arxiv: 2310.08491 [cs.CL]","author":"Kim Seungone","year":"2023","unstructured":"Seungone Kim, Jamin Shin, Yejin Cho, Joel Jang, Shayne Longpre, et al. 2023. Prometheus: Inducing Fine-grained Evaluation Capability in Language Models. arxiv: 2310.08491 [cs.CL]"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3564721.3565955"},{"key":"e_1_3_2_1_17_1","volume-title":"Artificial Intelligence in Education","author":"Koutcheme Charles","unstructured":"Charles Koutcheme. 2023. Training Language Models for Programming Feedback Using Automated Repair Tools. In Artificial Intelligence in Education. Springer Nature Switzerland, 830--835."},{"key":"e_1_3_2_1_18_1","volume-title":"Artificial Intelligence in Education","author":"Koutcheme Charles","unstructured":"Charles Koutcheme, Sami Sarsa, Juho Leinonen, Arto Hellas, and Paul Denny. 2023. Automated Program Repair Using Generative Models for Code Infilling. In Artificial Intelligence in Education. Springer Nature Switzerland, 798--803."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569770"},{"key":"e_1_3_2_1_20_1","volume-title":"Proc. of the 23rd Koli Calling Int. Conf. on Computing Education Research. ACM","author":"Liffiton Mark","year":"2024","unstructured":"Mark Liffiton, Brad E Sheese, Jaromir Savelka, and Paul Denny. 2024. CodeHelp: Using Large Language Models with Guardrails for Scalable Support in Programming Classes. In Proc. of the 23rd Koli Calling Int. Conf. on Computing Education Research. ACM, New York, NY, USA, Article 8, bibinfonumpages11 pages."},{"key":"e_1_3_2_1_21_1","unstructured":"Hunter McNichols Wanyong Feng Jaewook Lee Alexander Scarlatos Digory Smith et al. 2024. Automated Distractor and Feedback Generation for Math Multiple-choice Questions via In-context Learning. arxiv: 2308.03234 [cs.CL]"},{"key":"e_1_3_2_1_22_1","volume-title":"Responsive and Sustainable Educational Futures","author":"Moore Steven","unstructured":"Steven Moore, Huy A. Nguyen, Tianying Chen, and John Stamper. 2023. Assessing the Quality of Multiple-Choice Questions Using GPT-4 and Rule-Based Methods. In Responsive and Sustainable Educational Futures. Springer Nature Switzerland, 229--245."},{"key":"e_1_3_2_1_23_1","volume-title":"Engineering, and Medicine","author":"National Academies of Sciences","year":"2018","unstructured":"National Academies of Sciences, Engineering, and Medicine. 2018. Assessing and responding to the growth of computer science undergraduate enrollments. National Academies Press."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/2566486.2568023"},{"key":"e_1_3_2_1_25_1","article-title":"Automated Assessment in Computer Science Education","volume":"22","author":"Paiva Jos\u00e9 Carlos","year":"2022","unstructured":"Jos\u00e9 Carlos Paiva, Jos\u00e9 Paulo Leal, and \u00c1lvaro Figueira. 2022. Automated Assessment in Computer Science Education: A State-of-the-Art Review. ACM Trans. Comput. Educ. , Vol. 22, 3, Article 34 (2022), bibinfonumpages40 pages.","journal-title":"A State-of-the-Art Review. ACM Trans. Comput. Educ."},{"key":"e_1_3_2_1_26_1","volume-title":"Baker","author":"Pankiewicz Maciej","year":"2023","unstructured":"Maciej Pankiewicz and Ryan S. Baker. 2023. Large Language Models (GPT) for automating feedback on programming assignments. arxiv: 2307.00150 [cs.HC]"},{"key":"e_1_3_2_1_27_1","volume-title":"Examining Zero-Shot Vulnerability Repair with Large Language Models. In 2023 IEEE Symposium on Security and Privacy. 2339--2356","author":"Pearce Hammond","year":"2023","unstructured":"Hammond Pearce, Benjamin Tan, Baleegh Ahmad, Ramesh Karri, and Brendan Dolan-Gavitt. 2023. Examining Zero-Shot Vulnerability Repair with Large Language Models. In 2023 IEEE Symposium on Security and Privacy. 2339--2356."},{"key":"e_1_3_2_1_28_1","unstructured":"Tung Phung Jos\u00e9 Cambronero Sumit Gulwani Tobias Kohn Rupak Majumdar et al. 2023 a. Generating High-Precision Feedback for Programming Syntax Errors using language Models. arxiv: 2302.04662 [cs.PL]"},{"key":"e_1_3_2_1_29_1","first-page":"100790","article-title":"b. Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human","volume":"21","author":"Phung Tung","year":"2023","unstructured":"Tung Phung, Victor-Alexandru Pua durean, Jos\u00e9 Cambronero, Sumit Gulwani, Tobias Kohn, et al. 2023 b. Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors. Int. J. of Management, Vol. 21, 2 (2023), 100790.","journal-title":"Tutors. Int. J. of Management"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Tung Phung Victor-Alexandru P?durean Anjali Singh Christopher Brooks Jos\u00e9 Cambronero et al. 2023 c. Automating Human Tutor-Style Programming Feedback: Leveraging GPT-4 Tutor Model for Hint Generation and GPT-3.5 Student Model for Hint Validation. arxiv: 2310.03780 [cs.AI]","DOI":"10.1145\/3636555.3636846"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623762.3633499"},{"key":"e_1_3_2_1_32_1","unstructured":"Rafael Rafailov Archit Sharma Eric Mitchell Stefano Ermon Christopher D. Manning and Chelsea Finn. 2023. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. arxiv: 2305.18290 [cs.LG]"},{"key":"e_1_3_2_1_33_1","unstructured":"Nazneen Rajani Nathan Lambert Sheon Han Jean Wang Osvald Nitski et al. 2023. Can foundation models label data like humans? https:\/\/huggingface.co\/blog\/llm-v-human-data."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623476.3623522"},{"key":"e_1_3_2_1_35_1","volume-title":"Code Llama: Open Foundation Models for Code. arxiv: 2308.12950 [cs.CL]","author":"Rozi\u00e8re Baptiste","year":"2023","unstructured":"Baptiste Rozi\u00e8re, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, et al. 2023. Code Llama: Open Foundation Models for Code. arxiv: 2308.12950 [cs.CL]"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3568813.3600142"},{"key":"e_1_3_2_1_37_1","unstructured":"Jaromir Savelka Paul Denny Mark Liffiton and Brad Sheese. 2023 b. Efficient Classification of Student Help Requests in Programming Courses Using Large Language Models. arxiv: 2310.20105 [cs.CY]"},{"key":"e_1_3_2_1_38_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux et al. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv: 2302.13971 [cs.CL]"},{"key":"e_1_3_2_1_39_1","volume-title":"Zephyr: Direct Distillation of LM Alignment. arxiv: 2310.16944 [cs.LG]","author":"Tunstall Lewis","year":"2023","unstructured":"Lewis Tunstall, Edward Beeching, Nathan Lambert, Nazneen Rajani, Kashif Rasul, et al. 2023. Zephyr: Direct Distillation of LM Alignment. arxiv: 2310.16944 [cs.LG]"},{"key":"e_1_3_2_1_40_1","unstructured":"Yidong Wang Zhuohao Yu Zhengran Zeng Linyi Yang Cunxiang Wang et al. 2023. PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization. arxiv: 2306.05087 [cs.CL]"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue et al. 2020. HuggingFace's Transformers: State-of-the-art Natural Language Processing. arxiv: 1910.03771 [cs.CL]","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","unstructured":"Mike Wu M. Mosse Noah D. Goodman and C. Piech. 2019. Zero Shot Learning for Code Education: Rubric Sampling with Deep Learning Inference. In AAAI. https:\/\/doi.org\/10.1609\/aaai.v33i01.3301782","DOI":"10.1609\/aaai.v33i01.3301782"},{"key":"e_1_3_2_1_43_1","volume-title":"Education: A Systematic Scoping Review. British Journal of Educational Technology","author":"Yan Lixiang","year":"2023","unstructured":"Lixiang Yan, Lele Sha, Linxuan Zhao, Yuheng Li, Roberto Martinez-Maldonado, et al. 2023. Practical and Ethical Challenges of Large Language Models in Education: A Systematic Scoping Review. British Journal of Educational Technology (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu et al. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arxiv: 2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_45_1","volume-title":"LIMA: Less Is More for Alignment. arxiv: 2305.11206 [cs.CL]","author":"Zhou Chunting","year":"2023","unstructured":"Chunting Zhou, Pengfei Liu, Puxin Xu, Srini Iyer, Jiao Sun, et al. 2023. LIMA: Less Is More for Alignment. arxiv: 2305.11206 [cs.CL]"},{"key":"e_1_3_2_1_46_1","unstructured":"Lianghui Zhu Xinggang Wang and Xinlong Wang. 2023. JudgeLM: Fine-tuned Large Language Models are Scalable Judges. arxiv: 2310.17631 [cs.CL]"}],"event":{"name":"ITiCSE 2024: Innovation and Technology in Computer Science Education","location":"Milan Italy","acronym":"ITiCSE 2024","sponsor":["SIGCSE ACM Special Interest Group on Computer Science Education"]},"container-title":["Proceedings of the 2024 on Innovation and Technology in Computer Science Education V. 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649217.3653612","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649217.3653612","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:49:32Z","timestamp":1755787772000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649217.3653612"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,3]]},"references-count":46,"alternative-id":["10.1145\/3649217.3653612","10.1145\/3649217"],"URL":"https:\/\/doi.org\/10.1145\/3649217.3653612","relation":{},"subject":[],"published":{"date-parts":[[2024,7,3]]},"assertion":[{"value":"2024-07-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}