{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T20:16:44Z","timestamp":1762460204057,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,5]]},"DOI":"10.1145\/3649165.3690123","type":"proceedings-article","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T18:23:41Z","timestamp":1733163821000},"page":"193-199","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["A Benchmark for Testing the Capabilities of LLMs in Assessing the Quality of Multiple-choice Questions in Introductory Programming Education"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3695-2780","authenticated-orcid":false,"given":"Aninditha","family":"Ramesh","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9848-1663","authenticated-orcid":false,"given":"Arav","family":"Agarwal","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5430-7282","authenticated-orcid":false,"given":"Jacob Arthur","family":"Doughty","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3836-278X","authenticated-orcid":false,"given":"Ketan","family":"Ramaneti","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3674-5456","authenticated-orcid":false,"given":"Jaromir","family":"Savelka","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5150-8259","authenticated-orcid":false,"given":"Majd","family":"Sakr","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,12,5]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AI@Meta. 2024. Llama 3 Model Card. (2024). https:\/\/github.com\/meta-llama\/ llama3\/blob\/main\/MODEL_CARD.md"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Jonathan Brown Gwen A. Frishkoff and Maxine Esk\u00e9nazi. 2005. Automatic Question Generation for Vocabulary Assessment. In Human Language Technology - The Baltic Perspectiv. https:\/\/api.semanticscholar.org\/CorpusID:1973351","DOI":"10.3115\/1220575.1220678"},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_4_1","volume-title":"Scalable Educational Question Generation with Pre-trained Language Models. In International Conference on Artificial Intelligence in Education. https:\/\/api.semanticscholar.org\/ CorpusID:258685703","author":"Bulathwela Sahan","year":"2023","unstructured":"Sahan Bulathwela, Hamze Muse, and Emine Yilmaz. 2023. Scalable Educational Question Generation with Pre-trained Language Models. In International Conference on Artificial Intelligence in Education. https:\/\/api.semanticscholar.org\/ CorpusID:258685703"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jarmac.2018.07.002"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636243.3636256"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1219"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30353"},{"key":"e_1_3_2_1_9_1","volume-title":"Smith","author":"Heilman Michael","year":"2009","unstructured":"Michael Heilman and Noah A. Smith. 2009. Question Generation via Overgenerating Transformations and Ranking. https:\/\/api.semanticscholar.org\/CorpusID: 14631309"},{"key":"e_1_3_2_1_10_1","volume-title":"Smith","author":"Heilman Michael","year":"2010","unstructured":"Michael Heilman and Noah A. Smith. 2010. Good Question! Statistical Ranking for Question Generation. In North American Chapter of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:1809816"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICALT.2014.105"},{"key":"e_1_3_2_1_12_1","unstructured":"Kevin Hwang Sai Challagundla Maryam Alomair Lujie Karen Chen and F. S. Choa. [n. d.]. Towards AI-Assisted Multiple Choice Question Generation and Quality Evaluation at Scale: Aligning with Bloom's Taxonomy. https:\/\/api. semanticscholar.org\/CorpusID:266876687"},{"key":"e_1_3_2_1_13_1","volume-title":"QAScore'An Unsupervised Unreferenced Metric for the Question Generation Evaluation. Entropy 24","author":"Ji Tianbo","year":"2022","unstructured":"Tianbo Ji, Chenyang Lyu, Gareth J. F. Jones, Liting Zhou, and Yvette Graham. 2022. QAScore'An Unsupervised Unreferenced Metric for the Question Generation Evaluation. Entropy 24 (2022). https:\/\/api.semanticscholar.org\/CorpusID: 252781032"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580919"},{"key":"e_1_3_2_1_15_1","volume-title":"Learning to Answer by Learning to Ask: Getting the Best of GPT-2 and BERT Worlds. ArXiv abs\/1911.02365","author":"Klein Tassilo","year":"2019","unstructured":"Tassilo Klein and Moin Nabi. 2019. Learning to Answer by Learning to Ask: Getting the Best of GPT-2 and BERT Worlds. ArXiv abs\/1911.02365 (2019). https: \/\/api.semanticscholar.org\/CorpusID:207880647"},{"key":"e_1_3_2_1_16_1","unstructured":"Hidenobu Kunichika Tomoki Katayama Tsukasa Hirashima and Akira Takeuchi. 2001. Automated Question Generation Methods for Intelligent English Learning Systems and its Evaluation. https:\/\/api.semanticscholar.org\/CorpusID:1536529"},{"key":"e_1_3_2_1_17_1","volume-title":"Ashley Ricker Gyllen, and Mutlu Cukurova","author":"Leiker Daniel","year":"2023","unstructured":"Daniel Leiker, Sara Finnigan, Ashley Ricker Gyllen, and Mutlu Cukurova. 2023. Prototyping the use of Large Language Models (LLMs) for adult learning content creation at scale. In LLM@AIED. https:\/\/api.semanticscholar.org\/CorpusID: 259076210"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588785"},{"key":"e_1_3_2_1_19_1","volume-title":"Codehelp: Using large language models with guardrails for scalable support in programming classes. arXiv preprint arXiv:2308.06921","author":"Liffiton Mark","year":"2023","unstructured":"Mark Liffiton, Brad Sheese, Jaromir Savelka, and Paul Denny. 2023. Codehelp: Using large language models with guardrails for scalable support in programming classes. arXiv preprint arXiv:2308.06921 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"LLM-Eval: Unified MultiDimensional Automatic Evaluation for Open-Domain Conversations with Large Language Models. ArXiv abs\/2305.13711","author":"Lin Yen-Ting","year":"2023","unstructured":"Yen-Ting Lin and Yun-Nung (Vivian) Chen. 2023. LLM-Eval: Unified MultiDimensional Automatic Evaluation for Open-Domain Conversations with Large Language Models. ArXiv abs\/2305.13711 (2023). https:\/\/api.semanticscholar.org\/ CorpusID:258841681"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3501709.3544280"},{"key":"e_1_3_2_1_22_1","unstructured":"Steven Moore Eamon Costello and John Stamper. [n. d.]. An Automatic Question Usability Evaluation Toolkit. ([n. d.])."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3573051.3593396"},{"volume-title":"European Conference on Technology Enhanced Learning. https: \/\/api.semanticscholar.org\/CorpusID:259937876","author":"Moore Steven","key":"e_1_3_2_1_24_1","unstructured":"Steven Moore, Huy Anh Nguyen, Tianying Chen, and John C. Stamper. 2023. Assessing the Quality of Multiple-Choice Questions Using GPT-4 and Rule-Based Methods. In European Conference on Technology Enhanced Learning. https: \/\/api.semanticscholar.org\/CorpusID:259937876"},{"key":"e_1_3_2_1_25_1","volume-title":"Pre-Training With Scientific Text Improves Educational Question Generation. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:254409013","author":"Muse Hamze","year":"2022","unstructured":"Hamze Muse, Sahan Bulathwela, and Emine Yilmaz. 2022. Pre-Training With Scientific Text Improves Educational Question Generation. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:254409013"},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. ArXiv abs\/2303.08774 (2023)."},{"key":"e_1_3_2_1_27_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Gray et al. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.8115653"},{"key":"e_1_3_2_1_29_1","unstructured":"Alec Radford and Karthik Narasimhan. 2018. Improving Language Understanding by Generative Pre-Training. https:\/\/s3-us-west-2.amazonaws.com\/openai-assets\/ research-covers\/language-unsupervised\/language_understanding_paper.pdf. Accessed: 2023-09--30."},{"key":"e_1_3_2_1_30_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. https:\/\/d4mucfpksywv.cloudfront.net\/better-language-models\/language_ models_are_unsupervised_multitask_learners.pdf. Accessed: 2023-09--30."},{"key":"e_1_3_2_1_31_1","volume-title":"Multiple-Choice Question Generation: Towards an Automated Assessment Framework. ArXiv abs\/2209.11830","author":"Raina Vatsal","year":"2022","unstructured":"Vatsal Raina and Mark John Francis Gales. 2022. Multiple-Choice Question Generation: Towards an Automated Assessment Framework. ArXiv abs\/2209.11830 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252532175"},{"key":"e_1_3_2_1_32_1","volume-title":"White","author":"Rush Bonnie R","year":"2016","unstructured":"Bonnie R Rush, David C. Rankin, and Brad J. White. 2016. The impact of itemwriting flaws and item complexity on examination item difficulty and discrimination value. BMC Medical Education 16 (2016). https:\/\/api.semanticscholar.org\/ CorpusID:21949891"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3568813.3600142"},{"key":"e_1_3_2_1_34_1","volume-title":"Harnessing LLMs in Curricular Design: Using GPT4 to Support Authoring of Learning Objectives. arXiv preprint arXiv:2306.17459","author":"Sridhar Pragnya","year":"2023","unstructured":"Pragnya Sridhar, Aidan Doyle, Arav Agarwal, Christopher Bogart, Jaromir Savelka, and Majd Sakr. 2023. Harnessing LLMs in Curricular Design: Using GPT4 to Support Authoring of Learning Objectives. arXiv preprint arXiv:2306.17459 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1021\/ed500076x"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/FIE58773.2023.10342898"},{"key":"e_1_3_2_1_39_1","volume-title":"Leaf: Multiple-Choice Question Generation. ArXiv abs\/2201.09012","author":"Vachev Kristiyan","year":"2022","unstructured":"Kristiyan Vachev, Momchil Hardalov, Georgi Karadzhov, Georgi Georgiev, Ivan Koychev, and Preslav Nakov. 2022. Leaf: Multiple-Choice Question Generation. ArXiv abs\/2201.09012 (2022). https:\/\/api.semanticscholar.org\/CorpusID: 246240856"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Xiaodong Zhang and Houfeng Wang. 2018. A Neural Question Generation System Based on Knowledge Base. In Natural Language Processing and Chinese Computing. https:\/\/api.semanticscholar.org\/CorpusID:52002625","DOI":"10.1007\/978-3-319-99495-6_12"},{"key":"e_1_3_2_1_41_1","volume-title":"Towards Human-Like Educational Question Generation with Large Language Models. In International Conference on Artificial Intelligence in Education. https: \/\/api.semanticscholar.org\/CorpusID:251137828","author":"Wang Zichao","year":"2022","unstructured":"Zichao Wang, Jakob Valdez, Debshila Basu Mallick, and Richard Baraniuk. 2022. Towards Human-Like Educational Question Generation with Large Language Models. In International Conference on Artificial Intelligence in Education. https: \/\/api.semanticscholar.org\/CorpusID:251137828"},{"key":"e_1_3_2_1_42_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824--24837."}],"event":{"name":"SIGCSE Virtual 2024: 1st ACM Virtual Global Computing Education Conference","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"],"location":"Virtual Event NC USA","acronym":"SIGCSE Virtual 2024"},"container-title":["Proceedings of the 2024 on ACM Virtual Global Computing Education Conference V. 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649165.3690123","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649165.3690123","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:53:56Z","timestamp":1755892436000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649165.3690123"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,5]]},"references-count":42,"alternative-id":["10.1145\/3649165.3690123","10.1145\/3649165"],"URL":"https:\/\/doi.org\/10.1145\/3649165.3690123","relation":{},"subject":[],"published":{"date-parts":[[2024,12,5]]},"assertion":[{"value":"2024-12-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}