{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T15:30:41Z","timestamp":1781019041396,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T00:00:00Z","timestamp":1774224000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3748522.3779926","type":"proceedings-article","created":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T14:17:49Z","timestamp":1781014669000},"page":"95-102","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Robust LLM-as-a-Judge Validators for Assessing the Quality of Educational Exams"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1611-3969","authenticated-orcid":false,"given":"Renzo","family":"Degiovanni","sequence":"first","affiliation":[{"name":"Luxembourg Institute of Science and Technology, Belval, Luxembourg"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5921-9440","authenticated-orcid":false,"given":"Sergio","family":"Morales","sequence":"additional","affiliation":[{"name":"SOM Research Lab (Internet Interdisciplinary Institute - IN3), Universitat Oberta de Catalunya, Barcelona, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9386-3986","authenticated-orcid":false,"given":"Miriam","family":"Coccia","sequence":"additional","affiliation":[{"name":"Luxembourg Institute of Science and Technology, Belval, Luxembourg"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9639-0186","authenticated-orcid":false,"given":"Robert","family":"Claris\u00f3","sequence":"additional","affiliation":[{"name":"Universitat Oberta de Catalunya, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2418-2489","authenticated-orcid":false,"given":"Jordi","family":"Cabot","sequence":"additional","affiliation":[{"name":"Luxembourg Institute of Science and Technology, Belval, Luxembourg"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Montse Maritxalar, Edurne Martinez, and Larraitz Uria.","author":"Aldabe Itziar","year":"2006","unstructured":"Itziar Aldabe, Maddalen Lopez de Lacalle, Montse Maritxalar, Edurne Martinez, and Larraitz Uria. 2006. ArikIturri: an automatic question generator based on corpora and NLP techniques. In Intelligent Tutoring Systems. Springer Berlin Heidelberg, 584\u2013594."},{"key":"e_1_3_2_1_2_1","volume-title":"1st Workshop on Automated Evaluation of Learning and Assessment Content (CEUR Workshop Proceedings).","volume":"3772","author":"AlKhuzaey Samah","year":"2024","unstructured":"Samah AlKhuzaey, Floriana Grasso, Terry R. Payne, and Valentina Tamma. 2024. Towards automatic evaluation of questions generated from ontologies. In 1st Workshop on Automated Evaluation of Learning and Assessment Content (CEUR Workshop Proceedings). Vol. 3772. CEUR-WS.org."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"David Baidoo-Anu and Leticia Owusu Ansah. 2023. Education in the era of generative artificial intelligence (AI): understanding the potential benefits of ChatGPT in promoting teaching and learning. SSRN Electronic Journal.","DOI":"10.2139\/ssrn.4337484"},{"key":"e_1_3_2_1_4_1","volume-title":"TIMSS 2023 Encyclopedia: Education Policy and Curriculum in Mathematics and Science. Tech. rep.","author":"Boston College","year":"2024","unstructured":"Boston College, TIMSS & PIRLS International Study Center. 2024. TIMSS 2023 Encyclopedia: Education Policy and Curriculum in Mathematics and Science. Tech. rep."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TLT.2018.2889100"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.caeai.2024.100344"},{"key":"e_1_3_2_1_7_1","volume-title":"Seth Aycock, Zain Muhammad Mujahid, Vladana Perli\u0107, Ekaterina Borisova, and Markarit Vartampetian.","author":"Chirkova Nadezhda","year":"2025","unstructured":"Nadezhda Chirkova, Tunde Oluwaseyi Ajayi, Seth Aycock, Zain Muhammad Mujahid, Vladana Perli\u0107, Ekaterina Borisova, and Markarit Vartampetian. 2025. LLM-as-a-qualitative-judge: automating error analysis in natural language generation. (2025). arXiv: 2506.09147."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.3389\/feduc.2023.858273"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1186\/s41039-021-00151-1"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE63991.2025.00313"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10459-023-10225-y"},{"key":"e_1_3_2_1_12_1","volume-title":"Sungchul Kim","author":"Gallegos Isabel O.","year":"2024","unstructured":"Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. 2024. Bias and fairness in large language models: a survey. Computational Linguistics, 50, 3, (Sept. 2024), 1097\u20131179."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Mark J Gierl Okan Bulut Qi Guo and Xinxin Zhang. 2017. Developing analyzing and using distractors for multiple-choice tests in education: a comprehensive review. Review of educational research 87 6 1082\u20131116.","DOI":"10.3102\/0034654317726529"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Mark J Gierl and Hollis Lai. 2013. Evaluating the quality of medical multiple-choice items created with automated processes. Medical education 47 7 726\u2013733.","DOI":"10.1111\/medu.12202"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10639-024-12771-3"},{"key":"e_1_3_2_1_16_1","unstructured":"Jiawei Gu et al. 2025. A survey on LLM-as-a-judge. (2025). arXiv: 2411.15594."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Thomas M Haladyna Steven M Downing and Michael C Rodriguez. 2002. A review of multiple-choice item-writing guidelines for classroom assessment. Applied measurement in education 15 3 309\u2013333.","DOI":"10.1207\/S15324818AME1503_5"},{"key":"e_1_3_2_1_18_1","article-title":"A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions","volume":"43","author":"Lei Huang","year":"2025","unstructured":"Lei Huang et al. 2025. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Trans. Inf. Syst., 43, 2, Article 42, (Jan. 2025).","journal-title":"ACM Trans. Inf. Syst."},{"key":"e_1_3_2_1_19_1","volume-title":"2nd Workshop on Automated Evaluation of Learning and Assessment Content.","volume":"4006","author":"Kim Euigyum","year":"2025","unstructured":"Euigyum Kim, Salah Khalil, and Hyo Jeong Shin. 2025. Comparing human and LLM evaluations on AI-generated critical thinking items: implications for valid applications of automatic item generation. In 2nd Workshop on Automated Evaluation of Learning and Assessment Content. Vol. 4006. CEUR-WS.org."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Ghader Kurdi Jared Leo Bijan Parsia Uli Sattler and Salam Al-Emari. 2020. A systematic review of automatic question generation for educational purposes. International journal of artificial intelligence in education 30 1 121\u2013204.","DOI":"10.1007\/s40593-019-00186-y"},{"key":"e_1_3_2_1_21_1","unstructured":"Patrick Lewis et al. 2021. Retrieval-augmented generation for knowledge-intensive nlp tasks. (2021). arXiv: 2005.11401."},{"key":"e_1_3_2_1_22_1","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS).","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tony Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: multimodal reasoning via thought chains for science question answering. In The 36th Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1111\/ijsa.70021"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.3115\/1705415.1705422"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640310.3674093"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/bs.adcom.2018.03.015"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1186\/s41039-020-00134-8"},{"key":"e_1_3_2_1_28_1","volume-title":"Sriparna Saha, Vinija Jain, Samrat Mondal, and Aman Chadha.","author":"Sahoo Pranab","year":"2024","unstructured":"Pranab Sahoo, Ayush Kumar Singh, Sriparna Saha, Vinija Jain, Samrat Mondal, and Aman Chadha. 2024. A systematic survey of prompt engineering in large language models: techniques and applications. (2024). arXiv: 2402.07927."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3501385.3543957"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2016.2532875"},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on pattern recognition and machine intelligence. Springer, 733\u2013738","author":"Bhatia Arjun Singh","year":"2013","unstructured":"Arjun Singh Bhatia, Manas Kirti, and Sujan Kumar Saha. 2013. Automatic generation of multiple choice questions using Wikipedia. In International conference on pattern recognition and machine intelligence. Springer, 733\u2013738."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Yishen Song Junlei Du and Qinhua Zheng. 2025. Automatic item generation for educational assessments: a systematic literature review. Interactive Learning Environments 1\u201320.","DOI":"10.1080\/10494820.2025.2482588"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.21449\/ijate.1602294"},{"key":"e_1_3_2_1_34_1","volume-title":"Sankaran Vaidyanathan, and Dieuwke Hupkes.","author":"Thakur Aman Singh","year":"2025","unstructured":"Aman Singh Thakur, Kartik Choudhary, Venkat Srinik Ramayapally, Sankaran Vaidyanathan, and Dieuwke Hupkes. 2025. Judging the judges: evaluating alignment and vulnerabilities in LLMs-as-judges. (2025). arXiv: 2406.12624."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Lianmin","unstructured":"Lianmin Zheng et al. 2023. Judging LLM-as-a-judge with MT-bench and Chatbot Arena. In Proceedings of the 37th International Conference on Neural Information Processing Systems. Curran Associates Inc., New Orleans, LA, USA."}],"event":{"name":"SAC '26: 41st ACM\/SIGAPP Symposium on Applied Computing","location":"Grand Hotel Palace Thessaloniki Greece","acronym":"SAC '26","sponsor":["SIGAPP ACM Special Interest Group on Applied Computing"]},"container-title":["Proceedings of the 41st ACM\/SIGAPP Symposium on Applied Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3748522.3779926","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T14:51:21Z","timestamp":1781016681000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3748522.3779926"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,23]]},"references-count":35,"alternative-id":["10.1145\/3748522.3779926","10.1145\/3748522"],"URL":"https:\/\/doi.org\/10.1145\/3748522.3779926","relation":{},"subject":[],"published":{"date-parts":[[2026,3,23]]},"assertion":[{"value":"2026-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}