{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T17:06:23Z","timestamp":1762448783342,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,11]]},"DOI":"10.1145\/3769994.3770017","type":"proceedings-article","created":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T16:35:41Z","timestamp":1762446941000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Failures in Reliably Assessing Program Code Readability"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6086-2479","authenticated-orcid":false,"given":"Neil C. C.","family":"Brown","sequence":"first","affiliation":[{"name":"King's College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5915-9153","authenticated-orcid":false,"given":"Marcus","family":"Messer","sequence":"additional","affiliation":[{"name":"King's College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0227-0401","authenticated-orcid":false,"given":"Jennifer","family":"Ikin","sequence":"additional","affiliation":[{"name":"King's College London, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Umar Alkafaween Ibrahim Albluwi and Paul Denny. 2025. Automating Autograding: Large Language Models as Test Suite Generators for Introductory Programming. Journal of Computer Assisted Learning 41 1 (2025) e13100.","DOI":"10.1111\/jcal.13100"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/FIE.2018.8658503"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"John Biggs. 1996. Enhancing teaching through constructive alignment. Higher education 32 3 (1996) 347\u2013364.","DOI":"10.1007\/BF00138871"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3478431.3499294"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3174781.3174785"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/1390630.1390647"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","unstructured":"Daisy\u00a0Christodoulou Christopher\u00a0Wheadon Patrick\u00a0Barmby and Brian Henderson. 2020. A comparative judgement approach to the large-scale assessment of primary writing in England. Assessment in Education: Principles Policy & Practice 27 1 (2020) 46\u201364. 10.1080\/0969594X.2019.1700212","DOI":"10.1080\/0969594X.2019.1700212"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Liliya\u00a0A. Demidova Elena\u00a0G. Andrianova Peter\u00a0N. Sovietov and Artyom\u00a0V. Gorchakov. 2023. Dataset of Program Source Codes Solving Unique Programming Exercises Generated by Digital Teaching Assistant. Data 8 6 (2023). 10.3390\/data8060109","DOI":"10.3390\/data8060109"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","unstructured":"Christof Ebert James Cain Giuliano Antoniol Steve Counsell and Phillip Laplante. 2016. Cyclomatic Complexity. IEEE Software 33 6 (2016) 27\u201329. 10.1109\/MS.2016.147","DOI":"10.1109\/MS.2016.147"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPC.2019.00014"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643795.3648375"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.18260\/1-2--28416"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.18260\/1-2--30366"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLHCC.2013.6645259"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEET58685.2023.00024"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689187.3709615"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Anders J\u00f6nsson and Gunilla Svingby. 2007. The use of scoring rubrics: Reliability validity and educational consequences. Educational Research Review 2 2 (2007) 130\u2013144. 10.1016\/j.edurev.2007.05.002","DOI":"10.1016\/j.edurev.2007.05.002"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","unstructured":"Marcin Jukiewicz. 2024. The future of grading programming assignments in education: The role of ChatGPT in automating the assessment and feedback process. Thinking Skills and Creativity 52 (2024) 101522. 10.1016\/j.tsc.2024.101522","DOI":"10.1016\/j.tsc.2024.101522"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3059009.3059061"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3304221.3319780"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3587102.3588777"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"George Kinnear Ian Jones and Ben Davies. 2025. Comparative judgement as a research tool: a meta-analysis of application and reliability. 10.31219\/osf.io\/c9q3bv1","DOI":"10.31219\/osf.io\/c9q3bv1"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3636243.3636258"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","unstructured":"Diana Kirk Andrew Luxton-Reilly and Ewan Tempero. 2025. CSM: A Code Style Model for Computing Educators. ACM Trans. Comput. Educ. 25 1 Article 6 (April 2025). 10.1145\/3716861","DOI":"10.1145\/3716861"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569770"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Katherine\u00a0R Luking Brady\u00a0D Nelson Zachary\u00a0P Infantolino Colin\u00a0L Sauder and Greg Hajcak. 2017. Internal consistency of functional magnetic resonance imaging and electroencephalography measures of reward in late childhood and early adolescence. Biological Psychiatry: Cognitive Neuroscience and Neuroimaging 2 3 (2017) 289\u2013297.","DOI":"10.1016\/j.bpsc.2016.12.004"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3160489.3160498"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","unstructured":"Anders Lysne. 1984. Grading of Student\u2019s Attainment: Purposes and Functions. Scandinavian Journal of Educational Research 28 3 (1984) 149\u2013165. 10.1080\/0031383840280303","DOI":"10.1080\/0031383840280303"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3283812.3283820"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3649217.3653621"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630761"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","unstructured":"IC McManus M. Thompson and J. Mollon. 2006. Assessment of examiner leniency and stringency (\u2019hawk-dove effect\u2019) in the MRCP(UK) clinical examination (PACES) using multi-facet Rasch modelling. BMC Medical Education 6 1 (2006) 42. 10.1186\/1472-6920-6-42","DOI":"10.1186\/1472-6920-6-42"},{"key":"e_1_3_3_2_34_2","unstructured":"Marcus Messer Neil Brown Michael K\u00f6lling and Miaojing Shi. 2024. Menagerie: A Dataset of Graded Programming Assignments. https:\/\/osf.io\/q8jbt\/"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Marcus Messer Neil C.\u00a0C. Brown Michael K\u00f6lling and Miaojing Shi. 2024. Automated Grading and Feedback Tools for Programming Education: A Systematic Review. ACM Trans. Comput. Educ. 24 1 Article 10 (Feb. 2024). 10.1145\/3636515","DOI":"10.1145\/3636515"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","unstructured":"Marcus Messer Neil C.\u00a0C. Brown Michael K\u00f6lling and Miaojing Shi. 2025. How Consistent Are Humans When Grading Programming Assignments? ACM Trans. Comput. Educ. 25 4 Article 49 (Sept. 2025). 10.1145\/3759256","DOI":"10.1145\/3759256"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630960"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","unstructured":"Jos\u00e9\u00a0Carlos Paiva Jos\u00e9\u00a0Paulo Leal and \u00c1lvaro Figueira. 2022. Automated Assessment in Computer Science Education: A State-of-the-Art Review. ACM Trans. Comput. Educ. 22 3 Article 34 (June 2022). 10.1145\/3513140","DOI":"10.1145\/3513140"},{"key":"e_1_3_3_2_39_2","unstructured":"European Parliament. 2024. EU AI Act: first regulation on artificial intelligence. https:\/\/commission.europa.eu\/news\/ai-act-enters-force-2024-08-01_en."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","unstructured":"Rebecca\u00a0J. Passonneau Kathleen Koenig Zhaohui Li and Josephine Soddano. 2023. The Ideal versus the Real Deal in Assessment of Physics Lab Report Writing. European Journal of Applied Sciences 11 2 (Apr. 2023) 626\u2013644. 10.14738\/aivp.112.14406","DOI":"10.14738\/aivp.112.14406"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.18260\/1-2--32906"},{"key":"e_1_3_3_2_42_2","first-page":"956","volume-title":"Proceedings of the 13th Language Resources and Evaluation Conference","author":"Petersen-Frey Fynn","year":"2022","unstructured":"Fynn Petersen-Frey, Marcus Soll, Louis Kobras, Melf Johannsen, Peter Kling, and Chris Biemann. 2022. Dataset of Student Solutions to Algorithm and Data Structure Programming Assignments. In Proceedings of the 13th Language Resources and Evaluation Conference. 956\u2013962. https:\/\/aclanthology.org\/2022.lrec-1.101\/"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Alastair Pollitt. 2012. The method of adaptive comparative judgement. Assessment in Education: principles policy & practice 19 3 (2012) 281\u2013300.","DOI":"10.1080\/0969594X.2012.665354"},{"key":"e_1_3_3_2_44_2","unstructured":"Lutz Prechelt Guido Malpohl Michael Philippsen et\u00a0al. 2002. Finding plagiarisms among a set of programs with JPlag. J. Univers. Comput. Sci. 8 11 (2002) 1016."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626252.3630889"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","unstructured":"Vincent\u00a0Donche San\u00a0Verhavert Renske\u00a0Bouwer and Sven\u00a0De Maeyer. 2019. A meta-analysis on the reliability of comparative judgement. Assessment in Education: Principles Policy & Practice 26 5 (2019) 541\u2013562. 10.1080\/0969594X.2019.1602027","DOI":"10.1080\/0969594X.2019.1602027"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","unstructured":"Simone Scalabrino Mario Linares-V\u00e1squez Rocco Oliveto and Denys Poshyvanyk. 2018. A comprehensive model for code readability. Journal of Software: Evolution and Process 30 6 (2018) e1958. 10.1002\/smr.1958e1958 smr.1958.","DOI":"10.1002\/smr.1958"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Stefanie Schuch Andrea\u00a0M Philipp Luisa Maulitz and Iring Koch. 2022. On the reliability of behavioral measures of cognitive control: retest reliability of task-inhibition effect task-preparation effect Stroop-like interference and conflict adaptation effect. Psychological research 86 7 (2022) 2158\u20132184.","DOI":"10.1007\/s00426-021-01627-x"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643916.3644435"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","unstructured":"D.\u00a0L. Shell. 1959. A high-speed sorting procedure. Commun. ACM 2 7 (July 1959) 30\u201332. 10.1145\/368370.368387","DOI":"10.1145\/368370.368387"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSE51940.2021.9569444"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CSEET.2016.48"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","unstructured":"Mary Spratt. 2005. Washback and the classroom: the implications for teaching and learning of studies of washback from exams. Language Teaching Research 9 1 (2005) 5\u201329. 10.1191\/1362168805lr152oa","DOI":"10.1191\/1362168805lr152oa"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/2999541.2999555"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","unstructured":"Zahid Ullah Adidah Lajis Mona Jamjoom Abdulrahman Altalhi Abdullah Al-Ghamdi and Farrukh Saleem. 2018. The effect of automatic assessment on novice programming: Strengths and limitations of existing systems. Computer Applications in Engineering Education 26 6 (2018) 2328\u20132341. 10.1002\/cae.21974","DOI":"10.1002\/cae.21974"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/2361276.2361278"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEET.2019.00017"}],"event":{"name":"Koli Calling '25: 25th Koli Calling International Conference on Computing Education Research","acronym":"Koli Calling '25","location":"Koli Finland"},"container-title":["Proceedings of the 25th Koli Calling International Conference on Computing Education Research"],"original-title":[],"deposited":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T17:03:17Z","timestamp":1762448597000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3769994.3770017"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":56,"alternative-id":["10.1145\/3769994.3770017","10.1145\/3769994"],"URL":"https:\/\/doi.org\/10.1145\/3769994.3770017","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}