{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T19:31:31Z","timestamp":1772652691781,"version":"3.50.1"},"reference-count":52,"publisher":"Elsevier BV","issue":"2","license":[{"start":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T00:00:00Z","timestamp":1729209600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T00:00:00Z","timestamp":1729209600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Artif Intell Educ"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s40593-024-00431-z","type":"journal-article","created":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T16:29:33Z","timestamp":1729268973000},"page":"651-676","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Can LLMs Grade Open Response Reading Comprehension Questions? An Empirical Study Using the ROARs\u00a0Dataset"],"prefix":"10.1016","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8850-067X","authenticated-orcid":false,"given":"Owen","family":"Henkel","sequence":"first","affiliation":[]},{"given":"Libby","family":"Hills","sequence":"additional","affiliation":[]},{"given":"Bill","family":"Roberts","sequence":"additional","affiliation":[]},{"given":"Joshua","family":"McGrane","sequence":"additional","affiliation":[]}],"member":"78","published-online":{"date-parts":[[2024,10,18]]},"reference":[{"key":"431_CR1","doi-asserted-by":"publisher","unstructured":"Alikaniotis, D., Yannakoudakis, H., & Rei, M. (2016). Automatic Text Scoring Using Neural Networks. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 715\u2013725. https:\/\/doi.org\/10.18653\/v1\/P16-1068","DOI":"10.18653\/v1\/P16-1068"},{"issue":"4","key":"431_CR2","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s40593-021-00285-9","volume":"32","author":"RS Baker","year":"2022","unstructured":"Baker, R. S., & Hawn, A. (2022). Algorithmic Bias in Education. International Journal of Artificial Intelligence in Education, 32(4), 4. https:\/\/doi.org\/10.1007\/s40593-021-00285-9","journal-title":"International Journal of Artificial Intelligence in Education"},{"key":"431_CR3","doi-asserted-by":"publisher","first-page":"3","DOI":"10.2307\/3315487","volume":"27","author":"M Banerjee","year":"2008","unstructured":"Banerjee, M., Capozzoli, M., Mcsweeney, L., & Sinha, D. (2008). Beyond Kappa: A Review of Interrater Agreement Measures. Canadian Journal of Statistics, 27, 3\u201323. https:\/\/doi.org\/10.2307\/3315487","journal-title":"Canadian Journal of Statistics"},{"issue":"4","key":"431_CR4","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1002\/pits.20563","volume":"48","author":"JM Bellinger","year":"2011","unstructured":"Bellinger, J. M., & DiPerna, J. C. (2011). Is fluency-based story retell a good indicator of reading comprehension? Psychology in the Schools, 48(4), 4. https:\/\/doi.org\/10.1002\/pits.20563","journal-title":"Psychology in the Schools"},{"key":"431_CR5","doi-asserted-by":"publisher","first-page":"004912411879937","DOI":"10.1177\/0049124118799372","volume":"50","author":"J Belur","year":"2018","unstructured":"Belur, J., Tompson, L., Thornton, A., & Simon, M. (2018). Interrater Reliability in Systematic Review Methodology: Exploring Variation in Coder Decision-Making. Sociological Methods & Research, 50, 004912411879937. https:\/\/doi.org\/10.1177\/0049124118799372","journal-title":"Sociological Methods & Research"},{"issue":"1","key":"431_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11092-008-9068-5","volume":"21","author":"P Black","year":"2009","unstructured":"Black, P., & Wiliam, D. (2009). Developing the theory of formative assessment. Educational Assessment, Evaluation and Accountability, 21(1), 1. https:\/\/doi.org\/10.1007\/s11092-008-9068-5","journal-title":"Educational Assessment, Evaluation and Accountability"},{"key":"431_CR7","unstructured":"Bommasani, R., Hudson, D. A., Adeli, E., Altman, R., Arora, S., von Arx, S., Bernstein, M. S., Bohg, J., Bosselut, A., Brunskill, E., Brynjolfsson, E., Buch, S., Card, D., Castellon, R., Chatterji, N., Chen, A., Creel, K., Davis, J. Q., Demszky, D., \u2026 Liang, P. (2022). On the Opportunities and Risks of Foundation Models https:\/\/arxiv.org\/abs\/2108.07258"},{"key":"431_CR8","unstructured":"Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D. M., Wu, J., Winter, C., \u2026 Amodei, D. (2020). Language Models are Few-Shot Learners. arXiv:2005.14165 [Cs]. http:\/\/arxiv.org\/abs\/2005.14165"},{"issue":"1","key":"431_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s40593-014-0026-8","volume":"25","author":"S Burrows","year":"2015","unstructured":"Burrows, S., Gurevych, I., & Stein, B. (2015). The Eras and Trends of Automatic Short Answer Grading. International Journal of Artificial Intelligence in Education, 25(1), 1. https:\/\/doi.org\/10.1007\/s40593-014-0026-8","journal-title":"International Journal of Artificial Intelligence in Education"},{"key":"431_CR10","volume-title":"Children\u2019s comprehension problems in oral and written language a cognitive perspective","author":"K Cain","year":"2007","unstructured":"Cain, K., & Oakhill, J. (2007). Children\u2019s comprehension problems in oral and written language a cognitive perspective. Guilford Press."},{"key":"431_CR11","unstructured":"Caines, A., Benedetto, L., Taslimipoor, S., Davis, C., Gao, Y., Andersen, O., Yuan, Z., Elliott, M., Moore, R., Bryant, C., Rei, M., Yannakoudakis, H., Mullooly, A., Nicholls, D., & Buttery, P. (2023). On the application of Large Language Models for language teaching and assessment technology http:\/\/arxiv.org\/abs\/2307.08393"},{"key":"431_CR12","doi-asserted-by":"publisher","unstructured":"Camus, L., & Filighera, A. (2020). Investigating Transformers for Automatic Short Answer Grading. In I. I. Bittencourt, M. Cukurova, K. Muldner, R. Luckin, & E. Mill\u00e1n (Eds.), Artificial Intelligence in Education (Vol. 12164, pp. 43\u201348). Springer International Publishing. https:\/\/doi.org\/10.1007\/978-3-030-52240-7_8","DOI":"10.1007\/978-3-030-52240-7_8"},{"issue":"240","key":"431_CR13","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., Narang, S., Devlin, J., Bosma, M., Mishra, G., Roberts, A., Barham, P., Chung, H. W., Sutton, C., Gehrmann, S., Schuh, P., Shi, K., Tsvyashchenko, S., Maynez, J., Rao, A., Barnes, P., Tay, Y., Shazeer, N., Prabhakaran, V., & Salakhutdinov, R. (2023). PaLM: Scaling Language Modeling with Pathways. Journal of Machine Learning Research, 24(240), 1\u201313.","journal-title":"Journal of Machine Learning Research"},{"key":"431_CR14","unstructured":"Cohn, C., Hutchins, N., Le, T., & Biswas, G. (2024). A Chain-of-Thought Prompting Approach with LLMs for Evaluating Students\u2019 Formative Assessment Responses in Science (arXiv:2403.14565). arXiv. http:\/\/arxiv.org\/abs\/2403.14565"},{"issue":"3","key":"431_CR15","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/s00357-021-09386-5","volume":"38","author":"A De Raadt","year":"2021","unstructured":"De Raadt, A., Warrens, M. J., Bosker, R. J., & Kiers, H. A. L. (2021). A Comparison of Reliability Coefficients for Ordinal Rating Scales. Journal of Classification, 38(3), 3. https:\/\/doi.org\/10.1007\/s00357-021-09386-5","journal-title":"Journal of Classification"},{"key":"431_CR16","unstructured":"Fernandez, N., Ghosh, A., Liu, N., Wang, Z., Choffin, B., Baraniuk, R., & Lan, A. (2023). Automated Scoring for Reading Comprehension via In-context BERT Tuning (arXiv:2205.09864; Issue arXiv:2205.09864). arXiv. http:\/\/arxiv.org\/abs\/2205.09864"},{"issue":"30","key":"431_CR17","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1073\/pnas.2305016120","volume":"120","author":"F Gilardi","year":"2023","unstructured":"Gilardi, F., Alizadeh, M., & Kubli, M. (2023). ChatGPT outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences, 120(30), 30. https:\/\/doi.org\/10.1073\/pnas.2305016120","journal-title":"Proceedings of the National Academy of Sciences"},{"key":"431_CR18","doi-asserted-by":"publisher","unstructured":"Haller, S., Aldea, A., Seifert, C., & Strisciuglio, N. (2022). Survey on automated short answer grading with deep learning: From word embeddings to transformers. https:\/\/doi.org\/10.48550\/arXiv.2204.03503","DOI":"10.48550\/arXiv.2204.03503"},{"issue":"3","key":"431_CR19","doi-asserted-by":"publisher","first-page":"Article 3","DOI":"10.1080\/10888430802132279","volume":"12","author":"JM Keenan","year":"2008","unstructured":"Keenan, J. M., Betjemann, R. S., & Olson, R. K. (2008). Reading comprehension tests vary in the skills they assess: Differential dependence on decoding and oral comprehension. Scientific Studies of Reading, 12(3), Article 3. https:\/\/doi.org\/10.1080\/10888430802132279","journal-title":"Scientific Studies of Reading"},{"issue":"4","key":"431_CR20","doi-asserted-by":"publisher","first-page":"Article 4","DOI":"10.2307\/747509","volume":"13","author":"I Kirsch","year":"1977","unstructured":"Kirsch, I., & Guthrie, J. T. (1977). The concept and measurement of functional literacy. Reading Research Quarterly, 13(4), Article 4. https:\/\/doi.org\/10.2307\/747509","journal-title":"Reading Research Quarterly"},{"issue":"1","key":"431_CR21","doi-asserted-by":"publisher","first-page":"Article 1","DOI":"10.1080\/10228195.2011.569739","volume":"42","author":"A Klaas","year":"2011","unstructured":"Klaas, A., & Trudell, B. (2011). Effective literacy programmes and independent reading in African contexts. Language Matters, 42(1), Article 1. https:\/\/doi.org\/10.1080\/10228195.2011.569739","journal-title":"Language Matters"},{"key":"431_CR22","unstructured":"Kojima, T., Gu, S. S., Reid, M., Matsuo, Y., & Iwasawa, Y. (2022). Large Language Models are Zero-Shot Reasoners"},{"key":"431_CR23","doi-asserted-by":"crossref","unstructured":"Kortemeyer, G. (2023). Performance of the Pre-Trained Large Language Model GPT-4 on Automated Short Answer Grading (arXiv:2309.09338). arXiv. http:\/\/arxiv.org\/abs\/2309.09338","DOI":"10.1007\/s44163-024-00147-y"},{"key":"431_CR24","unstructured":"Kuzman, T., Mozeti\u010d, I., & Ljube\u0161i\u0107, N. (2023). ChatGPT: Beginning of an End of Manual Linguistic Data Annotation? Use Case of Automatic Genre Identification (arXiv:2303.03953; Issue arXiv:2303.03953). arXiv. http:\/\/arxiv.org\/abs\/2303.03953"},{"issue":"1","key":"431_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1080\/00405840802577593","volume":"48","author":"TK Landauer","year":"2009","unstructured":"Landauer, T. K., Lochbaum, K. E., & Dooley, S. (2009). A New Formative Assessment Technology for Reading and Writing. Theory Into Practice, 48(1), 1. https:\/\/doi.org\/10.1080\/00405840802577593","journal-title":"Theory Into Practice"},{"issue":"1","key":"431_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.2307\/2529310","volume":"33","author":"JR Landis","year":"1977","unstructured":"Landis, J. R., & Koch, G. G. (1977). The Measurement of Observer Agreement for Categorical Data. Biometrics, 33(1), 1. https:\/\/doi.org\/10.2307\/2529310","journal-title":"Biometrics"},{"key":"431_CR27","doi-asserted-by":"publisher","unstructured":"Lockheed, M. (2008). Measuring progress with tests of learning: Pros and cons for \u201ccash on delivery aid\u201d in education. SSRN Electronic Journal. https:\/\/doi.org\/10.2139\/ssrn.1213162","DOI":"10.2139\/ssrn.1213162"},{"issue":"3","key":"431_CR28","doi-asserted-by":"publisher","first-page":"3","DOI":"10.3758\/s13428-012-0211-3","volume":"44","author":"JP Magliano","year":"2012","unstructured":"Magliano, J. P., & Graesser, A. C. (2012). Computer-based assessment of student-constructed responses. Behavior Research Methods, 44(3), 3. https:\/\/doi.org\/10.3758\/s13428-012-0211-3","journal-title":"Behavior Research Methods"},{"key":"431_CR29","unstructured":"Matelsky, J. K., Parodi, F., Liu, T., Lange, R. D., & Kording, K. P. (2023). A large language model-assisted education tool to provide feedback on open-ended responses (arXiv:2308.02439). arXiv. http:\/\/arxiv.org\/abs\/2308.02439"},{"key":"431_CR30","doi-asserted-by":"crossref","unstructured":"Mishra, S., Khashabi, D., Baral, C., & Hajishirzi, H. (2022). Cross-Task Generalization via Natural Language Crowdsourcing Instructions (arXiv:2104.08773). arXiv. http:\/\/arxiv.org\/abs\/2104.08773","DOI":"10.18653\/v1\/2022.acl-long.244"},{"issue":"2","key":"431_CR31","doi-asserted-by":"publisher","first-page":"100050","DOI":"10.1016\/j.rmal.2023.100050","volume":"2","author":"A Mizumoto","year":"2023","unstructured":"Mizumoto, A., & Eguchi, M. (2023). Exploring the potential of using an AI language model for automated essay scoring. Research Methods in Applied Linguistics, 2(2), 100050. https:\/\/doi.org\/10.1016\/j.rmal.2023.100050","journal-title":"Research Methods in Applied Linguistics"},{"issue":"1","key":"431_CR32","doi-asserted-by":"publisher","first-page":"32","DOI":"10.3390\/ime3010004","volume":"3","author":"L Morjaria","year":"2024","unstructured":"Morjaria, L., Burns, L., Bracken, K., Levinson, A. J., Ngo, Q. N., Lee, M., & Sibbald, M. (2024). Examining the Efficacy of ChatGPT in Marking Short answer Assessments in an Undergraduate Medical Program. International Medical Education, 3(1), 32\u201343. https:\/\/doi.org\/10.3390\/ime3010004","journal-title":"International Medical Education"},{"issue":"9","key":"431_CR33","doi-asserted-by":"publisher","first-page":"Article 9","DOI":"10.1111\/j.1469-7610.2010.02254.x","volume":"51","author":"K Nation","year":"2010","unstructured":"Nation, K., Cocksey, J., Taylor, J. S. H., & Bishop, D. V. M. (2010). A longitudinal investigation of early reading and language skills in children with poor reading comprehension: A longitudinal investigation of early reading and language skills. Journal of Child Psychology and Psychiatry, 51(9), Article 9. https:\/\/doi.org\/10.1111\/j.1469-7610.2010.02254.x","journal-title":"Journal of Child Psychology and Psychiatry"},{"key":"431_CR34","doi-asserted-by":"publisher","unstructured":"Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C. L., Mishkin, P., Zhang, C., Agarwal, S., Slama, K., Ray, A., Schulman, J., Hilton, J., Kelton, F., Miller, L., Simens, M., Askell, A., Welinder, P., Christiano, P., Leike, J., & Lowe, R. (2022). Training language models to follow instructions with human feedback (arXiv:2203.02155; Issue arXiv:2203.02155). arXiv. https:\/\/doi.org\/10.48550\/arXiv.2203.02155","DOI":"10.48550\/arXiv.2203.02155"},{"key":"431_CR35","volume-title":"Children\u2019s reading comprehension and assessment","author":"PD Pearson","year":"2006","unstructured":"Pearson, P. D., & Hamm, D. N. (2006). The assessment of reading comprehension: A review of practices\u2014past, present, and future. In Children\u2019s reading comprehension and assessment. Lawrence Erlbaum Associates."},{"key":"431_CR36","unstructured":"Perez, E., Kiela, D., & Cho, K. (2021). True Few-Shot Learning with Language Models."},{"key":"431_CR37","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 [Cs, Stat]. http:\/\/arxiv.org\/abs\/1910.10683"},{"issue":"3","key":"431_CR38","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1080\/10888438.2010.538780","volume":"16","author":"DK Reed","year":"2012","unstructured":"Reed, D. K., & Vaughn, S. (2012). Retell as an indicator of reading comprehension. Scientific Studies of Reading, 16(3), 187\u2013217. https:\/\/doi.org\/10.1080\/10888438.2010.538780","journal-title":"Scientific Studies of Reading"},{"key":"431_CR39","unstructured":"Ridley, R., He, L., Dai, X., Huang, S., & Chen, J. (2020). Prompt Agnostic Essay Scorer: A Domain Generalization Approach to Cross-prompt Automated Essay Scoring (arXiv:2008.01441). arXiv. http:\/\/arxiv.org\/abs\/2008.01441"},{"key":"431_CR40","doi-asserted-by":"crossref","unstructured":"Schneider, J., Schenk, B., Niklaus, C., & Vlachos, M. (2023). Towards LLM-based Autograding for Short Textual Answers.","DOI":"10.5220\/0012552200003693"},{"issue":"7","key":"431_CR41","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1080\/02702711.2013.790328","volume":"35","author":"ES Shapiro","year":"2014","unstructured":"Shapiro, E. S., Fritschmann, N. S., Thomas, L. B., Hughes, C. L., & McDougal, J. (2014). Concurrent and Predictive Validity of Reading Retell as a Brief Measure of Reading Comprehension for Narrative Text. Reading Psychology, 35(7), 7. https:\/\/doi.org\/10.1080\/02702711.2013.790328","journal-title":"Reading Psychology"},{"issue":"1","key":"431_CR42","doi-asserted-by":"publisher","first-page":"153","DOI":"10.3102\/0034654307313795","volume":"78","author":"VJ Shute","year":"2008","unstructured":"Shute, V. J. (2008). Focus on Formative Feedback. Review of Educational Research, 78(1), 153\u2013189. https:\/\/doi.org\/10.3102\/0034654307313795","journal-title":"Review of Educational Research"},{"key":"431_CR43","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1080\/02702711.2018.1555361","volume":"40","author":"G Smith","year":"2019","unstructured":"Smith, G., & Paige, D. (2019). A Study of Reliability Across Multiple Raters When Using the NAEP and MDFS Rubrics to Measure Oral Reading Fluency. Reading Psychology, 40, 34.","journal-title":"Reading Psychology"},{"key":"431_CR44","doi-asserted-by":"publisher","unstructured":"Spaull, N., Pretorius, E., & Mohohlwane, N. (2020). Investigating the comprehension iceberg: Developing empirical benchmarks for early-grade reading in agglutinating African languages. South African Journal of Childhood Education, 10(1). https:\/\/doi.org\/10.4102\/sajce.v10i1.773","DOI":"10.4102\/sajce.v10i1.773"},{"key":"431_CR45","doi-asserted-by":"publisher","unstructured":"Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2022). Learning to summarize from human feedback (arXiv:2009.01325; Issue arXiv:2009.01325). arXiv. https:\/\/doi.org\/10.48550\/arXiv.2009.01325","DOI":"10.48550\/arXiv.2009.01325"},{"key":"431_CR46","doi-asserted-by":"publisher","unstructured":"Sultan, M. A., Salazar, C., & Sumner, T. (2016). Fast and easy short answer grading with high accuracy (pp. 1070\u20131075). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/v1\/N16-1123","DOI":"10.18653\/v1\/N16-1123"},{"key":"431_CR47","doi-asserted-by":"publisher","unstructured":"Sultan, M. A., Sil, A., & Florian, R. (2022). Not to Overfit or Underfit the Source Domains? An Empirical Study of Domain Generalization in Question Answering. Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 3752\u20133761. https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.247","DOI":"10.18653\/v1\/2022.emnlp-main.247"},{"key":"431_CR48","doi-asserted-by":"publisher","unstructured":"Sung, C., Dhamecha, T., Saha, S., Ma, T., Reddy, V., & Arora, R. (2019). Pre-Training BERT on Domain Resources for Short Answer Grading. Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), 6070\u20136074. https:\/\/doi.org\/10.18653\/v1\/D19-1628","DOI":"10.18653\/v1\/D19-1628"},{"issue":"1","key":"431_CR49","doi-asserted-by":"publisher","first-page":"Article 1","DOI":"10.1177\/014662169001400101","volume":"14","author":"H van den Bergh","year":"1990","unstructured":"van den Bergh, H. (1990). On the construct validity of multiple-choice items for reading comprehension. Applied Psychological Measurement, 14(1), Article 1. https:\/\/doi.org\/10.1177\/014662169001400101","journal-title":"Applied Psychological Measurement"},{"key":"431_CR50","unstructured":"Wei, J., Tay, Y., Bommasani, R., Raffel, C., Zoph, B., Borgeaud, S., Yogatama, D., Bosma, M., Zhou, D., Metzler, D., Chi, E. H., Hashimoto, T., Vinyals, O., Liang, P., Dean, J., & Fedus, W. (2022). Emergent Abilities of Large Language Models (arXiv:2206.07682; Issue arXiv:2206.07682). arXiv. http:\/\/arxiv.org\/abs\/2206.07682"},{"key":"431_CR51","doi-asserted-by":"publisher","unstructured":"Weidinger, L., Uesato, J., Rauh, M., Griffin, C., Huang, P.-S., Mellor, J., Glaese, A., Cheng, M., Balle, B., Kasirzadeh, A., Biles, C., Brown, S., Kenton, Z., Hawkins, W., Stepleton, T., Birhane, A., Hendricks, L. A., Rimell, L., Isaac, W., \u2026 Gabriel, I. (2022). Taxonomy of Risks posed by Language Models. Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, 214\u2013229. https:\/\/doi.org\/10.1145\/3531146.3533088","DOI":"10.1145\/3531146.3533088"},{"key":"431_CR52","doi-asserted-by":"crossref","unstructured":"Ye, Q., Lin, B. Y., & Ren, X. (2021). CrossFit: A Few-shot Learning Challenge for Cross-task Generalization in NLP (arXiv:2104.08835). arXiv. http:\/\/arxiv.org\/abs\/2104.08835","DOI":"10.18653\/v1\/2021.emnlp-main.572"}],"container-title":["International Journal of Artificial Intelligence in Education"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40593-024-00431-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40593-024-00431-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40593-024-00431-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T18:12:52Z","timestamp":1772647972000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40593-024-00431-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,18]]},"references-count":52,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["431"],"URL":"https:\/\/doi.org\/10.1007\/s40593-024-00431-z","relation":{},"ISSN":["1560-4292","1560-4306"],"issn-type":[{"value":"1560-4292","type":"print"},{"value":"1560-4306","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,18]]},"assertion":[{"value":"19 September 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 October 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The first author has an ongoing research partnership with Rising Academies and works as a consultant on a project related to developing a conversational agent to support students\u2019 math skills. This study is an extension of his doctoral dissertation and predates the consulting relationship.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}}]}}