{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T10:35:18Z","timestamp":1777113318072,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3785022.3785070","type":"proceedings-article","created":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T09:39:01Z","timestamp":1777109941000},"page":"325-335","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing LLM-Based Data Annotation with Error Decomposition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3131-910X","authenticated-orcid":false,"given":"Zhen","family":"Xu","sequence":"first","affiliation":[{"name":"Columbia University, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4275-0745","authenticated-orcid":false,"given":"Vedant","family":"Khatri","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7258-8374","authenticated-orcid":false,"given":"Yijun","family":"Dai","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3796-2251","authenticated-orcid":false,"given":"Xiner","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1465-6337","authenticated-orcid":false,"given":"Siyan","family":"Li","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2776-0344","authenticated-orcid":false,"given":"Xuanming","family":"Zhang","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2375-3537","authenticated-orcid":false,"given":"Renzhe","family":"Yu","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-69359-9_94"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.63317\/4oh42vhxfvci"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA58977.2023.00089"},{"key":"e_1_3_3_2_5_2","unstructured":"Enrique Amig\u00f3 Julio Gonzalo Stefano Mizzaro and Jorge Carrillo-de Albornoz. 2020. An effectiveness metric for ordinal classification: Formal properties and experimental results. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.01245 (2020)."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Julian Ashwin Aditya Chhabra and Vijayendra Rao. 2023. Using large language models for qualitative analysis can introduce serious bias. Sociological Methods & Research (2023) 00491241251338246.","DOI":"10.1596\/1813-9450-10597"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISDA.2009.230"},{"key":"e_1_3_3_2_8_2","unstructured":"Joachim Baumann Paul R\u00f6ttger Aleksandra Urman Albert Wendsj\u00f6 Flor\u00a0Miriam Plaza-del Arco Johannes\u00a0B Gruber and Dirk Hovy. 2025. Large Language Model Hacking: Quantifying the Hidden Risks of Using LLMs for Text Annotation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.08825 (2025)."},{"key":"e_1_3_3_2_9_2","first-page":"60","volume-title":"Proceedings of the 14th Linguistic Annotation Workshop","author":"Beck Christin","year":"2020","unstructured":"Christin Beck, Hannah Booth, Mennatallah El-Assady, and Miriam Butt. 2020. Representation problems in linguistic annotations: Ambiguity, variation, uncertainty, error and bias. In Proceedings of the 14th Linguistic Annotation Workshop. 60\u201373."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Francisco B\u00e9rchez-Moreno Rafael Ayllon-Gavilan Victor\u00a0M Vargas David Guijo-Rubio C\u00e9sar Herv\u00e1s-Mart\u00ednez Juan\u00a0C Fern\u00e1ndez and Pedro\u00a0A Guti\u00e9rrez. 2025. dlordinal: A Python package for deep ordinal classification. Neurocomputing 622 (2025) 129305.","DOI":"10.1016\/j.neucom.2024.129305"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Arne Bewersdorff Kathrin Se\u00dfler Armin Baur Enkelejda Kasneci and Claudia Nerdel. 2023. Assessing student errors in experimentation using artificial intelligence and large language models: A comparative study with human raters. Computers and Education: Artificial Intelligence 5 (2023) 100177.","DOI":"10.1016\/j.caeai.2023.100177"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Jaime\u00a0S Cardoso and Ricardo Sousa. 2011. Measuring the performance of ordinal classification. International Journal of Pattern Recognition and Artificial Intelligence 25 08 (2011) 1173\u20131195.","DOI":"10.1142\/S0218001411009093"},{"key":"e_1_3_3_2_13_2","unstructured":"Robert Chew John Bollenbacher Michael Wenger Jessica Speer and Annice Kim. 2023. LLM-assisted content analysis: Using large language models to support deductive coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.14924 (2023)."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Jacob Cohen. 1968. Weighted kappa: Nominal scale agreement with provision for scaled disagreement or partial credit. Psychological bulletin 70 4 (1968) 213.","DOI":"10.1037\/h0026256"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-64312-5_2"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Marie-Catherine De\u00a0Marneffe Christopher\u00a0D Manning and Christopher Potts. 2012. Did it happen? The pragmatic complexity of veridicality assessment. Computational linguistics 38 2 (2012) 301\u2013333.","DOI":"10.1162\/COLI_a_00097"},{"key":"e_1_3_3_2_17_2","unstructured":"Dorottya Demszky Jing Liu Zid Mancenido Julie Cohen Heather Hill Dan Jurafsky and Tatsunori Hashimoto. 2021. Measuring conversational uptake: A case study on student-teacher interactions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.03873 (2021)."},{"key":"e_1_3_3_2_18_2","unstructured":"Ricardo Dominguez-Olmedo Vedant Nanda Rediet Abebe Stefan Bechtold Christoph Engel Jens Frankenreiter Krishna Gummadi Moritz Hardt and Michael Livermore. 2024. Lawma: The power of specialization for legal annotation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.16615 (2024)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Naoki Egami Musashi Hinck Brandon Stewart and Hanying Wei. 2023. Using imperfect surrogates for downstream inference: Design-based supervised learning for social science applications of large language models. Advances in Neural Information Processing Systems 36 (2023) 68589\u201368601.","DOI":"10.52202\/075280-3000"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Fabrizio Gilardi Meysam Alizadeh and Ma\u00ebl Kubli. 2023. ChatGPT outperforms crowd workers for text-annotation tasks. Proceedings of the National Academy of Sciences 120 30 (2023) e2305016120.","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.179"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Igor Grossmann Matthew Feinberg Dawn\u00a0C Parker Nicholas\u00a0A Christakis Philip\u00a0E Tetlock and William\u00a0A Cunningham. 2023. AI and the transformation of social science research. Science 380 6650 (2023) 1108\u20131109.","DOI":"10.1126\/science.adi1778"},{"key":"e_1_3_3_2_23_2","unstructured":"Megan Gu Chloe\u00a0Qianhui Zhao Claire Liu Nikhil Patel Jahnvi Shah Jionghao Lin and Kenneth\u00a0R Koedinger. 2025. Toward Automated Qualitative Analysis: Leveraging Large Language Models for Tutoring Dialogue Evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.13882 (2025)."},{"key":"e_1_3_3_2_24_2","unstructured":"Luke Guerdan Solon Barocas Kenneth Holstein Hanna Wallach Zhiwei\u00a0Steven Wu and Alexandra Chouldechova. 2025. Validating LLM-as-a-Judge Systems under Rating Indeterminacy. arxiv:https:\/\/arXiv.org\/abs\/2503.05965\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2503.05965"},{"key":"e_1_3_3_2_25_2","unstructured":"Kunal Handa Drew Bent Alex Tamkin Miles McCain Esin Durmus Michael Stern Mike Schiraldi Saffron Huang Stuart Ritchie Steven Syverud et\u00a0al. 2025. Anthropic Education Report: How University Students Use Claude."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Deniu He. 2022. Active learning for ordinal classification based on expected cost minimization. Scientific Reports 12 1 (2022) 22468.","DOI":"10.1038\/s41598-022-26844-1"},{"key":"e_1_3_3_2_27_2","unstructured":"Xingwei He Zhenghao Lin Yeyun Gong A-Long Jin Hang Zhang Chen Lin Jian Jiao Siu\u00a0Ming Yiu Nan Duan and Weizhu Chen. 2023. AnnoLLM: making large language models to be better crowdsourced annotators (2023). CoRR abs\/2303.16854 (2023)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2029"},{"key":"e_1_3_3_2_29_2","first-page":"195","volume-title":"LAK Workshops","author":"Hu Yuanyuan","year":"2024","unstructured":"Yuanyuan Hu, Nasser Giacaman, and Claire Donald. 2024. Enhancing Trust in Generative AI: Investigating Explainability of LLMs to Analyse Confusion in MOOC Discussions.. In LAK Workshops. 195\u2013204."},{"key":"e_1_3_3_2_30_2","unstructured":"Zden\u011bk Kasner Vil\u00e9m Zouhar Patr\u00edcia Schmidtov\u00e1 Ivan Kart\u00e1\u010d Krist\u1ef3na Onderkov\u00e1 Ond\u0159ej Pl\u00e1tek Dimitra Gkatzia Saad Mahamood Ond\u0159ej Du\u0161ek and Simone Balloccu. 2025. Large Language Models as Span Annotators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.08697 (2025)."},{"key":"e_1_3_3_2_31_2","unstructured":"Omar Khattab Arnav Singhvi Paridhi Maheshwari Zhiyuan Zhang Keshav Santhanam Sri Vardhamanan Saiful Haq Ashutosh Sharma Thomas\u00a0T Joshi Hanna Moazam et\u00a0al. 2023. Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.03714 (2023)."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Jan-Christoph Klie Bonnie Webber and Iryna Gurevych. 2023. Annotation error detection: Analyzing the past and present for a more coherent future. Computational Linguistics 49 1 (2023) 157\u2013198.","DOI":"10.1162\/coli_a_00464"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Klaus Krippendorff. 2004. Measuring the reliability of qualitative text analysis data. Quality and quantity 38 6 (2004) 787\u2013800.","DOI":"10.1007\/s11135-004-8107-7"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Klaus Krippendorff. 2008. Systematic and random disagreement and the reliability of nominal data. Communication Methods and Measures 2 4 (2008) 323\u2013338.","DOI":"10.1080\/19312450802467134"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Klaus Krippendorff. 2011. Agreement and information in the reliability of coding. Communication methods and measures 5 2 (2011) 93\u2013112.","DOI":"10.1080\/19312458.2011.568376"},{"key":"e_1_3_3_2_36_2","volume-title":"Content analysis: An introduction to its methodology","author":"Krippendorff Klaus","year":"2018","unstructured":"Klaus Krippendorff. 2018. Content analysis: An introduction to its methodology. Sage publications."},{"key":"e_1_3_3_2_37_2","unstructured":"Aakriti Kumar Nalin Poungpeth Diyi Yang Erina Farrell Bruce Lambert and Matthew Groh. 2025. When Large Language Models are Reliable for Judging Empathic Communication. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.10150 (2025)."},{"key":"e_1_3_3_2_38_2","first-page":"530","volume-title":"Educational data mining 2022","author":"Li Yuheng","year":"2022","unstructured":"Yuheng Li, Mladen Rakovic, Boon\u00a0Xin Poh, Dragan Ga\u0161evic, and Guanliang Chen. 2022. Automatic classification of learning objectives based on bloom\u2019s taxonomy. In Educational data mining 2022. International Educational Data Mining Society, 530\u2013537."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Xiner Liu Andres\u00a0Felipe Zambrano Ryan\u00a0S Baker Amanda Barany Jaclyn Ocumpaugh Jiayi Zhang Maciej Pankiewicz Nidhi Nasiar and Zhanlan Wei. 2025. Qualitative Coding with GPT-4: Where It Works Better. Journal of Learning Analytics 12 1 (2025) 169\u2013185.","DOI":"10.18608\/jla.2025.8575"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-76335-9_7"},{"key":"e_1_3_3_2_41_2","unstructured":"Zhenyi Lu Jie Tian Wei Wei Xiaoye Qu Yu Cheng Dangyang Chen et\u00a0al. 2024. Mitigating boundary ambiguity and inherent bias for text classification in the era of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.07001 (2024)."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Jakub Macina Nico Daheim Sankalan\u00a0Pal Chowdhury Tanmay Sinha Manu Kapur Iryna Gurevych and Mrinmaya Sachan. 2023. Mathdial: A dialogue tutoring dataset with rich pedagogical properties grounded in math reasoning problems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14536 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.372"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-19400-9_14"},{"key":"e_1_3_3_2_44_2","first-page":"809","volume-title":"Proceedings of the 17th International Conference on Educational Data Mining","author":"McClure Jeanne","year":"2024","unstructured":"Jeanne McClure, Daria Smyslova, Amanda Hall, and Shiyan Jiang. 2024. Deductive Coding\u2019s Role in AI vs. Human Performance. In Proceedings of the 17th International Conference on Educational Data Mining. 809\u2013813."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0D McKelvey and William Zavoina. 1975. A statistical model for the analysis of ordinal level dependent variables. Journal of mathematical sociology 4 1 (1975) 103\u2013120.","DOI":"10.1080\/0022250X.1975.9989847"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Arbi\u00a0Haza Nasution and Aytu\u011f Onan. 2024. Chatgpt label: Comparing the quality of human-generated and llm-generated annotations in low-resource language nlp tasks. Ieee Access 12 (2024) 71876\u201371900.","DOI":"10.1109\/ACCESS.2024.3402809"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Cliodhna O\u2019Connor and Helene Joffe. 2020. Intercoder reliability in qualitative research: Debates and practical guidelines. International journal of qualitative methods 19 (2020) 1609406919899220.","DOI":"10.1177\/1609406919899220"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Austin Pack Alex Barrett and Juan Escalante. 2024. Large language models and automated essay scoring of English language learner writing: Insights into validity and reliability. Computers and Education: Artificial Intelligence 6 (2024) 100234.","DOI":"10.1016\/j.caeai.2024.100234"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2083"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Soujanya Poria Devamanyu Hazarika Navonil Majumder and Rada Mihalcea. 2020. Beneath the tip of the iceberg: Current challenges and new directions in sentiment analysis research. IEEE transactions on affective computing 14 1 (2020) 108\u2013132.","DOI":"10.1109\/TAFFC.2020.3038167"},{"key":"e_1_3_3_2_51_2","unstructured":"Keita Saito Akifumi Wachi Koki Wataoka and Youhei Akimoto. 2023. Verbosity bias in preference labeling by large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.10076 (2023)."},{"key":"e_1_3_3_2_52_2","unstructured":"Maarten Sap Swabha Swayamdipta Laura Vianna Xuhui Zhou Yejin Choi and Noah\u00a0A Smith. 2021. Annotators with attitudes: How annotator beliefs and identities bias toxic language detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.07997 (2021)."},{"key":"e_1_3_3_2_53_2","first-page":"60","volume-title":"International Conference on Artificial Intelligence in Education","author":"Simon Sebastian","year":"2025","unstructured":"Sebastian Simon, Sreecharan Sankaranarayanan, Elham Tajik, Conrad Borchers, Bahar Shahrokhian, Francesco Balzan, Sebastian Strau\u00df, Sree\u00a0Aurovindh Viswanathan, Amine\u00a0Hatun Ata\u015f, Mia \u010carapina, et\u00a0al. 2025. Comparing a Human\u2019s and a Multi-Agent System\u2019s Thematic Analysis: Assessing Qualitative Coding Consistency. In International Conference on Artificial Intelligence in Education. Springer, 60\u201373."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Li Siyan Zhen Xu Vethavikashini\u00a0Chithrra Raghuram Xuanming Zhang Renzhe Yu and Zhou Yu. 2025. Bringing Pedagogy into Focus: Evaluating Virtual Teaching Assistants\u2019 Question-Answering in Asynchronous Learning Environments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.17961 (2025).","DOI":"10.18653\/v1\/2025.findings-emnlp.518"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Stanley\u00a0Smith Stevens. 1946. On the theory of scales of measurement. Science 103 2684 (1946) 677\u2013680.","DOI":"10.1126\/science.103.2684.677"},{"key":"e_1_3_3_2_56_2","first-page":"138","volume-title":"Proceedings of the first workshop on NLP and computational social science","author":"Talat Zeerak","year":"2016","unstructured":"Zeerak Talat. 2016. Are you a racist or am i seeing things? annotator influence on hate speech detection on twitter. In Proceedings of the first workshop on NLP and computational social science. 138\u2013142."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"crossref","unstructured":"Zhen Tan Dawei Li Song Wang Alimohammad Beigi Bohan Jiang Amrita Bhattacharjee Mansooreh Karami Jundong Li Lu Cheng and Huan Liu. 2024. Large language models for data annotation and synthesis: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13446 (2024).","DOI":"10.18653\/v1\/2024.emnlp-main.54"},{"key":"e_1_3_3_2_58_2","unstructured":"Kaito Tanaka Benjamin Tan and Brian Wong. 2024. Leveraging language models for emotion and behavior analysis in education. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06874 (2024)."},{"key":"e_1_3_3_2_59_2","unstructured":"Danielle\u00a0R Thomas Conrad Borchers and Kenneth\u00a0R Koedinger. 2025. Beyond Agreement: Rethinking Ground Truth in Educational AI Annotation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.00143 (2025)."},{"key":"e_1_3_3_2_60_2","unstructured":"Petter T\u00f6rnberg. 2023. Chatgpt-4 outperforms experts and crowd workers in annotating political twitter messages with zero-shot learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.06588 (2023)."},{"key":"e_1_3_3_2_61_2","unstructured":"Nhat Tran Benjamin Pierce Diane Litman Richard Correnti Lindsay\u00a0Clare Matsumura et\u00a0al. 2024. Multi-dimensional performance analysis of large language models for classroom discussion assessment. Journal of Educational Data Mining 16 2 (2024) 304\u2013335."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Isaac Triguero and Celine Vens. 2016. Labelling strategies for hierarchical multi-label classification techniques. Pattern Recognition 56 (2016) 170\u2013183.","DOI":"10.1016\/j.patcog.2016.02.017"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"crossref","unstructured":"Sophie Vanbelle Christina\u00a0Hernandez Engelhart and Ellen Blix. 2024. A comprehensive guide to study the agreement and reliability of multi-observer ordinal data. BMC medical research methodology 24 1 (2024) 310.","DOI":"10.1186\/s12874-024-02431-y"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.868"},{"key":"e_1_3_3_2_65_2","unstructured":"Shuhe Wang Xiaofei Sun Xiaoya Li Rongbin Ouyang Fei Wu Tianwei Zhang Jiwei Li and Guoyin Wang. 2023. Gpt-ner: Named entity recognition via large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10428 (2023)."},{"key":"e_1_3_3_2_66_2","unstructured":"Leon Weber-Genzel Siyao Peng Marie-Catherine De\u00a0Marneffe and Barbara Plank. 2024. VariErr NLI: Separating annotation error from human label variation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.01931 (2024)."},{"key":"e_1_3_3_2_67_2","unstructured":"Xuansheng Wu Padmaja\u00a0Pravin Saraf Gyeonggeon Lee Ehsan Latif Ninghao Liu and Xiaoming Zhai. 2025. Unveiling scoring processes: Dissecting the differences between llms and human graders in automatic scoring. Technology Knowledge and Learning (2025) 1\u201316."},{"key":"e_1_3_3_2_68_2","unstructured":"Xinyi Wu Yifei Wang Stefanie Jegelka and Ali Jadbabaie. 2025. On the emergence of position bias in transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.01951 (2025)."},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-98465-5_26"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-47014-1_32"},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"crossref","unstructured":"Caleb Ziems William Held Omar Shaikh Jiaao Chen Zhehao Zhang and Diyi Yang. 2024. Can large language models transform computational social science? Computational Linguistics 50 1 (2024) 237\u2013291.","DOI":"10.1162\/coli_a_00502"}],"event":{"name":"LAK 2026: LAK26: 16th International Learning Analytics and Knowledge Conference","location":"Bergen Norway","acronym":"LAK 2026"},"container-title":["Proceedings of the LAK26: 16th International Learning Analytics and Knowledge Conference"],"original-title":[],"deposited":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T09:44:06Z","timestamp":1777110246000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3785022.3785070"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":70,"alternative-id":["10.1145\/3785022.3785070","10.1145\/3785022"],"URL":"https:\/\/doi.org\/10.1145\/3785022.3785070","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}