{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:04:51Z","timestamp":1780671891093,"version":"3.54.1"},"reference-count":61,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109199","type":"journal-article","created":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T05:28:50Z","timestamp":1780118930000},"page":"109199","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SAS-bench: A fine-grained benchmark for evaluating short answer scoring with large language models"],"prefix":"10.1016","volume":"203","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6936-5687","authenticated-orcid":false,"given":"Peichao","family":"Lai","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6509-7591","authenticated-orcid":false,"given":"Kexuan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3380-7044","authenticated-orcid":false,"given":"Yi","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linyihan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3368-1514","authenticated-orcid":false,"given":"Feiyang","family":"Ye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinhao","family":"Yan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yanwei","family":"Xu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Conghui","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1826-6707","authenticated-orcid":false,"given":"Yilei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109199_bib0001","series-title":"Proceedings of the 17th international conference on natural language processing, ICON 2020, Indian institute of technology Patna, Patna, India, December 18-21, 2020","first-page":"430","article-title":"ScAA: A dataset for automated short answer grading of children\u2019s free-text answers in Hindi and Marathi","author":"Agarwal","year":"2020"},{"key":"10.1016\/j.neunet.2026.109199_bib0002","series-title":"Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers), ACL 2024, Bangkok, Thailand, August 11\u201316, 2024","first-page":"14388","article-title":"L-eval: Instituting standardized evaluation for long context language models","author":"An","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0003","unstructured":"Barbara, Hamner, B., Morgan, J., lynnvandev, & Shermis, M. (2012). The Hewlett foundation: Short answer scoring. https:\/\/kaggle.com\/competitions\/asap-sas. Kaggle."},{"issue":"Part A","key":"10.1016\/j.neunet.2026.109199_bib0004","article-title":"Improving the performance of automatic short answer grading using transfer learning and augmentation","volume":"123","author":"Bonthu","year":"2023","journal-title":"Engineering Applications of Artificial Intelligence"},{"issue":"1","key":"10.1016\/j.neunet.2026.109199_bib0005","first-page":"1","article-title":"Improving your test-taking skills","volume":"1","author":"Boyd","year":"1989","journal-title":"Practical Assessment, Research, and Evaluation"},{"key":"10.1016\/j.neunet.2026.109199_bib0006","series-title":"Proceedings of the 19th workshop on innovative use of NLP for building educational applications, BEA 2024, Mexico city, Mexico, June 20, 2024","first-page":"309","article-title":"LLMs in short answer scoring: limitations and promise of zero-shot and few-shot approaches","author":"Chamieh","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0007","series-title":"Proceedings of the 2022 conference of the North American chapter of the association for computational linguistics: Human language technologies, NAACL 2022, Seattle, WA, United States, July 10\u201315, 2022","first-page":"4207","article-title":"DiffCSE: Difference-based contrastive learning for sentence embeddings","author":"Chuang","year":"2022"},{"key":"10.1016\/j.neunet.2026.109199_bib0008","series-title":"A short guide to writing effective test questions","author":"Clay","year":"2001"},{"key":"10.1016\/j.neunet.2026.109199_bib0009","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1016\/j.cogsys.2021.11.002","article-title":"Automatic question generation and answer assessment for subjective examination","volume":"72","author":"Das","year":"2022","journal-title":"Cogn. Syst. Res."},{"key":"10.1016\/j.neunet.2026.109199_bib0010","unstructured":"DeepSeek-AI (2024). Deepseek-v3 technical report. https:\/\/arxiv.org\/abs\/2412.19437."},{"key":"10.1016\/j.neunet.2026.109199_bib0011","unstructured":"DeepSeek-AI, Guo, D., Yang, D., Zhang, H., Song, J., Zhang, R., Xu, R., Zhu, Q., Ma, S., Wang, P., Bi, X., Zhang, X., Yu, X., Wu, Y., Wu, Z. F., Gou, Z., Shao, Z., Li, Z., Gao, Z., et al. (2025). Deepseek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning. https:\/\/arxiv.org\/abs\/2501.12948."},{"key":"10.1016\/j.neunet.2026.109199_bib0012","article-title":"GLIDER: Grading LLM interactions and decisions using explainable ranking","volume":"abs\/2412.14140","author":"Deshpande","year":"2024","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0013","unstructured":"Deshpande, D., Ravi, S. S., CH-Wang, S., Mielczarek, B., Kannappan, A., & Qian, R. (2024b). Glider: Grading llm interactions and decisions using explainable ranking. https:\/\/arxiv.org\/abs\/2412.14140."},{"key":"10.1016\/j.neunet.2026.109199_bib0014","series-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: Human language technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2\u20137, 2019, volume 1 (Long and short papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.neunet.2026.109199_bib0015","article-title":"The LLAMA 3 herd of models","volume":"abs\/2407.21783","author":"Dubey","year":"2024","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0016","series-title":"Proceedings of the 7th international workshop on semantic evaluation, semeval@NAACL-HLT 2013, Atlanta, Georgia, USA, June 14-15, 2013","first-page":"263","article-title":"Semeval-2013 task 7: The joint student response analysis and 8th recognizing textual entailment challenge","author":"Dzikovska","year":"2013"},{"key":"10.1016\/j.neunet.2026.109199_bib0017","series-title":"Proceedings of the 2023\u202fACM SIGIR international conference on theory of information retrieval, ICTIR 2023, Taipei, Taiwan, 23 July 2023","first-page":"39","article-title":"Perspectives on large language models for relevance judgment","author":"Faggioli","year":"2023"},{"key":"10.1016\/j.neunet.2026.109199_bib0018","unstructured":"Freja, S. A., Catak, F. O., Yurdem, B., & Rong, C. (2026). EvalQReason: A framework for step-level reasoning evaluation in large language models. https:\/\/arxiv.org\/abs\/2602.02295."},{"key":"10.1016\/j.neunet.2026.109199_bib0019","unstructured":"GLM, T., Zeng, A., Xu, B., Wang, B., Zhang, C., Yin, D., Zhang, D., Rojas, D., Feng, G., Zhao, H., Lai, H., Yu, H., Wang, H., Sun, J., Zhang, J., Cheng, J., Gui, J., Tang, J., Zhang, J., Sun, J., Li, J., Zhao, L., Wu, L., Zhong, L., Liu, M., Huang, M., Zhang, P., Zheng, Q., Lu, R., Duan, S., Zhang, S., Cao, S., Yang, S., Tam, W. L., Zhao, W., Liu, X., Xia, X., Zhang, X., Gu, X., Lv, X., Liu, X., Liu, X., Yang, X., Song, X., Zhang, X., An, Y., Xu, Y., Niu, Y., Yang, Y., Li, Y., Bai, Y., Dong, Y., Qi, Z., Wang, Z., Yang, Z., Du, Z., Hou, Z., & Wang, Z. (2024). ChatGLM: A family of large language models from GLM-130B to GLM-4 all tools. https:\/\/arxiv.org\/abs\/2406.12793."},{"key":"10.1016\/j.neunet.2026.109199_bib0020","unstructured":"Hamner, B., Morgan, J., lynnvandev, Shermis, M., & Ark, T. V. (2012). The Hewlett foundation: Automated essay scoring. https:\/\/kaggle.com\/competitions\/asap-aes. Kaggle."},{"key":"10.1016\/j.neunet.2026.109199_bib0021","unstructured":"Jiang, A. Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D. S., de las Casas, D., Hanna, E. B., Bressand, F., Lengyel, G., Bour, G., Lample, G., Lavaud, L. R., Saulnier, L., Lachaux, M.-A., Stock, P., Subramanian, S., Yang, S., Antoniak, S., Scao, T. L., Gervet, T., Lavril, T., Wang, T., Lacroix, T., & Sayed, W. E. (2024). Mixtral of experts. https:\/\/arxiv.org\/abs\/2401.04088."},{"key":"10.1016\/j.neunet.2026.109199_bib0022","series-title":"International conference on machine learning and applications, ICMLA 2023, Jacksonville, FL, USA, December 15\u201317, 2023","first-page":"1687","article-title":"Enhancing transfer learning of LLMs through fine- tuning on task - related corpora for automated short-answer grading","author":"Kazi","year":"2023"},{"key":"10.1016\/j.neunet.2026.109199_bib0023","series-title":"Proceedings of the thirty-third international joint conference on artificial intelligence, IJCAI 2024, Jeju, South Korea, August 3\u20139, 2024","first-page":"7154","article-title":"Automated essay scoring using discourse external knowledge","author":"Khayi","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0024","series-title":"Proceedings of the eleventh ACM conference on learning @ scale, l@s 2024, Atlanta, GA, USA, July 18\u201320, 2024","first-page":"315","article-title":"Is GPT-4 alone sufficient for automated essay scoring?: A comparative judgment approach based on rater cognition","author":"Kim","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0025","doi-asserted-by":"crossref","DOI":"10.1016\/j.csl.2023.101575","article-title":"M-sim: Multi-level semantic inference model for Chinese short answer scoring in low-resource scenarios","volume":"84","author":"Lai","year":"2024","journal-title":"Computer Speech & Language"},{"key":"10.1016\/j.neunet.2026.109199_bib0026","first-page":"166","article-title":"Knowledge distillation of LLMs for automatic scoring of science assessments","volume":"vol. 2151","author":"Latif","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0027","series-title":"Findings of the association for computational linguistics: EMNLP 2024, Miami, Florida, USA, November 12\u201316, 2024","first-page":"181","article-title":"Unleashing large language models\u2019 proficiency in zero-shot essay scoring","author":"Lee","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0028","unstructured":"Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let\u2019s verify step by step. https:\/\/arxiv.org\/abs\/2305.20050."},{"key":"10.1016\/j.neunet.2026.109199_bib0029","series-title":"Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers), ACL 2023, Toronto, Canada, July 9\u201314, 2023","first-page":"13785","article-title":"RankCSE: Unsupervised sentence representations learning via learning to rank","author":"Liu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109199_bib0030","series-title":"Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation, LREC\/COLING 2024, 20\u201325 May, 2024, Torino, Italy","first-page":"2638","article-title":"Calibrating LLM-based evaluator","author":"Liu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0031","series-title":"Proceedings of the 2024 conference on empirical methods in natural language processing, EMNLP 2024, Miami, FL, USA, November 12\u201316, 2024","first-page":"6835","article-title":"Efficient LLM comparative assessment: A product of experts framework for pairwise comparisons","author":"Liusie","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0032","series-title":"Proceedings of the sixth Italian conference on computational linguistics, Bari, Italy, November 13\u201315, 2019","article-title":"Automated short answer grading: A simple solution for a difficult task","volume":"vol. 2481","author":"Menini","year":"2019"},{"key":"10.1016\/j.neunet.2026.109199_bib0033","series-title":"Proceedings of the 2024 on ACM virtual global computing education conference v. 2","first-page":"322","article-title":"Asag2024: A combined benchmark for short answer grading","author":"Meyer","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0034","series-title":"The 49th annual meeting of the association for computational linguistics: Human language technologies, proceedings of the conference, 19\u201324 June, 2011, Portland, Oregon, USA","first-page":"752","article-title":"Learning to grade short answer questions using semantic similarity measures and dependency graph alignments","author":"Mohler","year":"2011"},{"key":"10.1016\/j.neunet.2026.109199_bib0035","series-title":"EACL 2009, 12th conference of the European chapter of the association for computational linguistics, proceedings of the conference, Athens, Greece, March 30, - April 3, 2009","first-page":"567","article-title":"Text-to-text semantic similarity for automatic short answer grading","author":"Mohler","year":"2009"},{"key":"10.1016\/j.neunet.2026.109199_bib0036","first-page":"1","article-title":"GPT-4 Technical Report","volume":"abs\/2303.08774","author":"OpenAI","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0037","unstructured":"Potamitis, N., Klein, L., & Arora, A. (2025). ReasonBENCH: Benchmarking the (in)stability of LLM reasoning. https:\/\/arxiv.org\/abs\/2512.07795."},{"key":"10.1016\/j.neunet.2026.109199_bib0038","series-title":"Proceedings of the 2024 conference on empirical methods in natural language processing, EMNLP 2024, Miami, FL, USA, November 12\u201316, 2024","first-page":"7499","article-title":"Is llm-as-a-judge robust? Investigating universal adversarial attacks on zero-shot LLM assessment","author":"Raina","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0039","unstructured":"Ren, Z. Z., Shao, Z., Song, J., Xin, H., Wang, H., Zhao, W., Zhang, L., Fu, Z., Zhu, Q., Yang, D., Wu, Z. F., Gou, Z., Ma, S., Tang, H., Liu, Y., Gao, W., Guo, D., & Ruan, C. (2025). Deepseek-prover-v2: Advancing formal mathematical reasoning via reinforcement learning for subgoal decomposition. https:\/\/arxiv.org\/abs\/2504.21801."},{"key":"10.1016\/j.neunet.2026.109199_bib0040","series-title":"Proceedings of the 16th international conference on computational processing of Portuguese - vol. 1","first-page":"228","article-title":"A new benchmark for automatic essay scoring in Portuguese","author":"Silveira","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0041","series-title":"Artificial intelligence in education - 25th international conference, AIED 2024, Recife, Brazil, July 8\u201312, 2024, proceedings, Part I","first-page":"163","article-title":"Automated long answer grading with ricechem dataset","volume":"vol. 14829","author":"Sonkar","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0042","series-title":"Proceedings of the 19th workshop on innovative use of NLP for building educational applications, BEA 2024, Mexico City, Mexico, June 20, 2024","first-page":"283","article-title":"Exploring LLM prompting strategies for joint essay scoring and feedback generation","author":"Stahl","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0043","unstructured":"Sun, Y., Wang, S., Feng, S., Ding, S., Pang, C., Shang, J., Liu, J., Chen, X., Zhao, Y., Lu, Y. et al. (2021). Ernie 3.0: Large-scale knowledge enhanced pre-training for language understanding and generation. arXiv: 2107.02137."},{"key":"10.1016\/j.neunet.2026.109199_bib0044","unstructured":"Q. Team (2025a). Qwen3. https:\/\/qwenlm.github.io\/blog\/qwen3\/."},{"key":"10.1016\/j.neunet.2026.109199_bib0045","unstructured":"Q. Team (2025b). Qwq-32b: Embracing the power of reinforcement learning. https:\/\/qwenlm.github.io\/blog\/qwq-32b\/."},{"key":"10.1016\/j.neunet.2026.109199_bib0046","unstructured":"T. Team (2025c). Superdistillation achieves near-r1 performance with just 5% of parameters. https:\/\/huggingface.co\/qihoo360\/TinyR1-32B-Preview."},{"key":"10.1016\/j.neunet.2026.109199_bib0047","series-title":"Proceedings of the 47th international ACM SIGIR conference on research and development in information retrieval, SIGIR 2024, Washington DC, USA, July 14\u201318, 2024","first-page":"1930","article-title":"Large language models can accurately predict searcher preferences","author":"Thomas","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0048","article-title":"Is chatgpt a good NLG evaluator? A preliminary study","volume":"abs\/2303.04048","author":"Wang","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0049","series-title":"Proceedings of the 5th workshop on natural language processing techniques for educational applications","first-page":"125","article-title":"A short answer grading system in Chinese by support vector approach","author":"Wu","year":"2018"},{"key":"10.1016\/j.neunet.2026.109199_bib0050","series-title":"2019\u202fIEEE 10th international conference on awareness science and technology (iCAST)","first-page":"1","article-title":"A short answer grading system in Chinese by CNN","author":"Wu","year":"2019"},{"key":"10.1016\/j.neunet.2026.109199_bib0051","unstructured":"Xiaomi LLM-Core Team (2025). Mimo: Unlocking the reasoning potential of language model - from pretraining to posttraining. https:\/\/github.com\/XiaomiMiMo\/MiMo."},{"key":"10.1016\/j.neunet.2026.109199_bib0052","unstructured":"M. Xu, text2vec: A tool for text to vector, (2023), https:\/\/github.com\/shibing624\/text2vec."},{"key":"10.1016\/j.neunet.2026.109199_bib0053","unstructured":"Yang, A., Yang, B., Hui, B., Zheng, B., Yu, B., Zhou, C., Li, C., Li, C., Liu, D., Huang, F., Dong, G., Wei, H., Lin, H., Tang, J., Wang, J., Yang, J., Tu, J., Zhang, J., Ma, J., Xu, J., Zhou, J., Bai, J., He, J., Lin, J., Dang, K., Lu, K., Chen, K., Yang, K., Li, M., Xue, M., Ni, N., Zhang, P., Wang, P., Peng, R., Men, R., Gao, R., Lin, R., Wang, S., Bai, S., Tan, S., Zhu, T., Li, T., Liu, T., Ge, W., Deng, X., Zhou, X., Ren, X., Zhang, X., Wei, X., Ren, X., Fan, Y., Yao, Y., Zhang, Y., Wan, Y., Chu, Y., Liu, Y., Cui, Z., Zhang, Z., & Fan, Z. (2024a). Qwen2 technical report. arXiv: 2407.10671."},{"key":"10.1016\/j.neunet.2026.109199_bib0054","unstructured":"Yang, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Li, C., Liu, D., Huang, F., Wei, H., Lin, H., Yang, J., Tu, J., Zhang, J., Yang, J., Yang, J., Zhou, J., Lin, J., Dang, K., Lu, K., Bao, K., Yang, K., Yu, L., Li, M., Xue, M., Zhang, P., Zhu, Q., Men, R., Lin, R., Li, T., Xia, T., Ren, X., Ren, X., Fan, Y., Su, Y., Zhang, Y., Wan, Y., Liu, Y., Cui, Z., Zhang, Z., & Qiu, Z. (2024b). Qwen2.5 technical report. arXiv: 2412.15115."},{"key":"10.1016\/j.neunet.2026.109199_bib0055","unstructured":"Yang, S.-X., Wang, C., Wang, Y., Gu, X., Huang, M., & Tang, J. (2025). StepMathAgent: A step-wise agent for evaluating mathematical processes through tree-of-error. arXiv: 2503.10105."},{"key":"10.1016\/j.neunet.2026.109199_bib0056","article-title":"Short answer grading using one-shot prompting and text similarity scoring model","volume":"abs\/2305.18638","author":"Yoon","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0057","series-title":"The twelfth international conference on learning representations","article-title":"Metamath: Bootstrap your own mathematical questions for large language models","author":"Yu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109199_bib0058","article-title":"Evaluating the performance of large language models on GAOKAO benchmark","volume":"abs\/2305.12474","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0059","series-title":"Advances in neural information processing systems 36: Annual conference on neural information processing systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10, - 16, 2023","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","author":"Zheng","year":"2023"},{"key":"10.1016\/j.neunet.2026.109199_bib0060","article-title":"JudgeLM: Fine-tuned large language models are scalable judges","volume":"abs\/2310.17631","author":"Zhu","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.neunet.2026.109199_bib0061","series-title":"Proceedings of the 2024 conference of the North American chapter of the association for computational linguistics: Human language technologies: Short papers, NAACL 2024, Mexico city, Mexico, June 16-21, 2024","first-page":"358","article-title":"Beyond yes and no: Improving zero-shot LLM rankers via scoring fine-grained relevance labels","author":"Zhuang","year":"2024"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S089360802600660X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S089360802600660X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T14:07:20Z","timestamp":1780668440000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S089360802600660X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":61,"alternative-id":["S089360802600660X"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109199","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SAS-bench: A fine-grained benchmark for evaluating short answer scoring with large language models","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109199","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109199"}}