{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T04:30:25Z","timestamp":1763699425037,"version":"3.45.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s11390-025-5514-9","type":"journal-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T04:25:32Z","timestamp":1763699132000},"page":"1220-1233","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CodeRankEval: Benchmarking and Analyzing LLM Performance for Code Ranking"],"prefix":"10.1007","volume":"40","author":[{"given":"Li-Guo","family":"Chen","sequence":"first","affiliation":[]},{"given":"Zheng","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Yi-Jiang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Rui-Chuan","family":"An","sequence":"additional","affiliation":[]},{"given":"Xin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yang-Ning","family":"Li","sequence":"additional","affiliation":[]},{"given":"Ying-Hui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yi-Dong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zheng-Ran","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Qing","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Shi-Kun","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"key":"5514_CR1","doi-asserted-by":"publisher","unstructured":"Jiang J, Wang F, Shen J, Kim S, Kim S. A survey on large language models for code generation. arXiv: 2406.00515, 2024. https:\/\/arxiv.org\/abs\/2406.00515, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2406.00515.","DOI":"10.48550\/arXiv.2406.00515"},{"key":"5514_CR2","doi-asserted-by":"publisher","unstructured":"Hui B, Yang J, Cui Z, Yang J, Liu D, Zhang L, Liu T, Zhang J, Yu B, Lu K, Dang K, Fan Y, Zhang Y, Yang A, Men R, Huang F, Zheng B, Miao Y, Quan S, Feng Y, Ren X, Ren X, Zhou J, Lin J. Qwen2.5-coder technical report. arXiv: 2409.12186, 2024. https:\/\/arxiv.org\/abs\/2409.12186, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2409.12186.","DOI":"10.48550\/arXiv.2409.12186"},{"key":"5514_CR3","doi-asserted-by":"publisher","unstructured":"Zhu Q, Guo D, Shao Z, Yang D, Wang P, Xu R, Wu Y, Li Y, Gao H, Ma S, Zeng W, Bi X, Gu Z, Xu H, Dai D, Dong K, Zhang L, Piao Y, Gou Z, Xie Z, Hao Z, Wang B, Song J, Chen D, Xie X, Guan K, You Y, Liu A, Du Q, Gao W, Lu X, Chen Q, Wang Y, Deng C, Li J, Zhao C, Ruan C, Luo F, Liang W. DeepSeek-coder-V2: Breaking the barrier of closed-source models in code intelligence. arXiv: 2406.11931, 2024. https:\/\/arxiv.org\/abs\/2406.11931, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2406.11931.","DOI":"10.48550\/arXiv.2406.11931"},{"key":"5514_CR4","doi-asserted-by":"publisher","unstructured":"Chen L, Guo Q, Jia H, Zeng Z, Wang X, Xu Y, Wu J, Wang Y, Gao Q, Wang J, Ye W, Zhang S. A survey on evaluating large language models in code generation tasks. arXiv: 2408.16498, 2024. https:\/\/arxiv.org\/abs\/2408.16498, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2408.16498.","DOI":"10.48550\/arXiv.2408.16498"},{"key":"5514_CR5","doi-asserted-by":"publisher","unstructured":"Shen B, Zhang J, Chen T, Zan D, Geng B, Fu A, Zeng M, Yu A, Ji J, Zhao J, Guo Y, Wang Q. PanGu-Coder2: Boosting large language models for code with ranking feedback. arXiv: 2307.14936, 2023. https:\/\/arxiv.org\/abs\/2307.14936, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2307.14936.","DOI":"10.48550\/arXiv.2307.14936"},{"key":"5514_CR6","doi-asserted-by":"publisher","first-page":"199","DOI":"10.18653\/v1\/2022.emnlp-main.15","volume-title":"Proc. the 2022 Conference on Empirical Methods in Natural Language Processing","author":"K Krishna","year":"2022","unstructured":"Krishna K, Chang Y, Wieting J, Iyyer M. RankGen: Improving text generation with large ranking models. In Proc. the 2022 Conference on Empirical Methods in Natural Language Processing, Dec. 2022, pp.199\u2013232. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.15."},{"key":"5514_CR7","doi-asserted-by":"publisher","unstructured":"Chen M, Tworek J, Jun H, Yuan Q, de Oliveira Pinto H P, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G, Ray A, Puri R, Krueger G, Petrov M, Khlaaf H, Sastry G, Mishkin P, Chan B, Gray S, Ryder N, Pavlov M, Power A, Kaiser L, Bavarian M, Winter C, Tillet P, Such F P, Cummings D, Plappert M, Chantzis F, Barnes E, Herbert-Voss A, Guss W H, Nichol A, Paino A, Tezak N, Tang J, Babuschkin I, Balaji S, Jain S, Saunders W, Hesse C, Carr A N, Leike J, Achiam J, Misra V, Morikawa E, Radford A, Knight M, Brundage M, Murati M, Mayer K, Welinder P, McGrew B, Amodei D, McCandlish S, Sutskever I, Zaremba W. Evaluating large language models trained on code. arXiv: 2107.03374, 2021. https:\/\/arxiv.org\/abs\/2107.03374, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2107.03374.","DOI":"10.48550\/arXiv.2107.03374"},{"key":"5514_CR8","volume-title":"Proc. the 12th International Conference on Learning Representations (ICLR)","author":"C E Jimenez","year":"2024","unstructured":"Jimenez C E, Yang J, Wettig A, Yao S, Pei K, Press O, Narasimhan K R. SWE-bench: Can language models resolve real-world GitHub issues? In Proc. the 12th International Conference on Learning Representations (ICLR), May 2024."},{"key":"5514_CR9","doi-asserted-by":"publisher","first-page":"81857","DOI":"10.5555\/3737916.3740517","volume-title":"Proc. the 38th International Conference on Neural Information Processing Systems","author":"N M\u00fcndler","year":"2024","unstructured":"M\u00fcndler N, M\u00fcller M N, He J, Vechev M. SWT-bench: Testing and validating real-world bug-fixes with code agents. In Proc. the 38th International Conference on Neural Information Processing Systems, Dec. 2024, pp.81857\u201381887. DOI: https:\/\/doi.org\/10.5555\/3737916.3740517."},{"key":"5514_CR10","doi-asserted-by":"publisher","first-page":"20032","DOI":"10.18653\/v1\/2024.emnlp-main.1118","volume-title":"Proc. the 2024 Conference on Empirical Methods in Natural Language Processing","author":"W Tong","year":"2024","unstructured":"Tong W, Zhang T. CodeJudge: Evaluating code generation with large language models. In Proc. the 2024 Conference on Empirical Methods in Natural Language Processing, Nov. 2024, pp.20032\u201320051. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.1118."},{"key":"5514_CR11","volume-title":"Proc. the 41st International Conference on Machine Learning","author":"M Allamanis","year":"2024","unstructured":"Allamanis M, Panthaplackel S, Yin P. Unsupervised evaluation of code LLMs with round-trip correctness. In Proc. the 41st International Conference on Machine Learning, Jul. 2024."},{"issue":"8","key":"5514_CR12","doi-asserted-by":"publisher","first-page":"198341","DOI":"10.1007\/s11704-024-40415-9","volume":"19","author":"Z Lyu","year":"2025","unstructured":"Lyu Z, Li X, Xie Z, Li M. Top Pass: Improve code generation by pass@k-maximized code ranking. Frontiers of Computer Science, 2025, 19(8): 198341. DOI: https:\/\/doi.org\/10.1007\/s11704-024-40415-9.","journal-title":"Frontiers of Computer Science"},{"key":"5514_CR13","doi-asserted-by":"publisher","first-page":"13818","DOI":"10.18653\/v1\/2023.acl-long.773","volume-title":"Proc. the 61st Annual Meeting of the Association for Computational Linguistics (ACL)","author":"S Wang","year":"2023","unstructured":"Wang S, Li Z, Qian H, Yang C, Wang Z, Shang M, Kumar V, Tan S, Ray B, Bhatia P, Nallapati R, Ramanathan M K, Roth D, Xiang B. ReCode: Robustness evaluation of code generation models. In Proc. the 61st Annual Meeting of the Association for Computational Linguistics (ACL), Jul. 2023, pp.13818\u201313843. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.773."},{"issue":"3","key":"5514_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3631973","volume":"33","author":"G Yang","year":"2024","unstructured":"Yang G, Zhou Y, Yang W, Yue T, Chen X, Chen T. How important are good method names in neural code generation? A model robustness perspective. ACM Trans. Software Engineering and Methodology, 2024, 33(3): 1\u201335. DOI: https:\/\/doi.org\/10.1145\/3630010.","journal-title":"ACM Trans. Software Engineering and Methodology"},{"key":"5514_CR15","doi-asserted-by":"publisher","unstructured":"Sun W, Chen Z, Ma X, Yan L, Wang S, Ren P, Chen Z, Yin D, Ren Z. Instruction distillation makes large language models efficient zero-shot rankers. arXiv: 2311.01555, 2023. https:\/\/arxiv.org\/abs\/2311.01555, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2311.01555.","DOI":"10.48550\/arXiv.2311.01555"},{"key":"5514_CR16","doi-asserted-by":"publisher","first-page":"1900","DOI":"10.18653\/v1\/2024.findings-naacl.123","volume-title":"Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL Findings)","author":"J Lee","year":"2024","unstructured":"Lee J, Bernier-Colborne G, Maharaj T, Vajjala S. Methods, applications, and directions of learning-to-rank in NLP research. In Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL Findings), Jun. 2024, pp.1900\u20131917. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.findings-naacl.123."},{"key":"5514_CR17","doi-asserted-by":"publisher","first-page":"944","DOI":"10.1145\/3701551.3703583","volume-title":"Proc. the 18th ACM International Conference on Web Search and Data Mining","author":"F Guo","year":"2025","unstructured":"Guo F, Li W, Zhuang H, Luo Y, Li Y, Yan L, Zhu Q, Zhang Y. MCRanker: Generating diverse criteria on-the-fly to improve pointwise LLM rankers. In Proc. the 18th ACM International Conference on Web Search and Data Mining, Mar. 2025, pp.944\u2013953. DOI: https:\/\/doi.org\/10.1145\/3701551.3703583."},{"key":"5514_CR18","doi-asserted-by":"publisher","first-page":"358","DOI":"10.18653\/v1\/2024.naacl-short.31","volume-title":"Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)","author":"H Zhuang","year":"2024","unstructured":"Zhuang H, Qin Z, Hui K, Wu J, Yan L, Wang X, Bendersky M. Beyond yes and no: Improving zero-shot LLM rankers via scoring fine-grained relevance labels. In Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), Jun. 2024, pp.358\u2013370. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.naacl-short.31."},{"key":"5514_CR19","doi-asserted-by":"publisher","first-page":"14165","DOI":"10.18653\/v1\/2023.acl-long.792","volume-title":"Proc. the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"D Jiang","year":"2023","unstructured":"Jiang D, Ren X, Lin B Y. LLM-Blender: Ensembling large language models with pairwise ranking and generative fusion. In Proc. the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Jul. 2023, pp.14165\u201314178. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.792."},{"key":"5514_CR20","doi-asserted-by":"publisher","first-page":"1504","DOI":"10.18653\/v1\/2024.findings-naacl.97","volume-title":"Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Findings)","author":"Z Qin","year":"2024","unstructured":"Qin Z, Jagerman R, Hui K, Zhuang H, Wu J, Yan L, Shen J, Liu T, Liu J, Metzler D, Wang X, Bendersky M. Large language models are effective text rankers with pairwise ranking prompting. In Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Findings), Jun. 2024, pp.1504\u20131518. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.findings-naacl.97."},{"key":"5514_CR21","doi-asserted-by":"publisher","first-page":"6985","DOI":"10.18653\/v1\/2024.findings-emnlp.410","volume-title":"Proc. the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP Findings)","author":"S Yang","year":"2024","unstructured":"Yang S, Bi K, Cui W, Guo J, Cheng X. LINKAGE: Listwise ranking among varied-quality references for non-factoid QA evaluation via LLMs. In Proc. the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP Findings), Nov. 2024, pp.6985\u20137000. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.findings-emnlp.410."},{"key":"5514_CR22","doi-asserted-by":"publisher","first-page":"2327","DOI":"10.18653\/v1\/2024.naacl-long.129","volume-title":"Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)","author":"R Tang","year":"2024","unstructured":"Tang R, Zhang C, Ma X, Lin J, Ture F. Found in the middle: Permutation self-consistency improves listwise ranking in large language models. In Proc. the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), Jun. 2024, pp.2327\u20132340. DOI: https:\/\/doi.org\/10.18653\/v1\/2024.naacl-long.129."},{"key":"5514_CR23","doi-asserted-by":"publisher","first-page":"5067","DOI":"10.18653\/v1\/2023.findings-emnlp.337","volume-title":"Proc. the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP Findings)","author":"W Yan","year":"2023","unstructured":"Yan W, Tian Y, Li Y, Chen Q, Wang W. Code-TransOcean: A comprehensive multilingual benchmark for code translation. In Proc. the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP Findings), Dec. 2023, pp.5067\u20135089. DOI: https:\/\/doi.org\/10.18653\/v1\/2023.findings-emnlp.337."},{"key":"5514_CR24","volume-title":"Proc. the 35th Conference on Neural Information Processing Systems","author":"D Hendrycks","year":"2021","unstructured":"Hendrycks D, Basart S, Kadavath S, Mazeika M, Arora A, Guo E, Burns C, Puranik S, He H, Song D, Steinhardt J. Measuring coding challenge competence with APPS. In Proc. the 35th Conference on Neural Information Processing Systems, Dec. 2021."},{"key":"5514_CR25","doi-asserted-by":"publisher","first-page":"982","DOI":"10.1145\/3597503.3639219","volume-title":"Proc. the 46th IEEE\/ACM International Conference on Software Engineering","author":"X Du","year":"2024","unstructured":"Du X, Liu M, Wang K, Wang H, Liu J, Chen Y, Feng J, Sha C, Peng X, Lou Y. Evaluating large language models in class-level code generation. In Proc. the 46th IEEE\/ACM International Conference on Software Engineering, Apr. 2024, pp.982\u2013994. DOI: https:\/\/doi.org\/10.1145\/3597503.3639219."},{"key":"5514_CR26","doi-asserted-by":"publisher","unstructured":"Dai D, Liu M, Li A, Cao J, Wang Y, Wang C, Peng X, Zheng Z. FeedbackEval: A benchmark for evaluating large language models in feedback-driven code repair tasks. arXiv: 2504.06939, 2025. https:\/\/arxiv.org\/abs\/2504.06939, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2504.06939.","DOI":"10.48550\/arXiv.2504.06939"},{"key":"5514_CR27","doi-asserted-by":"publisher","unstructured":"Rozi\u00e8re B, Gehring J, Gloeckle F, Sootla S, Gat I, Tan X E, Adi Y, Liu J, Sauvestre R, Remez T, Rapin J, Kozhevnikov A, Evtimov I, Bitton J, Bhatt M, Ferrer C C, Grattafiori A, Xiong W, D\u00e9fossez A, Copet J, Azhar F, Touvron H, Martin L, Usunier N, Scialom T, Synnaeve G. Code Llama: Open foundation models for code. arXiv: 2308.12950, 2023. https:\/\/arxiv.org\/abs\/2308.12950, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2308.12950.","DOI":"10.48550\/arXiv.2308.12950"},{"key":"5514_CR28","doi-asserted-by":"publisher","unstructured":"Bai J, Bai S, Chu Y, Cui Z, Dang K, Deng X, Fan Y, Ge W, Han Y, Huang F, Hui B, Ji L, Li M, Lin J, Lin R, Liu D, Liu G, Lu C, Lu K, Ma J, Men R, Ren X, Ren X, Tan C, Tan S, Tu J, Wang P, Wang S, Wang W, Wu S, Xu B, Xu J, Yang A, Yang H, Yang J, Yang S, Yao Y, Yu B, Yuan H, Yuan Z, Zhang J, Zhang X, Zhang Y, Zhang Z, Zhou C, Zhou J, Zhou X, Zhu T. Qwen technical report. arXiv: 2309.16609, 2023. https:\/\/arxiv.org\/abs\/2309.16609, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2309.16609.","DOI":"10.48550\/arXiv.2309.16609"},{"key":"5514_CR29","doi-asserted-by":"publisher","unstructured":"Guo D, Zhu Q, Yang D, Xie Z, Dong K, Zhang W, Chen G, Bi X, Wu Y, Li Y K, Luo F, Xiong Y, Liang W. DeepSeek-Coder: When the large language model meets programming \u2014 The rise of code intelligence. arXiv: 2401.14196, 2024. https:\/\/arxiv.org\/abs\/2401.14196, Sept.2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2401.14196.","DOI":"10.48550\/arXiv.2401.14196"},{"key":"5514_CR30","doi-asserted-by":"publisher","unstructured":"Hu Z, Zhang J, Xiong Z, Ratner A, Xiong H, Krishna R. Language model preference evaluation with multiple weak evaluators. arXiv: 2410.12869, 2024. https:\/\/arxiv.org\/abs\/2410.12869, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2410.12869.","DOI":"10.48550\/arXiv.2410.12869"},{"key":"5514_CR31","doi-asserted-by":"publisher","unstructured":"Tripathi T, Wadhwa M, Durrett G, Niekum S. Pairwise or pointwise? Evaluating feedback protocols for bias in LLM-based evaluation. arXiv: 2504.14716, 2025. https:\/\/arxiv.org\/abs\/2504.14716, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2504.14716.","DOI":"10.48550\/arXiv.2504.14716"},{"key":"5514_CR32","doi-asserted-by":"publisher","first-page":"138289","DOI":"10.5555\/3737916.3742305","volume-title":"Proc. the 38th International Conference on Neural Information Processing Systems (NeurIPS)","author":"S Olesker-Taylor","year":"2024","unstructured":"Olesker-Taylor S, Zanetti L. An analysis of Elo rating systems via Markov chains. In Proc. the 38th International Conference on Neural Information Processing Systems (NeurIPS), Dec. 2024, pp.138289\u2013138323. DOI: https:\/\/doi.org\/10.5555\/3737916.3742305."},{"key":"5514_CR33","doi-asserted-by":"publisher","first-page":"1772","DOI":"10.1145\/3442381.3450091","volume-title":"Proc. the 2021 Web Conference","author":"A Ebtekar","year":"2021","unstructured":"Ebtekar A, Liu P. Elo-MMR: A rating system for massive multiplayer competitions. In Proc. the 2021 Web Conference, Apr. 2021, pp.1772\u20131784. DOI: https:\/\/doi.org\/10.1145\/3442381.3450091."},{"key":"5514_CR34","doi-asserted-by":"publisher","first-page":"1832","DOI":"10.1109\/ICSE55347.2025.00035","volume-title":"Proc. the 47th IEEE\/ACM International Conference on Software Engineering (ICSE)","author":"G Rong","year":"2025","unstructured":"Rong G, Yu Y, Liu S, Tan X, Zhang T, Shen H, Hu J. Code comment inconsistency detection and rectification using a large language model. In Proc. the 47th IEEE\/ACM International Conference on Software Engineering (ICSE), Apr. 27\u2013May 3, 2025, pp.1832\u20131843. DOI: https:\/\/doi.org\/10.1109\/ICSE55347.2025.00035."},{"issue":"5","key":"5514_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3649594","volume":"33","author":"Y Liu","year":"2024","unstructured":"Liu Y, Le-Cong T, Widyasari R, Tantithamthavorn C, Li L, Le X B D, Lo D. Refining ChatGPT-generated code: Characterizing and mitigating code quality issues. ACM Trans. Software Engineering and Methodology, 2024, 33(5): 1\u201326. DOI: https:\/\/doi.org\/10.1145\/3643674.","journal-title":"ACM Trans. Software Engineering and Methodology"},{"issue":"4","key":"5514_CR36","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1145\/582415.582418","volume":"20","author":"K J\u00e4rvelin","year":"2002","unstructured":"J\u00e4rvelin K, Kek\u00e4l\u00e4inen J. Cumulated gain-based evaluation of IR techniques. ACM Trans. Information Systems, 2002, 20(4): 422\u2013446. DOI: https:\/\/doi.org\/10.1145\/582415.582418.","journal-title":"ACM Trans. Information Systems"},{"key":"5514_CR37","doi-asserted-by":"publisher","unstructured":"Ye J, Chen X, Xu N, Zu C, Shao Z, Liu S, Cui Y, Zhou Z, Gong C, Shen Y, Zhou J, Chen S, Gui T, Zhang Q, Huang X. A comprehensive capability analysis of GPT-3 and GPT-3.5 series models. arXiv: 2303.10420, 2023. https:\/\/arxiv.org\/abs\/2303.10420, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2303.10420.","DOI":"10.48550\/arXiv.2303.10420"},{"key":"5514_CR38","doi-asserted-by":"publisher","unstructured":"DeepSeek-AI. DeepSeek-V3 technical report. arXiv: 2412.19437, 2024. https:\/\/arxiv.org\/abs\/2412.19437, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2412.19437.","DOI":"10.48550\/arXiv.2412.19437"},{"key":"5514_CR39","doi-asserted-by":"publisher","unstructured":"DeepSeek-AI. DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning. arXiv: 2501.12948, 2025. https:\/\/arxiv.org\/abs\/2501.12948, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2501.12948.","DOI":"10.48550\/arXiv.2501.12948"},{"key":"5514_CR40","doi-asserted-by":"publisher","unstructured":"Chen M, Tworek J, Jun H et al. Evaluating large language models trained on code. arXiv: 2107.03374, 2021. https:\/\/arxiv.org\/abs\/2107.03374, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2107.03374.","DOI":"10.48550\/arXiv.2107.03374"},{"key":"5514_CR41","doi-asserted-by":"publisher","unstructured":"Shi L, Ma C, Liang W, Ma W, Vosoughi S. Judging the judges: A systematic investigation of position bias in pairwise comparative assessments by LLMs. arXiv: 2406.07791, 2024. https:\/\/arxiv.org\/abs\/2406.07791v4, Sept. 2025. DOI: https:\/\/doi.org\/10.48550\/arXiv.2406.07791.","DOI":"10.48550\/arXiv.2406.07791"},{"key":"5514_CR42","volume-title":"Proc. the 13th International Conference on Learning Representations","author":"J Ye","year":"2025","unstructured":"Ye J, Wang Y, Huang Y, Chen D, Zhang Q, Moniz N, Gao T, Geyer W, Huang C, Chen P Y, Chawla N V, Zhang X. Justice or prejudice? Quantifying biases in LLM-as-a-judge. In Proc. the 13th International Conference on Learning Representations, Apr. 2025."}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-5514-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11390-025-5514-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-5514-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T04:25:37Z","timestamp":1763699137000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11390-025-5514-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":42,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["5514"],"URL":"https:\/\/doi.org\/10.1007\/s11390-025-5514-9","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"value":"1000-9000","type":"print"},{"value":"1860-4749","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]},"assertion":[{"value":"1 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Conflict of Interest\n                      The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}]}}