{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T21:31:32Z","timestamp":1771018292396,"version":"3.50.1"},"reference-count":253,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IIEEE Trans. Software Eng."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1109\/tse.2025.3644183","type":"journal-article","created":{"date-parts":[[2025,12,17]],"date-time":"2025-12-17T18:45:59Z","timestamp":1765997159000},"page":"651-674","source":"Crossref","is-referenced-by-count":1,"title":["Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality"],"prefix":"10.1109","volume":"52","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1649-9596","authenticated-orcid":false,"given":"Roham","family":"Koohestani","sequence":"first","affiliation":[{"name":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3722-5428","authenticated-orcid":false,"given":"Philippe de","family":"Bekker","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6686-6008","authenticated-orcid":false,"given":"Beg\u00fcm","family":"Ko\u00e7","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5093-5523","authenticated-orcid":false,"given":"Maliheh","family":"Izadi","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Mathematics, and Computer Science (EEMCS), Delft University of Technology, Delft, XE, The Netherlands"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref2","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"ref3","article-title":"Gpt-4 technical report","year":"2023"},{"key":"ref4","doi-asserted-by":"crossref","DOI":"10.1145\/3650105.3652289","article-title":"Investigating the performance of language models for completing code in functional programming languages: A Haskell case study","author":"van Dam","year":"2024"},{"key":"ref5","article-title":"Multi-lingual evaluation of code generation models","author":"Athiwaratkun","year":"2022"},{"key":"ref6","article-title":"MultiPL-E: A scalable and extensible approach to benchmarking neural code generation","author":"Cassano","year":"2022"},{"key":"ref7","article-title":"OctoPACK: Instruction tuning code large language models","author":"Muennighoff","year":"2023"},{"key":"ref8","article-title":"CodeGeeX: A pre-trained model for code generation with multilingual evaluations on HumanEval-X","author":"Zheng","year":"2023"},{"key":"ref9","article-title":"CodeScore: Evaluating code generation by learning code execution","author":"Dong","year":"2023"},{"key":"ref10","article-title":"Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation","author":"Liu","year":"2023"},{"key":"ref11","article-title":"AI for code: Predict code complexity using IBM\u2019S CodeNet dataset","year":"2021"},{"key":"ref12","article-title":"HumanEval-XL: A multilingual code generation benchmark for cross-lingual natural language generalization","author":"Peng","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.570"},{"key":"ref14","article-title":"Top leaderboard ranking = Top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM","author":"Xia","year":"2024"},{"key":"ref15","article-title":"Program synthesis with large language models","author":"Austin","year":"2021"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1126\/science.abq1158"},{"key":"ref17","article-title":"Measuring coding challenge competence with APPS","author":"Hendrycks","year":"2021"},{"key":"ref18","article-title":"LiveCodeBench: Holistic and contamination free evaluation of large language models for code","author":"Jain","year":"2024"},{"key":"ref19","article-title":"Is ChatGPT the ultimate programming assistant \u2013 How far is it?","author":"Tian","year":"2023"},{"key":"ref20","article-title":"CodeElo: Benchmarking competition-level code generation of LLMs with human-comparable Elo ratings","author":"Quan","year":"2025"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/LLM4Code66737.2025.00014"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.365"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.449"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.299"},{"key":"ref25","article-title":"LiveCodeBench Pro: How do Olympiad medalists judge LLMs in competitive programming?","author":"Zheng","year":"2025"},{"key":"ref26","article-title":"Learning based methods for code runtime complexity prediction","author":"Sikka","year":"2019"},{"key":"ref27","article-title":"Tasty: A transformer based approach to space and time complexity","author":"Moudgalya","year":"2023"},{"key":"ref28","article-title":"CodeComplex: A time-complexity dataset for bilingual source codes","author":"Baik","year":"2024"},{"key":"ref29","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.findings-emnlp.996","article-title":"PythonSaga: Redefining the benchmark to evaluate code generating LLM","author":"Yadav","year":"2024"},{"key":"ref30","article-title":"EffiBench: Benchmarking the efficiency of automatically generated code","author":"Huang","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3736407"},{"key":"ref32","article-title":"Learning performance-improving code edits","author":"Shypula","year":"2024"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3715727"},{"key":"ref34","article-title":"DS-1000: A natural and reliable benchmark for data science code generation","author":"Lai","year":"2022"},{"key":"ref35","doi-asserted-by":"crossref","DOI":"10.24963\/ijcai.2022\/329","article-title":"CERT: Continual pre-training on sketches for library-oriented code generation","author":"Zan","year":"2022"},{"key":"ref36","first-page":"5436","article-title":"JuICe: A large scale distantly supervised dataset for open domain context-based code generation","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process. (EMNLP-IJCNLP)","author":"Agashe","year":"2019"},{"key":"ref37","article-title":"Training and evaluating a jupyter notebook data science assistant","author":"Chandel","year":"2022"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.dash-1.5"},{"key":"ref39","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-long.308","article-title":"Benchmarking data science agents","author":"Zhang","year":"2024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.21"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btae230"},{"key":"ref42","article-title":"WebApp1K: A practical code-generation benchmark for web app development","author":"Cui","year":"2024"},{"key":"ref43","article-title":"Measuring coding challenge competence with APPS","volume-title":"Proc. NeurIPS Datasets Benchmarks","author":"Hendrycks","year":"2021"},{"key":"ref44","article-title":"Let\u2019s verify step by step","author":"Lightman","year":"2023"},{"key":"ref45","article-title":"MathQA: Towards interpretable math word problem solving with operation-based formalisms","author":"Amini","year":"2019"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.392"},{"key":"ref47","first-page":"1743","article-title":"Solving general arithmetic word problems","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Roy","year":"2015"},{"key":"ref48","article-title":"PAL: Program-aided language models","author":"Gao","year":"2022"},{"key":"ref49","first-page":"7889","article-title":"TheoremQA: A theorem-driven question answering dataset","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Chen","year":"2023"},{"key":"ref50","first-page":"12690","article-title":"PECC: Problem extraction and coding challenges","volume-title":"Proc. Int. Conf. Lang. Resour. Eval.","author":"Haller","year":"2024"},{"key":"ref51","article-title":"BRIGHT: A realistic and challenging benchmark for reasoning-intensive retrieval","author":"Su","year":"2024"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3196398.3196408"},{"key":"ref53","article-title":"MCoNaLa: A benchmark for code generation from multiple natural languages","author":"Wang","year":"2022"},{"key":"ref54","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2021.nlp4prog-1.8","article-title":"Reading StackOverflow encourages cheating: Adding question text improves extractive code generation","author":"Orlanski","year":"2021"},{"key":"ref55","article-title":"AixBench: A code generation benchmark dataset","author":"Hao","year":"2022"},{"key":"ref56","first-page":"5690","article-title":"CoSQA: 20,000+ web queries for code search and question answering","volume-title":"Proc. 59th Annu. Meeting Assoc. Comput. Linguistics 11th Int. Joint Conf. Natural Lang. Process.","author":"Huang","year":"2021"},{"key":"ref57","article-title":"CodeXGLUE: A machine learning benchmark dataset for code understanding and generation","author":"Lu","year":"2021"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1192"},{"key":"ref59","article-title":"CodeGen: An open large language model for code with multi-turn program synthesis","author":"Nijkamp","year":"2023"},{"key":"ref60","article-title":"Experimenting a new programming practice with LLMs","author":"Zhang","year":"2024"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/s10515-022-00331-3"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3449639.3459285"},{"key":"ref63","article-title":"TACO: Topics in algorithmic COde generation dataset","author":"Li","year":"2023"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/icst62969.2025.10989005"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3630009"},{"key":"ref66","first-page":"995","article-title":"RMCBench: Benchmarking large language models\u2019 resistance to malicious code","volume-title":"Proc. 39th IEEE\/ACM Int. Conf. Autom. Softw. Eng.","author":"Chen","year":"2024"},{"key":"ref67","first-page":"321","article-title":"EVIL: Exploiting software via natural language","volume-title":"Proc. IEEE 32nd Int. Symp. Softw. Rel. Eng. (ISSRE)","author":"Liguori","year":"2021"},{"key":"ref68","article-title":"CodeBenchGen: Creating scalable execution-based code generation benchmarks","author":"Xie","year":"2024"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-industry.89"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-industry.89"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.9"},{"key":"ref72","first-page":"36976","article-title":"StackEval: Benchmarking LLMs in coding assistance","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Shah","year":"2024"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/Forge66646.2025.00015"},{"key":"ref74","article-title":"CoSQA+: Pioneering the multi-choice code search benchmark with test-driven agents","author":"Gong","year":"2025"},{"key":"ref75","article-title":"InfiCoder-Eval: Systematically evaluating question-answering for code large language models","year":"2023"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3196321.3196334"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/s10664-019-09730-9"},{"key":"ref78","article-title":"Binary code summarization: Benchmarking ChatGPT\/GPT-4 and other large language models","author":"Jin","year":"2023"},{"key":"ref79","article-title":"A convolutional attention network for extreme summarization of source code","author":"Allamanis","year":"2016"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00087"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/314"},{"key":"ref82","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2021.findings-acl.18","article-title":"CoDesc: A large code-description parallel dataset","author":"Hasan","year":"2021"},{"key":"ref83","article-title":"A parallel corpus of Python functions and documentation strings for automated code documentation and code generation","author":"Barone"},{"key":"ref84","doi-asserted-by":"crossref","DOI":"10.1109\/MSR66628.2025.00077","article-title":"CoDocBench: A dataset for code-documentation alignment in software maintenance","author":"Pai","year":"2025"},{"key":"ref85","first-page":"795","article-title":"How effectively do code language models understand poor-readability code?","volume-title":"Proc. 39th IEEE\/ACM Int. Conf. Autom. Softw. Eng.","author":"Hu","year":"2024"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2024.112149"},{"key":"ref87","first-page":"42330","article-title":"Can LLM already serve as a database interface? A big bench for large-scale database grounded text-to-SQLs","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Li","year":"2023"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.176"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3178876.3186081"},{"key":"ref90","article-title":"Spider 2.0: Evaluating language models on real-world enterprise text-to-SQL workflows","author":"Lei","year":"2024"},{"key":"ref91","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2021.acl-long.195","article-title":"Towards robustness of text-to-SQL models against synonym substitution","author":"Gan","year":"2021"},{"key":"ref92","first-page":"1337","article-title":"Structure-grounded pretraining for text-to-SQL","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics, Human Lang. Technol.","author":"Deng","year":"2021"},{"key":"ref93","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2021.emnlp-main.702","article-title":"Exploring underexplored limitations of cross-domain text-to-SQL generalization","author":"Gan","year":"2021"},{"key":"ref94","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/D19-1377","article-title":"A pilot study for Chinese SQL semantic parsing","author":"Min","year":"2019"},{"key":"ref95","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/P19-1443","article-title":"SParC: Cross-domain semantic parsing in context","author":"Yu","year":"2019"},{"key":"ref96","first-page":"4238","article-title":"Lyra: A benchmark for turducken-style code generation","volume-title":"Proc. 31st Int. Joint Conf. Artif. Intell.","author":"Liang","year":"2022"},{"key":"ref97","first-page":"6923","article-title":"DuSQL: A large-scale and pragmatic Chinese text-to-SQL dataset","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process. (EMNLP)","author":"Wang","year":"2020"},{"key":"ref98","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/D19-1204","article-title":"CoSQL: A conversational text-to-SQL Challenge towards cross-domain natural language interfaces to databases","author":"Yu","year":"2019"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.14778\/3749646.3749723"},{"key":"ref100","first-page":"2355","article-title":"PAUQ: Text-to-SQL in Russian","volume-title":"Proc. Findings Assoc. Computat. Linguistics (EMNLP)","author":"Bakshandaeva","year":"2022"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/3605098.3636065"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/UBMK59864.2023.10286686"},{"key":"ref103","article-title":"Instruction-tuning aligns LLMs to the human brain","author":"Aw","year":"2023"},{"key":"ref104","article-title":"DevBench: A comprehensive benchmark for software development","author":"Li","year":"2024"},{"key":"ref105","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.findings-acl.214","article-title":"DevEval: A manually-annotated code generation benchmark aligned with real-world code repositories","author":"Li","year":"2024"},{"key":"ref106","doi-asserted-by":"crossref","DOI":"10.1145\/3650212.3652115","article-title":"CoderUJB: An executable and unified Java benchmark for practical programming scenarios","author":"Zeng","year":"2024"},{"key":"ref107","article-title":"ToolQA: A dataset for LLM question answering with external tools","author":"Zhuang","year":"2023"},{"key":"ref108","article-title":"MINT: Evaluating LLMs in multi-turn interaction with tools and language feedback","author":"Wang","year":"2024"},{"key":"ref109","article-title":"Evaluation of LLMs on syntax-aware code fill-in-the-middle tasks","author":"Gong","year":"2024"},{"key":"ref110","article-title":"AgentBench: Evaluating LLMs as agents","author":"Liu","year":"2023"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.633"},{"key":"ref112","article-title":"ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation","author":"Du","year":"2023"},{"key":"ref113","article-title":"BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions","author":"Zhuo","year":"2024"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.808"},{"key":"ref115","article-title":"CodeSense: A real-world benchmark and dataset for code semantic reasoning","author":"Roy","year":"2025"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1145\/3728940"},{"key":"ref117","article-title":"SWE-Bench: Can language models resolve real-world GitHub issues?","author":"Jimenez","year":"2023"},{"key":"ref118","article-title":"CrossCodeEval: A diverse and multilingual benchmark for cross-file code completion","author":"Ding","year":"2023"},{"key":"ref119","article-title":"CoderEval: A benchmark of pragmatic code generation with generative pre-trained models","author":"Yu","year":"2023"},{"key":"ref120","article-title":"Guiding language models of code with global context using monitors","author":"Agrawal","year":"2023"},{"key":"ref121","first-page":"131","article-title":"Evaluating clone detection tools with BigCloneBench","volume-title":"Proc. IEEE Int. Conf. Softw. Maintenance Evol. (ICSME)","author":"Svajlenko","year":"2015"},{"key":"ref122","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2025.findings-acl.528","article-title":"DI-BENCH: Benchmarking large language models on dependency inference with testable repositories at scale","author":"Zhang","year":"2025"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1145\/3643742"},{"key":"ref124","article-title":"Multi-SWE-Bench: A multilingual benchmark for issue resolving","author":"Zan","year":"2025"},{"key":"ref125","article-title":"KernelBench: Can LLMs Write Efficient GPU Kernels?","author":"Ouyang","year":"2025"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.140"},{"key":"ref127","article-title":"CodeEditorBench: Evaluating code editing capability of LLMs","volume-title":"Proc. ICLR 3rd Workshop Deep Learn. Code","author":"Guo","year":"2025"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1036"},{"key":"ref129","article-title":"RepoBench: Benchmarking repository-level code auto-completion systems","author":"Liu","year":"2023"},{"key":"ref130","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.emnlp-main.151","article-title":"RepoCoder: Repository-level code completion through iterative retrieval and generation","author":"Zhang","year":"2023"},{"key":"ref131","article-title":"EvoCodeBench: An evolving code generation benchmark aligned with real-world code repositories","author":"Li","year":"2024"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3768577"},{"key":"ref133","article-title":"ML-Bench: Evaluating large language models and agents for machine learning tasks on repository-level code","author":"Tang","year":"2024"},{"key":"ref134","first-page":"434","article-title":"CodeGen4Libs: A two-stage approach for library-oriented code generation","volume-title":"Proc. 38th IEEE\/ACM Int. Conf. Autom. Softw. Eng. (ASE)","author":"Liu","year":"2023"},{"key":"ref135","article-title":"SWE-Rebench: An automated pipeline for task collection and decontaminated evaluation of software engineering agents","author":"Badertdinov","year":"2025"},{"key":"ref136","article-title":"SWE-PolyBench: A multi-language benchmark for repository level evaluation of coding agents","author":"Rashid","year":"2025"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE55347.2025.00228"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1204"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.839"},{"key":"ref140","first-page":"870","article-title":"JavaBench: A benchmark of object-oriented code generation for evaluating large language models","volume-title":"Proc. 39th IEEE\/ACM Int. Conf. Automat. Softw. Eng.","author":"Cao","year":"2024"},{"key":"ref141","article-title":"SWE-bench goes live!","author":"Zhang","year":"2025"},{"key":"ref142","article-title":"SWE-Lancer: Can frontier LLMs earn one million from real-world freelance software engineering?","author":"Miserendino","year":"2025"},{"key":"ref143","article-title":"RestGPT: Connecting large language models with real-world restful APIs","author":"Song","year":"2023"},{"key":"ref144","article-title":"Revisiting, benchmarking and exploring API recommendation: How far are we?","author":"Peng","year":"2021"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3238191"},{"key":"ref146","article-title":"Gorilla: Large language model connected with massive APIs","author":"Patil","year":"2023"},{"key":"ref147","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.emnlp-main.187","article-title":"API-bank: A comprehensive benchmark for tool-augmented LLMs","author":"Li","year":"2023"},{"key":"ref148","article-title":"CodeRAG-Bench: Can retrieval augment code generation?","author":"Wang","year":"2024"},{"key":"ref149","doi-asserted-by":"crossref","DOI":"10.1109\/MSR52588.2021.00077","article-title":"Search4Code: Code search intent classification using weak supervision","author":"Rao","year":"2021"},{"key":"ref150","article-title":"CoIR: A comprehensive benchmark for code information retrieval models","author":"Li","year":"2024"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3639133"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1145\/3180155.3180260"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i19.30185"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/COMPSAC.2019.00012"},{"key":"ref155","first-page":"11906","article-title":"SPOC: Search-based pseudocode to code","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Kulal","year":"2019"},{"key":"ref156","article-title":"NAPS: Natural program synthesis dataset","author":"Zavershynskyi","year":"2018"},{"key":"ref157","article-title":"Isolating language-coding from problem-solving: Benchmarking LLMs with PseudoEval","author":"Wu","year":"2025"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2015.36"},{"key":"ref159","article-title":"Seq2SQL: Generating structured queries from natural language using reinforcement learning","author":"Zhong","year":"2017"},{"key":"ref160","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/D18-1425","article-title":"Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-SQL task","author":"Yu","year":"2018"},{"key":"ref161","article-title":"NL2Bash: A corpus and semantic parser for natural language interface to the Linux operating system","volume-title":"Proc. Computat. Lang.","author":"Lin","year":"2018"},{"key":"ref162","article-title":"Leveraging automated unit tests for unsupervised code translation","author":"Roziere","year":"2022"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21434"},{"key":"ref164","first-page":"2268","article-title":"Avatar: A parallel corpus for Java-Python program translation","volume-title":"Proc. Findings Assoc. Computat. Linguistics","author":"Ahmad","year":"2023"},{"key":"ref165","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.findings-emnlp.337","article-title":"CodeTransOcean: A comprehensive multilingual benchmark for code translation","author":"Yan","year":"2023"},{"key":"ref166","first-page":"1529","article-title":"On the evaluation of neural code translation: Taxonomy and benchmark","volume-title":"Proc. 38th IEEE\/ACM Int. Conf. Automated Softw. Eng. (ASE)","author":"Jiao","year":"2023"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1145\/3611643.3616350"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/ase63991.2025.00057"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.3233\/faia240968"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/tse.2025.3645056"},{"key":"ref171","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-long.802","article-title":"IRCoder: Intermediate representations make language models robust multilingual code generators","author":"Paul","year":"2024"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1145\/2610384.2628055"},{"key":"ref173","first-page":"118","article-title":"GitBug-Java: A reproducible benchmark of recent Java bugs","volume-title":"Proc. IEEE\/ACM 21st Int. Conf. Mining Softw. Repositories (MSR)","author":"Silva","year":"2024"},{"key":"ref174","article-title":"A critical review of large language model on software engineering: An example from ChatGPT and automated program repair","author":"Zhang","year":"2023"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1145\/3650212.3680328"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/icst60714.2024.00049"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2015.2454513"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.247"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1145\/3135932.3135941"},{"key":"ref180","article-title":"Res-Q: Evaluating code-editing large language model systems at the repository scale","author":"Labash","year":"2024"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.501"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2019.00044"},{"key":"ref183","article-title":"ConDefects: A new dataset to address the data leakage concern for LLM-based fault localization and program repair","author":"Wu","year":"2023"},{"key":"ref184","first-page":"73","article-title":"Cerberus: A program repair framework","volume-title":"Proc. IEEE\/ACM 45th Int. Conf. Softw. Eng.: Companion Proc. (ICSE-Companion)","author":"Shariffdeen","year":"2023"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/LLM4Code66737.2025.00006"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/Forge66646.2025.00023"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417943"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1145\/3475960.3475985"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/MSR59073.2023.00084"},{"key":"ref190","doi-asserted-by":"crossref","DOI":"10.1145\/3549035.3561184","article-title":"SecurityEval dataset: Mining vulnerability examples to evaluate machine learning-based code generation techniques","author":"Siddiq","year":"2022"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1145\/3524842.3528482"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1145\/3617555.3617874"},{"key":"ref193","first-page":"1282","article-title":"How effective are neural networks for fixing security vulnerabilities","volume-title":"Proc. 32nd ACM SIGSOFT Int. Symp. Softw. Testing Anal.","author":"Wu","year":"2023"},{"key":"ref194","doi-asserted-by":"crossref","first-page":"530","DOI":"10.1145\/3377811.3380364","article-title":"Empirical review of automated analysis tools on 47,587 Ethereum smart contracts","volume-title":"Proc. ACM\/IEEE 42nd Int. Conf. Softw. Eng.","author":"Durieux","year":"2020"},{"key":"ref195","article-title":"Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks","author":"Zhou","year":"2019"},{"key":"ref196","doi-asserted-by":"crossref","DOI":"10.1109\/ICSE-SEIP52600.2021.00020","article-title":"D2A: A Dataset built for AI-Based vulnerability detection methods using differential analysis","author":"Zheng","year":"2021"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1145\/3379597.3387501"},{"key":"ref198","article-title":"ARVO: Atlas of reproducible vulnerabilities for open source software","author":"Mei","year":"2024"},{"key":"ref199","article-title":"VADER: A human-evaluated benchmark for vulnerability assessment, detection, explanation, and remediation","author":"Liu","year":"2025"},{"key":"ref200","article-title":"There are more fish in the sea: Automated vulnerability repair via binary templates","author":"Lin","year":"2024"},{"key":"ref201","article-title":"Predicting code coverage without execution","author":"Tufano","year":"2023"},{"key":"ref202","doi-asserted-by":"crossref","first-page":"1398","DOI":"10.1145\/3377811.3380429","article-title":"On learning meaningful assert statements for unit test cases","volume-title":"Proc. ACM\/IEEE 42nd Int. Conf. Softw. Eng.","author":"Watson","year":"2020"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1145\/3691620.3695501"},{"key":"ref204","article-title":"Code generation tools (almost) for free? A study of few-shot, pre-trained language models on code","author":"Barei\u00df","year":"2022"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510068"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1145\/3650212.3652115"},{"key":"ref207","article-title":"Testbench: Evaluating class-level test case generation capability of large language models","author":"Zhang","year":"2024"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.197"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2025.3541166"},{"key":"ref210","article-title":"ProjectTest: A project-level unit test generation benchmark and impact of error fixing mechanisms","author":"Wang","year":"2025"},{"key":"ref211","article-title":"Clover: A test case generation benchmark with coverage, long-context, and verification","author":"Xu","year":"2025"},{"key":"ref212","article-title":"Unit test case generation with transformers and focal context","author":"Tufano","year":"2020"},{"key":"ref213","article-title":"CruxEval: A benchmark for code reasoning, understanding and execution","author":"Gu","year":"2024"},{"key":"ref214","article-title":"CRQBench: A benchmark of code reasoning questions","author":"Dinella","year":"2024"},{"key":"ref215","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.findings-acl.91","article-title":"CriticBench: Benchmarking LLMs for critique-correct reasoning","author":"Lin","year":"2024"},{"key":"ref216","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-long.301","article-title":"CodeScope: An execution-based multilingual multitask multidimensional benchmark for evaluating LLMs on code understanding and generation","author":"Yan","year":"2024"},{"key":"ref217","article-title":"CodeCriticBench: A holistic code critique benchmark for large language models","author":"Zhang","year":"2025"},{"key":"ref218","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2025.acl-long.1158","article-title":"CruxEval-X: A benchmark for multilingual code reasoning, understanding and execution","author":"Xu","year":"2025"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2024.112084"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1145\/3639478.3640033"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1145\/3540250.3549081"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME46990.2020.00035"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1007\/s10664-024-10592-z"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1145\/3540250.3549081"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3623306"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.476"},{"key":"ref227","article-title":"Exploring and evaluating hallucinations in LLM-powered code generation","author":"Liu","year":"2024"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34717"},{"key":"ref229","article-title":"Collu-Bench: A benchmark for predicting language model hallucinations in code","author":"Jiang","year":"2024"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00128"},{"key":"ref231","article-title":"CodeMirage: Hallucinations in code generated by large language models","author":"Agarwal","year":"2024"},{"key":"ref232","doi-asserted-by":"crossref","DOI":"10.1145\/3661167.3661216","article-title":"Using large language models to generate JUnit tests: An empirical study","author":"Siddiq","year":"2024"},{"key":"ref233","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2022"},{"key":"ref234","article-title":"XLCoST: A benchmark dataset for cross-lingual code intelligence","author":"Zhu","year":"2022"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00055"},{"key":"ref236","article-title":"Long code arena: A set of benchmarks for long-context code models","author":"Bogomolov","year":"2024"},{"key":"ref237","article-title":"CodeSearchNet challenge: Evaluating the state of semantic code search","author":"Husain","year":"2019"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34811"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3511561"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/tse.2024.3475375"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1145\/3643754"},{"key":"ref242","article-title":"AL-Bench: A benchmark for automatic logging","author":"Tan","year":"2025"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1145\/3728969"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE59848.2023.00071"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1145\/3650212.3652123"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/SANER60148.2024.00077"},{"key":"ref247","article-title":"LogEval: A comprehensive benchmark suite for large language models in log analysis","author":"Cui","year":"2024"},{"key":"ref248","doi-asserted-by":"crossref","DOI":"10.1109\/ICSE-FoSE59343.2023.00008","article-title":"Large language models for software engineering: Survey and open problems","author":"Fan","year":"2023"},{"key":"ref249","doi-asserted-by":"crossref","DOI":"10.1145\/3695988","article-title":"Large language models for software engineering: A systematic literature review","author":"Hou","year":"2024"},{"key":"ref250","doi-asserted-by":"crossref","DOI":"10.1016\/j.jss.2023.111796","article-title":"A systematic literature review on source code similarity measurement and clone detection: Techniques, applications, and challenges","author":"Zakeri-Nasrabadi","year":"2023"},{"key":"ref251","article-title":"Software development life cycle perspective: A survey of benchmarks for code large language models and agents","author":"Wang","year":"2025"},{"key":"ref252","article-title":"Hugging Face","year":"2016"},{"key":"ref253","article-title":"Dynamic benchmarking of reasoning capabilities in code large language models under data contamination","author":"Chen","year":"2025"}],"container-title":["IEEE Transactions on Software Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/32\/11395383\/11301810.pdf?arnumber=11301810","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T20:49:43Z","timestamp":1771015783000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11301810\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":253,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tse.2025.3644183","relation":{},"ISSN":["0098-5589","1939-3520","2326-3881"],"issn-type":[{"value":"0098-5589","type":"print"},{"value":"1939-3520","type":"electronic"},{"value":"2326-3881","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]}}}