{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:42:48Z","timestamp":1777866168414,"version":"3.51.4"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T00:00:00Z","timestamp":1775520000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T00:00:00Z","timestamp":1775520000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Empir Software Eng"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1007\/s10664-026-10856-w","type":"journal-article","created":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T13:27:05Z","timestamp":1775568425000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Robustness evaluation and enhancement of LLMs in code generation: an empirical study"],"prefix":"10.1007","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4839-7153","authenticated-orcid":false,"given":"Jincheng","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Senrong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuan","family":"Yao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yibin","family":"Shen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yicong","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaorui","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ping","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoxing","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,4,7]]},"reference":[{"key":"10856_CR1","unstructured":"Achiam J, Adler S, Agarwal S, Ahmad L, Akkaya I, Aleman FL, Almeida D, Altenschmidt J, Altman S, Anadkat S et\u00a0al (2023) Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"10856_CR2","unstructured":"Alon U, Brody S, Levy O, Yahav E (2018) code2seq: Generating sequences from structured representations of code. arXiv preprint arXiv:1808.01400"},{"issue":"POPL","key":"10856_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3290353","volume":"3","author":"U Alon","year":"2019","unstructured":"Alon U, Zilberstein M, Levy O, Yahav E (2019) code2vec: Learning distributed representations of code. Proc ACM Program Lang 3(POPL):1\u201329","journal-title":"Proc ACM Program Lang"},{"key":"10856_CR4","unstructured":"Athalye A, Carlini N, Wagner D (2018) Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples. In: International conference on machine learning, pp 274\u2013283. PMLR"},{"key":"10856_CR5","unstructured":"Austin J, Odena A, Nye M, Bosma M, Michalewski H, Dohan D, Jiang E, Cai C, Terry M, Le Q et\u00a0al (2021) Program synthesis with large language models. arXiv preprint arXiv:2108.07732"},{"key":"10856_CR6","unstructured":"Ben\u00a0Allal L, Muennighoff N, Kumar\u00a0Umapathi L, Lipkin B, von Werra L (2022) A framework for the evaluation of code generation models. https:\/\/github.com\/bigcode-project\/bigcode-evaluation-harness"},{"key":"10856_CR7","unstructured":"Brockschmidt M (2020) Gnn-film: Graph neural networks with feature-wise linear modulation. In: International conference on machine learning. PMLR, pp 1144\u20131152"},{"key":"10856_CR8","unstructured":"Chaudhary S (2023) Code alpaca: An instruction-following llama model for code generation"},{"key":"10856_CR9","unstructured":"Chen M, Tworek J, Jun H, Yuan Q, Pinto HPDO, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G et\u00a0al (2021) Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374"},{"key":"10856_CR10","doi-asserted-by":"crossref","unstructured":"Feng Z, Guo D, Tang D, Duan N, Feng X, Gong M, Shou L, Qin B, Liu T, Jiang D et\u00a0al (2020) Codebert: A pre-trained model for programming and natural languages. In: Findings of the association for computational linguistics: EMNLP 2020, pp 1536\u20131547","DOI":"10.18653\/v1\/2020.findings-emnlp.139"},{"key":"10856_CR11","unstructured":"Fried D, Aghajanyan A, Lin J, Wang S, Wallace E, Shi F, Zhong R, Yih S, Zettlemoyer L, Lewis M (2023) Incoder: A generative model for code infilling and synthesis. In: The eleventh international conference on learning representations"},{"issue":"PLDI","key":"10856_CR12","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1145\/3591227","volume":"7","author":"F Gao","year":"2023","unstructured":"Gao F, Wang Y, Wang K (2023) Discrete adversarial attack to models of code. Proc ACM Program Lang 7(PLDI):172\u2013195","journal-title":"Proc ACM Program Lang"},{"key":"10856_CR13","unstructured":"Guo D, Ren S, Lu S, Feng Z, Tang D, Liu S, Zhou L, Duan N, Svyatkovskiy A, Fu S et\u00a0al (2020) Graphcodebert: Pre-training code representations with data flow. arXiv preprint arXiv:2009.08366"},{"key":"10856_CR14","doi-asserted-by":"crossref","unstructured":"Henkel J, Ramakrishnan G, Wang Z, Albarghouthi A, Jha S, Reps T (2022) Semantic robustness of models of source code. In: 2022 IEEE international conference on software analysis, evolution and reengineering (SANER), pp 526\u2013537. IEEE","DOI":"10.1109\/SANER53432.2022.00070"},{"issue":"8","key":"10856_CR15","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"10856_CR16","unstructured":"Hossen MI, Hei X (2024) On the adversarial robustness of instruction-tuned large language models for code. arXiv preprint arXiv:2411.19508"},{"issue":"2","key":"10856_CR17","first-page":"3","volume":"1","author":"EJ Hu","year":"2022","unstructured":"Hu EJ, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W et al (2022) Lora: Low-rank adaptation of large language models. ICLR 1(2):3","journal-title":"ICLR"},{"key":"10856_CR18","unstructured":"Hui B, Yang J, Cui Z, Yang J, Liu D, Zhang L, Liu T, Zhang J, Yu B, Lu K et\u00a0al (2024) Qwen2. 5-coder technical report. arXiv preprint arXiv:2409.12186"},{"key":"10856_CR19","unstructured":"Hurst A, Lerer A, Goucher AP, Perelman A, Ramesh A, Clark A, Ostrow A, Welihinda A, Hayes A, Radford A et\u00a0al (2024) Gpt-4o system card. arXiv preprint arXiv:2410.21276"},{"key":"10856_CR20","unstructured":"Jain N, Han K, Gu A, Li WD, Yan F, Zhang T, Wang S, Solar-Lezama A, Sen K, Stoica I (2024) Livecodebench: Holistic and contamination free evaluation of large language models for code. arXiv preprint arXiv:2403.07974"},{"key":"10856_CR21","unstructured":"Jimenez CE, Yang J, Wettig A, Yao S, Pei K, Press O, Narasimhan KR (2024) Swe-bench: Can language models resolve real-world github issues? In: The twelfth international conference on learning representations"},{"key":"10856_CR22","doi-asserted-by":"crossref","unstructured":"Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu CH, Gonzalez J, Zhang H, Stoica I (2023) Efficient memory management for large language model serving with pagedattention. In: Proceedings of the 29th symposium on operating systems principles, pp 611\u2013626","DOI":"10.1145\/3600006.3613165"},{"key":"10856_CR23","unstructured":"Li Y, Tarlow D, Brockschmidt M, Zemel R (2015) Gated graph sequence neural networks. arXiv preprint arXiv:1511.05493"},{"key":"10856_CR24","doi-asserted-by":"crossref","unstructured":"Li Z, Chen G, Chen C, Zou Y, Xu S (2022) Ropgen: Towards robust code authorship attribution via automatic coding style transformation. In: Proceedings of the 44th international conference on software engineering, pp 1906\u20131918","DOI":"10.1145\/3510003.3510181"},{"key":"10856_CR25","unstructured":"Liu A, Feng B, Wang B, Wang B, Liu B, Zhao C, Dengr C, Ruan C, Dai D, Guo D et\u00a0al (2024) Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434"},{"key":"10856_CR26","unstructured":"Liu T, Xu C, McAuley J (2024) Repobench: Benchmarking repository-level code auto-completion systems. In: The twelfth international conference on learning representations"},{"key":"10856_CR27","unstructured":"Merrill MA, Shaw AG, Carlini N, Li B, Raj H, Bercovich I, Shi L, Shin JY, Walshe T, Buchanan EK et\u00a0al (2026) Terminal-bench: Benchmarking agents on hard, realistic tasks in command line interfaces. arXiv preprint arXiv:2601.11868"},{"key":"10856_CR28","unstructured":"Nijkamp E, Pang B, Hayashi H, Tu L, Wang H, Zhou Y, Savarese S, Xiong C (2022) A conversational paradigm for program synthesis. arXiv preprint arXiv:2203.13474"},{"key":"10856_CR29","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A et al (2022) Training language models to follow instructions with human feedback. Adv Neural Inf Process Syst 35:27730\u201327744","journal-title":"Adv Neural Inf Process Syst"},{"key":"10856_CR30","unstructured":"Quiring E, Maier A, Rieck K (2019) Misleading authorship attribution of source code using adversarial learning. In: 28th USENIX security symposium (USENIX Security 19), pp 479\u2013496"},{"key":"10856_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2021.106552","volume":"135","author":"MRI Rabin","year":"2021","unstructured":"Rabin MRI, Bui ND, Wang K, Yu Y, Jiang L, Alipour MA (2021) On the generalizability of neural program models with respect to semantic-preserving program transformations. Inf Softw Technol 135:106552","journal-title":"Inf Softw Technol"},{"key":"10856_CR32","unstructured":"Ren S, Guo D, Lu S, Zhou L, Liu S, Tang D, Sundaresan N, Zhou M, Blanco A, Ma S (2020) Codebleu: a method for automatic evaluation of code synthesis. arXiv preprint arXiv:2009.10297"},{"key":"10856_CR33","unstructured":"Roziere B, Gehring J, Gloeckle F, Sootla S, Gat I, Tan XE, Adi Y, Liu J, Sauvestre R, Remez T et\u00a0al (2023) Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950"},{"key":"10856_CR34","unstructured":"Sanh V, Webson A, Raffel C, Bach SH, Sutawika L, Alyafeai Z, Chaffin A, Stiegler A, Scao TL, Raja A et\u00a0al (2021) Multitask prompted training enables zero-shot task generalization. arXiv preprint arXiv:2110.08207"},{"key":"10856_CR35","doi-asserted-by":"crossref","unstructured":"Shafahi A, Najibi M, Xu Z, Dickerson J, Davis LS, Goldstein T (2020) Universal adversarial training. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a034, pp 5636\u20135643","DOI":"10.1609\/aaai.v34i04.6017"},{"key":"10856_CR36","unstructured":"Srikant S, Liu S, Mitrovska T, Chang S, Fan Q, Zhang G, O\u2019Reilly UM (2021) Generating adversarial computer programs using optimized obfuscations. In: International conference on learning representations"},{"key":"10856_CR37","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux MA, Lacroix T, Rozi\u00e8re B, Goyal N, Hambro E, Azhar F et\u00a0al (2023) Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"10856_CR38","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"10856_CR39","unstructured":"Wang B, Komatsuzaki A (2021) Gpt-j-6b: A 6 billion parameter autoregressive language model"},{"key":"10856_CR40","doi-asserted-by":"crossref","unstructured":"Wang S, Li Z, Qian H, Yang C, Wang Z, Shang M, Kumar V, Tan S, Ray B, Bhatia P et\u00a0al (2023) Recode: Robustness evaluation of code generation models. In: The 61st annual meeting of the association for computational linguistics","DOI":"10.18653\/v1\/2023.acl-long.773"},{"key":"10856_CR41","doi-asserted-by":"crossref","unstructured":"Yang Z, Shi J, He J, Lo D (2022) Natural attack for pre-trained models of code. In: Proceedings of the 44th international conference on software engineering, pp 1482\u20131493","DOI":"10.1145\/3510003.3510146"},{"key":"10856_CR42","unstructured":"Yang A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Li C, Liu D, Huang F, Wei H et\u00a0al (2024) Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115"},{"key":"10856_CR43","doi-asserted-by":"crossref","unstructured":"Yefet N, Alon U, Yahav E (2020) Adversarial examples for models of code. Proc ACM Program Lang 4(OOPSLA):1\u201330","DOI":"10.1145\/3428230"},{"issue":"6","key":"10856_CR44","doi-asserted-by":"publisher","first-page":"1091","DOI":"10.1109\/TPAMI.2007.1078","volume":"29","author":"L Yujian","year":"2007","unstructured":"Yujian L, Bo L (2007) A normalized levenshtein distance metric. IEEE Trans Pattern Anal Mach Intell 29(6):1091\u20131095","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10856_CR45","unstructured":"Zhang H, Yu Y, Jiao J, Xing E, El\u00a0Ghaoui L, Jordan M (2019) Theoretically principled trade-off between robustness and accuracy. In: International conference on machine learning. PMLR, pp 7472\u20137482"},{"key":"10856_CR46","doi-asserted-by":"crossref","unstructured":"Zhang S, Zhao H, Liu X, Zheng Q, Qi Z, Gu X, Zhang X, Dong Y, Tang J (2024) Naturalcodebench: Examining coding performance mismatch on humaneval and natural user prompts. arXiv preprint arXiv:2405.04520","DOI":"10.18653\/v1\/2024.findings-acl.471"},{"key":"10856_CR47","doi-asserted-by":"crossref","unstructured":"Zheng Y, Zhang R, Zhang J, Ye Y, Luo Z, Feng Z, Ma Y (2024) Llamafactory: Unified efficient fine-tuning of 100+ language models. In: Proceedings of the 62nd annual meeting of the association for computational linguistics (Volume 3: System Demonstrations). Association for Computational Linguistics, Bangkok, Thailand. arxiv:2403.13372","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"10856_CR48","unstructured":"Zhu Q, Guo D, Shao Z, Yang D, Wang P, Xu R, Wu Y, Li Y, Gao H, Ma S et\u00a0al (2024) Deepseek-coder-v2: Breaking the barrier of closed-source models in code intelligence. arXiv preprint arXiv:2406.11931"},{"key":"10856_CR49","unstructured":"Zhuo TY, Vu MC, Chim J, Hu H, Yu W, Widyasari R, Yusuf INB, Zhan H, He J, Paul I et\u00a0al (2024) Bigcodebench: Benchmarking code generation with diverse function calls and complex instructions. arXiv preprint arXiv:2406.15877"}],"container-title":["Empirical Software Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-026-10856-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10664-026-10856-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-026-10856-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T08:10:07Z","timestamp":1777536607000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10664-026-10856-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,7]]},"references-count":49,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,7]]}},"alternative-id":["10856"],"URL":"https:\/\/doi.org\/10.1007\/s10664-026-10856-w","relation":{},"ISSN":["1382-3256","1573-7616"],"issn-type":[{"value":"1382-3256","type":"print"},{"value":"1573-7616","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,7]]},"assertion":[{"value":"29 December 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable. This study does not involve human participants in a biomedical or clinical context, and no formal ethical approval from an institutional review board was required.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"Informed consent was obtained from all individual participants involved in the study.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed Consent"}},{"value":"The authors declare that they have no competing interests relevant to the content of this manuscript.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Clinical Trial Number"}}],"article-number":"112"}}