{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T20:38:32Z","timestamp":1770755912617,"version":"3.50.0"},"reference-count":79,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100000646","name":"Japan Society for the Promotion of Science London","doi-asserted-by":"publisher","award":["JSPS KAKENHI Grant No. JP23H03372, No. JP23K16865"],"award-info":[{"award-number":["JSPS KAKENHI Grant No. JP23H03372, No. JP23K16865"]}],"id":[{"id":"10.13039\/501100000646","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Empir Software Eng"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s10664-025-10624-2","type":"journal-article","created":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T06:50:15Z","timestamp":1739861415000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Boosting source code learning with text-oriented data augmentation: an empirical study"],"prefix":"10.1007","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7742-0264","authenticated-orcid":false,"given":"Zeming","family":"Dong","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8251-1669","authenticated-orcid":false,"given":"Qiang","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yuejun","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Zhenya","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Maxime","family":"Cordy","sequence":"additional","affiliation":[]},{"given":"Mike","family":"Papadakis","sequence":"additional","affiliation":[]},{"given":"Yves","family":"Le Traon","sequence":"additional","affiliation":[]},{"given":"Jianjun","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,18]]},"reference":[{"key":"10624_CR1","doi-asserted-by":"publisher","unstructured":"Allamanis M, Barr ET, Devanbu P, Sutton C (2018) A survey of machine learning for big code and naturalness. ACM Comput Surv(CSUR) 51(4). https:\/\/doi.org\/10.1145\/3212695","DOI":"10.1145\/3212695"},{"key":"10624_CR2","unstructured":"Allamanis M, Brockschmidt M, Khademi M (2018) Learning to represent programs with graphs. In: International Conference on Learning Representations (ICLR)"},{"key":"10624_CR3","unstructured":"Allamanis M, Jackson-Flux HR, Brockschmidt M (2021) Self-supervised bug detection and repair. In: Advances in neural information processing systems"},{"key":"10624_CR4","unstructured":"Alon U, Brody S, Levy O, Yahav E (2019) code2seq: generating sequences from structured representations of code. In: International Conference on Learning Representations (ICLR)"},{"issue":"POPL","key":"10624_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3290353","volume":"3","author":"U Alon","year":"2019","unstructured":"Alon U, Zilberstein M, Levy O, Yahav E (2019) code2vec: learning distributed representations of code. Proc ACM Program Lang 3(POPL):1\u201329","journal-title":"Proc ACM Program Lang"},{"issue":"5","key":"10624_CR6","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1111\/opo.12131","volume":"34","author":"RA Armstrong","year":"2014","unstructured":"Armstrong RA (2014) When to use the b onferroni correction. Ophthalmic Physiol Opt 34(5):502\u2013508","journal-title":"Ophthalmic Physiol Opt"},{"key":"10624_CR7","unstructured":"Ben-Nun T, Jakobovits AS, Hoefler T (2018) Neural code comprehension: a learnable representation of code semantics. Adv Neural Inf Process Syst 31"},{"key":"10624_CR8","unstructured":"Bielik P, Vechev M (2020) Adversarial robustness for code. In: Proceedings of the 37th international conference on machine learning, ser. Proceedings of Machine Learning Research, vol 119. PMLR, pp 896\u2013907"},{"key":"10624_CR9","doi-asserted-by":"crossref","unstructured":"Bui ND, Yu Y, Jiang L (2021) Self-supervised contrastive learning for code retrieval and summarization via semantic-preserving transformations. In: Proceedings of the 44th international ACM SIGIR conference on research and development in information retrieval, ser. SIGIR \u201921. Association for Computing Machinery, New York, NY, USA, pp 511\u2013521. https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3404835.3462840","DOI":"10.1145\/3404835.3462840"},{"key":"10624_CR10","unstructured":"Buratti L, Pujar S, Bornea M, McCarley S, Zheng Y, Rossiello G, Morari A, Laredo J, Thost V, Zhuang Y et\u00a0al (2020) Exploring software naturalness through neural language models. arXiv:2006.12641. https:\/\/arxiv.org\/abs\/2006.12641"},{"key":"10624_CR11","unstructured":"Chen Z, Monperrus M (2018) The codrep machine learning on source code competition. arXiv:1807.03200"},{"key":"10624_CR12","doi-asserted-by":"crossref","unstructured":"Chirkova N, Troshin S (2021) Empirical study of transformers for source code. ser. ESEC\/FSE 2021. Association for Computing Machinery, New York, NY, USA, pp 703\u2013715. https:\/\/dl.acm.org\/doi\/10.1145\/3468264.3468611","DOI":"10.1145\/3468264.3468611"},{"issue":"1","key":"10624_CR13","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1017\/S1351324916000334","volume":"23","author":"KW Church","year":"2017","unstructured":"Church KW (2017) Word2vec. Nat Lang Eng 23(1):155\u2013162","journal-title":"Nat Lang Eng"},{"key":"10624_CR14","doi-asserted-by":"crossref","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the ACL: Human Language Technologies (NAACL-HLT), vol\u00a01. ACL, pp 4171\u20134186","DOI":"10.18653\/v1\/N19-1423"},{"key":"10624_CR15","unstructured":"Dinella E, Dai H, Li Z, Naik M, Song L, Wang K (2020) Hoppity: learning graph transformations to detect and fix bugs in programs. In: International conference on learning representations"},{"key":"10624_CR16","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111328","volume":"285","author":"Z Dong","year":"2024","unstructured":"Dong Z, Hu Q, Zhang Z, Zhao J (2024) On the effectiveness of graph data augmentation for source code learning. Knowl-Based Syst 285:111328","journal-title":"Knowl-Based Syst"},{"key":"10624_CR17","doi-asserted-by":"crossref","unstructured":"Dong Z, Hu Q, Guo Y, Cordy M, Papadakis M, Zhang Z, Le\u00a0Traon Y, Zhao J (2023) Mixcode: enhancing code classification by mixup-based data augmentation. In: 2023 IEEE International Conference on Software Analysis, Evolution and Reengineering (SANER). IEEE, pp 379\u2013390","DOI":"10.1109\/SANER56733.2023.00043"},{"key":"10624_CR18","doi-asserted-by":"crossref","unstructured":"Dong Z, Hu Q, Guo Y, Zhang Z, Zhao J (2023) Boosting source code learning with text-oriented data augmentation: an empirical study. In: 2023 IEEE 23rd International Conference on Software Quality, Reliability, and Security Companion (QRS-C). IEEE, pp 383\u2013392","DOI":"10.1109\/QRS-C60940.2023.00017"},{"key":"10624_CR19","doi-asserted-by":"crossref","unstructured":"Dong Z, Hu Q, Zhang Z, Guo Y, Cordy M, Papadakis M, Le\u00a0Traon Y, Zhao J (2024) On the effectiveness of hybrid pooling in mixup-based graph learning for language processing. J Syst Softw, p 112139","DOI":"10.1016\/j.jss.2024.112139"},{"key":"10624_CR20","doi-asserted-by":"crossref","unstructured":"Fabbri A, Han S, Li H, Li H, Ghazvininejad M, Joty S, Radev D, Mehdad Y (2021) Improving zero and few-shot abstractive summarization with intermediate fine-tuning and data augmentation. In: Proceedings of the 2021 Conference of the North American Chapter of the ACL: Human Language Technologies. ACL, pp 704\u2013717","DOI":"10.18653\/v1\/2021.naacl-main.57"},{"key":"10624_CR21","doi-asserted-by":"crossref","unstructured":"Feng SY, Gangal V, Wei J, Chandar S, Vosoughi S, Mitamura T, Hovy E (2021) A survey of data augmentation approaches for nlp. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021. Association for Computational Linguistics, pp 968\u2013988","DOI":"10.18653\/v1\/2021.findings-acl.84"},{"key":"10624_CR22","doi-asserted-by":"crossref","unstructured":"Feng Z, Guo D, Tang D, Duan N, Feng X, Gong M, Shou L, Qin B, Liu T, Jiang D, Zhou M (2020) Codebert: a pre-trained model for programming and natural languages, pp 1536\u20131547","DOI":"10.18653\/v1\/2020.findings-emnlp.139"},{"key":"10624_CR23","unstructured":"Goodfellow IJ, Shlens J, Szegedy C (2015) Explaining and harnessing adversarial examples. In: 3rd International Conference on Learning Representations (ICLR)"},{"issue":"12","key":"10624_CR24","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1145\/3318162","volume":"62","author":"CL Goues","year":"2019","unstructured":"Goues CL, Pradel M, Roychoudhury A (2019) Automated program repair. Commun ACM 62(12):56\u201365. https:\/\/doi.org\/10.1145\/3318162","journal-title":"Commun ACM"},{"key":"10624_CR25","unstructured":"Guo H, Mao Y, Zhang R (2019) Augmenting data with mixup for sentence classification: an empirical study. arXiv:1905.08941"},{"key":"10624_CR26","unstructured":"Guo D, Ren S, Lu S, Feng Z, Tang D, Liu S, Zhou L, Duan N, Svyatkovskiy A, Fu S et\u00a0al (2020) Graphcodebert: pre-training code representations with data flow. arXiv:2009.08366"},{"key":"10624_CR27","unstructured":"Hendrycks D, Dietterich T (2019) Benchmarking neural network robustness to common corruptions and perturbations. Proc Int Conf Learn Rep"},{"key":"10624_CR28","doi-asserted-by":"crossref","unstructured":"Hindle A, Barr ET, Su Z, Gabel M, Devanbu P (2012). On the naturalness of software. In: Proceedings of the 34th International Conference on Software Engineering (ICSE), ser. ICSE \u201912. IEEE Press, pp 837\u2013847","DOI":"10.1109\/ICSE.2012.6227135"},{"key":"10624_CR29","doi-asserted-by":"crossref","unstructured":"Hu Y, Ahmed UZ, Mechtaev S, Leong B, Roychoudhury A (2019) Re-factoring based program repair applied to programming assignments. In: 34th IEEE\/ACM International Conference on Automated Software Engineering (ASE), pp 388\u2013398. https:\/\/ieeexplore.ieee.org\/abstract\/document\/8952522","DOI":"10.1109\/ASE.2019.00044"},{"key":"10624_CR30","doi-asserted-by":"crossref","unstructured":"Hu Q, Guo Y, Xie X, Cordy M, Ma L, Papadakis M, Traon YL (2023) Codes: towards code model generalization under distribution shift. In: ICSE: New Ideas and Emerging Results (NIER)","DOI":"10.1109\/ICSE-NIER58687.2023.00007"},{"key":"10624_CR31","doi-asserted-by":"publisher","unstructured":"Hu X, Li G, Xia X, Lo D, Jin Z (2018) Deep code comment generation. In: Proceedings of the 26th conference on program comprehension, ser. ICPC \u201918. Association for Computing Machinery, New York, NY, USA, pp 200\u2013210. https:\/\/doi.org\/10.1145\/3196321.3196334","DOI":"10.1145\/3196321.3196334"},{"key":"10624_CR32","doi-asserted-by":"publisher","unstructured":"Jebnoun H, Ben\u00a0Braiek H, Rahman MM, Khomh F (2020) The scent of deep learning code: an empirical study. In: Proceedings of the 17th international conference on mining software repositories, ser. MSR \u201920. Association for Computing Machinery, pp 420\u2013430. https:\/\/doi.org\/10.1145\/3379597.3387479","DOI":"10.1145\/3379597.3387479"},{"key":"10624_CR33","unstructured":"Kanade A, Maniatis P, Balakrishnan G, Shi K (2020) Learning and evaluating contextual embedding of source code. In: Proceedings of the 37th international conference on machine learning, ser. ICML\u201920. JMLR.org, pp 5110\u20135121"},{"key":"10624_CR34","doi-asserted-by":"crossref","unstructured":"Kaur A, Kaur M (2016) Analysis of code refactoring impact on software quality. In: MATEC Web of Conferences, vol 57. EDP Sciences, p 02012","DOI":"10.1051\/matecconf\/20165702012"},{"key":"10624_CR35","unstructured":"Kenton JDM-WC, Toutanova LK (2019) Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, vol\u00a01. Minneapolis, Minnesota, p\u00a02"},{"key":"10624_CR36","doi-asserted-by":"publisher","unstructured":"Kimura M (2021) Why mixup improves the model performance. In: Artificial Neural Networks and Machine Learning - ICANN 2021: 30th International Conference on Artificial Neural Networks, Bratislava, Slovakia, September 14-17, 2021, Proceedings, Part II. Springer-Verlag, Berlin, Heidelberg, pp 275\u2013286. https:\/\/doi.org\/10.1007\/978-3-030-86340-1_22","DOI":"10.1007\/978-3-030-86340-1_22"},{"key":"10624_CR37","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1016\/j.aiopen.2022.03.001","volume":"3","author":"B Li","year":"2022","unstructured":"Li B, Hou Y, Che W (2022) Data augmentation approaches in natural language processing: a survey. AI Open 3:71\u201390","journal-title":"AI Open"},{"issue":"3","key":"10624_CR38","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/0098-3004(93)90090-R","volume":"19","author":"A Ma\u0107kiewicz","year":"1993","unstructured":"Ma\u0107kiewicz A, Ratajczak W (1993) Principal components analysis (pca). Comput Geosci 19(3):303\u2013342","journal-title":"Comput Geosci"},{"key":"10624_CR39","doi-asserted-by":"crossref","unstructured":"Marivate V, Sefara T (2020) Improving short text classification through global augmentation methods. In: Machine learning and knowledge extraction, pp 385\u2013399","DOI":"10.1007\/978-3-030-57321-8_21"},{"key":"10624_CR40","doi-asserted-by":"publisher","unstructured":"Mastropaolo A, Pascarella L, Guglielmi E, Ciniselli M, Scalabrino S, Oliveto R, Bavota G (2023) On the robustness of code generation techniques: an empirical study on github copilot. In: Proceedings of the 45th international conference on software engineering, ser. ICSE \u201923. IEEE Press, pp 2149\u20132160. https:\/\/doi.org\/10.1109\/ICSE48619.2023.00181","DOI":"10.1109\/ICSE48619.2023.00181"},{"key":"10624_CR41","doi-asserted-by":"crossref","unstructured":"Ma W, Zhao M, Soremekun E, Hu Q, Zhang JM, Papadakis M, Cordy M, Xie X, Traon YL (2022) Graphcode2vec: generic code embedding via lexical and program dependence analyses. In: Proceedings of the 19th international conference on mining software repositories, pp 524\u2013536","DOI":"10.1145\/3524842.3528456"},{"key":"10624_CR42","unstructured":"Mi Q, Xiao Y, Cai Z, Jia X (2021) The effectiveness of data augmentation in code readability classification. Information and Software Technology 129:106378. https:\/\/www.sciencedirect.com\/science\/article\/abs\/pii\/S0950584920301464#:~:text=The%20empirical%20results%20show%20that,reaching%20up%20to%2087.38%25%20accuracy"},{"key":"10624_CR43","doi-asserted-by":"publisher","unstructured":"Niu C, Li C, Ng V, Chen D, Ge J, Luo B (2023) An empirical comparison of pre-trained models of source code. In: Proceedings of the 45th international conference on software engineering, ser. ICSE \u201923. IEEE Press, pp 2136\u20132148. https:\/\/doi.org\/10.1109\/ICSE48619.2023.00180","DOI":"10.1109\/ICSE48619.2023.00180"},{"key":"10624_CR44","doi-asserted-by":"crossref","unstructured":"Pour MV, Li Z, Ma L, Hemmati H (2021) A search-based testing framework for deep neural networks of source code embedding. In: 14th IEEE Conference on Software Testing, Verification and Validation (ICST), pp 36\u201346","DOI":"10.1109\/ICST49551.2021.00016"},{"key":"10624_CR45","unstructured":"Puri R, Kung DS, Janssen G, Zhang W, Domeniconi G, Zolotov V, Dolby J, Chen J, Choudhury M, Decker L et\u00a0al (2021) Codenet: a large-scale ai for code dataset for learning a diversity of coding tasks. arXiv:2105.12655"},{"issue":"6","key":"10624_CR46","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1145\/2666356.2594321","volume":"49","author":"V Raychev","year":"2014","unstructured":"Raychev V, Vechev M, Yahav E (2014) Code completion with statistical language models. SIGPLAN Not 49(6):419\u2013428. https:\/\/doi.org\/10.1145\/2666356.2594321","journal-title":"SIGPLAN Not"},{"key":"10624_CR47","first-page":"29\u00a0935","volume":"34","author":"S-A Rebuffi","year":"2021","unstructured":"Rebuffi S-A, Gowal S, Calian DA, Stimberg F, Wiles O, Mann TA (2021) Data augmentation can improve robustness. Adv Neural Inf Process Syst 34:29\u00a0935-29\u00a0948","journal-title":"Adv Neural Inf Process Syst"},{"issue":"3","key":"10624_CR48","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1016\/j.eng.2019.12.012","volume":"6","author":"K Ren","year":"2020","unstructured":"Ren K, Zheng T, Qin Z, Liu X (2020) Adversarial attacks and defenses in deep learning. Engineering 6(3):346\u2013360","journal-title":"Engineering"},{"key":"10624_CR49","unstructured":"Roziere B, Gehring J, Gloeckle F, Sootla S, Gat I, Tan XE, Adi Y, Liu J, Remez T, Rapin J et\u00a0al (2023) Code llama: open foundation models for code. arXiv preprint arXiv:2308.12950"},{"issue":"1","key":"10624_CR50","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-019-0197-0","volume":"6","author":"C Shorten","year":"2019","unstructured":"Shorten C, Khoshgoftaar TM (2019) A survey on image data augmentation for deep learning. J Big Data 6(1):1\u201348","journal-title":"J Big Data"},{"key":"10624_CR51","doi-asserted-by":"crossref","unstructured":"Siow JK, Liu S, Xie X, Meng G, Liu Y (2022) Learning program semantics with code representations: an empirical study. In: 2022 IEEE international conference on Software Analysis, Evolution and Reengineering (SANER). IEEE, IEEE Computer Society, Los Alamitos, CA, USA, pp 554\u2013565. https:\/\/doi.ieeecomputersociety.org\/10.1109\/SANER53432.2022.00073","DOI":"10.1109\/SANER53432.2022.00073"},{"key":"10624_CR52","doi-asserted-by":"publisher","unstructured":"Steenhoek B, Rahman MM, Jiles R, Le W (2023) An empirical study of deep learning models for vulnerability detection. In: Proceedings of the 45th international conference on software engineering, ser. ICSE \u201923. IEEE Press, pp 2237\u20132248. https:\/\/doi.org\/10.1109\/ICSE48619.2023.00188","DOI":"10.1109\/ICSE48619.2023.00188"},{"key":"10624_CR53","doi-asserted-by":"crossref","unstructured":"Svajlenko J, Islam JF, Keivanloo I, Roy CK, Mia MM (2014) Towards a big data curated benchmark of inter-project code clones. In: Proceedings of the 2014 IEEE international conference on software maintenance and evolution, ser. ICSME \u201914. IEEE Computer Society, USA, pp 476\u2013480. https:\/\/ieeexplore.ieee.org\/document\/6976121","DOI":"10.1109\/ICSME.2014.77"},{"key":"10624_CR54","unstructured":"Touvron H, Martin L, Stone K, Albert P, Almahairi A, Babaei Y, Bashlykov N, Batra S, Bhargava P, Bhosale S et\u00a0al (2023) Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288"},{"key":"10624_CR55","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"10624_CR56","unstructured":"Verma V, Lamb A, Beckham C, Najafi A, Mitliagkas I, Lopez-Paz D, Bengio Y (2019) Manifold mixup: better representations by interpolating hidden states. In: International conference on machine learning. PMLR, pp 6438\u20136447"},{"key":"10624_CR57","doi-asserted-by":"crossref","unstructured":"Wang J, Chen H-C, Radach R, Inhoff A (1999) Reading chinese script: a cognitive analysis. Psychology Press","DOI":"10.4324\/9781410601483"},{"key":"10624_CR58","doi-asserted-by":"crossref","unstructured":"Wang D, Jia Z, Li S, Yu Y, Xiong Y, Dong W, Liao X (2022) Bridging pre-trained models and downstream tasks for source code understanding. In: Proceedings of the 44th international conference on software engineering, ser. ICSE \u201922. Association for Computing Machinery, New York, NY, USA, pp 287\u2013298. https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3510003.3510062","DOI":"10.1145\/3510003.3510062"},{"key":"10624_CR59","doi-asserted-by":"crossref","unstructured":"Wang W, Li G, Ma B, Xia X, Jin Z (2020) Detecting code clones with graph neural network and flow-augmented abstract syntax tree. In: 2020 IEEE 27th International Conference on Software Analysis, Evolution and Reengineering (SANER). IEEE, pp 261\u2013271","DOI":"10.1109\/SANER48275.2020.9054857"},{"key":"10624_CR60","doi-asserted-by":"crossref","unstructured":"Wan Y, Zhao Z, Yang M, Xu G, Ying H, Wu J, Yu PS (2018) Improving automatic source code summarization via deep reinforcement learning. In: Proceedings of the 33rd ACM\/IEEE international conference on automated software engineering, pp 397\u2013407","DOI":"10.1145\/3238147.3238206"},{"key":"10624_CR61","doi-asserted-by":"crossref","unstructured":"Wei M, Huang Y, Yang J, Wang J, Wang S (2022) Cocofuzzing: testing neural code models with coverage-guided fuzzing. IEEE Trans Reliab","DOI":"10.1109\/TR.2022.3208239"},{"key":"10624_CR62","doi-asserted-by":"crossref","unstructured":"Wei J, Zou K (2019) Eda: easy data augmentation techniques for boosting performance on text classification tasks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp 6382\u20136388","DOI":"10.18653\/v1\/D19-1670"},{"key":"10624_CR63","doi-asserted-by":"crossref","unstructured":"White M, Vendome C, Linares-V\u00e1squez M, Poshyvanyk D (2015) Toward deep learning software repositories. In: 2015 IEEE\/ACM 12th working conference on mining software repositories. IEEE, pp 334\u2013345","DOI":"10.1109\/MSR.2015.38"},{"key":"10624_CR64","doi-asserted-by":"crossref","unstructured":"Woolson RF (2007) Wilcoxon signed-rank test. Wiley encyclopedia of clinical trials, pp 1\u20133","DOI":"10.1002\/9780471462422.eoct979"},{"key":"10624_CR65","doi-asserted-by":"crossref","unstructured":"Xia M, Kong X, Anastasopoulos A, Neubig G (2019) Generalized data augmentation for low-resource translation. In: Proceedings of the 57th annual meeting of the association for computational linguistics, pp 5786\u20135796","DOI":"10.18653\/v1\/P19-1579"},{"key":"10624_CR66","unstructured":"Xie Q, Dai Z, Hovy E, Luong M-T, Le QV (2020) Unsupervised data augmentation for consistency training. In: NIPS\u201920"},{"key":"10624_CR67","doi-asserted-by":"crossref","unstructured":"Yang Z, Shi J, He J, Lo D (2022) Natural attack for pre-trained models of code. In: Proceedings of the 44th international conference on software engineering, ser. ICSE \u201922. Association for Computing Machinery, pp 1482\u20131493. https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3510003.3510146","DOI":"10.1145\/3510003.3510146"},{"key":"10624_CR68","doi-asserted-by":"crossref","unstructured":"Yan S, Yu H, Chen Y, Shen B, Jiang L (2020) Are the code snippets what we are searching for? a benchmark and an empirical study on code search with natural-language queries. In: 2020 IEEE 27th international conference on Software Analysis, Evolution and Reengineering (SANER), pp 344\u2013354","DOI":"10.1109\/SANER48275.2020.9054840"},{"issue":"OOPSLA","key":"10624_CR69","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3428230","volume":"4","author":"N Yefet","year":"2020","unstructured":"Yefet N, Alon U, Yahav E (2020) Adversarial examples for models of code. Proc ACM Program Lang 4(OOPSLA):1\u201330","journal-title":"Proc ACM Program Lang"},{"key":"10624_CR70","unstructured":"Yu AW, Dohan D, Luong T, Zhao R, Chen K, Le Q (2018) Qanet: combining local convolution with global self-attention for reading comprehension. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=B14TlG-RW"},{"key":"10624_CR71","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2022.111304","volume":"190","author":"S Yu","year":"2022","unstructured":"Yu S, Wang T, Wang J (2022) Data augmentation by program transformation. J Syst Softw 190:111304","journal-title":"J Syst Softw"},{"key":"10624_CR72","doi-asserted-by":"crossref","unstructured":"Yun S, Han D, Oh SJ, Chun S, Choe J, Yoo Y (2019) Cutmix: regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6023\u20136032","DOI":"10.1109\/ICCV.2019.00612"},{"issue":"01","key":"10624_CR73","first-page":"1169","volume":"34","author":"H Zhang","year":"2020","unstructured":"Zhang H, Li Z, Li G, Ma L, Liu Y, Jin Z (2020) Generating adversarial examples for holding robustness of source code processing models. Proc AAAI Conf Artif Intell 34(01):1169\u20131176","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"10624_CR74","unstructured":"Zhang H, Cisse M, Dauphin YN, Lopez-Paz D (2018) mixup: beyond empirical risk minimization. In: International Conference on Learning Representations (ICLR)"},{"key":"10624_CR75","unstructured":"Zhang L, Deng Z, Kawaguchi K, Ghorbani A, Zou J (2021) How does mixup help with robustness and generalization?. In: International conference on learning representations"},{"key":"10624_CR76","doi-asserted-by":"crossref","unstructured":"Zhang R, Xiao W, Zhang H, Liu Y, Lin H, Yang M (2020) An empirical study on program failures of deep learning jobs. In: Proceedings of the ACM\/IEEE 42nd international conference on software engineering, ser. ICSE \u201920. Association for Computing Machinery, New York, NY, USA, pp 1159\u20131170. https:\/\/dl.acm.org\/doi\/10.1145\/3377811.3380362","DOI":"10.1145\/3377811.3380362"},{"key":"10624_CR77","doi-asserted-by":"publisher","unstructured":"Zhang X, Zhou Y, Han T, Chen T (2021) Training deep code comment generation models via data augmentation. In: Proceedings of the 12th Asia-Pacific symposium on internetware, ser. Internetware \u201920. Association for Computing Machinery, New York, NY, USA, pp 185\u2013188. https:\/\/doi.org\/10.1145\/3457913.3457937","DOI":"10.1145\/3457913.3457937"},{"key":"10624_CR78","doi-asserted-by":"crossref","unstructured":"Zhong H, Su Z (2015) An empirical study on real bug fixes. In: IEEE\/ACM 37th IEEE International Conference on Software Engineering (ICSE), vol\u00a01, pp 913\u2013923","DOI":"10.1109\/ICSE.2015.101"},{"key":"10624_CR79","unstructured":"Zhou Y, Liu S, Siow J, Du X, Liu Y (2019) Devign: effective vulnerability identification by learning comprehensive program semantics via graph neural networks. In: Proceedings of the 33rd international conference on neural information processing systems"}],"container-title":["Empirical Software Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-025-10624-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10664-025-10624-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10664-025-10624-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T13:28:21Z","timestamp":1763645301000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10664-025-10624-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,18]]},"references-count":79,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["10624"],"URL":"https:\/\/doi.org\/10.1007\/s10664-025-10624-2","relation":{},"ISSN":["1382-3256","1573-7616"],"issn-type":[{"value":"1382-3256","type":"print"},{"value":"1573-7616","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,18]]},"assertion":[{"value":"5 February 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"68"}}