{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T13:23:09Z","timestamp":1771680189557,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1145\/3639478.3639792","type":"proceedings-article","created":{"date-parts":[[2024,5,23]],"date-time":"2024-05-23T10:49:26Z","timestamp":1716461366000},"page":"159-161","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Beyond Accuracy and Robustness Metrics for Large Language Models for Code"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3238-1229","authenticated-orcid":false,"given":"Daniel","family":"Rodriguez-Cardenas","sequence":"first","affiliation":[{"name":"Semeru Lab, William &amp; Mary, Williamsburg, VA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,5,23]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski et al. 2021. Program Synthesis with Large Language Models. arXiv:cs.PL\/2108.07732"},{"key":"e_1_3_2_1_2_1","unstructured":"Federico Cassano John Gouwar Daniel Nguyen Sydney Nguyen Luna Phipps-Costin et al. 2022. MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation. http:\/\/arxiv.org\/abs\/2208.08227 arXiv:2208.08227 [cs]."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2107.03374"},{"key":"e_1_3_2_1_4_1","volume-title":"Henrique Ponde de Oliveira Pinto, et al","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, et al. 2021. Evaluating Large Language Models Trained on Code. http:\/\/arxiv.org\/abs\/2107.03374 arXiv:2107.03374 [cs]."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2019.2940179"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2021.3128234"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER53432.2022.00096"},{"key":"e_1_3_2_1_8_1","unstructured":"Dan Hendrycks Steven Basart Saurav Kadavath Mantas Mazeika Akul Arora et al. 2021. Measuring Coding Challenge Competence With APPS. CoRR abs\/2105.09938 (2021). arXiv:2105.09938 https:\/\/arxiv.org\/abs\/2105.09938"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Xinyi Hou Yanjie Zhao Yue Liu Zhou Yang Kailong Wang et al. 2023. Large Language Models for Software Engineering: A Systematic Literature Review. http:\/\/arxiv.org\/abs\/2308.10620 arXiv:2308.10620 [cs].","DOI":"10.1145\/3695988"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Alexander LeClair Aakash Bansal and Collin McMillan. 2021. Ensemble Models for Neural Source Code Summarization of Subroutines. http:\/\/arxiv.org\/abs\/2107.11423 arXiv:2107.11423 [cs].","DOI":"10.26226\/morressier.613b5418842293c031b5b62e"},{"key":"e_1_3_2_1_11_1","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu et al. 2022. Holistic Evaluation of Language Models. http:\/\/arxiv.org\/abs\/2211.09110 arXiv:2211.09110 [cs]."},{"key":"e_1_3_2_1_12_1","unstructured":"Chao Liu Xuanlin Bao Hongyu Zhang Neng Zhang Haibo Hu et al. 2023. Improving ChatGPT Prompt for Code Generation. arXiv:cs.SE\/2305.08360"},{"key":"e_1_3_2_1_13_1","volume-title":"Yuyao Wang, and Lingming Zhang.","author":"Liu Jiawei","year":"2023","unstructured":"Jiawei Liu, Chunqiu Steven Xia, Yuyao Wang, and Lingming Zhang. 2023. Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation. arXiv:cs.SE\/2305.01210"},{"key":"e_1_3_2_1_14_1","unstructured":"Shuai Lu Daya Guo Shuo Ren Junjie Huang Alexey Svyatkovskiy et al. [n. d.]. CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. arXiv:2102.04664 [cs] http:\/\/arxiv.org\/abs\/2102.04664"},{"key":"e_1_3_2_1_15_1","volume-title":"2020 IEEE\/ACM 42nd International Conference on Software Engineering (ICSE). 873--885","author":"Moran Kevin","year":"2020","unstructured":"Kevin Moran, David N. Palacio, Carlos Bernal-Cardenas, Daniel McCrystal, Denys Poshyvanyk, et al. 2020. Improving the Effectiveness of Traceability Link Recovery using Hierarchical Bayesian Networks. In 2020 IEEE\/ACM 42nd International Conference on Software Engineering (ICSE). 873--885."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER53432.2022.00069"},{"key":"e_1_3_2_1_17_1","volume-title":"Nguyen","author":"Nguyen Anh Tuan","year":"2015","unstructured":"Anh Tuan Nguyen and Tien N. Nguyen. 2015. Graph-Based Statistical Language Model for Code. In ICSE'15. IEEE Press, 858--868."},{"key":"e_1_3_2_1_19_1","volume-title":"Code completion with statistical language models. PLDI","author":"Raychev Veselin","year":"2014","unstructured":"Veselin Raychev, Martin T. Vechev, and Eran Yahav. 2014. Code completion with statistical language models. PLDI (2014)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Daniel Rodriguez-Cardenas David N. Palacio Dipin Khati Henry Burke and Denys Poshyvanyk. 2023. Benchmarking Causal Study to Interpret Large Language Models for Source Code. http:\/\/arxiv.org\/abs\/2308.12415 arXiv:2308.12415 [cs].","DOI":"10.1109\/ICSME58846.2023.00040"},{"key":"e_1_3_2_1_21_1","volume-title":"Shobha Rani Dhalipathi, et al","author":"Rosenberg Doug","year":"2020","unstructured":"Doug Rosenberg, Barry Boehm, Matt Stephens, Charles Suscheck, Shobha Rani Dhalipathi, et al. 2020. CodeBots: From Domain Model to Executable Architecture. Parallel Agile-faster delivery, fewer defects, lower cost (2020), 27--51."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00021"},{"key":"e_1_3_2_1_23_1","volume-title":"2018 IEEE\/ACM 15th International Conference on Mining Software Repositories (MSR). 542--553","author":"Tufano Michele","year":"2018","unstructured":"Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, et al. 2018. Deep Learning Similarities from Different Representations of Source Code. In 2018 IEEE\/ACM 15th International Conference on Mining Software Repositories (MSR). 542--553."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3238147.3240732"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME.2019.00046"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340544"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510621"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE43902.2021.00027"},{"key":"e_1_3_2_1_29_1","unstructured":"Shiqi Wang Zheng Li Haifeng Qian Chenghao Yang Zijian Wang et al. 2022. ReCode: Robustness Evaluation of Code Generation Models. http:\/\/arxiv.org\/abs\/2212.10264 arXiv:2212.10264 [cs]."},{"key":"e_1_3_2_1_30_1","volume-title":"On Learning Meaningful Assert Statements for Unit Test Cases. In 2020 IEEE\/ACM 42nd International Conference on Software Engineering (ICSE). 1398--1409","author":"Watson Cody","year":"2020","unstructured":"Cody Watson, Michele Tufano, Kevin Moran, Gabriele Bavota, and Denys Poshyvanyk. 2020. On Learning Meaningful Assert Statements for Unit Test Cases. In 2020 IEEE\/ACM 42nd International Conference on Software Engineering (ICSE). 1398--1409."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545945.3569830"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER.2019.8668043"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2970276.2970326"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSR.2015.38"},{"key":"e_1_3_2_1_35_1","unstructured":"Robert White and Jens Krinke. 2020. ReAssert: Deep Learning for Assert Generation. http:\/\/arxiv.org\/abs\/2011.09784 arXiv:2011.09784 [cs]."},{"key":"e_1_3_2_1_36_1","volume-title":"Hellendoorn","author":"Xu Frank F.","year":"2022","unstructured":"Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. 2022. A Systematic Evaluation of Large Language Models of Code. http:\/\/arxiv.org\/abs\/2202.13169 arXiv:2202.13169 [cs]."},{"key":"e_1_3_2_1_37_1","unstructured":"Wojciech Zaremba Greg Brockman and OpenAI. 2021. OpenAI Codex. https:\/\/openai.com\/blog\/openai-codex\/."},{"key":"e_1_3_2_1_38_1","unstructured":"Yaqin Zhou Shangqing Liu Jingkai Siow Xiaoning Du and Yang Liu. [n. d.]. Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. ([n. d.])."}],"event":{"name":"ICSE-Companion '24: 2024 IEEE\/ACM 46th International Conference on Software Engineering: Companion Proceedings","location":"Lisbon Portugal","acronym":"ICSE-Companion '24","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS","Faculty of Engineering of University of Porto"]},"container-title":["Proceedings of the 2024 IEEE\/ACM 46th International Conference on Software Engineering: Companion Proceedings"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3639478.3639792","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3639478.3639792","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:44:32Z","timestamp":1750290272000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3639478.3639792"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":37,"alternative-id":["10.1145\/3639478.3639792","10.1145\/3639478"],"URL":"https:\/\/doi.org\/10.1145\/3639478.3639792","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]},"assertion":[{"value":"2024-05-23","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}