{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:18:48Z","timestamp":1775913528815,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T00:00:00Z","timestamp":1749600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,25]]},"DOI":"10.1145\/3713081.3731745","type":"proceedings-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T17:20:36Z","timestamp":1749230436000},"page":"153-160","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["On Benchmarking Code LLMs for Android Malware Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5977-1489","authenticated-orcid":false,"given":"Yiling","family":"He","sequence":"first","affiliation":[{"name":"University College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1813-3293","authenticated-orcid":false,"given":"Hongyu","family":"She","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6617-6521","authenticated-orcid":false,"given":"Xingzhi","family":"Qian","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1130-7916","authenticated-orcid":false,"given":"Xinran","family":"Zheng","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6829-9868","authenticated-orcid":false,"given":"Zhuo","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7872-6969","authenticated-orcid":false,"given":"Zhan","family":"Qin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3878-2680","authenticated-orcid":false,"given":"Lorenzo","family":"Cavallaro","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.211"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER56733.2023.00033"},{"key":"e_1_3_2_1_3_1","volume-title":"Machines in Malware Classification. In 32nd USENIX Security Symposium (USENIX Security 23)","author":"Aonzo Simone","year":"2023","unstructured":"Simone Aonzo, Yufei Han, Alessandro Mantovani, and Davide Balzarotti. 2023. Humans vs. Machines in Malware Classification. In 32nd USENIX Security Symposium (USENIX Security 23)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2014.23247"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643757"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_2_1_7_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Chen Xinyun","year":"2024","unstructured":"Xinyun Chen, Maxwell Lin, Nathanael Sch\u00e4rli, and Denny Zhou. 2024. Teaching large language models to self-debug. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","volume-title":"d.]. Androguard. https:\/\/github.com\/androguard\/androguard","author":"Desnos Anthony","year":"2020","unstructured":"Anthony Desnos. [n. d.]. Androguard. https:\/\/github.com\/androguard\/androguard 2020."},{"key":"e_1_3_2_1_9_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Downing Evan","year":"2021","unstructured":"Evan Downing, Yisroel Mirsky, Kyuhong Park, and Wenke Lee. 2021. DeepReflect: Discovering Malicious Functionality through Binary Reconstruction. In 30th USENIX Security Symposium (USENIX Security 21)."},{"key":"e_1_3_2_1_10_1","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Fang Chongzhou","year":"2024","unstructured":"Chongzhou Fang, Ning Miao, Shaurya Srivastav, Jialin Liu, Ruoyu Zhang, Ruijie Fang, Asmita, Ryan Tsang, Najmeh Nazari, Han Wang, and Houman Homayoun. 2024. Large Language Models for Code Analysis: Do LLMs Really Do Their Job?. In 33rd USENIX Security Symposium (USENIX Security 24). USENIX Association, Philadelphia, PA, 829\u2013846."},{"key":"e_1_3_2_1_11_1","unstructured":"Daya Guo Qihao Zhu Dejian Yang Zhenda Xie Kai Dong Wentao Zhang Guanting Chen Xiao Bi Yu Wu YK Li et al. 2024. DeepSeek-Coder: When the Large Language Model Meets Programming-The Rise of Code Intelligence. arXiv preprint arXiv:2401.14196 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"DREAM: Combating Concept Drift with Explanatory Detection and Adaptation in Malware Classification. arXiv preprint arXiv:2405.04095","author":"He Yiling","year":"2024","unstructured":"Yiling He, Junchi Lei, Zhan Qin, and Kui Ren. 2024. DREAM: Combating Concept Drift with Explanatory Detection and Adaptation in Malware Classification. arXiv preprint arXiv:2405.04095 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2022.3168285"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3576915.3616599"},{"key":"e_1_3_2_1_15_1","volume-title":"Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436","author":"Husain Hamel","year":"2019","unstructured":"Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. Codesearchnet challenge: Evaluating the state of semantic code search. arXiv preprint arXiv:1909.09436 (2019)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2808769.2808780"},{"key":"e_1_3_2_1_17_1","volume-title":"Lightgbm: A highly efficient gradient boosting decision tree. Advances in neural information processing systems 30","author":"Ke Guolin","year":"2017","unstructured":"Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. Lightgbm: A highly efficient gradient boosting decision tree. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER50967.2021.00037"},{"key":"e_1_3_2_1_19_1","volume-title":"Weishi Wang, Md Rizwan Parvez, and Shafiq Joty.","author":"Matin Khan Mohammad Abdullah","year":"2023","unstructured":"Mohammad Abdullah Matin Khan, M Saiful Bari, Xuan Long Do, Weishi Wang, Md Rizwan Parvez, and Shafiq Joty. 2023. xcodeeval: A large scale multilingual multitask benchmark for code understanding, generation, translation and retrieval. arXiv preprint arXiv:2303.03004 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, et al.","author":"Kocetkov Denis","year":"2023","unstructured":"Denis Kocetkov, Raymond Li, LI Jia, Chenghao Mou, Yacine Jernite, Margaret Mitchell, Carlos Mu\u00f1oz Ferrandis, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, et al. 2023. The Stack: 3 TB of permissively licensed source code. Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Raymond Li Loubna Ben allal Yangtian Zi Niklas Muennighoff Denis Kocetkov Chenghao Mou Marc Marone Christopher Akiki Jia LI Jenny Chim Qian Liu Evgenii Zheltonozhskii Terry Yue Zhuo Thomas Wang Olivier Dehaene Joel Lamy-Poirier Joao Monteiro Nicolas Gontier Ming-Ho Yee Logesh Kumar Umapathi Jian Zhu Ben Lipkin Muhtasham Oblokulov Zhiruo Wang Rudra Murthy Jason T Stillerman Siva Sankalp Patel Dmitry Abulkhanov Marco Zocca Manan Dey Zhihan Zhang Urvashi Bhattacharyya Wenhao Yu Sasha Luccioni Paulo Villegas Fedor Zhdanov Tony Lee Nadav Timor Jennifer Ding Claire S Schlesinger Hailey Schoelkopf Jan Ebert Tri Dao Mayank Mishra Alex Gu Carolyn Jane Anderson Brendan Dolan-Gavitt Danish Contractor Siva Reddy Daniel Fried Dzmitry Bahdanau Yacine Jernite Carlos Mu\u00f1oz Ferrandis Sean Hughes Thomas Wolf Arjun Guha Leandro Von Werra and Harm de Vries. 2023. StarCoder: may the source be with you! Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Enhancing Android Malware Detection: The Influence of ChatGPT on Decision-centric Task. arXiv preprint arXiv:2410.04352","author":"Li Yao","year":"2024","unstructured":"Yao Li, Sen Fang, Tao Zhang, and Haipeng Cai. 2024. Enhancing Android Malware Detection: The Influence of ChatGPT on Decision-centric Task. arXiv preprint arXiv:2410.04352 (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00110"},{"key":"e_1_3_2_1_24_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00074"},{"key":"e_1_3_2_1_26_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1).","author":"Lu Shuai","year":"2021","unstructured":"Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, et al. 2021. CodeXGLUE : A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_2_1_27_1","volume-title":"31st USENIX Security Symposium (USENIX Security 22)","author":"Mantovani Alessandro","year":"2022","unstructured":"Alessandro Mantovani, Simone Aonzo, Yanick Fratantonio, and Davide Balzarotti. 2022. {RE-Mind}: a first look inside the mind of a reverse engineer. In 31st USENIX Security Symposium (USENIX Security 22). 2727\u20132745."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2017.23353"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.448"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931037.2931047"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46215.2023.10179420"},{"key":"e_1_3_2_1_33_1","volume-title":"28th USENIX security symposium (USENIX Security 19). 729\u2013746.","author":"Pendlebury Feargus","unstructured":"Feargus Pendlebury, Fabio Pierazzi, Roberto Jordaney, Johannes Kinder, and Lorenzo Cavallaro. 2019. {TESSERACT}: Eliminating experimental bias in malware classification across space and time. In 28th USENIX security symposium (USENIX Security 19). 729\u2013746."},{"key":"e_1_3_2_1_34_1","volume-title":"LAMD: Context-driven Android Malware Detection and Classification with LLMs. arXiv preprint arXiv:2502.13055","author":"Qian Xingzhi","year":"2025","unstructured":"Xingzhi Qian, Xinran Zheng, Yiling He, Shuo Yang, and Lorenzo Cavallaro. 2025. LAMD: Context-driven Android Malware Detection and Classification with LLMs. arXiv preprint arXiv:2502.13055 (2025)."},{"key":"e_1_3_2_1_35_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training."},{"key":"e_1_3_2_1_36_1","volume-title":"Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422337.3447849"},{"key":"e_1_3_2_1_38_1","volume-title":"How Far Have We Gone in Binary Code Understanding Using Large Language Models. In 2024 IEEE International Conference on Software Maintenance and Evolution (ICSME). IEEE, 1\u201312","author":"Shang Xiuwei","year":"2024","unstructured":"Xiuwei Shang, Shaoyin Cheng, Guoqiang Chen, Yanming Zhang, Li Hu, Xiao Yu, Gangyang Li, Weiming Zhang, and Nenghai Yu. 2024. How Far Have We Gone in Binary Code Understanding Using Large Language Models. In 2024 IEEE International Conference on Software Maintenance and Evolution (ICSME). IEEE, 1\u201312."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2025.230338"},{"key":"e_1_3_2_1_40_1","volume-title":"Pitfalls in language models for code intelligence: A taxonomy and survey. arXiv preprint arXiv:2310.17903","author":"She Xinyu","year":"2023","unstructured":"Xinyu She, Yue Liu, Yanjie Zhao, Yiling He, Li Li, Chakkrit Tantithamthavorn, Zhan Qin, and Haoyu Wang. 2023. Pitfalls in language models for code intelligence: A taxonomy and survey. arXiv preprint arXiv:2310.17903 (2023)."},{"key":"e_1_3_2_1_41_1","first-page":"4173","article-title":"DebugBench","volume":"2024","author":"Tian Runchu","year":"2024","unstructured":"Runchu Tian, Yining Ye, Yujia Qin, Xin Cong, Yankai Lin, Yinxu Pan, Yesai Wu, Hui Haotian, Liu Weichuan, Zhiyuan Liu, et al. 2024. DebugBench: Evaluating Debugging Capability of Large Language Models. In Findings of the Association for Computational Linguistics ACL 2024. 4173\u20134198.","journal-title":"Evaluating Debugging Capability of Large Language Models. In Findings of the Association for Computational Linguistics ACL"},{"key":"e_1_3_2_1_42_1","volume-title":"James M Ghawaly, and Aisha Ali-Gombe.","author":"Walton Brandon J","year":"2025","unstructured":"Brandon J Walton, Mst Eshita Khatun, James M Ghawaly, and Aisha Ali-Gombe. 2025. Exploring Large Language Models for Semantic Analysis and Categorization of Android Malware. arXiv preprint arXiv:2501.04848 (2025)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3555652"},{"key":"e_1_3_2_1_44_1","volume-title":"ReCode: Robustness Evaluation of Code Generation Models. In The 61st Annual Meeting Of The Association For Computational Linguistics.","author":"Wang Shiqi","year":"2023","unstructured":"Shiqi Wang, Zheng Li, Haifeng Qian, Chenghao Yang, Zijian Wang, Mingyue Shang, Varun Kumar, Samson Tan, Baishakhi Ray, Parminder Bhatia, et al. 2023. ReCode: Robustness Evaluation of Code Generation Models. In The 61st Annual Meeting Of The Association For Computational Linguistics."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.685"},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on Learning Representations.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Maarten Bosma, Vincent Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2022. Finetuned Language Models are Zero-Shot Learners. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_47_1","volume-title":"Evaluating explanation without ground truth in interpretable machine learning. arXiv preprint arXiv:1907.06831","author":"Yang Fan","year":"2019","unstructured":"Fan Yang, Mengnan Du, and Xia Hu. 2019. Evaluating explanation without ground truth in interpretable machine learning. arXiv preprint arXiv:1907.06831 (2019)."},{"key":"e_1_3_2_1_48_1","volume-title":"Tapi: Towards target-specific and adversarial prompt injection against code llms. arXiv preprint arXiv:2407.09164","author":"Yang Yuchen","year":"2024","unstructured":"Yuchen Yang, Hongwei Yao, Bingrun Yang, Yiling He, Yiming Li, Tianwei Zhang, Zhan Qin, and Kui Ren. 2024. Tapi: Towards target-specific and adversarial prompt injection against code llms. arXiv preprint arXiv:2407.09164 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.125546"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599790"}],"event":{"name":"ISSTA Companion '25: 34th ACM SIGSOFT International Symposium on Software Testing and Analysis","location":"Clarion Hotel Trondheim Trondheim Norway","acronym":"ISSTA Companion '25","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 34th ACM SIGSOFT International Symposium on Software Testing and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713081.3731745","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:09Z","timestamp":1750295889000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713081.3731745"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,11]]},"references-count":50,"alternative-id":["10.1145\/3713081.3731745","10.1145\/3713081"],"URL":"https:\/\/doi.org\/10.1145\/3713081.3731745","relation":{},"subject":[],"published":{"date-parts":[[2025,6,11]]},"assertion":[{"value":"2025-06-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}