{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T13:02:30Z","timestamp":1761570150047,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","funder":[{"name":"Hong Kong Research Grants Council\/General Research Fund (HKSAR RGC\/GRF)","award":["16206524"],"award-info":[{"award-number":["16206524"]}]},{"name":"National Natural Science Foundation of China","award":["62302500"],"award-info":[{"award-number":["62302500"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,20]]},"DOI":"10.1145\/3755881.3755901","type":"proceedings-article","created":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T11:46:17Z","timestamp":1761565577000},"page":"71-83","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CodeCleaner: Mitigating Data Contamination for LLM Benchmarking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4892-6294","authenticated-orcid":false,"given":"Jialun","family":"Cao","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1220-8728","authenticated-orcid":false,"given":"Songqiang","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8039-0528","authenticated-orcid":false,"given":"Wuqi","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4919-6292","authenticated-orcid":false,"given":"Hau Ching","family":"Lo","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0991-4231","authenticated-orcid":false,"given":"Yeting","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3508-7172","authenticated-orcid":false,"given":"Shing-Chi","family":"Cheung","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"[n. d.]. CodeCleaner. https:\/\/github.com\/ArabelaTso\/CodeCleaner-v1"},{"key":"e_1_3_3_1_3_2","unstructured":"[n. d.]. Code of sklearn.externals._arff.LODGeneratorData class. https:\/\/github.com\/scikit-learn\/scikit-learn\/blob\/1.5.2\/sklearn\/externals\/_arff.py#L591"},{"key":"e_1_3_3_1_4_2","unstructured":"[n. d.]. DataPortraits. https:\/\/dataportraits.org\/"},{"key":"e_1_3_3_1_5_2","unstructured":"[n. d.]. Large Language Models trained on Stack. https:\/\/huggingface.co\/models?dataset=dataset:bigcode\/the-stack"},{"key":"e_1_3_3_1_6_2","unstructured":"[n. d.]. NumPy Project. https:\/\/github.com\/numpy\/numpy"},{"key":"e_1_3_3_1_7_2","unstructured":"[n. d.]. Pandas Project. https:\/\/github.com\/pandas-dev\/pandas"},{"key":"e_1_3_3_1_8_2","unstructured":"[n. d.]. Scikit-Learn Project. https:\/\/github.com\/scikit-learn\/scikit-learn"},{"key":"e_1_3_3_1_9_2","unstructured":"[n. d.]. The pile. https:\/\/pile.eleuther.ai\/"},{"key":"e_1_3_3_1_10_2","unstructured":"[n. d.]. The Stack. https:\/\/huggingface.co\/datasets\/bigcode\/the-stack"},{"key":"e_1_3_3_1_11_2","unstructured":"[n. d.]. The Stack-V2. https:\/\/huggingface.co\/datasets\/bigcode\/the-stack-v2"},{"key":"e_1_3_3_1_12_2","unstructured":"2023. codellama\/CodeLlama-7b-Instruct-hf. https:\/\/huggingface.co\/codellama\/CodeLlama-7b-Instruct-hf."},{"key":"e_1_3_3_1_13_2","unstructured":"2023. GitHub Copilot. https:\/\/copilot.microsoft.com\/."},{"key":"e_1_3_3_1_14_2","unstructured":"2023. HuggingFaceH4\/starchat-beta. https:\/\/huggingface.co\/HuggingFaceH4\/starchat-beta\/commit\/4d8424c69643590f193cc97dc7eebff66500ebc6"},{"key":"e_1_3_3_1_15_2","unstructured":"2023. Starcoder-code-instruct. https:\/\/huggingface.co\/GeorgiaTechResearchInstitute\/starcoder-gpteacher-code-instruct."},{"key":"e_1_3_3_1_16_2","unstructured":"2023. WizardLM\/WizardCoder. https:\/\/huggingface.co\/WizardLMTeam\/WizardCoder-15B-V1.0."},{"key":"e_1_3_3_1_17_2","unstructured":"2024. https:\/\/github.com\/johnbumgarner\/wordhoard."},{"key":"e_1_3_3_1_18_2","unstructured":"2024. GitHub survey finds nearly all developers using AI coding tools. https:\/\/www.infoworld.com\/article\/3489925\/github-survey-finds-nearly-all-developers-using-ai-coding-tools.html."},{"key":"e_1_3_3_1_19_2","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski David Dohan Ellen Jiang Carrie Cai Michael Terry Quoc Le et\u00a0al. 2021. Program Synthesis with Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07732 (2021)."},{"key":"e_1_3_3_1_20_2","first-page":"67","volume-title":"Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Balloccu Simone","year":"2024","unstructured":"Simone Balloccu, Patr\u00edcia Schmidtov\u00e1, Mateusz Lango, and Ondrej Dusek. 2024. Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in Closed-Source LLMs. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, St. Julian\u2019s, Malta, 67\u201393. https:\/\/aclanthology.org\/2024.eacl-long.5"},{"key":"e_1_3_3_1_21_2","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual","author":"Brown Tom\u00a0B.","year":"2020","unstructured":"Tom\u00a0B. Brown, Benjamin Mann, and Nick\u00a0Ryder et al.2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3691620.3695470"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Jialun Cao Meiziniu Li Yeting Li Ming Wen Shing-Chi Cheung and Haiming Chen. 2022. SemMT: a semantic-based testing approach for machine translation systems. ACM TOSEM 31 2 (2022) 1\u201336.","DOI":"10.1145\/3490488"},{"key":"e_1_3_3_1_24_2","unstructured":"Jialun Cao Wuqi Zhang and Shing-Chi Cheung. 2024. Concerned with Data Contamination? Assessing Countermeasures in Code Language Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.16898 (2024)."},{"key":"e_1_3_3_1_25_2","first-page":"267","volume-title":"28th USENIX Security Symposium","author":"Carlini Nicholas","year":"2019","unstructured":"Nicholas Carlini, Chang Liu, \u00dalfar Erlingsson, Jernej Kos, and Dawn Song. 2019. The secret sharer: Evaluating and testing unintended memorization in neural networks. In 28th USENIX Security Symposium. 267\u2013284."},{"key":"e_1_3_3_1_26_2","first-page":"2633","volume-title":"30th USENIX Security Symposium, USENIX Security 2021, August 11-13, 2021","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tram\u00e8r, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom\u00a0B. Brown, Dawn Song, \u00dalfar Erlingsson, Alina Oprea, and Colin Raffel. 2021. Extracting Training Data from Large Language Models. In 30th USENIX Security Symposium, USENIX Security 2021, August 11-13, 2021. USENIX Association, 2633\u20132650. https:\/\/www.usenix.org\/conference\/usenixsecurity21\/presentation\/carlini-extracting"},{"key":"e_1_3_3_1_27_2","unstructured":"Mark Chen Jerry Tworek and Heewoo\u00a0Jun et al.2021. Evaluating Large Language Models Trained on Code. (2021). arxiv:https:\/\/arXiv.org\/abs\/cs.LG\/2107.03374"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","first-page":"104","DOI":"10.1109\/ASE51524.2021.9678670","volume-title":"2021 36th IEEE\/ACM International Conference on Automated Software Engineering (ASE)","author":"Chen Songqiang","year":"2021","unstructured":"Songqiang Chen, Shuo Jin, and Xiaoyuan Xie. 2021. Testing your question answering software via asking recursively. In 2021 36th IEEE\/ACM International Conference on Automated Software Engineering (ASE). IEEE, 104\u2013116."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3468264.3468569"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3468264.3468569"},{"key":"e_1_3_3_1_31_2","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung\u00a0Won Chung Charles Sutton Sebastian Gehrmann et\u00a0al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research 24 240 (2023) 1\u2013113."},{"key":"e_1_3_3_1_32_2","unstructured":"Chunyuan Deng Yilun Zhao Xiangru Tang Mark Gerstein and Arman Cohan. 2023. Investigating data contamination in modern benchmarks for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.09783 (2023)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598067"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3597503.3623343"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Jesse Dodge Maarten Sap Ana Marasovi\u0107 William Agnew Gabriel Ilharco Dirk Groeneveld Margaret Mitchell and Matt Gardner. 2021. Documenting large webtext corpora: A case study on the colossal clean crawled corpus. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.08758 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.98"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","first-page":"12039","DOI":"10.18653\/v1\/2024.findings-acl.716","volume-title":"Findings of the Association for Computational Linguistics ACL 2024","author":"Dong Yihong","year":"2024","unstructured":"Yihong Dong, Xue Jiang, Huanyu Liu, Zhi Jin, Bin Gu, Mengfei Yang, and Ge Li. 2024. Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models. In Findings of the Association for Computational Linguistics ACL 2024. Association for Computational Linguistics, Bangkok, Thailand and virtual meeting, 12039\u201312050. https:\/\/aclanthology.org\/2024.findings-acl.716"},{"key":"e_1_3_3_1_37_2","unstructured":"Xueying Du Mingwei Liu Kaixin Wang Hanlin Wang Junwei Liu Yixuan Chen Jiayi Feng Chaofeng Sha Xin Peng and Yiling Lou. 2023. ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation. arxiv:https:\/\/arXiv.org\/abs\/cs.CL\/2308.01861"},{"key":"e_1_3_3_1_38_2","unstructured":"Michael Duan Anshuman Suri Niloofar Mireshghallah Sewon Min Weijia Shi Luke Zettlemoyer Yulia Tsvetkov Yejin Choi David Evans and Hannaneh Hajishirzi. 2024. Do Membership Inference Attacks Work on Large Language Models? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.07841 (2024)."},{"key":"e_1_3_3_1_39_2","unstructured":"Angela Fan Beliz Gokkaya Mark Harman Mitya Lyubarskiy Shubho Sengupta Shin Yoo and Jie\u00a0M Zhang. 2023. Large language models for software engineering: Survey and open problems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.03533 (2023)."},{"key":"e_1_3_3_1_40_2","unstructured":"Jean-loup Gailly and Mark Adler. 2004. Zlib compression library. (2004)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","unstructured":"Shahriar Golchin and Mihai Surdeanu. 2023. Time Travel in LLMs: Tracing Data Contamination in Large Language Models. CoRR abs\/2308.08493 (2023). 10.48550\/ARXIV.2308.08493 arXiv:https:\/\/arXiv.org\/abs\/2308.08493","DOI":"10.48550\/ARXIV.2308.08493"},{"key":"e_1_3_3_1_42_2","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et\u00a0al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12948 (2025)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380339"},{"key":"e_1_3_3_1_44_2","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2009.03300 (2020)."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Rao. Hongzhou Zhao. Yanjie Zhu. Wenjie Xiao. Ling Wang. Meizhen and Wang. Haoyu. 2024. CodeMorph: Mitigating Data Leakage in Large Language Model Assessment. Proceedings of the 2025 IEEE\/ACM 47th International Conference on Software Engineering: Companion Proceedings (2024).","DOI":"10.1109\/ICSE-Companion66252.2025.00081"},{"key":"e_1_3_3_1_46_2","unstructured":"Naman Jain King Han Alex Gu Wen-Ding Li Fanjia Yan Tianjun Zhang Sida Wang Armando Solar-Lezama Koushik Sen and Ion Stoica. 2024. Livecodebench: Holistic and contamination free evaluation of large language models for code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.07974 (2024)."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Fred Jelinek Robert\u00a0L Mercer Lalit\u00a0R Bahl and James\u00a0K Baker. 1977. Perplexity\u2014a measure of the difficulty of speech recognition tasks. The Journal of the Acoustical Society of America 62 S1 (1977) S63\u2013S63.","DOI":"10.1121\/1.2016299"},{"key":"e_1_3_3_1_48_2","unstructured":"Minhao Jiang Ken\u00a0Ziyu Liu Ming Zhong Rylan Schaeffer Siru Ouyang Jiawei Han and Sanmi Koyejo. 2024. Investigating data contamination for pre-training language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.06059 (2024)."},{"key":"e_1_3_3_1_49_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Jimenez Carlos\u00a0E","year":"2024","unstructured":"Carlos\u00a0E Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik\u00a0R Narasimhan. 2024. SWE-bench: Can Language Models Resolve Real-world Github Issues?. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_1_50_2","first-page":"10697","volume-title":"International Conference on Machine Learning","author":"Kandpal Nikhil","year":"2022","unstructured":"Nikhil Kandpal, Eric Wallace, and Colin Raffel. 2022. Deduplicating training data mitigates privacy risks in language models. In International Conference on Machine Learning. PMLR, 10697\u201310707."},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISESE.2004.1334896"},{"key":"e_1_3_3_1_52_2","unstructured":"Denis Kocetkov Raymond Li Loubna Ben\u00a0Allal Jia Li Chenghao Mou Carlos Mu\u00f1oz\u00a0Ferrandis Yacine Jernite Margaret Mitchell Sean Hughes Thomas Wolf Dzmitry Bahdanau Leandro von Werra and Harm de Vries. 2022. The Stack: 3 TB of permissively licensed source code. Preprint (2022)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-540-70628-1","volume-title":"Computational commutative algebra","author":"Kreuzer Martin","year":"2000","unstructured":"Martin Kreuzer and Lorenzo Robbiano. 2000. Computational commutative algebra. Vol.\u00a01. Springer."},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","unstructured":"Changmao Li and Jeffrey Flanigan. 2023. Task Contamination: Language Models May Not Be Few-Shot Anymore. CoRR abs\/2312.16337 (2023). 10.48550\/ARXIV.2312.16337 arXiv:https:\/\/arXiv.org\/abs\/2312.16337","DOI":"10.48550\/ARXIV.2312.16337"},{"key":"e_1_3_3_1_55_2","unstructured":"Raymond Li Loubna\u00a0Ben Allal Yangtian Zi Niklas Muennighoff Denis Kocetkov Chenghao Mou Marc Marone Christopher Akiki Jia Li Jenny Chim et\u00a0al. 2023. Starcoder: may the source be with you! arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.06161 (2023)."},{"key":"e_1_3_3_1_56_2","unstructured":"Yucheng Li. 2023. An open source data contamination report for llama series models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.17589 (2023)."},{"key":"e_1_3_3_1_57_2","unstructured":"Stephanie Lin Jacob Hilton and Owain Evans. 2021. Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.07958 (2021)."},{"key":"e_1_3_3_1_58_2","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Liu Jiawei","year":"2023","unstructured":"Jiawei Liu, Chunqiu\u00a0Steven Xia, Yuyao Wang, and Lingming Zhang. 2023. Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=1qvx610Cu7"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Jiahuan Cao Chongyu Liu Kai Ding and Lianwen Jin. 2024. Datasets for Large Language Models: A Comprehensive Survey. arxiv:https:\/\/arXiv.org\/abs\/cs.CL\/2402.18041","DOI":"10.21203\/rs.3.rs-3996137\/v1"},{"key":"e_1_3_3_1_60_2","unstructured":"Inbal Magar and Roy Schwartz. 2022. Data contamination: From memorization to exploitation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.08242 (2022)."},{"key":"e_1_3_3_1_61_2","volume-title":"Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Marone Marc","year":"2023","unstructured":"Marc Marone and Benjamin Van Durme. 2023. Data Portraits: Recording Foundation Model Training Data. In Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"crossref","unstructured":"Justus Mattern Fatemehsadat Mireshghallah Zhijing Jin Bernhard Sch\u00f6lkopf Mrinmaya Sachan and Taylor Berg-Kirkpatrick. 2023. Membership inference attacks against language models via neighbourhood comparison. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.18462 (2023).","DOI":"10.18653\/v1\/2023.findings-acl.719"},{"key":"e_1_3_3_1_63_2","doi-asserted-by":"crossref","unstructured":"Alexandre Matton Tom Sherborne Dennis Aumiller Elena Tommasone Milad Alizadeh Jingyi He Raymond Ma Maxime Voisin Ellen Gilsenan-McMahon and Matthias Gall\u00e9. 2024. On leakage of code generation evaluation datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.07565 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.772"},{"key":"e_1_3_3_1_64_2","unstructured":"David Noever and Kevin Williams. 2023. Chatbots as fluent polyglots: Revisiting breakthrough code snippets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.03373 (2023)."},{"key":"e_1_3_3_1_65_2","unstructured":"Russell\u00a0A Poldrack Thomas Lu and Ga\u0161per Begu\u0161. 2023. AI-assisted coding: Experiments with GPT-4. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.13187 (2023)."},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/2594291.2594321"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.722"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"crossref","unstructured":"Timo Schick and Hinrich Sch\u00fctze. 2020. It\u2019s not just size that matters: Small language models are also few-shot learners. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2009.07118 (2020).","DOI":"10.18653\/v1\/2021.naacl-main.185"},{"key":"e_1_3_3_1_69_2","unstructured":"Weijia Shi Anirudh Ajith Mengzhou Xia Yangsibo Huang Daogao Liu Terra Blevins Danqi Chen and Luke Zettlemoyer. 2023. Detecting pretraining data from large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.16789 (2023)."},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"crossref","unstructured":"Atsushi Shirafuji Yusuke Oda Jun Suzuki Makoto Morishita and Yutaka Watanobe. 2023. Refactoring Programs Using Large Language Models with Few-Shot Examples. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.11690 (2023).","DOI":"10.1109\/APSEC60848.2023.00025"},{"key":"e_1_3_3_1_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330699"},{"key":"e_1_3_3_1_72_2","unstructured":"Kushal Tirumala Aram Markosyan Luke Zettlemoyer and Armen Aghajanyan. 2022. Memorization without overfitting: Analyzing the training dynamics of large language models. Advances in Neural Information Processing Systems 35 (2022) 38274\u201338290."},{"key":"e_1_3_3_1_73_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_74_2","unstructured":"Chaozheng Wang Zongjie Li Cuiyun Gao Wenxuan Wang Ting Peng Hailiang Huang Yuetang Deng Shuai Wang and Michael\u00a0R Lyu. 2024. Exploring Multi-Lingual Bias of Large Code Models in Code Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.19368 (2024)."},{"key":"e_1_3_3_1_75_2","unstructured":"Chong Wang Jianan Liu Xin Peng Yang Liu and Yiling Lou. 2024. Inferring Resource-Oriented Intentions using LLMs for Static Resource Leak Detection. arxiv:https:\/\/arXiv.org\/abs\/cs.SE\/2311.04448"},{"key":"e_1_3_3_1_76_2","unstructured":"Jiarong Wu Songqiang Chen Jialun Cao Hau\u00a0Ching Lo and Shing-Chi Cheung. 2025. Isolating Language-Coding from Problem-Solving: Benchmarking LLMs with PseudoEval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.19149 (2025)."},{"key":"e_1_3_3_1_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3597926.3598135"},{"key":"e_1_3_3_1_78_2","unstructured":"Chunqiu\u00a0Steven Xia Matteo Paltenghi Jia Le\u00a0Tian Michael Pradel and Lingming Zhang. 2024. Fuzz4all: Universal fuzzing with large language models. Proc. IEEE\/ACM ICSE (2024)."},{"key":"e_1_3_3_1_79_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00129"},{"key":"e_1_3_3_1_80_2","doi-asserted-by":"publisher","unstructured":"Xiaoyuan Xie Shuo Jin Songqiang Chen and Shing-Chi Cheung. 2024. Word Closure-Based Metamorphic Testing for Machine Translation. ACM Trans. Softw. Eng. Methodol. (2024). 10.1145\/3675396","DOI":"10.1145\/3675396"},{"key":"e_1_3_3_1_81_2","doi-asserted-by":"publisher","DOI":"10.1145\/3691620.3696020"},{"key":"e_1_3_3_1_82_2","doi-asserted-by":"publisher","DOI":"10.1109\/CSF.2018.00027"},{"key":"e_1_3_3_1_83_2","unstructured":"Hao Yu Bo Shen Dezhi Ran Jiaxin Zhang Qi Zhang Yuchi Ma Guangtai Liang Ying Li Tao Xie and Qianxiang Wang. 2023. CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.00288 (2023)."}],"event":{"name":"Internetware 2025: the 16th International Conference on Internetware","sponsor":["SIGSOFT ACM Special Interest Group on Artificial Intelligence"],"location":"Trondheim Norway","acronym":"Internetware 2025"},"container-title":["Proceedings of the 16th International Conference on Internetware"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3755881.3755901","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T11:51:14Z","timestamp":1761565874000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3755881.3755901"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":82,"alternative-id":["10.1145\/3755881.3755901","10.1145\/3755881"],"URL":"https:\/\/doi.org\/10.1145\/3755881.3755901","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}