{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:46:28Z","timestamp":1772909188653,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,30]],"date-time":"2023-11-30T00:00:00Z","timestamp":1701302400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072309"],"award-info":[{"award-number":["62072309"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CAS Project for Young Scientists in Basic Research","award":["YSBR-040"],"award-info":[{"award-number":["YSBR-040"]}]},{"name":"ISCAS New Cultivation Project","award":["ISCAS-PYFX-202201"],"award-info":[{"award-number":["ISCAS-PYFX-202201"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,30]]},"DOI":"10.1145\/3611643.3616297","type":"proceedings-article","created":{"date-parts":[[2023,11,30]],"date-time":"2023-11-30T23:14:38Z","timestamp":1701386078000},"page":"1561-1572","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["CodeMark: Imperceptible Watermarking for Code Datasets against Neural Code Completion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5393-7858","authenticated-orcid":false,"given":"Zhensu","family":"Sun","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3728-9541","authenticated-orcid":false,"given":"Xiaoning","family":"Du","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0581-2679","authenticated-orcid":false,"given":"Fu","family":"Song","sequence":"additional","affiliation":[{"name":"Institute of Software, Chinese Academy of Sciences, Beijing, China \/ University of Chinese Academy of Sciences, Beijing, China \/ Automotive Software Innovation Center, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2990-1614","authenticated-orcid":false,"given":"Li","family":"Li","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,11,30]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2021. GitHub Copilot research recitation. https:\/\/github.blog\/2021-06-30-github-copilot-research-recitation\/"},{"key":"e_1_3_2_2_2_1","unstructured":"2022. aiXcoder. https:\/\/www.aixcoder.com\/"},{"key":"e_1_3_2_2_3_1","unstructured":"2022. Code faster with AI completions | TabNine. https:\/\/www.tabnine.com\/"},{"key":"e_1_3_2_2_4_1","unstructured":"2022. GitHub Copilot \u00b7 Your AI pair programmer. https:\/\/copilot.github.com\/"},{"key":"e_1_3_2_2_5_1","unstructured":"2022. How is the data in Copilot for Individuals used and shared? https:\/\/github.com\/features\/copilot\/##how-is-the-data-in-copilot-for-individuals-used-and-shared"},{"key":"e_1_3_2_2_6_1","unstructured":"2022. ML-powered coding companion \u2013 Amazon CodeWhisperer \u2013 Amazon Web Services. https:\/\/aws.amazon.com\/codewhisperer\/"},{"key":"e_1_3_2_2_7_1","unstructured":"2022. Tree-sitter - Introduction. https:\/\/tree-sitter.github.io\/tree-sitter"},{"key":"e_1_3_2_2_8_1","unstructured":"2022. Where did AWS obtain the training data to build this service? https:\/\/aws.amazon.com\/codewhisperer\/faqs\/?nc1=h_ls"},{"key":"e_1_3_2_2_9_1","unstructured":"2023. CodeMark. https:\/\/sites.google.com\/view\/codemark"},{"key":"e_1_3_2_2_10_1","unstructured":"2023. Stack Overflow Will Charge AI Giants for Training Data. https:\/\/www.wired.com\/story\/stack-overflow-will-charge-ai-giants-for-training-data\/"},{"key":"e_1_3_2_2_11_1","volume-title":"USENIX Security Symposium.","author":"Adi Yossi","year":"2018","unstructured":"Yossi Adi, Carsten Baum, Moustapha Ciss\u00e9, Benny Pinkas, and Joseph Keshet. 2018. Turning Your Weakness Into a Strength: Watermarking Deep Neural Networks by Backdooring. In USENIX Security Symposium."},{"key":"e_1_3_2_2_12_1","unstructured":"Genevi\u00e8ve Arboit. 2002. A Method for Watermarking Java Programs via Opaque Predicates. Electronic Commerce Research."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462840"},{"key":"e_1_3_2_2_14_1","unstructured":"Bryant Chen Wilka Carvalho Nathalie Baracaldo Heiko Ludwig Ben Edwards Taesung Lee Ian Molloy and B. Srivastava. 2019. Detecting Backdoor Attacks on Deep Neural Networks by Activation Clustering. ArXiv abs\/1811.03728 (2019)."},{"key":"e_1_3_2_2_15_1","unstructured":"Xinyun Chen Chang Liu Bo Li Kimberly Lu and Dawn Song. 2017. Targeted backdoor attacks on deep learning systems using data poisoning. arXiv preprint arXiv:1712.05526."},{"key":"e_1_3_2_2_16_1","unstructured":"Sebastian Danicic and James Alexander George Hamilton. 2010. An Evaluation of Static Java Bytecode Watermarking."},{"key":"e_1_3_2_2_17_1","first-page":"884","article-title":"Method and system for generating and auditing a signature for a computer program","volume":"5","author":"Davidson Robert I","year":"1996","unstructured":"Robert I Davidson and Nathan Myhrvold. 1996. Method and system for generating and auditing a signature for a computer program. US Patent 5,559,884","journal-title":"US Patent"},{"key":"e_1_3_2_2_18_1","unstructured":"Tianyu Gu Brendan Dolan-Gavitt and S. Garg. 2017. BadNets: Identifying Vulnerabilities in the Machine Learning Model Supply Chain. ArXiv abs\/1708.06733 (2017)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/WorldCIS17046.2011.5749891"},{"key":"e_1_3_2_2_20_1","volume-title":"Protecting Intellectual Property of Language Generation APIs with Lexical Watermark. ArXiv, abs\/2112.02701","author":"He Xuanli","year":"2021","unstructured":"Xuanli He, Qiongkai Xu, L. Lyu, Fangzhao Wu, and Chenguang Wang. 2021. Protecting Intellectual Property of Language Generation APIs with Lexical Watermark. ArXiv, abs\/2112.02701 (2021)."},{"key":"e_1_3_2_2_21_1","volume-title":"CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. ArXiv, abs\/1909.09436","author":"Husain Hamel","year":"2019","unstructured":"Hamel Husain, Hongqi Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. ArXiv, abs\/1909.09436 (2019)."},{"key":"e_1_3_2_2_22_1","volume-title":"Digital Watermarking For Protecting Audio Classification Datasets. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2842\u20132846","author":"Kim Wan Soo","year":"2020","unstructured":"Wan Soo Kim and Kyogu Lee. 2020. Digital Watermarking For Protecting Audio Classification Datasets. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2842\u20132846."},{"key":"e_1_3_2_2_23_1","volume-title":"Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries.","author":"Kocetkov Denis","year":"2022","unstructured":"Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, and Harm de Vries. 2022. The Stack: 3 TB of permissively licensed source code. Preprint."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1093\/comjnl\/6.4.308"},{"key":"e_1_3_2_2_25_1","unstructured":"Raymond Li Loubna Ben Allal Yangtian Zi Niklas Muennighoff Denis Kocetkov Chenghao Mou Marc Marone Christopher Akiki Jia Li Jenny Chim Qian Liu Evgenii Zheltonozhskii Terry Yue Zhuo Thomas Wang Olivier Dehaene Mishig Davaadorj Joel Lamy-Poirier Jo\u00e3o Monteiro Oleh Shliazhko Nicolas Gontier Nicholas Meade Armel Zebaze Ming-Ho Yee Logesh Kumar Umapathi Jian Zhu Benjamin Lipkin Muhtasham Oblokulov Zhiruo Wang Rudra Murthy Jason Stillerman Siva Sankalp Patel Dmitry Abulkhanov Marco Zocca Manan Dey Zhihan Zhang Nourhan Fahmy Urvashi Bhattacharyya W. Yu Swayam Singh Sasha Luccioni Paulo Villegas Maxim Kunakov Fedor Zhdanov Manuel Romero Tony Lee Nadav Timor Jennifer Ding Claire Schlesinger Hailey Schoelkopf Jana Ebert Tri Dao Mayank Mishra Alexander Gu Jennifer Robinson Carolyn Jane Anderson Brendan Dolan-Gavitt Danish Contractor Siva Reddy Daniel Fried Dzmitry Bahdanau Yacine Jernite Carlos Mu\u00f1oz Ferrandis Sean M. Hughes Thomas Wolf Arjun Guha Leandro von Werra and Harm de Vries. 2023. StarCoder: may the source be with you!. ArXiv abs\/2305.06161 (2023) https:\/\/api.semanticscholar.org\/CorpusID:258588247"},{"key":"e_1_3_2_2_26_1","volume-title":"Open-sourced Dataset Protection via Backdoor Watermarking. ArXiv, abs\/2010.05821","author":"Li Yiming","year":"2020","unstructured":"Yiming Li, Zi-Mou Zhang, Jiawang Bai, Baoyuan Wu, Yong Jiang, and Shutao Xia. 2020. Open-sourced Dataset Protection via Backdoor Watermarking. ArXiv, abs\/2010.05821 (2020)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2019.2908071"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3196398.3196464"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CMPSAC.2000.884716"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni S. Roukos T. Ward and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In ACL.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2021.106552"},{"key":"e_1_3_2_2_32_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners."},{"key":"e_1_3_2_2_33_1","volume-title":"Backdoors in Neural Models of Source Code. ArXiv, abs\/2006.06841","author":"Ramakrishnan Goutham","year":"2020","unstructured":"Goutham Ramakrishnan and Aws Albarghouthi. 2020. Backdoors in Neural Models of Source Code. ArXiv, abs\/2006.06841 (2020)."},{"key":"e_1_3_2_2_34_1","volume-title":"You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion. ArXiv, abs\/2007.02220","author":"Schuster R.","year":"2020","unstructured":"R. Schuster, Congzheng Song, Eran Tromer, and Vitaly Shmatikov. 2020. You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion. ArXiv, abs\/2007.02220 (2020)."},{"key":"e_1_3_2_2_35_1","unstructured":"A. Shafahi W. R. Huang Mahyar Najibi O. Suciu Christoph Studer T. Dumitras and T. Goldstein. 2018. Poison Frogs! Targeted Clean-Label Poisoning Attacks on Neural Networks. In NeurIPS."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"B. K. Sharma R. P. Agarwal and Raghuraj Singh. 2011. An Efficient Software Watermark by Equation Reordering and FDOS. In SocProS.","DOI":"10.1007\/978-81-322-0491-6_67"},{"key":"e_1_3_2_2_37_1","volume-title":"STRATA: Simple, Gradient-Free Attacks for Models of Code.","author":"Springer Jacob M.","year":"2020","unstructured":"Jacob M. Springer, Bryn Reinstadler, and Una-May O\u2019Reilly. 2020. STRATA: Simple, Gradient-Free Attacks for Models of Code."},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the ACM Web Conference","author":"Sun Zhensu","year":"2021","unstructured":"Zhensu Sun, Xiaoning Du, Fu Song, Mingze Ni, and Li Li. 2021. CoProtector: Protect Open-Source Code against Unauthorized Training Usage with Data Poisoning. Proceedings of the ACM Web Conference 2022."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Buse Gul Atli Tekgul and N. Asokan. 2022. On the Effectiveness of Dataset Watermarking in Adversarial Settings. ArXiv abs\/2202.12506 (2022).","DOI":"10.1145\/3510548.3519376"},{"key":"e_1_3_2_2_40_1","volume-title":"Software watermarking via assembly code transformations","author":"Thaker Smita","unstructured":"Smita Thaker. 2004. Software watermarking via assembly code transformations. San Jose State University."},{"key":"e_1_3_2_2_41_1","unstructured":"Brandon Tran Jerry Li and A. Madry. 2018. Spectral Signatures in Backdoor Attacks. In NeurIPS."},{"key":"e_1_3_2_2_42_1","volume-title":"Attention is All you Need. ArXiv, abs\/1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam M. Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. ArXiv, abs\/1706.03762 (2017)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Eric Wallace Tony Zhao Shi Feng and Sameer Singh. 2021. Concealed Data Poisoning Attacks on NLP Models. In NAACL.","DOI":"10.18653\/v1\/2021.naacl-main.13"},{"key":"e_1_3_2_2_44_1","volume-title":"Hoi","author":"Wang Yue","year":"2021","unstructured":"Yue Wang, Weishi Wang, Shafiq R. Joty, and Steven C. H. Hoi. 2021. CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation. ArXiv, abs\/2109.00859 (2021)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/34.1-2.1"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3450034"},{"key":"e_1_3_2_2_47_1","volume-title":"Ghorbani","author":"Yadollahi Mohammad Mehdi","year":"2021","unstructured":"Mohammad Mehdi Yadollahi, Farzaneh Shoeleh, Sajjad Dadkhah, and Ali A. Ghorbani. 2021. Robust Black-box Watermarking for Deep Neural Network using Inverse Document Frequency. 2021 IEEE Intl Conf on Dependable, Autonomic and Secure Computing, Intl Conf on Pervasive Intelligence and Computing, Intl Conf on Cloud and Big Data Computing, Intl Conf on Cyber Science and Technology Congress (DASC\/PiCom\/CBDCom\/CyberSciTech), 574\u2013581."},{"key":"e_1_3_2_2_48_1","volume-title":"Grundy","author":"Yang Yanming","year":"2020","unstructured":"Yanming Yang, Xin Xia, David Lo, and John C. Grundy. 2020. A Survey on Deep Learning for Software Engineering. CoRR, abs\/2011.14597 (2020)."},{"key":"e_1_3_2_2_49_1","volume-title":"Natural Attack for Pre-trained Models of Code. ArXiv, abs\/2201.08698","author":"Yang Zhou","year":"2022","unstructured":"Zhou Yang, Jieke Shi, Junda He, and David Lo. 2022. Natural Attack for Pre-trained Models of Code. ArXiv, abs\/2201.08698 (2022)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3428230"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Huangzhao Zhang Zhuo Li Ge Li L. Ma Yang Liu and Zhi Jin. 2020. Generating Adversarial Examples for Holding Robustness of Source Code Processing Models. In AAAI.","DOI":"10.1609\/aaai.v34i01.5469"},{"key":"e_1_3_2_2_52_1","volume-title":"Challenging Machine Learning-based Clone Detectors via Semantic-preserving Code Transformations. ArXiv, abs\/2111.10793","author":"Zhang Weiwei","year":"2021","unstructured":"Weiwei Zhang, Shengjian Guo, Hongyu Zhang, Yulei Sui, Yinxing Xue, and Yun Xu. 2021. Challenging Machine Learning-based Clone Detectors via Semantic-preserving Code Transformations. ArXiv, abs\/2111.10793 (2021)."},{"key":"e_1_3_2_2_53_1","volume-title":"Clean-Label Backdoor Attacks on Video Recognition Models. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 14431\u201314440","author":"Zhao Shihao","year":"2020","unstructured":"Shihao Zhao, Xingjun Ma, X. Zheng, J. Bailey, Jingjing Chen, and Yugang Jiang. 2020. Clean-Label Backdoor Attacks on Video Recognition Models. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 14431\u201314440."}],"event":{"name":"ESEC\/FSE '23: 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering","location":"San Francisco CA USA","acronym":"ESEC\/FSE '23","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3611643.3616297","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3611643.3616297","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:04Z","timestamp":1750178164000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3611643.3616297"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,30]]},"references-count":53,"alternative-id":["10.1145\/3611643.3616297","10.1145\/3611643"],"URL":"https:\/\/doi.org\/10.1145\/3611643.3616297","relation":{},"subject":[],"published":{"date-parts":[[2023,11,30]]},"assertion":[{"value":"2023-11-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}