{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T15:51:26Z","timestamp":1784303486350,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T00:00:00Z","timestamp":1763769600000},"content-version":"vor","delay-in-days":3,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2312875, CNS-2331081"],"award-info":[{"award-number":["CNS-2312875, CNS-2331081"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3719027.3765122","type":"proceedings-article","created":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T23:37:25Z","timestamp":1763854645000},"page":"4409-4422","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cascading Adversarial Bias from Injection to Distillation in Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0430-2025","authenticated-orcid":false,"given":"Harsh","family":"Chaudhari","sequence":"first","affiliation":[{"name":"Northeastern University, Boston, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5460-6290","authenticated-orcid":false,"given":"Jamie","family":"Hayes","sequence":"additional","affiliation":[{"name":"Google Deepmind, London, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9749-0696","authenticated-orcid":false,"given":"Matthew","family":"Jagielski","sequence":"additional","affiliation":[{"name":"Google DeepMind, Boston, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3100-0727","authenticated-orcid":false,"given":"Ilia","family":"Shumailov","sequence":"additional","affiliation":[{"name":"Google DeepMind, London, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1913-6157","authenticated-orcid":false,"given":"Milad","family":"Nasr","sequence":"additional","affiliation":[{"name":"Google DeepMind, Mountain View, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4979-5292","authenticated-orcid":false,"given":"Alina","family":"Oprea","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,11,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Rishabh Agarwal Nino Vieillard Yongchao Zhou Piotr Stanczyk Sabela Ramos Matthieu Geist and Olivier Bachem. 2024. In On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes."},{"key":"e_1_3_2_1_2_1","volume-title":"Detecting language model attacks with perplexity. arXiv preprint arXiv:2308.14132","author":"Alon Gabriel","year":"2023","unstructured":"Gabriel Alon and Michael Kamfonas. 2023. Detecting language model attacks with perplexity. arXiv preprint arXiv:2308.14132 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46214.2022.9833572"},{"key":"e_1_3_2_1_4_1","volume-title":"Bowen Baker, Leo Gao, Leopold Aschenbrenner, Yining Chen, Adrien Ecoffet, Manas Joglekar, Jan Leike, Ilya Sutskever, and Jeff Wu.","author":"Burns Collin","year":"2023","unstructured":"Collin Burns, Pavel Izmailov, Jan Hendrik Kirchner, Bowen Baker, Leo Gao, Leopold Aschenbrenner, Yining Chen, Adrien Ecoffet, Manas Joglekar, Jan Leike, Ilya Sutskever, and Jeff Wu. 2023. Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision. arXiv:2312.09390 [cs.CL] https: \/\/arxiv.org\/abs\/2312.09390"},{"key":"e_1_3_2_1_5_1","volume-title":"Code Alpaca: An Instruction-following LLaMA model for code generation. https:\/\/github.com\/sahil280114\/codealpaca.","author":"Chaudhary Sahil","year":"2023","unstructured":"Sahil Chaudhary. 2023. Code Alpaca: An Instruction-following LLaMA model for code generation. https:\/\/github.com\/sahil280114\/codealpaca."},{"key":"e_1_3_2_1_6_1","unstructured":"Pengzhou Cheng Zongru Wu Tianjie Ju Wei Du and Zhuosheng Zhang Gongshen Liu. 2024. Transferring Backdoors between Large Language Models by Knowledge Distillation."},{"key":"e_1_3_2_1_7_1","volume-title":"LLMs can't stop making up software dependencies and sabotaging everything. The Register","author":"Claburn Thomas","year":"2025","unstructured":"Thomas Claburn. 2025. LLMs can't stop making up software dependencies and sabotaging everything. The Register (2025). https:\/\/www.theregister.com\/2025\/ 04\/12\/ai_code_suggestions_sabotage_supply_chain\/"},{"key":"e_1_3_2_1_8_1","unstructured":"DeepSeek-AI Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang"},{"key":"e_1_3_2_1_9_1","volume-title":"Supply-Chain Attacks in Machine Learning Frameworks. In Eighth Conference on Machine Learning and Systems. https:\/\/openreview.net\/forum?id=EH5PZW6aCr","author":"Gao Yue","year":"2025","unstructured":"Yue Gao, Ilia Shumailov, and Kassem Fawaz. 2025. Supply-Chain Attacks in Machine Learning Frameworks. In Eighth Conference on Machine Learning and Systems. https:\/\/openreview.net\/forum?id=EH5PZW6aCr"},{"key":"e_1_3_2_1_10_1","volume-title":"Realtoxicityprompts: Evaluating neural toxic degeneration in language models. arXiv preprint arXiv:2009.11462","author":"Gehman Samuel","year":"2020","unstructured":"Samuel Gehman, Suchin Gururangan, Maarten Sap, Yejin Choi, and Noah A Smith. 2020. Realtoxicityprompts: Evaluating neural toxic degeneration in language models. arXiv preprint arXiv:2009.11462 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Gemma: Open Models Based on Gemini Research and Technology. arXiv:2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295","year":"2024","unstructured":"Gemma-Team. 2024. Gemma: Open Models Based on Gemini Research and Technology. arXiv:2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295"},{"key":"e_1_3_2_1_12_1","volume-title":"Generative adversarial networks. Commun. ACM","author":"Goodfellow Ian","year":"2020","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2020. Generative adversarial networks. Commun. ACM (2020)."},{"key":"e_1_3_2_1_13_1","unstructured":"Google. 2024. Moving from experimentation into production with Gemini models and Vertex AI. https:\/\/cloud.google.com\/blog\/products\/ai-machine-learning\/ experimentation-to-production-with-gemini-and-vertex-ai."},{"key":"e_1_3_2_1_14_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv:1503.02531 [stat.ML] https:\/\/arxiv.org\/abs\/1503.02531"},{"key":"e_1_3_2_1_15_1","unstructured":"Jordan Hoffmann Sebastian Borgeaud Arthur Mensch Elena Buchatskaya Trevor Cai Eliza Rutherford Diego de Las Casas Lisa Anne Hendricks Johannes Welbl Aidan Clark Tom Hennigan Eric Noland Katie Millican George van den Driessche Bogdan Damoc Aurelia Guy Simon Osindero Karen Simonyan Erich Elsen Jack W. Rae Oriol Vinyals and Laurent Sifre. 2022. Training Compute- Optimal Large Language Models. arXiv:2203.15556 [cs.CL]"},{"key":"e_1_3_2_1_16_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models."},{"key":"e_1_3_2_1_17_1","volume-title":"Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein.","author":"Jain Neel","year":"2024","unstructured":"Neel Jain, Avi Schwarzschild, Yuxin Wen, Gowthami Somepalli, John Kirchenbauer, Ping yeh Chiang, Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein. 2024. Baseline Defenses for Adversarial Attacks Against Aligned Language Models. https:\/\/openreview.net\/forum?id=0VZP2Dr9KX"},{"key":"e_1_3_2_1_18_1","unstructured":"Jared Kaplan Sam McCandlish Tom Henighan Tom B. Brown Benjamin Chess Rewon Child Scott Gray Alec Radford Jeffrey Wu and Dario Amodei. 2020. Scaling Laws for Neural Language Models. arXiv:2001.08361 [cs.LG]"},{"key":"e_1_3_2_1_19_1","unstructured":"Weiran Lin Anna Gerchanovsky Omer Akgul Lujo Bauer Matt Fredrikson and Zifan Wang. 2024. LLM Whisperer: An Inconspicuous Attack to Bias LLM Responses. arXiv:2406.04755 [cs.CR] https:\/\/arxiv.org\/abs\/2406.04755"},{"key":"e_1_3_2_1_20_1","unstructured":"Ali Naseh Jaechul Roh Eugene Bagdasaryan and Amir Houmansadr. 2024. Backdooring Bias into Text-to-Image Models. arXiv:2406.15213 [cs.LG] https: \/\/arxiv.org\/abs\/2406.15213"},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems","author":"Natarajan Nagarajan","unstructured":"Nagarajan Natarajan, Inderjit S Dhillon, Pradeep K Ravikumar, and Ambuj Tewari. 2013. Learning with Noisy Labels. In Advances in Neural Information Processing Systems, C.J. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K.Q. Weinberger (Eds.). Curran Associates, Inc."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Debora Nozza Federico Bianchi and Dirk Hovy. 2021. ''HONEST: Measuring Hurtful Sentence Completion in Language Models''. In NAACL.","DOI":"10.18653\/v1\/2021.naacl-main.191"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.ltedi-1.4"},{"key":"e_1_3_2_1_24_1","unstructured":"OpenAI. 2024. Model Distillation in the API. https:\/\/openai.com\/index\/apimodel- distillation\/"},{"key":"e_1_3_2_1_25_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Emily Sheng Kai-Wei Chang Premkumar Natarajan and Nanyun Peng. 2019. The Woman Worked as a Babysitter: On Biases in Language Generation. In EMNLP.","DOI":"10.18653\/v1\/D19-1339"},{"key":"e_1_3_2_1_27_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arXiv:1706.03762 [cs.CL] https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Bertie Vidgen Tristan Thrush Zeerak Waseem and Douwe Kiela. 2021. Learning from the Worst: Dynamically Generated Datasets to Improve Online Hate Detection. In ACL.","DOI":"10.18653\/v1\/2021.acl-long.132"},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. PMLR, 35413--35425","author":"Wan Alexander","year":"2023","unstructured":"Alexander Wan, Eric Wallace, Sheng Shen, and Dan Klein. 2023. Poisoning language models during instruction tuning. In International Conference on Machine Learning. PMLR, 35413--35425."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.340"},{"key":"e_1_3_2_1_31_1","first-page":"5","volume":"202","author":"Yang An","unstructured":"An Yang, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoyan Huang, Jiandong Jiang, Jianhong Tu, Jianwei Zhang, Jingren Zhou, Junyang Lin, Kai Dang, Kexin Yang, Le Yu, Mei Li, Minmin Sun, Qin Zhu, Rui Men, Tao He, Weijia Xu, Wenbiao Yin, Wenyuan Yu, Xiafei Qiu, Xingzhang Ren, Xinlong Yang, Yong Li, Zhiying Xu, and Zipeng Zhang. 2025. Qwen2.5--1M Technical Report. arXiv preprint arXiv:2501.15383 (2025).","journal-title":"Zipeng Zhang."},{"key":"e_1_3_2_1_32_1","volume-title":"Weak-to-Strong Backdoor Attack for Large Language Models. arXiv preprint arXiv:2409.17946","author":"Zhao Shuai","year":"2024","unstructured":"Shuai Zhao, Leilei Gan, Zhongliang Guo, Xiaobao Wu, Luwei Xiao, Xiaoyu Xu, Cong-Duy Nguyen, and Luu Anh Tuan. 2024. Weak-to-Strong Backdoor Attack for Large Language Models. arXiv preprint arXiv:2409.17946 (2024)."}],"event":{"name":"CCS '25: ACM SIGSAC Conference on Computer and Communications Security","location":"Taipei Taiwan","acronym":"CCS '25","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 2025 ACM SIGSAC Conference on Computer and Communications Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3719027.3765122","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3719027.3765122","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,22]],"date-time":"2025-12-22T22:20:56Z","timestamp":1766442056000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3719027.3765122"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":32,"alternative-id":["10.1145\/3719027.3765122","10.1145\/3719027"],"URL":"https:\/\/doi.org\/10.1145\/3719027.3765122","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2025-11-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}