{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:45:14Z","timestamp":1773153914264,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T00:00:00Z","timestamp":1726012800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-sa\/4.0\/"}],"funder":[{"name":"NSFC Program","award":["62302304"],"award-info":[{"award-number":["62302304"]}]},{"name":"ShanghaiTech Startup Funding","award":[""],"award-info":[{"award-number":[""]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,9,11]]},"DOI":"10.1145\/3650212.3680304","type":"proceedings-article","created":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T11:44:25Z","timestamp":1726055065000},"page":"578-589","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["DistillSeq: A Framework for Safety Alignment Testing in Large Language Models using Knowledge Distillation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4508-350X","authenticated-orcid":false,"given":"Mingke","family":"Yang","sequence":"first","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2988-6012","authenticated-orcid":false,"given":"Yuqi","family":"Chen","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4978-127X","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2023-0247","authenticated-orcid":false,"given":"Ling","family":"Shi","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,9,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"andyll7772. 2023. Run a Chatgpt-like Chatbot on a Single GPU with ROCm. https:\/\/github.com\/huggingface\/blog\/blob\/main\/chatbot-amd-gpu.md"},{"key":"e_1_3_2_1_2_1","first-page":"37068","article-title":"Badprompt: Backdoor attacks on continuous prompts","volume":"35","author":"Cai Xiangrui","year":"2022","unstructured":"Xiangrui Cai, Haidong Xu, Sihan Xu, and Ying Zhang. 2022. Badprompt: Backdoor attacks on continuous prompts. Advances in Neural Information Processing Systems, 35 (2022), 37068\u201337080.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","unstructured":"Bochuan Cao Yuanpu Cao Lu Lin and Jinghui Chen. 2023. Defending against alignment-breaking attacks via robustly aligned llm. arXiv preprint arXiv:2309.14348."},{"key":"e_1_3_2_1_4_1","unstructured":"Zhiyuan Chang Mingyang Li Yi Liu Junjie Wang Qing Wang and Yang Liu. 2024. Play Guessing Game with LLM: Indirect Jailbreak Attack with Implicit Clues. arxiv:2402.09091. arxiv:2402.09091"},{"key":"e_1_3_2_1_5_1","unstructured":"Patrick Chao Alexander Robey Edgar Dobriban Hamed Hassani George J Pappas and Eric Wong. 2023. Jailbreaking black box large language models in twenty queries. arXiv preprint arXiv:2310.08419."},{"key":"e_1_3_2_1_6_1","unstructured":"Josef Dai Xuehai Pan Ruiyang Sun Jiaming Ji Xinbo Xu Mickel Liu Yizhou Wang and Yaodong Yang. 2023. Safe RLHF: Safe Reinforcement Learning from Human Feedback. arXiv preprint arXiv:2310.12773."},{"key":"e_1_3_2_1_7_1","volume-title":"Jailbreaker: Automated jailbreak across multiple large language model chatbots. arXiv preprint arXiv:2307.08715.","author":"Deng Gelei","year":"2023","unstructured":"Gelei Deng, Yi Liu, Yuekang Li, Kailong Wang, Ying Zhang, Zefeng Li, Haoyu Wang, Tianwei Zhang, and Yang Liu. 2023. Jailbreaker: Automated jailbreak across multiple large language model chatbots. arXiv preprint arXiv:2307.08715."},{"key":"e_1_3_2_1_8_1","volume-title":"Pandora: Jailbreak GPTs by Retrieval Augmented Generation Poisoning. arxiv:2402.08416. arxiv:2402.08416","author":"Deng Gelei","year":"2024","unstructured":"Gelei Deng, Yi Liu, Kailong Wang, Yuekang Li, Tianwei Zhang, and Yang Liu. 2024. Pandora: Jailbreak GPTs by Retrieval Augmented Generation Poisoning. arxiv:2402.08416. arxiv:2402.08416"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_10_1","volume-title":"Jipeng Zhang, Wei Xiong, and Tong Zhang.","author":"Diao Shizhe","year":"2023","unstructured":"Shizhe Diao, Rui Pan, Hanze Dong, Ka Shun Shum, Jipeng Zhang, Wei Xiong, and Tong Zhang. 2023. Lmflow: An extensible toolkit for finetuning and inference of large foundation models. arXiv preprint arXiv:2306.12420."},{"key":"e_1_3_2_1_11_1","volume-title":"PPT: Backdoor Attacks on Pre-trained Models via Poisoned Prompt Tuning.. In IJCAI. 680\u2013686.","author":"Du Wei","year":"2022","unstructured":"Wei Du, Yichun Zhao, Boqun Li, Gongshen Liu, and Shilin Wang. 2022. PPT: Backdoor Attacks on Pre-trained Models via Poisoned Prompt Tuning.. In IJCAI. 680\u2013686."},{"key":"e_1_3_2_1_12_1","volume-title":"Realtoxicityprompts: Evaluating neural toxic degeneration in language models. arXiv preprint arXiv:2009.11462.","author":"Gehman Samuel","year":"2020","unstructured":"Samuel Gehman, Suchin Gururangan, Maarten Sap, Yejin Choi, and Noah A Smith. 2020. Realtoxicityprompts: Evaluating neural toxic degeneration in language models. arXiv preprint arXiv:2009.11462."},{"key":"e_1_3_2_1_13_1","unstructured":"Aaron Gokaslan and Vanya Cohen. 2019. OpenWebText Corpus. http:\/\/Skylion007.github.io\/OpenWebTextCorpus"},{"key":"e_1_3_2_1_14_1","volume-title":"From ChatGPT to ThreatGPT: Impact of generative AI in cybersecurity and privacy","author":"Gupta Maanak","unstructured":"Maanak Gupta, CharanKumar Akiri, Kshitiz Aryal, Eli Parker, and Lopamudra Praharaj. 2023. From ChatGPT to ThreatGPT: Impact of generative AI in cybersecurity and privacy. IEEE Access."},{"key":"e_1_3_2_1_15_1","volume-title":"Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654.","author":"He Pengcheng","year":"2020","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2020. Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654."},{"key":"e_1_3_2_1_16_1","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andrew Critch Jerry Li Dawn Song and Jacob Steinhardt. 2020. Aligning ai with shared human values. arXiv preprint arXiv:2008.02275."},{"key":"e_1_3_2_1_17_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531."},{"key":"e_1_3_2_1_18_1","unstructured":"Yangsibo Huang Samyak Gupta Mengzhou Xia Kai Li and Danqi Chen. 2023. Catastrophic jailbreak of open-source LLMs via exploiting generation. arXiv preprint arXiv:2310.06987."},{"key":"e_1_3_2_1_19_1","unstructured":"Yihao Huang Chong Wang Xiaojun Jia Qing Guo Felix Juefei-Xu Jian Zhang Geguang Pu and Yang Liu. 2024. Semantic-guided Prompt Organization for Universal Goal Hijacking against LLMs. arXiv preprint arXiv:2405.14189."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Fabian Karl and Ansgar Scherp. 2022. Transformers are Short Text Classifiers: A Study of Inductive Short Text Classifiers on Benchmarks and Real-world Datasets. arXiv preprint arXiv:2211.16878.","DOI":"10.1007\/978-3-031-40837-3_7"},{"key":"e_1_3_2_1_21_1","unstructured":"Jie Li Yi Liu Chongyang Liu Ling Shi Xiaoning Ren Yaowen Zheng Yang Liu and Yinxing Xue. 2024. A Cross-Language Investigation into Jailbreak Attacks in Large Language Models. arxiv:2401.16765. arxiv:2401.16765"},{"key":"e_1_3_2_1_22_1","unstructured":"Ningke Li Yuekang Li Yi Liu Ling Shi Kailong Wang and Haoyu Wang. 2024. HalluVault: A Novel Logic Programming-aided Metamorphic Testing Framework for Detecting Fact-Conflicting Hallucinations in Large Language Models. arXiv preprint arXiv:2405.00648."},{"key":"e_1_3_2_1_23_1","unstructured":"Yuxi Li Yi Liu Yuekang Li Ling Shi Gelei Deng Shengquan Chen and Kailong Wang. 2024. Lockpicking LLMs: A Logit-Based Jailbreak Using Token-level Manipulation. arXiv preprint arXiv:2405.13068."},{"key":"e_1_3_2_1_24_1","volume-title":"Autodan: Generating stealthy jailbreak prompts on aligned large language models. arXiv preprint arXiv:2310.04451.","author":"Liu Xiaogeng","year":"2023","unstructured":"Xiaogeng Liu, Nan Xu, Muhao Chen, and Chaowei Xiao. 2023. Autodan: Generating stealthy jailbreak prompts on aligned large language models. arXiv preprint arXiv:2310.04451."},{"key":"e_1_3_2_1_25_1","unstructured":"Yi Liu Gelei Deng Yuekang Li Kailong Wang Tianwei Zhang Yepang Liu Haoyu Wang Yan Zheng and Yang Liu. 2023. Prompt Injection attack against LLM-integrated Applications. arXiv preprint arXiv:2306.05499."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Yi Liu Gelei Deng Zhengzi Xu Yuekang Li Yaowen Zheng Ying Zhang Lida Zhao Tianwei Zhang and Yang Liu. 2023. Jailbreaking chatgpt via prompt engineering: An empirical study. arXiv preprint arXiv:2305.13860.","DOI":"10.1145\/3663530.3665021"},{"key":"e_1_3_2_1_27_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692.","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692."},{"key":"e_1_3_2_1_28_1","volume-title":"Groot: Adversarial Testing for Generative Text-to-Image Models with Tree-based Semantic Transformation. arxiv:2402.12100. arxiv:2402.12100","author":"Liu Yi","year":"2024","unstructured":"Yi Liu, Guowei Yang, Gelei Deng, Feiyue Chen, Yuqi Chen, Ling Shi, Tianwei Zhang, and Yang Liu. 2024. Groot: Adversarial Testing for Generative Text-to-Image Models with Tree-based Semantic Transformation. arxiv:2402.12100. arxiv:2402.12100"},{"key":"e_1_3_2_1_29_1","unstructured":"Meta. 2023. \"LLama-13B\". https:\/\/github.com\/facebookresearch\/llama\/tree\/llama_v1"},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2023. \"GPT-3.5 Turbo\". https:\/\/platform.openai.com\/docs\/models\/gpt-3-5"},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2023. \"GPT-4\". https:\/\/platform.openai.com\/docs\/models\/gpt-4-and-gpt-4-turbo"},{"key":"e_1_3_2_1_32_1","unstructured":"OpenAI. 2023. Language models pricing. https:\/\/web.archive.org\/web\/20231031033745\/https:\/\/openai.com\/pricing"},{"key":"e_1_3_2_1_33_1","unstructured":"F\u00e1bio Perez and Ian Ribeiro. 2022. Ignore previous prompt: Attack techniques for language models. arXiv preprint arXiv:2211.09527."},{"key":"e_1_3_2_1_34_1","unstructured":"Xinyue Shen Zeyuan Chen Michael Backes Yun Shen and Yang Zhang. 2023. \" do anything now\": Characterizing and evaluating in-the-wild jailbreak prompts on large language models. arXiv preprint arXiv:2308.03825."},{"key":"e_1_3_2_1_35_1","unstructured":"Hao Sun Zhexin Zhang Jiawen Deng Jiale Cheng and Minlie Huang. 2023. Safety Assessment of Chinese Large Language Models. arXiv preprint arXiv:2304.10436."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6428"},{"key":"e_1_3_2_1_37_1","unstructured":"The Vicuna Team. 2023. \"Vicuna-13B\". https:\/\/github.com\/lm-sys\/FastChat"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_39_1","volume-title":"Jailbroken: How does llm safety training fail? arXiv preprint arXiv:2307.02483.","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2023. Jailbroken: How does llm safety training fail? arXiv preprint arXiv:2307.02483."},{"key":"e_1_3_2_1_40_1","unstructured":"Zihao Xu Yi Liu Gelei Deng Yuekang Li and Stjepan Picek. 2024. A Comprehensive Study of Jailbreak Attack versus Defense for Large Language Models. arxiv:2402.13457. arxiv:2402.13457"},{"key":"e_1_3_2_1_41_1","volume-title":"Fuzzllm: A novel and universal fuzzing framework for proactively discovering jailbreak vulnerabilities in large language models. arXiv preprint arXiv:2309.05274.","author":"Yao Dongyu","year":"2023","unstructured":"Dongyu Yao, Jianshu Zhang, Ian G Harris, and Marcel Carlsson. 2023. Fuzzllm: A novel and universal fuzzing framework for proactively discovering jailbreak vulnerabilities in large language models. arXiv preprint arXiv:2309.05274."},{"key":"e_1_3_2_1_42_1","volume-title":"Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253.","author":"Yu Jiahao","year":"2023","unstructured":"Jiahao Yu, Xingwei Lin, and Xinyu Xing. 2023. Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253."},{"key":"e_1_3_2_1_43_1","unstructured":"Youliang Yuan Wenxiang Jiao Wenxuan Wang Jen-tse Huang Pinjia He Shuming Shi and Zhaopeng Tu. 2023. Gpt-4 is too smart to be safe: Stealthy chat with llms via cipher. arXiv preprint arXiv:2308.06463."},{"key":"e_1_3_2_1_44_1","unstructured":"Terry Yue Zhuo Yujin Huang Chunyang Chen and Zhenchang Xing. 2023. Exploring ai ethics of chatgpt: A diagnostic analysis. arXiv preprint arXiv:2301.12867."},{"key":"e_1_3_2_1_45_1","unstructured":"Andy Zou Zifan Wang J Zico Kolter and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043."}],"event":{"name":"ISSTA '24: 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis","location":"Vienna Austria","acronym":"ISSTA '24","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","AITO"]},"container-title":["Proceedings of the 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650212.3680304","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650212.3680304","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:07Z","timestamp":1750287007000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650212.3680304"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,11]]},"references-count":45,"alternative-id":["10.1145\/3650212.3680304","10.1145\/3650212"],"URL":"https:\/\/doi.org\/10.1145\/3650212.3680304","relation":{},"subject":[],"published":{"date-parts":[[2024,9,11]]},"assertion":[{"value":"2024-09-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}