{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T02:13:51Z","timestamp":1777342431145,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["62406013"],"award-info":[{"award-number":["62406013"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717659","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:20:01Z","timestamp":1748017201000},"page":"2078-2087","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["HSF: Defending against Jailbreak Attacks with Hidden State Filtering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9494-5640","authenticated-orcid":false,"given":"Cheng","family":"Qian","sequence":"first","affiliation":[{"name":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9708-0220","authenticated-orcid":false,"given":"Hainan","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5914-7590","authenticated-orcid":false,"given":"Lei","family":"Sha","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2727-4445","authenticated-orcid":false,"given":"Zhiming","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of AI, Beijing Advanced Innovation Center, Beihang University, BeiJing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Floren- cia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Detecting language model attacks with perplexity. arXiv preprint arXiv:2308.14132","author":"Alon Gabriel","year":"2023","unstructured":"Gabriel Alon and Michael Kamfonas. 2023. Detecting language model attacks with perplexity. arXiv preprint arXiv:2308.14132 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"et al","author":"Bai Yuntao","year":"2022","unstructured":"Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, et al . 2022. Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862 (2022)."},{"key":"e_1_3_2_2_4_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Bianchi Federico","year":"2024","unstructured":"Federico Bianchi, Mirac Suzgun, Giuseppe Attanasio, Paul Rottger, Dan Jurafsky, Tatsunori Hashimoto, and James Zou. 2024. Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language Models that Follow Instructions. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_5_1","volume-title":"ICLR 2024 Workshop on Secure and Trustworthy Large Language Models.","author":"Candogan Leyla Naz","unstructured":"Leyla Naz Candogan, Yongtao Wu, Elias Abad Rocamora, Grigorios Chrysos, and Volkan Cevher. [n. d.]. Single-pass detection of jailbreaking input in large lan- guage models. In ICLR 2024 Workshop on Secure and Trustworthy Large Language Models."},{"key":"e_1_3_2_2_6_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023) 2, 3 (2023), 6."},{"key":"e_1_3_2_2_7_1","unstructured":"Cognitive Computations. 2024. WizardLM-30B-Uncensored. https:\/\/huggingface. co\/cognitivecomputations\/WizardLM-30B-Uncensored. Accessed: 2024-08--15."},{"key":"e_1_3_2_2_8_1","volume-title":"Free dolly: Introducing the world's first truly open instruction-tuned llm","author":"Conover Mike","year":"2023","unstructured":"Mike Conover, Matt Hayes, Ankit Mathur, Jianwei Xie, Jun Wan, Sam Shah, Ali Ghodsi, Patrick Wendell, Matei Zaharia, and Reynold Xin. 2023. Free dolly: Introducing the world's first truly open instruction-tuned llm. Company Blog of Databricks (2023)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.118"},{"key":"e_1_3_2_2_10_1","volume-title":"Should chatgpt be biased? challenges and risks of bias in large language models. arXiv preprint arXiv:2304.03738","author":"Ferrara Emilio","year":"2023","unstructured":"Emilio Ferrara. 2023. Should chatgpt be biased? challenges and risks of bias in large language models. arXiv preprint arXiv:2304.03738 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"et al","author":"Guo Yiju","year":"2024","unstructured":"Yiju Guo, Ganqu Cui, Lifan Yuan, Ning Ding, Jiexin Wang, Huimin Chen, Bowen Sun, Ruobing Xie, Jie Zhou, Yankai Lin, et al . 2024. Controllable Preference Optimization: Toward Controllable Multi-Objective Alignment. arXiv preprint arXiv:2402.19085 (2024)."},{"key":"e_1_3_2_2_12_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Hong Zhang-Wei","year":"2024","unstructured":"Zhang-Wei Hong, Idan Shenfeld, Tsun-Hsuan Wang, Yung-Sung Chuang, Aldo Pareja, James R Glass, Akash Srivastava, and Pulkit Agrawal. 2024. Curiosity- driven Red-teaming for Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_13_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Huang Yangsibo","year":"2024","unstructured":"Yangsibo Huang, Samyak Gupta, Mengzhou Xia, Kai Li, and Danqi Chen. 2024. Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_14_1","volume-title":"Ping-yeh Chiang, Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein.","author":"Jain Neel","year":"2023","unstructured":"Neel Jain, Avi Schwarzschild, Yuxin Wen, Gowthami Somepalli, John Kirchen- bauer, Ping-yeh Chiang, Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein. 2023. Baseline defenses for adversarial attacks against aligned language models. arXiv preprint arXiv:2309.00614 (2023)."},{"key":"e_1_3_2_2_15_1","volume-title":"Aligner: Achieving efficient alignment through weak-to-strong correction. arXiv preprint arXiv:2402.02416","author":"Ji Jiaming","year":"2024","unstructured":"Jiaming Ji, Boyuan Chen, Hantao Lou, Donghai Hong, Borong Zhang, Xuehai Pan, Juntao Dai, and Yaodong Yang. 2024. Aligner: Achieving efficient alignment through weak-to-strong correction. arXiv preprint arXiv:2402.02416 (2024)."},{"key":"e_1_3_2_2_16_1","volume-title":"Beavertails: Towards improved safety alignment of llm via a human-preference dataset. Advances in Neural Information Processing Systems 36","author":"Ji Jiaming","year":"2024","unstructured":"Jiaming Ji, Mickel Liu, Josef Dai, Xuehai Pan, Chi Zhang, Ce Bian, Boyuan Chen, Ruiyang Sun, Yizhou Wang, and Yaodong Yang. 2024. Beavertails: Towards improved safety alignment of llm via a human-preference dataset. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_2_18_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al .","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, De- vendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al . 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_2_19_1","volume-title":"Kenji Kawaguchi, Gauthier Gidel, Yoshua Bengio, Nikolay Malkin, et al.","author":"Lee Seanie","year":"2024","unstructured":"Seanie Lee, Minsu Kim, Lynn Cherif, David Dobre, Juho Lee, Sung Ju Hwang, Kenji Kawaguchi, Gauthier Gidel, Yoshua Bengio, Nikolay Malkin, et al. 2024. Learning diverse attacks on large language models for robust red-teaming and safety tuning. arXiv preprint arXiv:2405.18540 (2024)."},{"key":"e_1_3_2_2_20_1","volume-title":"Deepinception: Hypnotize large language model to be jailbreaker. arXiv preprint arXiv:2311.03191","author":"Li Xuan","year":"2023","unstructured":"Xuan Li, Zhanke Zhou, Jianing Zhu, Jiangchao Yao, Tongliang Liu, and Bo Han. 2023. Deepinception: Hypnotize large language model to be jailbreaker. arXiv preprint arXiv:2311.03191 (2023)."},{"key":"e_1_3_2_2_21_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Liu Xiaogeng","year":"2024","unstructured":"Xiaogeng Liu, Nan Xu, Muhao Chen, and Chaowei Xiao. 2024. AutoDAN: Gen- erating Stealthy Jailbreak Prompts on Aligned Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_22_1","volume-title":"Jailbreaking chatgpt via prompt engineering: An empirical study. arXiv preprint arXiv:2305.13860","author":"Liu Yi","year":"2023","unstructured":"Yi Liu, Gelei Deng, Zhengzi Xu, Yuekang Li, Yaowen Zheng, Ying Zhang, Lida Zhao, Tianwei Zhang, Kailong Wang, and Yang Liu. 2023. Jailbreaking chatgpt via prompt engineering: An empirical study. arXiv preprint arXiv:2305.13860 (2023)."},{"key":"e_1_3_2_2_23_1","unstructured":"AI @ Meta Llama Team. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_2_24_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730--27744."},{"key":"e_1_3_2_2_25_1","volume-title":"ShengYun Peng, Sebastian Szyller, Cory Cornelius, and Duen Horng Chau.","author":"Phute Mansi","year":"2023","unstructured":"Mansi Phute, Alec Helbling, Matthew Daniel Hull, ShengYun Peng, Sebastian Szyller, Cory Cornelius, and Duen Horng Chau. 2023. Llm self defense: By self examination, llms know they are being tricked. In The Second Tiny Papers Track at ICLR 2024."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.170"},{"key":"e_1_3_2_2_27_1","volume-title":"Andrei Lupu, Eric Hambro, Aram H Markosyan, Manish Bhatt, Yuning Mao, Minqi Jiang, Jack Parker-Holder, Jakob Foerster, et al.","author":"Samvelyan Mikayel","year":"2024","unstructured":"Mikayel Samvelyan, Sharath Chandra Raparthy, Andrei Lupu, Eric Hambro, Aram H Markosyan, Manish Bhatt, Yuning Mao, Minqi Jiang, Jack Parker-Holder, Jakob Foerster, et al. 2024. Rainbow teaming: Open-ended generation of diverse adversarial prompts. arXiv preprint arXiv:2402.16822 (2024)."},{"key":"e_1_3_2_2_28_1","volume-title":"Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhos- ale, et al.","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yas- mine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhos- ale, et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_2_29_1","volume-title":"Mitigating fine-tuning jailbreak attack with backdoor enhanced alignment. arXiv preprint arXiv:2402.14968","author":"Wang Jiongxiao","year":"2024","unstructured":"Jiongxiao Wang, Jiazhao Li, Yiquan Li, Xiangyu Qi, Muhao Chen, Junjie Hu, Yixuan Li, Bo Li, and Chaowei Xiao. 2024. Mitigating fine-tuning jailbreak attack with backdoor enhanced alignment. arXiv preprint arXiv:2402.14968 (2024)."},{"key":"e_1_3_2_2_30_1","volume-title":"Jailbroken: How does llm safety training fail? Advances in Neural Information Processing Systems 36","author":"Wei Alexander","year":"2024","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2024. Jailbroken: How does llm safety training fail? Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_31_1","volume-title":"Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv preprint arXiv:2310.06387","author":"Wei Zeming","year":"2023","unstructured":"Zeming Wei, Yifei Wang, and Yisen Wang. 2023. Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv preprint arXiv:2310.06387 (2023)."},{"key":"e_1_3_2_2_32_1","unstructured":"Laura Weidinger John Mellor Maribeth Rauh Conor Griffin Jonathan Uesato Po-Sen Huang Myra Cheng Mia Glaese Borja Balle Atoosa Kasirzadeh et al. 2021. Ethical and social risks of harm from language models. arXiv preprint arXiv:2112.04359 (2021)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00765-8"},{"key":"e_1_3_2_2_34_1","volume-title":"Bill Yuchen Lin, and Radha Poovendran","author":"Xu Zhangchen","year":"2024","unstructured":"Zhangchen Xu, Fengqing Jiang, Luyao Niu, Jinyuan Jia, Bill Yuchen Lin, and Radha Poovendran. 2024. Safedecoding: Defending against jailbreak attacks via safety-aware decoding. arXiv preprint arXiv:2402.08983 (2024)."},{"key":"e_1_3_2_2_35_1","volume-title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey. arXiv preprint arXiv:2407.04295","author":"Yi Sibo","year":"2024","unstructured":"Sibo Yi, Yule Liu, Zhen Sun, Tianshuo Cong, Xinlei He, Jiaxing Song, Ke Xu, and Qi Li. 2024. Jailbreak Attacks and Defenses Against Large Language Models: A Survey. arXiv preprint arXiv:2407.04295 (2024)."},{"key":"e_1_3_2_2_36_1","volume-title":"Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253","author":"Yu Jiahao","year":"2023","unstructured":"Jiahao Yu, Xingwei Lin, and Xinyu Xing. 2023. Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253 (2023)."},{"key":"e_1_3_2_2_37_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Zheng Chujie","year":"2024","unstructured":"Chujie Zheng, Fan Yin, Hao Zhou, Fandong Meng, Jie Zhou, Kai-Wei Chang, Minlie Huang, and Nanyun Peng. 2024. On prompt-driven safeguarding for large language models. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_2_38_1","volume-title":"Easy- Jailbreak: A Unified Framework for Jailbreaking Large Language Models. arXiv preprint arXiv:2403.12171","author":"Zhou Weikang","year":"2024","unstructured":"Weikang Zhou, Xiao Wang, Limao Xiong, Han Xia, Yingshuang Gu, Mingxu Chai, Fukang Zhu, Caishuang Huang, Shihan Dou, Zhiheng Xi, et al. 2024. Easy- Jailbreak: A Unified Framework for Jailbreaking Large Language Models. arXiv preprint arXiv:2403.12171 (2024)."},{"key":"e_1_3_2_2_39_1","volume-title":"Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J Zico Kolter, and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:10:04Z","timestamp":1759893004000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":39,"alternative-id":["10.1145\/3701716.3717659","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717659","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}