{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T02:07:22Z","timestamp":1777342042615,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302101, 62402114, U2436207"],"award-info":[{"award-number":["62302101, 62402114, U2436207"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key Research and Development Program","award":["2024YFF0618800"],"award-info":[{"award-number":["2024YFF0618800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,22]]},"DOI":"10.1145\/3696410.3714632","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:47:11Z","timestamp":1745362031000},"page":"872-883","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["You Can't Eat Your Cake and Have It Too: The Performance Degradation of LLMs with Jailbreak Defense"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2797-680X","authenticated-orcid":false,"given":"Wuyuao","family":"Mai","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1811-9432","authenticated-orcid":false,"given":"Geng","family":"Hong","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8088-316X","authenticated-orcid":false,"given":"Pei","family":"Chen","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1394-0395","authenticated-orcid":false,"given":"Xudong","family":"Pan","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9032-8063","authenticated-orcid":false,"given":"Baojun","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0726-9996","authenticated-orcid":false,"given":"Yuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0083-733X","authenticated-orcid":false,"given":"Haixin","family":"Duan","sequence":"additional","affiliation":[{"name":"Quancheng Laboratory, Jinan, China and Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9714-5545","authenticated-orcid":false,"given":"Min","family":"Yang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Meta AI. 2023a. Llama 2 7B. https:\/\/huggingface.co\/meta-llama\/Llama-2--7b-hf. Hugging Face Model Hub.."},{"key":"e_1_3_2_1_2_1","unstructured":"Mistral AI. 2023b. Mistral 7B Instruct v0.2. https:\/\/huggingface.co\/mistralai\/Mistral-7B-Instruct-v0.2. Hugging Face Model Hub.."},{"key":"e_1_3_2_1_3_1","unstructured":"Mistral AI. 2023c. Mistral 7B Instruct v0.3. https:\/\/huggingface.co\/mistralai\/Mistral-7B-Instruct-v0.3. Hugging Face Model Hub.."},{"key":"e_1_3_2_1_4_1","unstructured":"Meta AI. 2024a. Meta Llama 3 8B Instruct. https:\/\/huggingface.co\/meta-llama\/Meta-Llama-3--8B-Instruct. Hugging Face Model Hub.."},{"key":"e_1_3_2_1_5_1","unstructured":"Mistral AI. 2024b. Mistral AI. https:\/\/mistral.ai\/. A company focusing on open language models.."},{"key":"e_1_3_2_1_6_1","unstructured":"Alex Albert. 2024. Jailbreak. http:\/\/www.jailbreakchat.com. Accessed: 2024--10--14."},{"key":"e_1_3_2_1_7_1","unstructured":"Gabriel Alon and Michael Kamfonas. 2023. Detecting Language Model Attacks with Perplexity. arxiv: 2308.14132 [cs.CL] https:\/\/arxiv.org\/abs\/2308.14132"},{"key":"e_1_3_2_1_8_1","volume-title":"Automatic Pseudo-Harmful Prompt Generation for Evaluating False Refusals in Large Language Models. In First Conference on Language Modeling. https:\/\/openreview.net\/forum?id=ljFgX6A8NL","author":"An Bang","year":"2024","unstructured":"Bang An, Sicheng Zhu, Ruiyi Zhang, Michael-Andrei Panaitescu-Liess, Yuancheng Xu, and Furong Huang. 2024. Automatic Pseudo-Harmful Prompt Generation for Evaluating False Refusals in Large Language Models. In First Conference on Language Modeling. https:\/\/openreview.net\/forum?id=ljFgX6A8NL"},{"key":"e_1_3_2_1_9_1","unstructured":"LLM Attacks. 2024. LLM Attacks - AdvBench Data. https:\/\/github.com\/llm-attacks\/llm-attacks\/tree\/main\/data\/advbench. Accessed: 2024--10--14."},{"key":"e_1_3_2_1_10_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=gT5hALch9z","author":"Bianchi Federico","year":"2024","unstructured":"Federico Bianchi, Mirac Suzgun, Giuseppe Attanasio, Paul Rottger, Dan Jurafsky, Tatsunori Hashimoto, and James Zou. 2024. Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language Models that Follow Instructions. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=gT5hALch9z"},{"key":"e_1_3_2_1_11_1","volume-title":"Language Models are Few-Shot Learners. (2020). arxiv","author":"Brown Tom B.","year":"2005","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020a. Language Models are Few-Shot Learners. (2020). arxiv: 2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_12_1","volume-title":"Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei.","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeff Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Ma teusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020b. Language Models are Few-Shot Learners. ArXiv, Vol. abs\/2005.14165 (2020). https:\/\/api.semanticscholar.org\/CorpusID:218971783"},{"key":"e_1_3_2_1_13_1","unstructured":"OpenAI Community. 2024. Why ChatGPT 4.0 is getting stupider and stupider. https:\/\/community.openai.com\/t\/why-chatgpt-4-0-is-getting-stupider-and-stupider\/590741. Accessed: 2024--10--14."},{"key":"e_1_3_2_1_14_1","volume-title":"Multilingual Jailbreak Challenges in Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=vESNKdEMGp","author":"Deng Yue","year":"2024","unstructured":"Yue Deng, Wenxuan Zhang, Sinno Jialin Pan, and Lidong Bing. 2024. Multilingual Jailbreak Challenges in Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=vESNKdEMGp"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.88"},{"key":"e_1_3_2_1_16_1","unstructured":"Deutsche Welle (DW). 2023. Is ChatGPT Getting Dumber? https:\/\/www.dw.com\/en\/is-chatgpt-getting-dumber\/a-66352529 Accessed: 2024--10--14."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Ubiquitous Security. https:\/\/api.semanticscholar.org\/CorpusID:266359311","author":"Esmradi Aysan","year":"2023","unstructured":"Aysan Esmradi, Daniel Wankit Yip, and Chun Fai Chan. 2023. A Comprehensive Survey of Attack Techniques, Implementation, and Mitigation Strategies in Large Language Models. In International Conference on Ubiquitous Security. https:\/\/api.semanticscholar.org\/CorpusID:266359311"},{"key":"e_1_3_2_1_18_1","volume-title":"JailbreakLens: Visual Analysis of Jailbreak Attacks Against Large Language Models. ArXiv","author":"Feng Yingchaojie","year":"2024","unstructured":"Yingchaojie Feng, Zhizhang Chen, Zhining Kang, Sijia Wang, Minfeng Zhu, Wei Zhang, and Wei Chen. 2024. JailbreakLens: Visual Analysis of Jailbreak Attacks Against Large Language Models. ArXiv, Vol. abs\/2404.08793 (2024). https:\/\/api.semanticscholar.org\/CorpusID:269149510"},{"key":"e_1_3_2_1_19_1","unstructured":"Victor Gallego. 2024. Configurable Safety Tuning of Language Models with Synthetic Preference Data. arxiv: 2404.00495 [cs.CL]"},{"key":"e_1_3_2_1_20_1","volume-title":"Attacking large language models with projected gradient descent. arXiv preprint arXiv:2402.09154","author":"Geisler Simon","year":"2024","unstructured":"Simon Geisler, Tom Wollschl\u00e4ger, MHI Abdalla, Johannes Gasteiger, and Stephan G\u00fcnnemann. 2024. Attacking large language models with projected gradient descent. arXiv preprint arXiv:2402.09154 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"17002","author":"Guo Xingang","year":"2024","unstructured":"Xingang Guo, Fangxu Yu, Huan Zhang, Lianhui Qin, and Bin Hu. 2024. COLD-Attack: Jailbreaking LLMs with Stealthiness and Controllability. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235), Ruslan Salakhutdinov, Zico Kolter, Katherine Heller, Adrian Weller, Nuria Oliver, Jonathan Scarlett, and Felix Berkenkamp (Eds.). PMLR, 16974--17002. https:\/\/proceedings.mlr.press\/v235\/guo24i.html"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. Proceedings of the International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_2_1_23_1","unstructured":"Caishuang Huang Wanxu Zhao Rui Zheng Huijie Lv Shihan Dou Sixian Li Xiao Wang Enyu Zhou Junjie Ye Yuming Yang et al. 2024. SafeAligner: Safety Alignment against Jailbreak Attacks via Response Disparity Guidance. arXiv preprint arXiv:2406.18118 (2024)."},{"key":"e_1_3_2_1_24_1","unstructured":"Albert Qiaochu Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de Las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L'elio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. ArXiv Vol. abs\/2310.06825 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263830494"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.809"},{"key":"e_1_3_2_1_26_1","volume-title":"JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts Discovery from Large-Scale Human-LLM Conversational Datasets. ArXiv","author":"Jin Zhihua","year":"2024","unstructured":"Zhihua Jin, Shiyi Liu, Haotian Li, Xun Zhao, and Huamin Qu. 2024. JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts Discovery from Large-Scale Human-LLM Conversational Datasets. ArXiv, Vol. abs\/2407.03045 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270924032"},{"key":"e_1_3_2_1_27_1","volume-title":"Break the Breakout: Reinventing LM Defense Against Jailbreak Attacks with Self-Refinement. arXiv preprint arXiv:2402.15180","author":"Kim Heegyu","year":"2024","unstructured":"Heegyu Kim, Sehyun Yuk, and Hyunsouk Cho. 2024. Break the Breakout: Reinventing LM Defense Against Jailbreak Attacks with Self-Refinement. arXiv preprint arXiv:2402.15180 (2024)."},{"key":"e_1_3_2_1_28_1","unstructured":"UMD Huang Lab. 2024. False Refusal. https:\/\/github.com\/umd-huang-lab\/FalseRefusal. Accessed: 2024--10--14."},{"key":"e_1_3_2_1_29_1","volume-title":"Hashimoto","author":"Li Xuechen","year":"2023","unstructured":"Xuechen Li, Tianyi Zhang, Yann Dubois, Rohan Taori, Ishaan Gulrajani, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. AlpacaEval: An Automatic Evaluator of Instruction-following Models. https:\/\/github.com\/tatsu-lab\/alpaca_eval."},{"key":"e_1_3_2_1_30_1","volume-title":"AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=7Jwpw4qKkb","author":"Liu Xiaogeng","year":"2024","unstructured":"Xiaogeng Liu, Nan Xu, Muhao Chen, and Chaowei Xiao. 2024. AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=7Jwpw4qKkb"},{"key":"e_1_3_2_1_31_1","volume-title":"Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study. ArXiv","author":"Liu Yi","year":"2023","unstructured":"Yi Liu, Gelei Deng, Zhengzi Xu, Yuekang Li, Yaowen Zheng, Ying Zhang, Lida Zhao, Tianwei Zhang, and Yang Liu. 2023. Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study. ArXiv, Vol. abs\/2305.13860 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258841501"},{"key":"e_1_3_2_1_32_1","unstructured":"LMSys. 2023. Vicuna 7B v1.5. https:\/\/huggingface.co\/lmsys\/vicuna-7b-v1.5. Hugging Face Model Hub.."},{"key":"e_1_3_2_1_33_1","unstructured":"Wuyuao Mai Geng Hong Pei Chen Xudong Pan Baojun Liu Yuan Zhang Haixin Duan and Min Yang. 2025. USEBench. https:\/\/github.com\/Marphownio\/USEBench Accessed: 2025-01--24."},{"key":"e_1_3_2_1_34_1","unstructured":"Inc. Meta Platforms. 2024. Meta Platforms Inc. https:\/\/about.meta.com\/. Formerly known as Facebook Inc.."},{"key":"e_1_3_2_1_35_1","volume-title":"ICLR 2024 Workshop on Secure and Trustworthy Large Language Models. https:\/\/openreview.net\/forum?id=q0PbfNwLBq","author":"Mo Yichuan","year":"2024","unstructured":"Yichuan Mo, Yuji Wang, Zeming Wei, and Yisen Wang. 2024. Fight Back Against Jailbreaking via Prompt Adversarial Tuning. In ICLR 2024 Workshop on Secure and Trustworthy Large Language Models. https:\/\/openreview.net\/forum?id=q0PbfNwLBq"},{"key":"e_1_3_2_1_36_1","unstructured":"Ollmer. 2024. MMLU. https:\/\/github.com\/ollmer\/mmlu. Accessed: 2024--10--14."},{"key":"e_1_3_2_1_37_1","unstructured":"OpenAI. 2022. ChatGPT: Improving Language Understanding with Human Feedback. https:\/\/openai.com\/research\/chatgpt. Accessed: 2024--10--15.."},{"key":"e_1_3_2_1_39_1","unstructured":"OpenAI. 2024. OpenAI. https:\/\/www.openai.com. Accessed: 2024--10--15.."},{"key":"e_1_3_2_1_40_1","volume-title":"PAT: Practice Algorithm Testing. https:\/\/github.com\/rain152\/PAT. GitHub repository.","year":"2024","unstructured":"Rain152. 2024. PAT: Practice Algorithm Testing. https:\/\/github.com\/rain152\/PAT. GitHub repository."},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024","author":"Rao Abhinav Sukumar","year":"2024","unstructured":"Abhinav Sukumar Rao, Atharva Roshan Naik, Sachin Vashistha, Somak Aditya, and Monojit Choudhury. 2024. Tricking LLMs into Disobedience: Formalizing, Analyzing, and Detecting Jailbreaks. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), Nicoletta Calzolari, Min-Yen Kan, Veronique Hoste, Alessandro Lenci, Sakriani Sakti, and Nianwen Xue (Eds.). ELRA and ICCL, Torino, Italia, 16802--16830. https:\/\/aclanthology.org\/2024.lrec-main.1462"},{"key":"e_1_3_2_1_42_1","volume-title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks. arXiv preprint arXiv:2310.03684","author":"Robey Alexander","year":"2023","unstructured":"Alexander Robey, Eric Wong, Hamed Hassani, and George J Pappas. 2023. SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks. arXiv preprint arXiv:2310.03684 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_44_1","unstructured":"thu coai. 2024. JailbreakDefense: Defending Large Language Models Against Jailbreaking Attacks Through Goal Prioritization. https:\/\/github.com\/thu-coai\/JailbreakDefense_GoalPriority. GitHub repository."},{"key":"e_1_3_2_1_45_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. ArXiv","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. ArXiv, Vol. abs\/2302.13971 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257219404"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","unstructured":"Alex Wang Amanpreet Singh Julian Michael Felix Hill Omer Levy and Samuel Bowman. 2018. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP Tal Linzen Grzegorz Chrupa\u0142a and Afra Alishahi (Eds.). Association for Computational Linguistics Brussels Belgium 353--355. https:\/\/doi.org\/10.18653\/v1\/W18--5446","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_47_1","volume-title":"F. Xia, Quoc Le, and Denny Zhou.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Huai hsin Chi, F. Xia, Quoc Le, and Denny Zhou. 2022. Chain of Thought Prompting Elicits Reasoning in Large Language Models. ArXiv, Vol. abs\/2201.11903 (2022). https:\/\/api.semanticscholar.org\/CorpusID:246411621"},{"key":"e_1_3_2_1_48_1","volume-title":"Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv preprint arXiv:2310.06387","author":"Wei Zeming","year":"2023","unstructured":"Zeming Wei, Yifei Wang, and Yisen Wang. 2023. Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv preprint arXiv:2310.06387 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00765-8"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_52_1","volume-title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey. arXiv preprint arXiv:2407.04295","author":"Yi Sibo","year":"2024","unstructured":"Sibo Yi, Yule Liu, Zhen Sun, Tianshuo Cong, Xinlei He, Jiaxing Song, Ke Xu, and Qi Li. 2024. Jailbreak Attacks and Defenses Against Large Language Models: A Survey. arXiv preprint arXiv:2407.04295 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Refuse whenever you feel unsafe: Improving safety in llms via decoupled refusal training. arXiv preprint arXiv:2407.09121","author":"Yuan Youliang","year":"2024","unstructured":"Youliang Yuan, Wenxiang Jiao, Wenxuan Wang, Jen-tse Huang, Jiahao Xu, Tian Liang, Pinjia He, and Zhaopeng Tu. 2024. Refuse whenever you feel unsafe: Improving safety in llms via decoupled refusal training. arXiv preprint arXiv:2407.09121 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Autodefense: Multi-agent llm defense against jailbreak attacks. arXiv preprint arXiv:2403.04783","author":"Zeng Yifan","year":"2024","unstructured":"Yifan Zeng, Yiran Wu, Xiao Zhang, Huazheng Wang, and Qingyun Wu. 2024. Autodefense: Multi-agent llm defense against jailbreak attacks. arXiv preprint arXiv:2403.04783 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Safe Unlearning: A Surprisingly Effective and Generalizable Solution to Defend Against Jailbreak Attacks. arXiv preprint arXiv:2407.02855","author":"Zhang Zhexin","year":"2024","unstructured":"Zhexin Zhang, Junxiao Yang, Pei Ke, Shiyao Cui, Chujie Zheng, Hongning Wang, and Minlie Huang. 2024. Safe Unlearning: A Surprisingly Effective and Generalizable Solution to Defend Against Jailbreak Attacks. arXiv preprint arXiv:2407.02855 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Levine (Eds.)","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, Hao Zhang, Joseph E Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 46595--46623. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/91f18a1287b398d378ef22505bf41832-Paper-Datasets_and_Benchmarks.pdf"},{"key":"e_1_3_2_1_57_1","unstructured":"Sicheng Zhu Ruiyi Zhang Bang An Gang Wu Joe Barrow Zichao Wang Furong Huang Ani Nenkova and Tong Sun. 2023. AutoDAN: Interpretable Gradient-Based Adversarial Attacks on Large Language Models."},{"key":"e_1_3_2_1_58_1","volume-title":"Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J Zico Kolter, and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714632","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714632","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:56Z","timestamp":1750295936000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714632"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":56,"alternative-id":["10.1145\/3696410.3714632","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714632","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}