{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,7]],"date-time":"2026-07-07T15:38:19Z","timestamp":1783438699959,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,22]]},"DOI":"10.1145\/3696410.3714654","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:57:28Z","timestamp":1745362648000},"page":"863-871","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Dual Intention Escape: Penetrating and Toxic Jailbreak Attack against Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8015-0430","authenticated-orcid":false,"given":"Yanni","family":"Xue","sequence":"first","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5884-3412","authenticated-orcid":false,"given":"Jiakai","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhongguancun Laboratory, Beijing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2129-9182","authenticated-orcid":false,"given":"Zixin","family":"Yin","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1936-9396","authenticated-orcid":false,"given":"Yuqing","family":"Ma","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7391-7539","authenticated-orcid":false,"given":"Haotong","family":"Qin","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5695-2009","authenticated-orcid":false,"given":"Renshuai","family":"Tao","sequence":"additional","affiliation":[{"name":"Institute of Information Science, Beijing Jiaotong University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7618-3275","authenticated-orcid":false,"given":"Xianglong","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China, Zhongguancun Laboratory, Beijing, China, and Institute of Data Space, Hefei Comprehensive National Science Center, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Amanda Askell Yuntao Bai Anna Chen Dawn Drain Deep Ganguli Tom Henighan Andy Jones Nicholas Joseph Ben Mann Nova DasSarma et al. 2021. A general language assistant as a laboratory for alignment. arXiv preprint arXiv:2112.00861 (2021)."},{"key":"e_1_3_2_1_3_1","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Kaijie Zhu Hao Chen Linyi Yang Xiaoyuan Yi Cunxiang Wang Yidong Wang et al. 2023. A survey on evaluation of large language models. arXiv preprint arXiv:2307.03109 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Jailbreaking black box large language models in twenty queries. arXiv preprint arXiv:2310.08419","author":"Chao Patrick","year":"2023","unstructured":"Patrick Chao, Alexander Robey, Edgar Dobriban, Hamed Hassani, George J Pappas, and Eric Wong. 2023. Jailbreaking black box large language models in twenty queries. arXiv preprint arXiv:2310.08419 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"When LLM Meets DRL: Advancing Jailbreaking Efficiency via DRL-guided Search. arXiv preprint arXiv:2406.08705","author":"Chen Xuan","year":"2024","unstructured":"Xuan Chen, Yuzhou Nie, Wenbo Guo, and Xiangyu Zhang. 2024a. When LLM Meets DRL: Advancing Jailbreaking Efficiency via DRL-guided Search. arXiv preprint arXiv:2406.08705 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"RL-JACK: Reinforcement Learning-powered Black-box Jailbreaking Attack against LLMs. arXiv preprint arXiv:2406.08725","author":"Chen Xuan","year":"2024","unstructured":"Xuan Chen, Yuzhou Nie, Lu Yan, Yunshu Mao, Wenbo Guo, and Xiangyu Zhang. 2024b. RL-JACK: Reinforcement Learning-powered Black-box Jailbreaking Attack against LLMs. arXiv preprint arXiv:2406.08725 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. URL: https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/."},{"key":"e_1_3_2_1_8_1","volume-title":"Comprehensive assessment of jailbreak attacks against llms. arXiv preprint arXiv:2402.05668","author":"Chu Junjie","year":"2024","unstructured":"Junjie Chu, Yugeng Liu, Ziqing Yang, Xinyue Shen, Michael Backes, and Yang Zhang. 2024. Comprehensive assessment of jailbreak attacks against llms. arXiv preprint arXiv:2402.05668 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Deep Ganguli Liane Lovitt Jackson Kernion Amanda Askell Yuntao Bai Saurav Kadavath Ben Mann Ethan Perez Nicholas Schiefer Kamal Ndousse et al. 2022. Red teaming language models to reduce harms: Methods scaling behaviors and lessons learned. arXiv preprint arXiv:2209.07858 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"e_1_3_2_1_12_1","volume-title":"Generative language models and automated influence operations: Emerging threats and potential mitigations. arXiv preprint arXiv:2301.04246","author":"Goldstein Josh A","year":"2023","unstructured":"Josh A Goldstein, Girish Sastry, Micah Musser, Renee DiResta, Matthew Gentzel, and Katerina Sedova. 2023. Generative language models and automated influence operations: Emerging threats and potential mitigations. arXiv preprint arXiv:2301.04246 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Cold-attack: Jailbreaking llms with stealthiness and controllability. arXiv preprint arXiv:2402.08679","author":"Guo Xingang","year":"2024","unstructured":"Xingang Guo, Fangxu Yu, Huan Zhang, Lianhui Qin, and Bin Hu. 2024. Cold-attack: Jailbreaking llms with stealthiness and controllability. arXiv preprint arXiv:2402.08679 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Large language models can be used to effectively scale spear phishing campaigns. arXiv preprint arXiv:2305.06972","author":"Hazell Julian","year":"2023","unstructured":"Julian Hazell. 2023. Large language models can be used to effectively scale spear phishing campaigns. arXiv preprint arXiv:2305.06972 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Efficient LLM Jailbreak via Adaptive Dense-to-sparse Constrained Optimization. arXiv preprint arXiv:2405.09113","author":"Hu Kai","year":"2024","unstructured":"Kai Hu, Weichen Yu, Tianjun Yao, Xiang Li, Wenhe Liu, Lijun Yu, Yining Li, Kai Chen, Zhiqiang Shen, and Matt Fredrikson. 2024. Efficient LLM Jailbreak via Adaptive Dense-to-sparse Constrained Optimization. arXiv preprint arXiv:2405.09113 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Huang Yangsibo","unstructured":"Yangsibo Huang, Samyak Gupta, Mengzhou Xia, Kai Li, and Danqi Chen. [n.,d.]. Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_17_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Artprompt: Ascii art-based jailbreak attacks against aligned llms. arXiv preprint arXiv:2402.11753","author":"Jiang Fengqing","year":"2024","unstructured":"Fengqing Jiang, Zhangchen Xu, Luyao Niu, Zhen Xiang, Bhaskar Ramasubramanian, Bo Li, and Radha Poovendran. 2024b. Artprompt: Ascii art-based jailbreak attacks against aligned llms. arXiv preprint arXiv:2402.11753 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Unlocking Adversarial Suffix Optimization Without Affirmative Phrases: Efficient Black-box Jailbreaking via LLM as Optimizer. arXiv preprint arXiv:2408.11313","author":"Jiang Weipeng","year":"2024","unstructured":"Weipeng Jiang, Zhenting Wang, Juan Zhai, Shiqing Ma, Zhengyu Zhao, and Chao Shen. 2024a. Unlocking Adversarial Suffix Optimization Without Affirmative Phrases: Efficient Black-box Jailbreaking via LLM as Optimizer. arXiv preprint arXiv:2408.11313 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/SPW63631.2024.00018"},{"key":"e_1_3_2_1_21_1","volume-title":"Drattack: Prompt decomposition and reconstruction makes powerful llm jailbreakers. arXiv preprint arXiv:2402.16914","author":"Li Xirui","year":"2024","unstructured":"Xirui Li, Ruochen Wang, Minhao Cheng, Tianyi Zhou, and Cho-Jui Hsieh. 2024. Drattack: Prompt decomposition and reconstruction makes powerful llm jailbreakers. arXiv preprint arXiv:2402.16914 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Liu Tong","year":"2024","unstructured":"Tong Liu, Yingjie Zhang, Zhe Zhao, Yinpeng Dong, Guozhu Meng, and Kai Chen. 2024. Making them ask and answer: Jailbreaking large language models in few queries via disguise and reconstruction. In 33rd USENIX Security Symposium (USENIX Security 24). 4711--4728."},{"key":"e_1_3_2_1_23_1","volume-title":"Autodan: Generating stealthy jailbreak prompts on aligned large language models. arXiv preprint arXiv:2310.04451","author":"Liu Xiaogeng","year":"2023","unstructured":"Xiaogeng Liu, Nan Xu, Muhao Chen, and Chaowei Xiao. 2023. Autodan: Generating stealthy jailbreak prompts on aligned large language models. arXiv preprint arXiv:2310.04451 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Tree of attacks: Jailbreaking black-box llms automatically. arXiv preprint arXiv:2312.02119","author":"Mehrotra Anay","year":"2023","unstructured":"Anay Mehrotra, Manolis Zampetakis, Paul Kassianik, Blaine Nelson, Hyrum Anderson, Yaron Singer, and Amin Karbasi. 2023. Tree of attacks: Jailbreaking black-box llms automatically. arXiv preprint arXiv:2312.02119 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Decision Making in Emergency Medicine: Biases, Errors and Solutions","author":"Morgenstern Justin","year":"2021","unstructured":"Justin Morgenstern. 2021. Availability Bias. Decision Making in Emergency Medicine: Biases, Errors and Solutions (2021), 47--52."},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2022. ChatGPT: Optimizing Language Models for Dialogue. https:\/\/openai.com\/blog\/chatgpt\/. Accessed: 2024-05--28."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2020.02022"},{"key":"e_1_3_2_1_28_1","volume-title":"do anything now'': Characterizing and evaluating in-the-wild jailbreak prompts on large language models. arXiv preprint arXiv:2308.03825","author":"Shen Xinyue","year":"2023","unstructured":"Xinyue Shen, Zeyuan Chen, Michael Backes, Yun Shen, and Yang Zhang. 2023. '' do anything now'': Characterizing and evaluating in-the-wild jailbreak prompts on large language models. arXiv preprint arXiv:2308.03825 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"Qwen Team. 2024. Qwen2.5: A Party of Foundation Models. https:\/\/qwenlm.github.io\/blog\/qwen2.5\/"},{"key":"e_1_3_2_1_30_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Wei Alexander","year":"2024","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2024. Jailbroken: How does llm safety training fail? Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00765-8"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.443"},{"key":"e_1_3_2_1_34_1","volume-title":"Chain of Attack: a Semantic-Driven Contextual Multi-Turn attacker for LLM. arXiv preprint arXiv:2405.05610","author":"Yang Xikang","year":"2024","unstructured":"Xikang Yang, Xuehai Tang, Songlin Hu, and Jizhong Han. 2024. Chain of Attack: a Semantic-Driven Contextual Multi-Turn attacker for LLM. arXiv preprint arXiv:2405.05610 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing","author":"Yao Yifan","year":"2024","unstructured":"Yifan Yao, Jinhao Duan, Kaidi Xu, Yuanfang Cai, Zhibo Sun, and Yue Zhang. 2024. A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing (2024), 100211."},{"key":"e_1_3_2_1_36_1","volume-title":"Low-resource languages jailbreak gpt-4. arXiv preprint arXiv:2310.02446","author":"Yong Zheng-Xin","year":"2023","unstructured":"Zheng-Xin Yong, Cristina Menghini, and Stephen H Bach. 2023. Low-resource languages jailbreak gpt-4. arXiv preprint arXiv:2310.02446 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253","author":"Yu Jiahao","year":"2023","unstructured":"Jiahao Yu, Xingwei Lin, Zheng Yu, and Xinyu Xing. 2023. Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Yu Jiahao","year":"2024","unstructured":"Jiahao Yu, Xingwei Lin, Zheng Yu, and Xinyu Xing. 2024a. {LLM-Fuzzer}: Scaling Assessment of Large Language Model Jailbreaks. In 33rd USENIX Security Symposium (USENIX Security 24). 4657--4674."},{"key":"e_1_3_2_1_39_1","volume-title":"Don't Listen To Me: Understanding and Exploring Jailbreak Prompts of Large Language Models. arXiv preprint arXiv:2403.17336","author":"Yu Zhiyuan","year":"2024","unstructured":"Zhiyuan Yu, Xiaogeng Liu, Shunning Liang, Zach Cameron, Chaowei Xiao, and Ning Zhang. 2024b. Don't Listen To Me: Understanding and Exploring Jailbreak Prompts of Large Language Models. arXiv preprint arXiv:2403.17336 (2024)."},{"key":"e_1_3_2_1_40_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arxiv: 2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581318"},{"key":"e_1_3_2_1_42_1","unstructured":"Weikang Zhou Xiao Wang Limao Xiong Han Xia Yingshuang Gu Mingxu Chai Fukang Zhu Caishuang Huang Shihan Dou Zhiheng Xi Rui Zheng Songyang Gao Yicheng Zou Hang Yan Yifan Le Ruohui Wang Lijun Li Jing Shao Tao Gui Qi Zhang and Xuanjing Huang. 2024. EasyJailbreak: A Unified Framework for Jailbreaking Large Language Models. arxiv: 2403.12171 [cs.CL]"},{"key":"e_1_3_2_1_43_1","volume-title":"Don't Say No: Jailbreaking LLM by Suppressing Refusal. arXiv preprint arXiv:2404.16369","author":"Zhou Yukai","year":"2024","unstructured":"Yukai Zhou and Wenjie Wang. 2024. Don't Say No: Jailbreaking LLM by Suppressing Refusal. arXiv preprint arXiv:2404.16369 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Diyi Yang, and Xing Xie.","author":"Zhu Kaijie","year":"2023","unstructured":"Kaijie Zhu, Jiaao Chen, Jindong Wang, Neil Zhenqiang Gong, Diyi Yang, and Xing Xie. 2023a. DyVal: Graph-informed Dynamic Evaluation of Large Language Models. arXiv preprint arXiv:2309.17167 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Yue Zhang, et al.","author":"Zhu Kaijie","year":"2023","unstructured":"Kaijie Zhu, Jindong Wang, Jiaheng Zhou, Zichen Wang, Hao Chen, Yidong Wang, Linyi Yang, Wei Ye, Neil Zhenqiang Gong, Yue Zhang, et al. 2023b. PromptBench: Towards Evaluating the Robustness of Large Language Models on Adversarial Prompts. arXiv preprint arXiv:2306.04528 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"PromptBench: A Unified Library for Evaluation of Large Language Models. arXiv preprint arXiv:2312.07910","author":"Zhu Kaijie","year":"2023","unstructured":"Kaijie Zhu, Qinlin Zhao, Hao Chen, Jindong Wang, and Xing Xie. 2023c. PromptBench: A Unified Library for Evaluation of Large Language Models. arXiv preprint arXiv:2312.07910 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J Zico Kolter, and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714654","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714654","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:56Z","timestamp":1750295936000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714654"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":47,"alternative-id":["10.1145\/3696410.3714654","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714654","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}