{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T15:47:07Z","timestamp":1771516027429,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3760899","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T01:03:42Z","timestamp":1762563822000},"page":"4817-4821","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Jailbreaking LLMs Through Alignment Vulnerabilities in Out-of-Distribution Settings"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8315-7972","authenticated-orcid":false,"given":"Yue","family":"Huang","sequence":"first","affiliation":[{"name":"University of Notre Dame, South Bend, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8259-8531","authenticated-orcid":false,"given":"Jingyu","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Notre Dame, South Bend, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9848-2557","authenticated-orcid":false,"given":"Dongping","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Maryland, College Park, MD, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2522-6858","authenticated-orcid":false,"given":"Bingda","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6937-4180","authenticated-orcid":false,"given":"Yao","family":"Wan","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1539-7939","authenticated-orcid":false,"given":"Lichao","family":"Sun","sequence":"additional","affiliation":[{"name":"Lehigh University, Bethlehem, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3491-5968","authenticated-orcid":false,"given":"Philip","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Illinois Chicago, Chicgao, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3574-5665","authenticated-orcid":false,"given":"Xiangliang","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Notre Dame, South Bend, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Principal component analysis","author":"Abdi Herv\u00e9","year":"2010","unstructured":"Herv\u00e9 Abdi and Lynne J Williams. 2010. Principal component analysis. Wiley interdisciplinary reviews: computational statistics, Vol. 2, 4 (2010), 433-459."},{"key":"e_1_3_2_1_2_1","unstructured":"Gabriel Alon and Michael Kamfonas. 2023. Detecting Language Model Attacks with Perplexity. arXiv:2308.14132 [cs.CL]"},{"key":"e_1_3_2_1_3_1","unstructured":"Maksym Andriushchenko Francesco Croce and Nicolas Flammarion. 2024. Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks. arXiv:2404.02151 [cs.CR]"},{"key":"e_1_3_2_1_4_1","unstructured":"Patrick Chao Alexander Robey Edgar Dobriban Hamed Hassani George J. Pappas and Eric Wong. 2023. Jailbreaking Black Box Large Language Models in Twenty Queries. arXiv:2310.08419 [cs.LG]"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Patrick Chao Alexander Robey Edgar Dobriban Hamed Hassani George J. Pappas and Eric Wong. 2024. Jailbreaking Black Box Large Language Models in Twenty Queries. arXiv:2310.08419 [cs.LG] https:\/\/arxiv.org\/abs\/2310.08419","DOI":"10.1109\/SaTML64287.2025.00010"},{"key":"e_1_3_2_1_6_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_1_7_1","unstructured":"Paul Christiano Jan Leike Tom B. Brown Miljan Martic Shane Legg and Dario Amodei. 2023. Deep reinforcement learning from human preferences. arXiv:1706.03741 [stat.ML]"},{"key":"e_1_3_2_1_8_1","unstructured":"Yangsibo Huang Samyak Gupta Mengzhou Xia Kai Li and Danqi Chen. 2023. Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. arXiv:2310.06987 [cs.CL]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3651509"},{"key":"e_1_3_2_1_10_1","volume-title":"Position: TrustLLM: Trustworthiness in Large Language Models. In International Conference on Machine Learning. PMLR","author":"Huang Yue","year":"2024","unstructured":"Yue Huang, Lichao Sun, Haoran Wang, Siyuan Wu, Qihui Zhang, Yuan Li, Chujie Gao, Yixin Huang, Wenhan Lyu, Yixuan Zhang, et al., 2024b. Position: TrustLLM: Trustworthiness in Large Language Models. In International Conference on Machine Learning. PMLR, 20166-20270."},{"key":"e_1_3_2_1_11_1","volume-title":"Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein.","author":"Jain Neel","year":"2023","unstructured":"Neel Jain, Avi Schwarzschild, Yuxin Wen, Gowthami Somepalli, John Kirchenbauer, Ping yeh Chiang, Micah Goldblum, Aniruddha Saha, Jonas Geiping, and Tom Goldstein. 2023. Baseline Defenses for Adversarial Attacks Against Aligned Language Models. arXiv:2309.00614 [cs.LG]"},{"key":"e_1_3_2_1_12_1","unstructured":"Xuan Li Zhanke Zhou Jianing Zhu Jiangchao Yao Tongliang Liu and Bo Han. 2024. DeepInception: Hypnotize Large Language Model to Be Jailbreaker. arXiv:2311.03191 [cs.LG]"},{"key":"e_1_3_2_1_13_1","unstructured":"Xiaogeng Liu Nan Xu Muhao Chen and Chaowei Xiao. 2023. AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models. arXiv:2310.04451 [cs.CL]"},{"key":"e_1_3_2_1_14_1","unstructured":"Meta. 2024. Build the future of AI with Meta Llama 3. (2024). https:\/\/llama.meta.com\/llama3\/"},{"key":"e_1_3_2_1_15_1","unstructured":"OpenAI. 2023. GPT-4. https:\/\/openai.com\/gpt-4."},{"key":"e_1_3_2_1_16_1","unstructured":"Mansi Phute Alec Helbling Matthew Hull ShengYun Peng Sebastian Szyller Cory Cornelius and Duen Horng Chau. 2023. LLM Self Defense: By Self Examination LLMs Know They Are Being Tricked. arXiv:2308.07308 [cs.CL]"},{"key":"e_1_3_2_1_17_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_18_1","volume":"202","author":"Robey Alexander","unstructured":"Alexander Robey, Eric Wong, Hamed Hassani, and George J. Pappas. 2023. SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks. arXiv:2310.03684 [cs.LG]","journal-title":"George J. Pappas."},{"key":"e_1_3_2_1_19_1","volume-title":"Trustllm: Trustworthiness in large language models. arXiv preprint arXiv:2401.05561","author":"Sun Lichao","year":"2024","unstructured":"Lichao Sun, Yue Huang, Haoran Wang, Siyuan Wu, Qihui Zhang, Chujie Gao, Yixin Huang, Wenhan Lyu, Yixuan Zhang, Xiner Li, et al., 2024. Trustllm: Trustworthiness in large language models. arXiv preprint arXiv:2401.05561 (2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_21_1","volume-title":"Jailbroken: How Does LLM Safety Training Fail? arXiv:2307.02483 [cs.LG]","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2023a. Jailbroken: How Does LLM Safety Training Fail? arXiv:2307.02483 [cs.LG]"},{"key":"e_1_3_2_1_22_1","unstructured":"Zeming Wei Yifei Wang and Yisen Wang. 2023b. Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations. arXiv:2310.06387 [cs.LG]"},{"key":"e_1_3_2_1_23_1","volume-title":"ChatGPT DAN Mode Prompt","author":"Williams Jake","year":"2024","unstructured":"Jake Williams. 2024. ChatGPT DAN Mode Prompt 2024. Medium (2024). https:\/\/medium.com\/@dakseymain\/chatgpt-dan-mode-prompt-2024-295298be7e4e"},{"key":"e_1_3_2_1_24_1","volume-title":"Wenbo Guo, Han Liu, and Xinyu Xing.","author":"Yu Jiahao","year":"2024","unstructured":"Jiahao Yu, Haozheng Luo, Jerry Yao-Chieh Hu, Wenbo Guo, Han Liu, and Xinyu Xing. 2024. Enhancing Jailbreak Attack Against Large Language Models through Silent Tokens. arXiv:2405.20653 [cs.AI]"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Yi Zeng Hongpeng Lin Jingwen Zhang Diyi Yang Ruoxi Jia and Weiyan Shi. 2024. How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs. arXiv:2401.06373 [cs.CL]","DOI":"10.18653\/v1\/2024.acl-long.773"},{"key":"e_1_3_2_1_26_1","unstructured":"Yufeng Zhang Fengzhuo Zhang Zhuoran Yang and Zhaoran Wang. 2023. What and How does In-Context Learning Learn? Bayesian Model Averaging Parameterization and Generalization. arXiv:2305.19420 [stat.ML]"},{"key":"e_1_3_2_1_27_1","unstructured":"Chujie Zheng Fan Yin Hao Zhou Fandong Meng Jie Zhou Kai-Wei Chang Minlie Huang and Nanyun Peng. 2024. On Prompt-Driven Safeguarding for Large Language Models. arXiv:2401.18018 [cs.LG]"},{"key":"e_1_3_2_1_28_1","unstructured":"Andy Zhou Bo Li and Haohan Wang. 2024. Robust Prompt Optimization for Defending Language Models Against Jailbreaking Attacks. arXiv:2401.17263 [cs.LG]"},{"key":"e_1_3_2_1_29_1","volume-title":"Yue Zhang, et al.","author":"Zhu Kaijie","year":"2023","unstructured":"Kaijie Zhu, Jindong Wang, Jiaheng Zhou, Zichen Wang, Hao Chen, Yidong Wang, Linyi Yang, Wei Ye, Neil Zhenqiang Gong, Yue Zhang, et al., 2023. Promptbench: Towards evaluating the robustness of large language models on adversarial prompts. arXiv preprint arXiv:2306.04528 (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"Andy Zou Zifan Wang Nicholas Carlini Milad Nasr J. Zico Kolter and Matt Fredrikson. 2023. Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv:2307.15043 [cs.CL]"}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","location":"Seoul Republic of Korea","acronym":"CIKM '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3760899","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:14:04Z","timestamp":1765502044000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3760899"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":30,"alternative-id":["10.1145\/3746252.3760899","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3760899","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}