{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T04:06:35Z","timestamp":1779422795383,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T00:00:00Z","timestamp":1779753600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2229876"],"award-info":[{"award-number":["2229876"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,26]]},"DOI":"10.1145\/3786335.3813133","type":"proceedings-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:16:22Z","timestamp":1779419782000},"page":"839-854","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring and Developing a Pre-Model Safeguard with Draft Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9280-8493","authenticated-orcid":false,"given":"Hongyu","family":"Cai","sequence":"first","affiliation":[{"name":"Purdue University, West Lafayette, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1631-6064","authenticated-orcid":false,"given":"Arjun","family":"Arunasalam","sequence":"additional","affiliation":[{"name":"Florida International University, Miami, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6071-5211","authenticated-orcid":false,"given":"Yiming","family":"Liang","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2862-5286","authenticated-orcid":false,"given":"Antonio","family":"Bianchi","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7362-8905","authenticated-orcid":false,"given":"Z. Berkay","family":"Celik","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,26]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Marah\u00a0I. Abdin Sam\u00a0Ade Jacobs Ammar\u00a0Ahmad Awan Jyoti Aneja Ahmed Awadallah et\u00a0al. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. https:\/\/www.microsoft.com\/en-us\/research\/publication\/phi-3-technical-report-a-highly-capable-language-model-locally-on-your-phone\/. arXiv (2024)."},{"key":"e_1_3_3_1_3_2","unstructured":"Loubna\u00a0Ben Allal Anton Lozhkov Elie Bakouch Leandro von Werra and Thomas Wolf. 2024. SmolLM - blazingly fast and remarkably powerful. https:\/\/huggingface.co\/blog\/smollm."},{"key":"e_1_3_3_1_4_2","unstructured":"Gabriel Alon and Michael\u00a0J. Kamfonas. 2023. Detecting Language Model Attacks With Perplexity. https:\/\/openreview.net\/forum?id=lNLVvdHyAw. arXiv (2023)."},{"key":"e_1_3_3_1_5_2","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang et\u00a0al. 2023. Qwen Technical Report. arXiv (2023)."},{"key":"e_1_3_3_1_6_2","unstructured":"Yuntao Bai Andy Jones Kamal Ndousse Amanda Askell Anna Chen et\u00a0al. 2022. Training a Helpful and Harmless Assistant With Reinforcement Learning From Human Feedback. arXiv (2022)."},{"key":"e_1_3_3_1_7_2","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion et\u00a0al. 2022. Constitutional AI: Harmlessness from AI Feedback. arXiv (2022)."},{"key":"e_1_3_3_1_8_2","unstructured":"Nikhil Bhendawade Irina Belousova Qichen Fu Henry Mason Mohammad Rastegari et\u00a0al. 2024. Speculative Streaming: Fast LLM Inference without Auxiliary Models. arXiv (2024)."},{"key":"e_1_3_3_1_9_2","unstructured":"Bochuan Cao Yuanpu Cao Lu Lin and Jinghui Chen. 2023. Defending Against Alignment-Breaking Attacks via Robustly Aligned LLM. Annual Meeting of the Association for Computational Linguistics (2023)."},{"key":"e_1_3_3_1_10_2","unstructured":"Patrick Chao Edoardo Debenedetti Alexander Robey Maksym Andriushchenko Francesco Croce et\u00a0al. 2024. JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models. NeurIPS Datasets and Benchmarks Track (2024)."},{"key":"e_1_3_3_1_11_2","unstructured":"Patrick Chao Alexander Robey Edgar Dobriban Hamed Hassani George\u00a0J. Pappas et\u00a0al. 2023. Jailbreaking Black Box Large Language Models in Twenty Queries. IEEE Conference on Secure and Trustworthy Machine Learning (2023)."},{"key":"e_1_3_3_1_12_2","unstructured":"Charlie Chen Sebastian Borgeaud Geoffrey Irving Jean-Baptiste Lespiau Laurent Sifre et\u00a0al. 2023. Accelerating Large Language Model Decoding with Speculative Sampling. arXiv (2023)."},{"key":"e_1_3_3_1_13_2","unstructured":"Yann Dubois Bal\u00e1zs Galambosi Percy Liang and Tatsunori\u00a0B Hashimoto. 2024. Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators. Conference on Language Modeling (2024)."},{"key":"e_1_3_3_1_14_2","unstructured":"Yann Dubois Xuechen Li Rohan Taori Tianyi Zhang Ishaan Gulrajani et\u00a0al. 2023. AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. https:\/\/openreview.net\/forum?id=4hturzLcKX. Conference on Neural Information Processing Systems (2023)."},{"key":"e_1_3_3_1_15_2","unstructured":"Jaiden Fairoze Sanjam Garg Keewoo Lee and Mingyuan Wang. 2025. Bypassing Prompt Guards in Production with Controlled-Release Prompting. https:\/\/arxiv.org\/abs\/2510.01529. arXiv (2025)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Markus Freitag and Yaser Al-Onaizan. 2017. Beam Search Strategies for Neural Machine Translation. https:\/\/aclanthology.org\/W17-3207\/. Workshop on Neural Machine Translation (2017).","DOI":"10.18653\/v1\/W17-3207"},{"key":"e_1_3_3_1_17_2","unstructured":"Ari Holtzman Jan Buys Li Du Maxwell Forbes and Yejin Choi. 2020. The Curious Case of Neural Text Degeneration. https:\/\/openreview.net\/forum?id=rygGQyrFvH. International Conference on Learning Representations (2020)."},{"key":"e_1_3_3_1_18_2","unstructured":"Yangsibo Huang Samyak Gupta Mengzhou Xia Kai Li and Danqi Chen. 2023. Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. International Conference on Learning Representations (2023)."},{"key":"e_1_3_3_1_19_2","unstructured":"Hakan Inan K. Upasani Jianfeng Chi Rashi Rungta Krithika Iyer et\u00a0al. 2023. Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations. https:\/\/bit.ly\/42ir2JB. arXiv (2023)."},{"key":"e_1_3_3_1_20_2","unstructured":"Neel Jain Avi Schwarzschild Yuxin Wen Gowthami Somepalli John Kirchenbauer et\u00a0al. 2023. Baseline defenses for adversarial attacks against aligned language models. arXiv (2023)."},{"key":"e_1_3_3_1_21_2","unstructured":"Harrison Lee Samrat Phatale Hassan Mansoor Kellie Lu Thomas Mesnard et\u00a0al. 2023. RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback. International Conference on Machine Learning (2023)."},{"key":"e_1_3_3_1_22_2","unstructured":"Yaniv Leviathan Matan Kalman and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. International Conference on Machine Learning (2023)."},{"key":"e_1_3_3_1_23_2","unstructured":"Meng Li Wangmeng Zuo and Lei Zhang. 2021. Early Stopping for Deep Image Prior. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)."},{"key":"e_1_3_3_1_24_2","unstructured":"Xuechen Li Tianyi Zhang Yann Dubois Rohan Taori Ishaan Gulrajani et\u00a0al. 2023. AlpacaEval: An Automatic Evaluator of Instruction-following Models. https:\/\/github.com\/tatsu-lab\/alpaca_eval. GitHub repository (2023). https:\/\/bit.ly\/4jhOvkn"},{"key":"e_1_3_3_1_25_2","unstructured":"Xuan Li Zhanke Zhou Jianing Zhu Jiangchao Yao Tongliang Liu and Bo Han. 2023. Deepinception: Hypnotize large language model to be jailbreaker. arXiv (2023)."},{"key":"e_1_3_3_1_26_2","unstructured":"Bill\u00a0Yuchen Lin Abhilasha Ravichander Ximing Lu Nouha Dziri Melanie Sclar et\u00a0al. 2024. The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning. https:\/\/openreview.net\/forum?id=wxJ0eXwwda. International Conference on Learning Representations (2024)."},{"key":"e_1_3_3_1_27_2","unstructured":"Tong Liu Yingjie Zhang Zhe Zhao Yinpeng Dong Guozhu Meng et\u00a0al. 2024. Making them ask and answer: Jailbreaking large language models in few queries via disguise and reconstruction. USENIX Security Symposium (2024)."},{"key":"e_1_3_3_1_28_2","unstructured":"Xiaoxuan Liu Lanxiang Hu Peter Bailis Ion Stoica Zhijie Deng et\u00a0al. 2023. Online Speculative Decoding. International Conference on Machine Learning (2023)."},{"key":"e_1_3_3_1_29_2","unstructured":"Xiaogeng Liu Nan Xu Muhao Chen and Chaowei Xiao. 2023. AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models. https:\/\/openreview.net\/forum?id=7Jwpw4qKkb. International Conference on Learning Representations (2023)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Yi Liu Gelei Deng Zhengzi Xu Yuekang Li Yaowen Zheng et\u00a0al. 2024. Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study. arXiv (2024).","DOI":"10.1145\/3663530.3665021"},{"key":"e_1_3_3_1_31_2","unstructured":"AI\u00a0@\u00a0Meta Llama\u00a0Team. 2024. The Llama 3 Herd of Models. https:\/\/arxiv.org\/abs\/2407.21783. arXiv (2024)."},{"key":"e_1_3_3_1_32_2","unstructured":"Mantas Mazeika Long Phan Xuwang Yin Andy Zou Zifan Wang et\u00a0al. 2024. HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal. International Conference on Machine Learning (2024)."},{"key":"e_1_3_3_1_33_2","unstructured":"Meta. 2024. Meta Llama Guard 2. https:\/\/bit.ly\/42l2a42."},{"key":"e_1_3_3_1_34_2","unstructured":"Meta-Llama. 2024. Prompt-Guard. https:\/\/github.com\/meta-llama\/PurpleLlama\/tree\/main\/Prompt-Guard."},{"key":"e_1_3_3_1_35_2","unstructured":"NVIDIA. [n. d.]. NVIDIA Data Center Deep Learning Product Performance AI Inference. https:\/\/developer.nvidia.com\/deep-learning-performance-training-inference\/ai-inference. NVIDIA Developer ([n. d.]). https:\/\/developer.nvidia.com\/deep-learning-performance-training-inference\/ai-inference"},{"key":"e_1_3_3_1_36_2","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad et\u00a0al. 2024. GPT-4 Technical Report. arXiv (2024)."},{"key":"e_1_3_3_1_37_2","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright et\u00a0al. 2022. Training Language Models to Follow Instructions With Human Feedback. Annual Conference on Neural Information Processing Systems (2022)."},{"key":"e_1_3_3_1_38_2","unstructured":"Mansi Phute Alec Helbling Matthew Hull ShengYun Peng Sebastian Szyller et\u00a0al. 2024. LLM Self Defense: By Self Examination LLMs Know They Are Being Tricked. Tiny Papers Track at ICLR (2024)."},{"key":"e_1_3_3_1_39_2","unstructured":"Xiangyu Qi Yi Zeng Tinghao Xie Pin-Yu Chen Ruoxi Jia et\u00a0al. 2024. Fine-tuning Aligned Language Models Compromises Safety Even When Users Do Not Intend To! https:\/\/openreview.net\/forum?id=hTEGyKf0dZ. International Conference on Learning Representations (2024)."},{"key":"e_1_3_3_1_40_2","unstructured":"Alexander Robey Eric Wong Hamed Hassani and George\u00a0J. Pappas. 2023. SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks. Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_3_1_41_2","unstructured":"Sayak\u00a0Saha Roy Poojitha Thota Krishna\u00a0Vamsi Naragam and Shirin Nilizadeh. 2024. From Chatbots to Phishbots?: Phishing Scam Generation in Commercial Large Language Models. IEEE Symposium on Security and Privacy (2024)."},{"key":"e_1_3_3_1_42_2","unstructured":"Vinu\u00a0Sankar Sadasivan Shoumik Saha Gaurang Sriramanan Priyatham Kattakinda Atoosa Chegini and Soheil Feizi. 2024. Fast Adversarial Attacks on Language Models In One GPU Minute. https:\/\/arxiv.org\/abs\/2402.15570. International Conference on Machine Learning (2024)."},{"key":"e_1_3_3_1_43_2","unstructured":"Leo Schwinn and Simon Geisler. 2024. Revisiting the Robust Alignment of Circuit Breakers. https:\/\/arxiv.org\/abs\/2407.15902. arXiv (2024)."},{"key":"e_1_3_3_1_44_2","unstructured":"Yanshen Sun Jianfeng He Limeng Cui Shuo Lei and Chang-Tien Lu. 2024. Exploring the Deceptive Power of LLM-Generated Fake News: A Study of Real-World Detection Challenges. arXiv (2024)."},{"key":"e_1_3_3_1_45_2","unstructured":"Gemma Team. 2024. Gemma: Introducing new state-of-the-art open models. https:\/\/blog.google\/technology\/developers\/gemma-open-models\/. Google (2024). https:\/\/blog.google\/technology\/developers\/gemma-open-models\/"},{"key":"e_1_3_3_1_46_2","unstructured":"Llama Team. [n. d.]. Prompt Guard-86M | Model Cards and Prompt formats. https:\/\/www.llama.com\/docs\/model-cards-and-prompt-formats\/prompt-guard\/."},{"key":"e_1_3_3_1_47_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi et\u00a0al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv (2023)."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Eric Wallace Tony Zhao Shi Feng and Sameer Singh. 2021. Concealed Data Poisoning Attacks on NLP Models. Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (2021).","DOI":"10.18653\/v1\/2021.naacl-main.13"},{"key":"e_1_3_3_1_49_2","unstructured":"Alexander Wan Eric Wallace Sheng Shen and Dan Klein. 2023. Poisoning Language Models During Instruction Tuning. International Conference on Machine Learning (2023)."},{"key":"e_1_3_3_1_50_2","unstructured":"Zeming Wei Yifei Wang and Yisen Wang. 2023. Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv (2023)."},{"key":"e_1_3_3_1_51_2","unstructured":"Xiaofei Wen Wenxuan Zhou Wenjie\u00a0Jacky Mo and Muhao Chen. 2025. ThinkGuard: Deliberative Slow Thinking Leads to Cautious Guardrails. https:\/\/arxiv.org\/abs\/2502.13458. arXiv (2025)."},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue et\u00a0al. 2020. Transformers: State-of-the-Art Natural Language Processing. Conference on Empirical Methods in Natural Language Processing: System Demonstrations (2020).","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"crossref","unstructured":"Yueqi Xie Minghong Fang Renjie Pi and Neil Gong. 2024. GradSafe: Detecting Jailbreak Prompts for LLMs via Safety-Critical Gradient Analysis. Annual Meeting of the Association for Computational Linguistics (2024).","DOI":"10.18653\/v1\/2024.acl-long.30"},{"key":"e_1_3_3_1_54_2","unstructured":"Zhangchen Xu Fengqing Jiang Luyao Niu Jinyuan Jia Bill\u00a0Yuchen Lin et\u00a0al. 2024. SafeDecoding: Defending against Jailbreak Attacks via Safety-Aware Decoding. Annual Meeting of the Association for Computational Linguistics (2024)."},{"key":"e_1_3_3_1_55_2","unstructured":"Jiahao Yu Xingwei Lin Zheng Yu and Xinyu Xing. 2023. Gptfuzzer: Red teaming large language models with auto-generated jailbreak prompts. arXiv (2023)."},{"key":"e_1_3_3_1_56_2","unstructured":"Youliang Yuan Wenxiang Jiao Wenxuan Wang Jen-tse Huang Pinjia He Shuming Shi and Zhaopeng Tu. 2024. GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher. International Conference on Learning Representations (2024)."},{"key":"e_1_3_3_1_57_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen et\u00a0al. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv (2022)."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"crossref","unstructured":"Zhexin Zhang Junxiao Yang Pei Ke and Minlie Huang. 2023. Defending Large Language Models Against Jailbreaking Attacks Through Goal Prioritization. Annual Meeting of the Association for Computational Linguistics (2023).","DOI":"10.18653\/v1\/2024.acl-long.481"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Wei Zhao Zhe Li Yige Li Ye Zhang and Jun Sun. 2024. Defending Large Language Models Against Jailbreak Attacks via Layer-specific Editing. Empirical Methods in Natural Language Processing (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.293"},{"key":"e_1_3_3_1_60_2","unstructured":"Xuandong Zhao Xianjun Yang Tianyu Pang Chao Du Lei Li et\u00a0al. 2024. Weak-to-Strong Jailbreaking on Large Language Models. International Conference on Machine Learning (2024)."},{"key":"e_1_3_3_1_61_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu et\u00a0al. 2024. Judging LLM-as-a-judge with MT-bench and Chatbot Arena. International Conference on Neural Information Processing Systems (2024)."},{"key":"e_1_3_3_1_62_2","unstructured":"Chunting Zhou Pengfei Liu Puxin Xu Srini Iyer Jiao Sun et\u00a0al. 2023. LIMA: Less Is More for Alignment. https:\/\/openreview.net\/forum?id=KBMOKmX2he. Conference on Neural Information Processing Systems (2023)."},{"key":"e_1_3_3_1_63_2","unstructured":"Daniel\u00a0M Ziegler Nisan Stiennon Jeffrey Wu Tom\u00a0B Brown Alec Radford et\u00a0al. 2019. Fine-Tuning Language Models From Human Preferences. arXiv (2019)."},{"key":"e_1_3_3_1_64_2","unstructured":"Andy Zou Long Phan Justin Wang Derek Duenas Maxwell Lin Maksym Andriushchenko Ruiqi Wang Zico Kolter Matt Fredrikson and Dan Hendrycks. 2024. Improving Alignment and Robustness with Circuit Breakers. https:\/\/arxiv.org\/abs\/2406.04313. arXiv (2024)."},{"key":"e_1_3_3_1_65_2","unstructured":"Andy Zou Zifan Wang J.\u00a0Zico Kolter and Matt Fredrikson. 2023. Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv (2023)."}],"event":{"name":"CAIS '26: ACM Conference on AI and Agentic Systems","location":"San Jose CA USA","acronym":"CAIS '26"},"container-title":["Proceedings of the ACM Conference on AI and Agentic Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3786335.3813133","content-type":"text\/html","content-version":"vor","intended-application":"syndication"}],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:19:23Z","timestamp":1779419963000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3786335.3813133"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,26]]},"references-count":64,"alternative-id":["10.1145\/3786335.3813133","10.1145\/3786335"],"URL":"https:\/\/doi.org\/10.1145\/3786335.3813133","relation":{},"subject":[],"published":{"date-parts":[[2026,5,26]]},"assertion":[{"value":"2026-05-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}