{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T01:43:03Z","timestamp":1769391783764,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"name":"DARPA","award":["885000"],"award-info":[{"award-number":["885000"]}]},{"name":"NSF","award":["CCF-FMiTF-1836978"],"award-info":[{"award-number":["CCF-FMiTF-1836978"]}]},{"name":"ONR","award":["N00014-21-1-2492"],"award-info":[{"award-number":["N00014-21-1-2492"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3733799.3762980","type":"proceedings-article","created":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T11:38:49Z","timestamp":1767094729000},"page":"218-229","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["How Not to Detect Prompt Injections with an LLM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1881-3408","authenticated-orcid":false,"given":"Sarthak","family":"Choudhary","sequence":"first","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6324-0315","authenticated-orcid":false,"given":"Divyam","family":"Anshumaan","sequence":"additional","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7451-0976","authenticated-orcid":false,"given":"Nils","family":"Palumbo","sequence":"additional","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5877-0436","authenticated-orcid":false,"given":"Somesh","family":"Jha","sequence":"additional","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,30]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2023. Instruction Defense. https:\/\/learnprompting.org\/docs\/prompt_hacking\/defensive_measures\/instruction. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_3_2","unstructured":"2023. Random Sequence Enclosure. https:\/\/learnprompting.org\/docs\/prompt_hacking\/defensive_measures\/random_sequence. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_4_2","unstructured":"2024. Poe. https:\/\/poe.com\/. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_5_2","unstructured":"2025. Microsoft Copilot. https:\/\/copilot.microsoft.com. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_6_2","unstructured":"Anthropic. 2025. Claude 4 Sonnet. https:\/\/www.anthropic.com\/claude\/sonnet."},{"key":"e_1_3_3_2_7_2","unstructured":"R.\u00a0G.\u00a0Stuart Armstrong. 2023. Using GPT-Eliezer against ChatGPT Jailbreaking. https:\/\/www.alignmentforum.org\/posts\/pNcFYZnPdXyL2RfgA\/using-gpt-eliezer-against-chatgpt-jailbreaking. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/SFCS.1997.646128"},{"key":"e_1_3_3_2_9_2","unstructured":"Sizhe Chen Julien Piet Chawin Sitawarin and David Wagner. 2024. Struq: Defending against prompt injection with structured queries. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.06363 (2024)."},{"key":"e_1_3_3_2_10_2","unstructured":"Sizhe Chen Arman Zharmagambetov Saeed Mahloujifar Kamalika Chaudhuri and Chuan Guo. 2024. Aligning llms to be robust against prompt injection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.05451 (2024)."},{"key":"e_1_3_3_2_11_2","unstructured":"Sarthak Choudhary. 2025. DataFlip. https:\/\/github.com\/sarthak-choudhary\/DataFlip. Accessed: 2025-07-17."},{"key":"e_1_3_3_2_12_2","unstructured":"Manuel Costa Boris K\u00f6pf Aashish Kolluri Andrew Paverd Mark Russinovich Ahmed Salem Shruti Tople Lukas Wutschitz and Santiago Zanella-B\u00e9guelin. 2025. Securing AI Agents with Information-Flow Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.23643 (2025)."},{"key":"e_1_3_3_2_13_2","unstructured":"Edoardo Debenedetti Ilia Shumailov Tianqi Fan Jamie Hayes Nicholas Carlini Daniel Fabian Christoph Kern Chongyang Shi Andreas Terzis and Florian Tram\u00e8r. 2025. Defeating prompt injections by design. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.18813 (2025)."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/103418.103474"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/11681878_14"},{"key":"e_1_3_3_2_16_2","unstructured":"DeepSeek-AI et al.2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arxiv:https:\/\/arXiv.org\/abs\/2501.12948\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3605764.3623985"},{"key":"e_1_3_3_2_18_2","unstructured":"Riley Harang. 2023. Securing LLM Systems Against Prompt Injection. https:\/\/developer.nvidia.com\/blog\/securing-llm-systems-against-prompt-injection. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_19_2","unstructured":"Keegan Hines Gary Lopez Matthew Hall Federico Zarfati Yonatan Zunger and Emre K\u0131c\u0131man. 2024. Defending Against Indirect Prompt Injection Attacks With Spotlighting. (2024)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3670370"},{"key":"e_1_3_3_2_21_2","unstructured":"Neel Jain Avi Schwarzschild Yuxin Wen Gowthami Somepalli John Kirchenbauer Ping-yeh Chiang Micah Goldblum Aniruddha Saha Jonas Geiping and Tom Goldstein. 2023. Baseline defenses for adversarial attacks against aligned language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.00614 (2023)."},{"key":"e_1_3_3_2_22_2","unstructured":"Albert\u00a0Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio\u00a0Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven\u00a0Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William\u00a0El Sayed. 2023. Mistral 7B. arxiv:https:\/\/arXiv.org\/abs\/2310.06825\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48405-1_25"},{"key":"e_1_3_3_2_24_2","unstructured":"Learn Prompting. 2023. Sandwich Defense. https:\/\/learnprompting.org\/docs\/prompt%20hacking\/defensive%20measures\/sandwich%20defense. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_25_2","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74\u201381. https:\/\/aclanthology.org\/W04-1013\/"},{"key":"e_1_3_3_2_26_2","unstructured":"Xiaogeng Liu Zhiyuan Yu Yizhe Zhang Ning Zhang and Chaowei Xiao. 2024. Automatic and universal prompt injection attacks against large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.04957 (2024)."},{"key":"e_1_3_3_2_27_2","first-page":"1831","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Liu Yupei","year":"2024","unstructured":"Yupei Liu, Yuqi Jia, Runpeng Geng, Jinyuan Jia, and Neil\u00a0Zhenqiang Gong. 2024. Formalizing and benchmarking prompt injection attacks and defenses. In 33rd USENIX Security Symposium (USENIX Security 24). 1831\u20131847."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/SP61157.2025.00250"},{"key":"e_1_3_3_2_29_2","unstructured":"A. Mendes. 2023. Ultimate ChatGPT Prompt Engineering Guide for General Users and Developers. https:\/\/www.imaginarycloud.com\/blog\/chatgpt-prompt-engineering. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_30_2","unstructured":"Meta. 2025. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/."},{"key":"e_1_3_3_2_31_2","unstructured":"Y. Nakajima. 2022. Yohei\u2019s Blog Post. https:\/\/twitter.com\/yoheinakajima\/status\/1582844144640471040. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2008.33"},{"key":"e_1_3_3_2_33_2","unstructured":"OpenAI. 2025. Introducing GPT-4.1 in the API. https:\/\/openai.com\/index\/gpt-4-1\/."},{"key":"e_1_3_3_2_34_2","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et\u00a0al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730\u201327744."},{"key":"e_1_3_3_2_35_2","unstructured":"OWASP. 2023. OWASP Top 10 for LLM Applications. https:\/\/llmtop10.com. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689932.3694764"},{"key":"e_1_3_3_2_37_2","volume-title":"NeurIPS ML Safety Workshop","author":"Perez F\u00e1bio","year":"2022","unstructured":"F\u00e1bio Perez and Ian Ribeiro. 2022. Ignore Previous Prompt: Attack Techniques For Language Models. In NeurIPS ML Safety Workshop."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-70879-4_6"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.494"},{"key":"e_1_3_3_2_40_2","unstructured":"L. Reid. 2024. Generative AI in Search: Let Google do the Searching for You. https:\/\/blog.google\/products\/search\/generative-ai-google-search-may-2024\/. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_41_2","unstructured":"Pierangela Samarati and Latanya Sweeney. 1998. Protecting privacy when disclosing information: k-anonymity and its enforcement through generalization and suppression. (1998)."},{"key":"e_1_3_3_2_42_2","unstructured":"V. Schermerhorn. 2023. How Amazon Continues to Improve the Customer Reviews Experience with Generative AI. https:\/\/www.aboutamazon.com\/news\/amazon-ai\/amazon-improves-customer-reviews-with-generative-ai. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_43_2","unstructured":"J. Selvi. 2022. Exploring Prompt Injection Attacks. https:\/\/research.nccgroup.com\/2022\/12\/05\/exploring-prompt-injection-attacks\/. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_44_2","unstructured":"Zedian Shao Hongbin Liu Jaden Mu and Neil Zhenqiang\u00a0Gong. 2024. Making llms vulnerable to prompt injection via poisoning alignment. arXiv e-prints (2024) arXiv\u20132410."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Latanya Sweeney. 2002. Achieving k-anonymity privacy protection using generalization and suppression. International Journal of Uncertainty Fuzziness and Knowledge-Based Systems 10 05 (2002) 571\u2013588.","DOI":"10.1142\/S021848850200165X"},{"key":"e_1_3_3_2_46_2","unstructured":"Eric Wallace Kai Xiao Reimar Leike Lilian Weng Johannes Heidecke and Alex Beutel. 2024. The instruction hierarchy: Training llms to prioritize privileged instructions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.13208 (2024)."},{"key":"e_1_3_3_2_47_2","unstructured":"Simon Willison. 2022. Prompt injection attacks against GPT-3. https:\/\/simonwillison.net\/2022\/Sep\/12\/prompt-injection\/. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_48_2","unstructured":"Simon Willison. 2023. Delimiters Won\u2019t Save You from Prompt Injection. https:\/\/simonwillison.net\/2023\/May\/11\/delimiters-wont-save-you. Accessed: 2025-06-28."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"M Woodridge and NR Jennings. 1995. Intelligent Agents: Theory and Practice The Knowledge Engineering Review. (1995).","DOI":"10.1017\/S0269888900008122"},{"key":"e_1_3_3_2_50_2","unstructured":"Jingwei Yi Yueqi Xie Bin Zhu Emre Kiciman Guangzhong Sun Xing Xie and Fangzhao Wu. 2023. Benchmarking and defending against indirect prompt injection attacks on large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.14197 (2023)."},{"key":"e_1_3_3_2_51_2","unstructured":"Andy Zou Zifan Wang Nicholas Carlini Milad Nasr J\u00a0Zico Kolter and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.15043 (2023)."}],"event":{"name":"AISec '25: Proceedings of the 2025 Workshop on Artificial Intelligence and Security","location":"Taipei , Taiwan","acronym":"AISec '25","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 18th ACM Workshop on Artificial Intelligence and Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3733799.3762980","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T11:53:15Z","timestamp":1767095595000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3733799.3762980"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"references-count":50,"alternative-id":["10.1145\/3733799.3762980","10.1145\/3733799"],"URL":"https:\/\/doi.org\/10.1145\/3733799.3762980","relation":{},"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"2025-12-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}