{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:29:43Z","timestamp":1777462183800,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","funder":[{"DOI":"10.13039\/100014013","name":"UK Research and Innovation","doi-asserted-by":"publisher","award":["EP\/W005271\/1"],"award-info":[{"award-number":["EP\/W005271\/1"]}],"id":[{"id":"10.13039\/100014013","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807629","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"315-322","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Graph-Based Detection of Jailbreak and Prompt-Leakage Attacks in LLMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3399-2440","authenticated-orcid":false,"given":"Javad","family":"Forough","sequence":"first","affiliation":[{"name":"Department of Computing, Imperial College London, London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1905-1611","authenticated-orcid":false,"given":"Mohammad M","family":"Maheri","sequence":"additional","affiliation":[{"name":"Department of Computing, Imperial College London, London, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5895-8903","authenticated-orcid":false,"given":"Hamed","family":"Haddadi","sequence":"additional","affiliation":[{"name":"Department of Computing, Imperial College London, London, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073","author":"Bai Yuntao","year":"2022","unstructured":"Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, et al. 2022. Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency (FAccT). 610\u2013623","author":"Bender Emily M.","year":"2021","unstructured":"Emily M. Bender, Timnit Gebru, Angelina McMillan-Major, and Margaret Mitchell. 2021. On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?. In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency (FAccT). 610\u2013623."},{"key":"e_1_3_2_1_4_1","unstructured":"Rishi Bommasani Drew A. Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx et al. 2021. On the Opportunities and Risks of Foundation Models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_2_1_5_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tramer, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom Brown, Dawn Song, \u00dalfar Erlingsson, Alina Oprea, and Colin Raffel. 2021. Extracting Training Data from Large Language Models. In 30th USENIX Security Symposium (USENIX Security 21). 2633\u20132650."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the Network and Distributed System Security Symposium (NDSS). 1\u201312","author":"Deng Gelei","year":"2024","unstructured":"Gelei Deng, Yi Liu, Yuekang Li, Kailong Wang, Ying Zhang, Zefeng Li, Haoyu Wang, Tianwei Zhang, and Yang Liu. 2024. MasterKey: Automated Jailbreak Across Multiple Large Language Model Chatbots. In Proceedings of the Network and Distributed System Security Symposium (NDSS). 1\u201312."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT).","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP).","author":"DeYoung Jay","year":"2023","unstructured":"Jay DeYoung, Kevin Allen, and Liang Zhang. 2023. MACE: Model-Agnostic Classifiers for Explaining Adversarial Prompts. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP)."},{"key":"e_1_3_2_1_9_1","volume-title":"2022 IEEE 42nd International Conference on Distributed Computing Systems (ICDCS). IEEE, 1155\u20131165","author":"Forough Javad","year":"2022","unstructured":"Javad Forough, Monowar Bhuyan, and Erik Elmroth. 2022. Dela: A deep ensemble learning approach for cross-layer vsi-ddos detection on the edge. In 2022 IEEE 42nd International Conference on Distributed Computing Systems (ICDCS). IEEE, 1155\u20131165."},{"key":"e_1_3_2_1_10_1","volume-title":"Efficient anomaly detection for edge clouds: mitigating data and resource constraints","author":"Forough Javad","year":"2024","unstructured":"Javad Forough, Hamed Haddadi, Monowar Bhuyan, and Erik Elmroth. 2024. Efficient anomaly detection for edge clouds: mitigating data and resource constraints. IEEE Access (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Smith","author":"Gehman Samuel","year":"2020","unstructured":"Samuel Gehman, Suchin Gururangan, Maarten Sap, Yejin Choi, and Noah A. Smith. 2020. RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models. In Findings of the Association for Computational Linguistics: EMNLP 2020. 3356\u20133369."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Kai Greshake et al. 2023. Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. arXiv preprint arXiv:2302.12173 (2023).","DOI":"10.1145\/3605764.3623985"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of ICLR","author":"Dan","year":"2021","unstructured":"Dan Hendrycks et al. 2021. Aligning AI with Shared Human Values. Proceedings of ICLR (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3670370"},{"key":"e_1_3_2_1_15_1","volume-title":"Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations. arXiv preprint arXiv:2312.06674","author":"Hakan Inan","year":"2023","unstructured":"Hakan Inan et al. 2023. Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations. arXiv preprint arXiv:2312.06674 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3700838.3703687"},{"key":"e_1_3_2_1_17_1","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga et al. 2022. Holistic Evaluation of Language Models. arXiv preprint arXiv:2211.09110 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_19_1","volume-title":"Teleportation-Based Defenses for Privacy in Approximate Machine Unlearning. arXiv preprint arXiv:2512.00272","author":"Maheri Mohammad M","year":"2025","unstructured":"Mohammad M Maheri, Xavier Cadet, Peter Chin, and Hamed Haddadi. 2025. Teleportation-Based Defenses for Privacy in Approximate Machine Unlearning. arXiv preprint arXiv:2512.00272 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"ZK-APEX: Zero-Knowledge Approximate Personalized Unlearning with Executable Proofs. arXiv preprint arXiv:2512.09953","author":"Maheri Mohammad M","year":"2025","unstructured":"Mohammad M Maheri, Sunil Cotterill, Alex Davidson, and Hamed Haddadi. 2025. ZK-APEX: Zero-Knowledge Approximate Personalized Unlearning with Executable Proofs. arXiv preprint arXiv:2512.09953 (2025)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1002\/asi.23062"},{"key":"e_1_3_2_1_22_1","volume-title":"Scalable extraction of training data from (production) language models. arXiv preprint arXiv:2311.17035","author":"Nasr Milad","year":"2023","unstructured":"Milad Nasr, Nicholas Carlini, Jonathan Hayase, Matthew Jagielski, A Feder Cooper, Daphne Ippolito, Christopher A Choquette-Choo, Eric Wallace, Florian Tram\u00e8r, and Katherine Lee. 2023. Scalable extraction of training data from (production) language models. arXiv preprint arXiv:2311.17035 (2023)."},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_24_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin et al. 2022. Training language models to follow instructions with human feedback. arXiv preprint arXiv:2203.02155 (2022)."},{"key":"e_1_3_2_1_25_1","unstructured":"Xudong Pan et al. 2023. Backdoor Attacks on Language Models. arXiv preprint arXiv:2303.02564 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Red teaming language models with language models. arXiv preprint arXiv:2202.03286","author":"Perez Ethan","year":"2022","unstructured":"Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. 2022. Red teaming language models with language models. arXiv preprint arXiv:2202.03286 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"RNN, LSTM, GRU. arXiv preprint arXiv:2305.17473","author":"Shiri Farhad Mortezapour","year":"2023","unstructured":"Farhad Mortezapour Shiri, Thinagaran Perumal, Norwati Mustapha, and Raihani Mohamed. 2023. A comprehensive overview and comparative analysis on deep learning models: CNN, RNN, LSTM, GRU. arXiv preprint arXiv:2305.17473 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of EMNLP-IJCNLP","author":"Eric","year":"2019","unstructured":"Eric Wallace et al. 2019. Universal Adversarial Triggers for Attacking and Analyzing NLP. Proceedings of EMNLP-IJCNLP (2019)."},{"key":"e_1_3_2_1_29_1","first-page":"80079","article-title":"Jailbroken: How does llm safety training fail","volume":"36","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2023. Jailbroken: How does llm safety training fail? Advances in Neural Information Processing Systems 36 (2023), 80079\u201380110.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","unstructured":"Laura Weidinger Jonathan Uesato Maribeth Rauh Conor Griffin John Mellor Po-Sen Huang et al. 2021. Ethical and social risks of harm from Language Models. arXiv preprint arXiv:2112.04359 (2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 34th USENIX Security Symposium.","author":"Xu Kai","year":"2025","unstructured":"Kai Xu, Pranav Gopalakrishnan, Emily Choi, and Zhiyuan Lin. 2025. Adversarial Prompt Attacks and Defenses: A Systematic Evaluation. In Proceedings of the 34th USENIX Security Symposium."},{"key":"e_1_3_2_1_32_1","volume-title":"Jailbreak attacks and defenses against large language models: A survey. arXiv preprint arXiv:2407.04295","author":"Yi Sibo","year":"2024","unstructured":"Sibo Yi, Yule Liu, Zhen Sun, Tianshuo Cong, Xinlei He, Jiaxing Song, Ke Xu, and Qi Li. 2024. Jailbreak attacks and defenses against large language models: A survey. arXiv preprint arXiv:2407.04295 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Yu Jiahao","year":"2024","unstructured":"Jiahao Yu, Xingwei Lin, Zheng Yu, and Xinyu Xing. 2024. {LLM-Fuzzer}: Scaling assessment of large language model jailbreaks. In 33rd USENIX Security Symposium (USENIX Security 24). 4657\u20134674."},{"key":"e_1_3_2_1_34_1","volume-title":"LLM-Guard: Detecting and Filtering Harmful Prompts in Large Language Models. Transactions on Machine Learning Research","author":"Yung Felix","year":"2024","unstructured":"Felix Yung, Trisha Banerjee, Shuhan Duan, and Heng Ji. 2024. LLM-Guard: Detecting and Filtering Harmful Prompts in Large Language Models. Transactions on Machine Learning Research (2024). To appear."},{"key":"e_1_3_2_1_35_1","volume-title":"33rd USENIX Security Symposium (USENIX Security 24)","author":"Zhang Rui","year":"2024","unstructured":"Rui Zhang, Hongwei Li, Rui Wen, Wenbo Jiang, Yuan Zhang, Michael Backes, Yun Shen, and Yang Zhang. 2024. Instruction backdoor attacks against customized {LLMs}. In 33rd USENIX Security Symposium (USENIX Security 24). 1849\u20131866."},{"key":"e_1_3_2_1_36_1","unstructured":"Andy Zou et al. 2024. Instruction-Level Backdoor Attacks on Aligned Language Models. arXiv preprint arXiv:2402.08646 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J Zico Kolter, and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807629","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:16:45Z","timestamp":1777382205000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807629"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":37,"alternative-id":["10.1145\/3805621.3807629","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807629","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}