{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T06:14:07Z","timestamp":1775628847445,"version":"3.50.1"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T00:00:00Z","timestamp":1764720000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T00:00:00Z","timestamp":1764720000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,3]]},"DOI":"10.1109\/icmla66185.2025.00172","type":"proceedings-article","created":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T19:54:58Z","timestamp":1775591698000},"page":"1116-1123","source":"Crossref","is-referenced-by-count":0,"title":["Jailbreaking Large Language Models: Safety Alignment, Response Quality, Computational Cost"],"prefix":"10.1109","author":[{"given":"Jonas","family":"Rosengren","sequence":"first","affiliation":[{"name":"Lovable Labs,Stockholm,Sweden,SE-111 37"}]},{"given":"Joel","family":"Brynielsson","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Stockholm,Sweden,SE-100 44"}]},{"given":"Fredrik","family":"Johansson","sequence":"additional","affiliation":[{"name":"FOI Swedish Defence Research Agency,Stockholm,Sweden,SE-164 90"}]},{"given":"Patrik","family":"Jonell","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020)","author":"Brown"},{"key":"ref2","article-title":"DeepSeek-R1: Incentivizing reasoning capability in llms via reinforcement learning","year":"2025"},{"key":"ref3","first-page":"3356","article-title":"RealToxicityPrompts: Evaluating neural toxic degeneration in language models","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","author":"Gehman"},{"key":"ref4","article-title":"Defending against neural fake news","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Zellers","year":"2019"},{"key":"ref5","article-title":"Ethical and social risks of harm from language models","author":"Weidinger","year":"2021"},{"key":"ref6","doi-asserted-by":"crossref","DOI":"10.1101\/2024.04.07.24305462","article-title":"Risks from language models for automated mental healthcare: Ethics and structure for implementation","volume-title":"First Conference on Language Modeling","author":"Grabb"},{"key":"ref7","article-title":"Generative AI misuse: A taxonomy of tactics and insights from realworld data","author":"Marchal","year":"2024"},{"key":"ref8","article-title":"A general language assistant as a laboratory for alignment","author":"Askell","year":"2021"},{"key":"ref9","article-title":"Training a helpful and harmless assistant with reinforcement learning from human feedback","author":"Bai","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2011"},{"key":"ref11","article-title":"Jailbroken: How does LLM safety training fail?","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Wei"},{"key":"ref12","article-title":"Universal and transferable adversarial attacks on aligned language models","author":"Zou","year":"2023"},{"key":"ref13","article-title":"Catastrophic jailbreak of open-source LLMs via exploiting generation","volume-title":"The Twelfth International Conference on Learning Representations","author":"Huang"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4322"},{"key":"ref15","article-title":"Assessing the brittleness of safety alignment via pruning and low-rank modifications","volume-title":"ICLR 2024 Workshop on Mathematical and Empirical Understanding of Foundation Models","author":"Wei"},{"key":"ref16","article-title":"Fine-tuning aligned language models compromises safety, even when users do not intend to","volume-title":"ICLR","author":"Qi","year":"2024"},{"key":"ref17","article-title":"Harmbench: A standardized evaluation framework for automated red teaming and robust refusal","volume-title":"ICML","author":"Mazeika","year":"2024"},{"key":"ref18","first-page":"55005","article-title":"JailbreakBench: An open robustness benchmark for jailbreaking large language models","volume-title":"Advances in Neural Information Processing Systems","volume":"37","author":"Chao","year":"2024"},{"key":"ref19","article-title":"Safe RLHF: Safe reinforcement learning from human feedback","volume-title":"The Twelfth International Conference on Learning Representations","author":"Dai"},{"key":"ref20","article-title":"Constitutional classifiers: Defending against universal jailbreaks across thousands of hours of red teaming","author":"Sharma","year":"2025"},{"key":"ref21","article-title":"Measuring massive multitask language understanding","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref22","first-page":"4791","article-title":"HellaSwag: Can a machine really finish your sentence?","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","author":"Zellers"},{"key":"ref23","first-page":"13025","article-title":"InFoBench: Evaluating instruction following ability in large language models","volume-title":"Findings of the Association for Computational Linguistics: ACL 2024","author":"Qin"},{"key":"ref24","first-page":"163","article-title":"Can large language models serve as effective classifiers for hierarchical multi-label classification of scientific documents at industrial scale?","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics: Industry Track","author":"Tabatabaei"},{"key":"ref25","article-title":"Large language models for text classification: Case study and comprehensive review","author":"Kostina","year":"2025"},{"key":"ref26","article-title":"Giving Claude a role with a system prompt","year":"2024"},{"key":"ref27","article-title":"Jailbreaking in genai: Techniques and ethical implications","year":"2024"},{"key":"ref28","article-title":"ChatGPT \u201dDAN\" (and other \"jailbreaks\")","author":"Lee","year":"2023"},{"key":"ref29","article-title":"Stanford Alpaca: An instruction-following Llama model","author":"Taori","year":"2023"},{"key":"ref30","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"International Conference on Learning Representations","author":"Hu"}],"event":{"name":"2025 International Conference on Machine Learning and Applications (ICMLA)","location":"Boca Raton, FL, USA","start":{"date-parts":[[2025,12,3]]},"end":{"date-parts":[[2025,12,5]]}},"container-title":["2025 International Conference on Machine Learning and Applications (ICMLA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11471302\/11471304\/11471392.pdf?arnumber=11471392","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T05:38:42Z","timestamp":1775626722000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11471392\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,3]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/icmla66185.2025.00172","relation":{},"subject":[],"published":{"date-parts":[[2025,12,3]]}}}