{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:50:53Z","timestamp":1755802253928,"version":"3.44.0"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032020178"},{"type":"electronic","value":"9783032020185"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02018-5_39","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T10:16:12Z","timestamp":1755771372000},"page":"537-565","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Safety Retrofitting Against Jailbreaking for\u00a0LLMs"],"prefix":"10.1007","author":[{"given":"Dario","family":"Garcia-Gasulla","sequence":"first","affiliation":[]},{"given":"Adri\u00e1n","family":"Tormos","sequence":"additional","affiliation":[]},{"given":"Anna","family":"Arias-Duart","sequence":"additional","affiliation":[]},{"given":"Daniel","family":"Hinjos","sequence":"additional","affiliation":[]},{"given":"Oscar","family":"Molina-Sedano","sequence":"additional","affiliation":[]},{"given":"Ashwin Kumar","family":"Gurarajan","sequence":"additional","affiliation":[]},{"given":"Maria Eugenia","family":"Cardello","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"39_CR1","unstructured":"Abdin, M., et\u00a0al.: Phi-3 technical report: a highly capable language model locally on your phone (2024). https:\/\/arxiv.org\/abs\/2404.14219"},{"key":"39_CR2","unstructured":"AI@Meta: LLaMA 3 model card (2024). https:\/\/github.com\/meta-llama\/llama3blob\/main\/MODEL_CARD.md"},{"key":"39_CR3","unstructured":"Andriushchenko, M., Flammarion, N.: Does refusal training in LLMs generalize to the past tense? (2024). https:\/\/arxiv.org\/abs\/2407.11969"},{"key":"39_CR4","unstructured":"Bai, J., et\u00a0al.: Qwen 1 technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"39_CR5","unstructured":"Bai, J., et\u00a0al.: Qwen 2 technical report (2023)"},{"key":"39_CR6","unstructured":"Bianchi, F., et\u00a0al.: Safety-tuned LLaMAs: lessons from improving the safety of large language models that follow instructions. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"39_CR7","unstructured":"Chao, P., et\u00a0al.: JailBreakBench: an open robustness benchmark for jailbreaking large language models (2024). https:\/\/arxiv.org\/abs\/2404.01318"},{"key":"39_CR8","unstructured":"Chao, P., et\u00a0al.: Jailbreaking black box large language models in twenty queries (2024). https:\/\/arxiv.org\/abs\/2310.08419"},{"key":"39_CR9","unstructured":"Chen, S., et\u00a0al.: Red teaming GPT-4v: are GPT-4v safe against uni\/multi-modal jailbreak attacks? (2024). https:\/\/arxiv.org\/abs\/2404.03411"},{"key":"39_CR10","unstructured":"Chowdhury, A.G., et\u00a0al.: Breaking down the defenses: a comparative survey of attacks on large language models (2024). https:\/\/arxiv.org\/abs\/2403.04786"},{"key":"39_CR11","unstructured":"Christiano, P., et\u00a0al.: Deep reinforcement learning from human preferences (2023). https:\/\/arxiv.org\/abs\/1706.03741"},{"key":"39_CR12","unstructured":"Cui, J., et al.: Or-bench: an over-refusal benchmark for large language models. arXiv preprint arXiv:2405.20947 (2024)"},{"key":"39_CR13","doi-asserted-by":"crossref","unstructured":"Deng, G., et\u00a0al.: MasterKey: automated jailbreaking of large language model chatbots. In: Proceedings 2024 Network and Distributed System Security Symposium (2024)","DOI":"10.14722\/ndss.2024.24188"},{"key":"39_CR14","doi-asserted-by":"crossref","unstructured":"Ding, P., et\u00a0al.: A wolf in sheep\u2018s clothing: generalized nested jailbreak prompts can fool large language models easily. In: Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers) (2024)","DOI":"10.18653\/v1\/2024.naacl-long.118"},{"key":"39_CR15","unstructured":"Feng, D., et\u00a0al.: Towards analyzing and understanding the limitations of DPO: a theoretical perspective (2024). https:\/\/arxiv.org\/abs\/2404.04626"},{"key":"39_CR16","unstructured":"Fourrier, C., et\u00a0al.: Open LLM leaderboard v2 (2024). https:\/\/huggingface.co\/spaces\/open-llm-leaderboard\/open_llm_leaderboard"},{"key":"39_CR17","unstructured":"Ganguli, D., et\u00a0al.: Red teaming language models to reduce harms: methods, scaling behaviors, and lessons learned (2022). https:\/\/arxiv.org\/abs\/2209.07858"},{"key":"39_CR18","doi-asserted-by":"publisher","unstructured":"Gao, L., et\u00a0al.: A framework for few-shot language model evaluation (2024). https:\/\/doi.org\/10.5281\/zenodo.12608602","DOI":"10.5281\/zenodo.12608602"},{"key":"39_CR19","unstructured":"Hendrycks, D., et\u00a0al.: Measuring massive multitask language understanding (2021). https:\/\/arxiv.org\/abs\/2009.03300"},{"key":"39_CR20","unstructured":"Hu, J., et\u00a0al.: OpenRLHF: an easy-to-use, scalable and high-performance RLHF framework (2024). https:\/\/arxiv.org\/abs\/2405.11143"},{"key":"39_CR21","unstructured":"Huang, Y., et\u00a0al.: Catastrophic jailbreak of open-source LLMs via exploiting generation. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"39_CR22","unstructured":"Intel: Orca DPO pairs. https:\/\/huggingface.co\/datasets\/Intel\/orca_dpo_pairs"},{"key":"39_CR23","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b (2023). https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"39_CR24","doi-asserted-by":"crossref","unstructured":"Khaki, S., et\u00a0al.: RS-DPO: a hybrid rejection sampling and direct preference optimization method for alignment of large language models (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.108"},{"key":"39_CR25","unstructured":"Kim, G.H., et\u00a0al.: SafeDPO: a simple approach to direct preference optimization with enhanced safety. arXiv preprint arXiv:2505.20065 (2025)"},{"key":"39_CR26","unstructured":"Kwon, W., et\u00a0al.: Efficient memory management for large language model serving with pagedattention (2023). https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"39_CR27","unstructured":"Lambert, N., et\u00a0al.: Tulu 3: pushing frontiers in open language model post-training (2024). https:\/\/arxiv.org\/abs\/2411.15124"},{"key":"39_CR28","unstructured":"Li, X., et\u00a0al.: DeepInception: hypnotize large language model to be jailbreaker (2024). https:\/\/arxiv.org\/abs\/2311.03191"},{"key":"39_CR29","unstructured":"Liu, X., et\u00a0al.: AutoDAN: generating stealthy jailbreak prompts on aligned large language models (2024). https:\/\/arxiv.org\/abs\/2310.04451"},{"key":"39_CR30","unstructured":"Llama\u00a0Team, A..M.: The LLaMA 3 herd of models (2024). https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"39_CR31","unstructured":"Mazeika, M., et\u00a0al.: TDC 2023 (LLM edition): the trojan detection challenge. In: NeurIPS Competition Track (2023)"},{"key":"39_CR32","unstructured":"Mazeika, M., et\u00a0al.: HarmBench: a standardized evaluation framework for automated red teaming and robust refusal (2024)"},{"key":"39_CR33","unstructured":"Mou, Y., Zhang, S., Ye, W.: SG-bench: evaluating LLM safety generalization across diverse tasks and prompt types (2024). https:\/\/arxiv.org\/abs\/2410.21965"},{"key":"39_CR34","unstructured":"Ouyang, L., , et\u00a0al.: Training language models to follow instructions with human feedback (2022). https:\/\/arxiv.org\/abs\/2203.02155"},{"key":"39_CR35","unstructured":"Park, R., et\u00a0al.: Disentangling length from quality in direct preference optimization (2024). https:\/\/arxiv.org\/abs\/2403.19159"},{"key":"39_CR36","doi-asserted-by":"crossref","unstructured":"Perez, E., et\u00a0al.: Red teaming language models with language models (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.225"},{"key":"39_CR37","unstructured":"Rafailov, R., et\u00a0al.: Direct preference optimization: your language model is secretly a reward model (2024). https:\/\/arxiv.org\/abs\/2305.18290"},{"key":"39_CR38","unstructured":"Rafailov, R., et\u00a0al.: Scaling laws for reward model overoptimization in direct alignment algorithms (2024). https:\/\/arxiv.org\/abs\/2406.02900"},{"key":"39_CR39","unstructured":"Ramamurthy, R., et\u00a0al.: Is reinforcement learning (not) for natural language processing: Benchmarks, baselines, and building blocks for natural language policy optimization (2023). https:\/\/arxiv.org\/abs\/2210.01241"},{"key":"39_CR40","unstructured":"Saeidi, A., Verma, S., Baral, C.: Insights into alignment: evaluating DPO and its variants across multiple tasks (2024). https:\/\/arxiv.org\/abs\/2404.14723"},{"key":"39_CR41","unstructured":"Samvelyan, M., et\u00a0al.: Rainbow teaming: open-ended generation of diverse adversarial prompts. In: ICLR 2024 Workshop on Secure and Trustworthy Large Language Models (2024)"},{"key":"39_CR42","unstructured":"Samvelyan, M., et\u00a0al.: Rainbow teaming: open-ended generation of diverse adversarial prompts (2024). https:\/\/arxiv.org\/abs\/2402.16822"},{"key":"39_CR43","doi-asserted-by":"publisher","unstructured":"Shaikh, O., et\u00a0al.: On second thought, let\u2018s not think step by step! Bias and toxicity in zero-shot reasoning. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.244","DOI":"10.18653\/v1\/2023.acl-long.244"},{"key":"39_CR44","doi-asserted-by":"crossref","unstructured":"Shen, X., et\u00a0al.: \u201cDo anything now\u201d: characterizing and evaluating in-the-wild jailbreak prompts on large language models. In: ACM SIGSAC Conference on Computer and Communications Security (CCS). ACM (2024)","DOI":"10.1145\/3658644.3670388"},{"key":"39_CR45","doi-asserted-by":"publisher","unstructured":"Strubell, E., et\u00a0al.: Energy and policy considerations for deep learning in NLP. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 3645\u20133650 (2019). https:\/\/doi.org\/10.18653\/v1\/P19-1355","DOI":"10.18653\/v1\/P19-1355"},{"key":"39_CR46","unstructured":"Su, J., et\u00a0al.: Mission impossible: a statistical perspective on jailbreaking LLMs (2024). https:\/\/arxiv.org\/abs\/2408.01420"},{"key":"39_CR47","doi-asserted-by":"crossref","unstructured":"Sun, D.Q., et\u00a0al.: Delphi: data for evaluating LLMs\u2019 performance in handling controversial issues. In: EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-industry.76"},{"key":"39_CR48","doi-asserted-by":"publisher","unstructured":"Team, G.: Gemma (2024). https:\/\/doi.org\/10.34740\/KAGGLE\/M\/3301, https:\/\/www.kaggle.com\/m\/3301","DOI":"10.34740\/KAGGLE\/M\/3301"},{"key":"39_CR49","unstructured":"Team, L.: Meta LLaMA guard 2 (2024). https:\/\/github.com\/meta-llama\/PurpleLlama\/blob\/main\/Llama-Guard2\/MODEL_CARD.md"},{"key":"39_CR50","unstructured":"Tedeschi, S., et\u00a0al.: Alert: a comprehensive benchmark for assessing large language models\u2019 safety through red teaming (2024). https:\/\/arxiv.org\/abs\/2404.08676"},{"key":"39_CR51","unstructured":"Vidgen, B., et\u00a0al.: SimpleSafetyTests: a test suite for identifying critical safety risks in large language models (2024). https:\/\/arxiv.org\/abs\/2311.08370"},{"key":"39_CR52","unstructured":"Wang, Y., et\u00a0al.: Do-not-answer: evaluating safeguards in LLMs. In: Graham, Y., Purver, M. (eds.) Findings of the Association for Computational Linguistics: EACL 2024 (2024). https:\/\/aclanthology.org\/2024.findings-eacl.61"},{"key":"39_CR53","unstructured":"Wei, A., et\u00a0al.: Jailbroken: how does LLM safety training fail? In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"39_CR54","unstructured":"Wolf, Y., et\u00a0al.: Tradeoffs between alignment and helpfulness in language models with representation engineering (2024). https:\/\/arxiv.org\/abs\/2401.16332"},{"key":"39_CR55","doi-asserted-by":"crossref","unstructured":"Yang, A., et\u00a0al.: Qwen2.5 technical report (2025)","DOI":"10.2172\/2497340"},{"key":"39_CR56","unstructured":"Yi, S., et\u00a0al.: Jailbreak attacks and defenses against large language models: a survey (2024). https:\/\/arxiv.org\/abs\/2407.04295"},{"key":"39_CR57","unstructured":"Young, A., et\u00a0al.: Yi: open foundation models by 01. AI. arXiv preprint arXiv:2403.04652 (2024)"},{"key":"39_CR58","unstructured":"Yu, J., et\u00a0al.: GPTFuzzer: red teaming large language models with auto-generated jailbreak prompts (2024). https:\/\/arxiv.org\/abs\/2309.10253"},{"key":"39_CR59","unstructured":"Zhu, B., et\u00a0al.: Starling-7B: improving helpfulness and harmlessness with RLAIF. In: First Conference on Language Modeling (2024)"},{"key":"39_CR60","unstructured":"Zou, A., et\u00a0al.: Universal and transferable adversarial attacks on aligned language models (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Safety, Reliability, and Security. SAFECOMP 2025 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02018-5_39","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T10:16:31Z","timestamp":1755771391000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02018-5_39"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032020178","9783032020185"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02018-5_39","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SAFECOMP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Computer Safety, Reliability, and Security","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Stockholm","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sweden","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"44","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"safecomp2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/safecomp2025.se\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}