{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T15:13:41Z","timestamp":1781190821154,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":47,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819596935","type":"print"},{"value":"9789819596942","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-9694-2_8","type":"book-chapter","created":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T14:37:53Z","timestamp":1781188673000},"page":"97-113","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Review of\u00a0LLM Jailbreaks: White-Box and\u00a0Black-Box Perspectives on\u00a0Attacks, Defenses, and\u00a0Critical Metrics"],"prefix":"10.1007","author":[{"given":"Shuyuan","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiawei","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhaoxia","family":"Yin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,5,1]]},"reference":[{"key":"8_CR1","unstructured":"Alon, G., Kamfonas, M.: Detecting language model attacks with perplexity (2023). https:\/\/arxiv.org\/abs\/2308.14132"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Cao, B., Cao, Y., Lin, L., Chen, J.: Defending against alignment-breaking attacks via robustly aligned LLM (2023)","DOI":"10.18653\/v1\/2024.acl-long.568"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Cheng, J., Danescu-Niculescu-Mizil, C., Leskovec, J.: Antisocial behavior in online discussion communities. Proc. Int. AAAI Conf. Web Soc. Media 9(1), 61\u201370 (2021). https:\/\/doi.org\/10.1609\/icwsm.v9i1.14583. https:\/\/ojs.aaai.org\/index.php\/ICWSM\/article\/view\/14583","DOI":"10.1609\/icwsm.v9i1.14583"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Chu, J., Liu, Y., Yang, Z., Shen, X., Backes, M., Zhang, Y.: Jailbreakradar: comprehensive assessment of jailbreak attacks against LLMs. arXiv preprint arXiv:2402.05668 (2024)","DOI":"10.18653\/v1\/2025.acl-long.1045"},{"key":"8_CR5","doi-asserted-by":"publisher","unstructured":"Cui, S., et al.: FFT: towards harmlessness evaluation and analysis for LLMs with factuality, fairness, toxicity. CoRR abs\/2311.18580 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2311.18580","DOI":"10.48550\/ARXIV.2311.18580"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Ding, P., et al.: A wolf in sheep\u2019s clothing: generalized nested jailbreak prompts can fool large language models easily (2023)","DOI":"10.18653\/v1\/2024.naacl-long.118"},{"key":"8_CR7","doi-asserted-by":"publisher","unstructured":"Dong, Z., Zhou, Z., Yang, C., Shao, J., Qiao, Y.: Attacks, defenses and evaluations for LLM conversation safety: a survey. In: Duh, K., Gomez, H., Bethard, S. (eds.) Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 6734\u20136747. Association for Computational Linguistics, Mexico City, Mexico (2024). https:\/\/doi.org\/10.18653\/v1\/2024.naacl-long.375. https:\/\/aclanthology.org\/2024.naacl-long.375\/","DOI":"10.18653\/v1\/2024.naacl-long.375"},{"key":"8_CR8","unstructured":"Du, Y., Zhao, S., Ma, M., Chen, Y., Qin, B.: Analyzing the inherent response tendency of LLMs: real-world instructions-driven jailbreak (2024). https:\/\/arxiv.org\/abs\/2312.04127"},{"key":"8_CR9","doi-asserted-by":"publisher","unstructured":"Gehman, S., Gururangan, S., Sap, M., Choi, Y., Smith, N.A.: RealToxicityPrompts: evaluating neural toxic degeneration in language models. In: Cohn, T., He, Y., Liu, Y. (eds.) Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 3356\u20133369. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.301. https:\/\/aclanthology.org\/2020.findings-emnlp.301\/","DOI":"10.18653\/v1\/2020.findings-emnlp.301"},{"key":"8_CR10","doi-asserted-by":"publisher","unstructured":"Goyal, S., et al.: LLMGuard: guarding against unsafe LLM behavior. In: Proceedings of the Thirty-Eighth AAAI Conference on Artificial Intelligence and Thirty-Sixth Conference on Innovative Applications of Artificial Intelligence and Fourteenth Symposium on Educational Advances in Artificial Intelligence, AAAI\u201924\/IAAI\u201924\/EAAI\u201924. AAAI Press (2024). https:\/\/doi.org\/10.1609\/aaai.v38i21.30566","DOI":"10.1609\/aaai.v38i21.30566"},{"key":"8_CR11","unstructured":"Guo, X., Yu, F., Zhang, H., Qin, L., Hu, B.: Cold-attack: jailbreaking LLMs with stealthiness and controllability. In: Proceedings of the 41st International Conference on Machine Learning, ICML 2024. JMLR.org (2024)"},{"key":"8_CR12","doi-asserted-by":"publisher","unstructured":"Hartvigsen, T., Gabriel, S., Palangi, H., Sap, M., Ray, D., Kamar, E.: ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 3309\u20133326. Association for Computational Linguistics, Dublin, Ireland, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.234. https:\/\/aclanthology.org\/2022.acl-long.234\/","DOI":"10.18653\/v1\/2022.acl-long.234"},{"key":"8_CR13","unstructured":"He, L., Xia, M., Henderson, P.: What is in your safe data? Identifying benign data that breaks safety (2024). https:\/\/arxiv.org\/abs\/2404.01099"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Jha, P., Arora, A., Ganesh, V.: LLM stinger: Jailbreaking LLMs using RL fine-tuned LLMs (student abstract). In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a039, pp. 29393\u201329395 (2025)","DOI":"10.1609\/aaai.v39i28.35263"},{"key":"8_CR15","unstructured":"Jia, X., et al.: Improved techniques for optimization-based jailbreaking on large language models (2024). https:\/\/arxiv.org\/abs\/2405.21018"},{"key":"8_CR16","doi-asserted-by":"publisher","unstructured":"Jiang, F., et al.: ArtPrompt: ASCII art-based jailbreak attacks against aligned LLMs. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 15157\u201315173. Association for Computational Linguistics, Bangkok, Thailand, August 2024. https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.809. https:\/\/aclanthology.org\/2024.acl-long.809\/","DOI":"10.18653\/v1\/2024.acl-long.809"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Kang, D., Li, X., Stoica, I., Guestrin, C., Zaharia, M., Hashimoto, T.: Exploiting programmatic behavior of LLMs: dual-use through standard security attacks (2023). https:\/\/arxiv.org\/abs\/2302.05733","DOI":"10.1109\/SPW63631.2024.00018"},{"key":"8_CR18","unstructured":"Lermen, S., Rogers-Smith, C., Ladish, J.: Lora fine-tuning efficiently undoes safety training in llama 2-chat 70B (2024). https:\/\/arxiv.org\/abs\/2310.20624"},{"key":"8_CR19","doi-asserted-by":"publisher","unstructured":"Li, H., et al.: PrivLM-bench: a multi-level privacy evaluation benchmark for language models. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 54\u201373. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.4. https:\/\/aclanthology.org\/2024.acl-long.4\/","DOI":"10.18653\/v1\/2024.acl-long.4"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Li, H., Ye, J., Wu, J., Yan, T., Wang, C., Li, Z.: JailPO: a novel black-box jailbreak framework via preference optimization against aligned LLMs. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a039, pp. 27419\u201327427 (2025)","DOI":"10.1609\/aaai.v39i26.34953"},{"key":"8_CR21","unstructured":"Li, T., et al.: Revisiting jailbreaking for large language models: a representation engineering perspective. In: Proceedings of the 31st International Conference on Computational Linguistics, pp. 3158\u20133178 (2025)"},{"key":"8_CR22","unstructured":"Li, X., Zhou, Z., Zhu, J., Yao, J., Liu, T., Han, B.: DeepInception: hypnotize large language model to be jailbreaker (2024). https:\/\/arxiv.org\/abs\/2311.03191"},{"key":"8_CR23","doi-asserted-by":"publisher","unstructured":"Lin, S., Hilton, J., Evans, O.: TruthfulQA: measuring how models mimic human falsehoods. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 3214\u20133252. Association for Computational Linguistics, Dublin, Ireland, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.229. https:\/\/aclanthology.org\/2022.acl-long.229\/","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Liu, S., Chen, J., Ruan, S., Su, H., Yin, Z.: Exploring the robustness of decision-level through adversarial attacks on LLM-based embodied models. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 8120\u20138128 (2024)","DOI":"10.1145\/3664647.3680616"},{"key":"8_CR25","unstructured":"Liu, X., Xu, N., Chen, M., Xiao, C.: AutoDAN: generating stealthy jailbreak prompts on aligned large language models. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=7Jwpw4qKkb"},{"key":"8_CR26","unstructured":"Luo, W., Ma, S., Liu, X., Guo, X., Xiao, C.: JailBreakV-28K: a benchmark for assessing the robustness of multimodal large language models against jailbreak attacks (2024)"},{"key":"8_CR27","unstructured":"Mozes, M., He, X., Kleinberg, B., Griffin, L.D.: Use of LLMs for illicit purposes: threats, prevention measures, and vulnerabilities (2023). https:\/\/arxiv.org\/abs\/2308.12833"},{"key":"8_CR28","doi-asserted-by":"publisher","unstructured":"Nobata, C., Tetreault, J., Thomas, A., Mehdad, Y., Chang, Y.: Abusive language detection in online user content. In: Proceedings of the 25th International Conference on World Wide Web, WWW 2016, pp. 145\u2013153. International World Wide Web Conferences Steering Committee, Republic and Canton of Geneva, CHE (2016). https:\/\/doi.org\/10.1145\/2872427.2883062","DOI":"10.1145\/2872427.2883062"},{"key":"8_CR29","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Proceedings of the 36th International Conference on Neural Information Processing Systems, NIPS 2022. Curran Associates Inc., Red Hook, NY, USA (2022)"},{"key":"8_CR30","unstructured":"Qiu, H., Zhang, S., Li, A., He, H., Lan, Z.: Latent jailbreak: a benchmark for evaluating text safety and output robustness of large language models (2023). https:\/\/arxiv.org\/abs\/2307.08487"},{"key":"8_CR31","unstructured":"Robey, A., Wong, E., Hassani, H., Pappas, G.J.: SmoothLLM: defending large language models against jailbreaking attacks (2024). https:\/\/arxiv.org\/abs\/2310.03684"},{"key":"8_CR32","unstructured":"Shah, R., Feuillade-Montixi, Q., Pour, S., Tagade, A., Casper, S., Rando, J.: Scalable and transferable black-box jailbreaks for language models via persona modulation (2023). https:\/\/arxiv.org\/abs\/2311.03348"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Shen, X., Chen, Z., Backes, M., Shen, Y., Zhang, Y.: \u201cDo anything now\u201d: characterizing and evaluating in-the-wild jailbreak prompts on large language models (2024). https:\/\/arxiv.org\/abs\/2308.03825","DOI":"10.1145\/3658644.3670388"},{"key":"8_CR34","doi-asserted-by":"publisher","unstructured":"Wang, Y., Shi, Z., Bai, A., Hsieh, C.J.: Defending LLMs against jailbreaking attacks via backtranslation. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Findings of the Association for Computational Linguistics: ACL 2024, pp. 16031\u201316046. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.findings-acl.948. https:\/\/aclanthology.org\/2024.findings-acl.948\/","DOI":"10.18653\/v1\/2024.findings-acl.948"},{"key":"8_CR35","unstructured":"Wei, Z., Wang, Y., Li, A., Mo, Y., Wang, Y.: Jailbreak and guard aligned language models with only few in-context demonstrations (2024). https:\/\/arxiv.org\/abs\/2310.06387"},{"key":"8_CR36","doi-asserted-by":"publisher","unstructured":"Wulczyn, E., Thain, N., Dixon, L.: Ex machina: personal attacks seen at scale. In: Proceedings of the 26th International Conference on World Wide Web, WWW 2017, pp. 1391\u20131399. International World Wide Web Conferences Steering Committee, Republic and Canton of Geneva, CHE (2017). https:\/\/doi.org\/10.1145\/3038912.3052591","DOI":"10.1145\/3038912.3052591"},{"key":"8_CR37","doi-asserted-by":"publisher","unstructured":"Xu, Z., Jiang, F., Niu, L., Jia, J., Lin, B.Y., Poovendran, R.: SafeDecoding: defending against jailbreak attacks via safety-aware decoding. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5587\u20135605. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.303. https:\/\/aclanthology.org\/2024.acl-long.303\/","DOI":"10.18653\/v1\/2024.acl-long.303"},{"key":"8_CR38","doi-asserted-by":"publisher","unstructured":"Xue, Y., et al.: Dual intention escape: Penetrating and toxic jailbreak attack against large language models. In: Proceedings of the ACM on Web Conference 2025, WWW 2025, pp. 863\u2013871. Association for Computing Machinery, New York, NY, USA (2025). https:\/\/doi.org\/10.1145\/3696410.3714654","DOI":"10.1145\/3696410.3714654"},{"key":"8_CR39","unstructured":"Yuan, Y., et al.: GPT-4 is too smart to be safe: stealthy chat with LLMs via cipher (2023)"},{"key":"8_CR40","volume-title":"Defending Against Neural Fake News","author":"R Zellers","year":"2019","unstructured":"Zellers, R., et al.: Defending Against Neural Fake News. Curran Associates Inc., Red Hook, NY, USA (2019)"},{"key":"8_CR41","doi-asserted-by":"publisher","unstructured":"Zhan, Q., Fang, R., Bindu, R., Gupta, A., Hashimoto, T., Kang, D.: Removing RLHF protections in GPT-4 via fine-tuning. In: Duh, K., Gomez, H., Bethard, S. (eds.) Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pp. 681\u2013687. Association for Computational Linguistics, Mexico City, Mexico (2024). https:\/\/doi.org\/10.18653\/v1\/2024.naacl-short.59. https:\/\/aclanthology.org\/2024.naacl-short.59\/","DOI":"10.18653\/v1\/2024.naacl-short.59"},{"key":"8_CR42","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Wei, Z.: Boosting jailbreak attack with momentum. In: ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10888812","DOI":"10.1109\/ICASSP49660.2025.10888812"},{"key":"8_CR43","doi-asserted-by":"publisher","unstructured":"Zhang, Z., Yang, J., Ke, P., Mi, F., Wang, H., Huang, M.: Defending large language models against jailbreaking attacks through goal prioritization. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 8865\u20138887. Association for Computational Linguistics, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.481. https:\/\/aclanthology.org\/2024.acl-long.481\/","DOI":"10.18653\/v1\/2024.acl-long.481"},{"key":"8_CR44","unstructured":"Zhou, A., Li, B., Wang, H.: Robust prompt optimization for defending language models against jailbreaking attacks (2024)"},{"key":"8_CR45","unstructured":"Zhou, C., et al.: Lima: less is more for alignment. In: Proceedings of the 37th International Conference on Neural Information Processing Systems, NIPS 2023. Curran Associates Inc., Red Hook, NY, USA (2023)"},{"key":"8_CR46","doi-asserted-by":"publisher","unstructured":"Zhou, Y., et al.: Defending jailbreak prompts via in-context adversarial game. In: Al-Onaizan, Y., Bansal, M., Chen, Y.N. (eds.) Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pp. 20084\u201320105. Association for Computational Linguistics, Miami, Florida, USA (2024). https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.1121. https:\/\/aclanthology.org\/2024.emnlp-main.1121\/","DOI":"10.18653\/v1\/2024.emnlp-main.1121"},{"key":"8_CR47","unstructured":"Zou, A., Wang, Z., Carlini, N., Nasr, M., Kolter, J.Z., Fredrikson, M.: Universal and transferable adversarial attacks on aligned language models (2023). https:\/\/arxiv.org\/abs\/2307.15043"}],"container-title":["Lecture Notes in Computer Science","Evaluation Science and Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-9694-2_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T14:38:08Z","timestamp":1781188688000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-9694-2_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819596935","9789819596942"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-9694-2_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"1 May 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Bench","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Benchmarking, Measuring and Optimization","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chengdu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 December 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 December 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"bench2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.benchcouncil.org\/bench2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}