{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T22:22:36Z","timestamp":1771539756868,"version":"3.50.1"},"reference-count":57,"publisher":"Elsevier BV","issue":"3","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","award":["2024YFB4506200"],"award-info":[{"award-number":["2024YFB4506200"]}],"id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019081","name":"Science and Technology Program of Hunan Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100019081","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014206","name":"National Key Laboratory Foundation of China","doi-asserted-by":"publisher","award":["2024-KJWPDL-14"],"award-info":[{"award-number":["2024-KJWPDL-14"]}],"id":[{"id":"10.13039\/501100014206","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Processing &amp; Management"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.ipm.2025.104544","type":"journal-article","created":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T18:01:37Z","timestamp":1765562497000},"page":"104544","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"title":["SEAttack: A self-evolving jailbreak attack to induce toxic responses for non-toxic queries in large language models"],"prefix":"10.1016","volume":"63","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0252-6973","authenticated-orcid":false,"given":"Huijun","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6508-5119","authenticated-orcid":false,"given":"Shasha","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5508-5051","authenticated-orcid":false,"given":"Bin","family":"Ji","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4455-3128","authenticated-orcid":false,"given":"Xiaohu","family":"Du","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8695-5591","authenticated-orcid":false,"given":"Xiaopeng","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-0854","authenticated-orcid":false,"given":"Jun","family":"Ma\u2020","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5655-6014","authenticated-orcid":false,"given":"Jie","family":"Yu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.ipm.2025.104544_bib0001","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S. et al. (2023). GPT-4 technical report. arXiv preprint arXiv:2303.08774."},{"key":"10.1016\/j.ipm.2025.104544_bib0002","series-title":"ICML 2024 next generation of AI safety workshop","article-title":"Jailbreaking leading safety-aligned LLMs with simple adaptive attacks","author":"Andriushchenko","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0003","unstructured":"Anil, R., Dai, A. M., Firat, O., Johnson, M., Lepikhin, D., Passos, A., Shakeri, S., Taropa, E., Bailey, P., Chen, Z. et al. (2023). PaLM 2 technical report. arXiv preprint arXiv:2305.10403."},{"key":"10.1016\/j.ipm.2025.104544_bib0004","unstructured":"Bai, Y., Jones, A., Ndousse, K., Askell, A., Chen, A., DasSarma, N., Drain, D., Fort, S., Ganguli, D., Henighan, T. et al. (2022a). Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862."},{"key":"10.1016\/j.ipm.2025.104544_bib0005","unstructured":"Bai, Y., Kadavath, S., Kundu, S., Askell, A., Kernion, J., Jones, A., Chen, A., Goldie, A., Mirhoseini, A., McKinnon, C. et al. (2022b). Constitutional AI: Harmlessness from AI feedback. arXiv preprint arXiv:2212.08073."},{"key":"10.1016\/j.ipm.2025.104544_bib0006","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"issue":"3","key":"10.1016\/j.ipm.2025.104544_bib0007","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3641289","article-title":"A survey on evaluation of large language models","volume":"15","author":"Chang","year":"2024","journal-title":"ACM Transactions on Intelligent Systems and Technology"},{"key":"10.1016\/j.ipm.2025.104544_bib0008","unstructured":"Chen, Z., Jiang, F., Chen, J., Wang, T., Yu, F., Chen, G., Zhang, H., Liang, J., Zhang, C., Zhang, Z. et al. (2023). Phoenix: Democratizing ChatGPT across languages. arXiv preprint arXiv:2304.10453."},{"issue":"240","key":"10.1016\/j.ipm.2025.104544_bib0009","first-page":"1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"issue":"70","key":"10.1016\/j.ipm.2025.104544_bib0010","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.ipm.2025.104544_bib0011","unstructured":"Cui, Y., Yang, Z., & Yao, X. (2023). Efficient and effective text encoding for Chinese Llama and Alpaca. arXiv preprint arXiv:2304.08177."},{"key":"10.1016\/j.ipm.2025.104544_bib0012","doi-asserted-by":"crossref","first-page":"512","DOI":"10.1609\/icwsm.v11i1.14955","article-title":"Automated hate speech detection and the problem of offensive language","volume":"vol. 11","author":"Davidson","year":"2017","journal-title":"Proceedings of the International AAAI Conference on Web and Social Media"},{"key":"10.1016\/j.ipm.2025.104544_bib0013","unstructured":"El-Mhamdi, E.-M., Farhadkhani, S., Guerraoui, R., Gupta, N., Hoang, L.-N., Pinot, R., Rouault, S., & Stephan, J. (2022). On the impossible safety of large AI models. arXiv preprint arXiv:2209.15259."},{"key":"10.1016\/j.ipm.2025.104544_bib0014","unstructured":"Ganguli, D., Lovitt, L., Kernion, J., Askell, A., Bai, Y., Kadavath, S., Mann, B., Perez, E., Schiefer, N., Ndousse, K. et al. (2022). Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858."},{"key":"10.1016\/j.ipm.2025.104544_bib0015","doi-asserted-by":"crossref","unstructured":"Gehman, S., Gururangan, S., Sap, M., Choi, Y., & Smith, N. A. (2020). RealToxicityPrompts: Evaluating neural toxic degeneration in language models. arXiv preprint arXiv:2009.11462.","DOI":"10.18653\/v1\/2020.findings-emnlp.301"},{"key":"10.1016\/j.ipm.2025.104544_bib0016","unstructured":"Glaese, A., McAleese, N., Tr\u0119bacz, M., Aslanides, J., Firoiu, V., Ewalds, T., Rauh, M., Weidinger, L., Chadwick, M., Thacker, P. et al. (2022). Improving alignment of dialogue agents via targeted human judgements. arXiv preprint arXiv:2209.14375."},{"key":"10.1016\/j.ipm.2025.104544_bib0017","unstructured":"Goldstein, J. A., Sastry, G., Musser, M., DiResta, R., Gentzel, M., & Sedova, K. (2023). Generative language models and automated influence operations: Emerging threats and potential mitigations. arXiv preprint arXiv:2301.04246."},{"key":"10.1016\/j.ipm.2025.104544_bib0018","unstructured":"Grattafiori, A., Dubey, A., Jauhri, A., Pandey, A., & Kadian, A., et al. (2024). The Llama 3 herd of models. arXiv preprint arXiv:2407.21783."},{"key":"10.1016\/j.ipm.2025.104544_bib0019","unstructured":"Greshake, K., Abdelnabi, S., Mishra, S., Endres, C., Holz, T., & Fritz, M. (2023). More than you\u2019ve asked for: A comprehensive analysis of novel prompt injection threats to application-integrated large language models. arXiv preprint arXiv:2302.12173."},{"key":"10.1016\/j.ipm.2025.104544_bib0020","series-title":"The eleventh international conference on learning representations","article-title":"Prototypical calibration for few-shot learning of language models","author":"Han","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0021","series-title":"Proceedings of the 60th annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"3309","article-title":"ToxiGen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection","author":"Hartvigsen","year":"2022"},{"key":"10.1016\/j.ipm.2025.104544_bib0022","unstructured":"Hazell, J. (2023). Large language models can be used to effectively scale spear phishing campaigns. arXiv preprint arXiv:2305.06972."},{"key":"10.1016\/j.ipm.2025.104544_bib0023","unstructured":"Hurst, A., Lerer, A., Goucher, A. P., Perelman, A., Ramesh, A., Clark, A., Ostrow, A. J., Welihinda, A., Hayes, A., Radford, A. et al. (2024). GPT-4o system card. arXiv preprint arXiv:2410.21276."},{"key":"10.1016\/j.ipm.2025.104544_bib0024","series-title":"International conference on machine learning","first-page":"15307","article-title":"Automatically auditing large language models via discrete optimization","author":"Jones","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0025","series-title":"2024\u202fIEEE security and privacy workshops (SPW)","first-page":"132","article-title":"Exploiting programmatic behavior of LLMs: Dual-use through standard security attacks","author":"Kang","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0026","series-title":"International conference on machine learning","first-page":"17506","article-title":"Pretraining language models with human preferences","author":"Korbak","year":"2023"},{"issue":"1","key":"10.1016\/j.ipm.2025.104544_bib0027","doi-asserted-by":"crossref","first-page":"104","DOI":"10.1017\/XPS.2020.37","article-title":"All the news that\u2019s fit to fabricate: AI-generated text as a tool of media misinformation","volume":"9","author":"Kreps","year":"2022","journal-title":"Journal of Experimental Political Science"},{"issue":"1","key":"10.1016\/j.ipm.2025.104544_bib0028","doi-asserted-by":"crossref","first-page":"159","DOI":"10.2307\/2529310","article-title":"The measurement of observer agreement for categorical data","volume":"33","author":"Landis","year":"1977","journal-title":"Biometrics"},{"key":"10.1016\/j.ipm.2025.104544_bib0029","unstructured":"Li, X., Li, Z., Li, Q., Lee, B., Cui, J., & Hu, X. (2024). Faster-GCG: Efficient discrete optimization jailbreak attacks against aligned large language models. arXiv preprint arXiv:2410.15362."},{"key":"10.1016\/j.ipm.2025.104544_bib0030","series-title":"The twelfth international conference on learning representations","article-title":"AutoDAN: Generating stealthy jailbreak prompts on aligned large language models","author":"Liu","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0031","series-title":"2023\u202fIEEE symposium on security and privacy (SP)","first-page":"346","article-title":"Analyzing leakage of personally identifiable information in language models","author":"Lukas","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0032","series-title":"The twelfth international conference on learning representations","article-title":"WizardCoder: Empowering code large language models with evol-instruct","author":"Luo","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0033","unstructured":"OpenAI, :., Hurst, A., Lerer, A., Goucher, A. P., & Perelman, A., et al. (2024). GPT-4o system card. arXiv preprint arXiv:2410.21276."},{"key":"10.1016\/j.ipm.2025.104544_bib0034","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0035","unstructured":"Pathade, C. (2025). Red teaming the mind of the machine: A systematic evaluation of prompt injection and jailbreak vulnerabilities in LLMs. arXiv preprint arXiv:2505.04806."},{"key":"10.1016\/j.ipm.2025.104544_bib0036","first-page":"79155","article-title":"The refinedweb dataset for falcon LLM: Outperforming curated corpora with web data only","volume":"36","author":"Penedo","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0037","series-title":"Proceedings of the 2022 conference on empirical methods in natural language processing","first-page":"3419","article-title":"Red teaming language models with language models","author":"Perez","year":"2022"},{"key":"10.1016\/j.ipm.2025.104544_bib0038","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"7595","article-title":"On the challenges of using black-box APIs for toxicity evaluation in research","author":"Pozzobon","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0039","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0040","first-page":"2511","article-title":"Principle-driven self-alignment of language models from scratch with minimal human supervision","volume":"36","author":"Sun","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0041","unstructured":"G. Team, Kamath, A., Ferret, J., Pathak, S., & Vieillard, N., et al. (2025). Gemma 3 technical report. arXiv preprint arXiv:2503.19786."},{"key":"10.1016\/j.ipm.2025.104544_bib0042","unstructured":"Q. Team (2025). QwQ-32B: Embracing the power of reinforcement learning. https:\/\/qwenlm. github. io\/blog\/qwq-32b."},{"key":"10.1016\/j.ipm.2025.104544_bib0043","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F. et al. (2023a). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971."},{"key":"10.1016\/j.ipm.2025.104544_bib0044","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et al. (2023b). Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288."},{"key":"10.1016\/j.ipm.2025.104544_bib0045","first-page":"80079","article-title":"Jailbroken: How does llm safety training fail?","volume":"36","author":"Wei","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0046","series-title":"Forty-first international conference on machine learning","article-title":"Fundamental limitations of alignment in large language models","author":"Wolf","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0047","series-title":"Proceedings of the 26th international conference on world wide web","first-page":"1391","article-title":"Ex machina: Personal attacks seen at scale","author":"Wulczyn","year":"2017"},{"key":"10.1016\/j.ipm.2025.104544_bib0048","unstructured":"Xu, B., Yang, A., Lin, J., Wang, Q., Zhou, C., Zhang, Y., & Mao, Z. (2023a). ExpertPrompting: Instructing large language models to be distinguished experts. arXiv preprint arXiv:2305.14688."},{"key":"10.1016\/j.ipm.2025.104544_bib0049","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"6268","article-title":"Baize: An open-source chat model with parameter-efficient tuning on self-chat data","author":"Xu","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0050","series-title":"Findings of the association for computational linguistics ACL 2024","first-page":"7432","article-title":"A comprehensive study of jailbreak attack versus defense for large language models","author":"Xu","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0051","unstructured":"Yang, A., Li, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Gao, C., Huang, C., Lv, C. et al. (2025). Qwen3 technical report. arXiv preprint arXiv:2505.09388."},{"key":"10.1016\/j.ipm.2025.104544_bib0052","unstructured":"Yang, A., Xiao, B., Wang, B., Zhang, B., Bian, C., Yin, C., Lv, C., Pan, D., Wang, D., Yan, D. et al. (2023). Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305."},{"key":"10.1016\/j.ipm.2025.104544_bib0053","unstructured":"Ying, Z., Zheng, G., Huang, Y., Zhang, D., Zhang, W., Zou, Q., Liu, A., Liu, X., & Tao, D. (2025). Towards understanding the safety boundaries of deepseek models: Evaluation and findings. arXiv preprint arXiv:2503.15092."},{"key":"10.1016\/j.ipm.2025.104544_bib0054","series-title":"The eleventh international conference on learning representations","article-title":"GLM-130B: An open bilingual pre-trained model","author":"Zeng","year":"2023"},{"key":"10.1016\/j.ipm.2025.104544_bib0055","series-title":"The twelfth international conference on learning representations","article-title":"LMSYS-Chat-1M: A large-scale real-world LLM conversation dataset","author":"Zheng","year":"2024"},{"key":"10.1016\/j.ipm.2025.104544_bib0056","first-page":"46595","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","volume":"36","author":"Zheng","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.ipm.2025.104544_bib0057","unstructured":"Zou, A., Wang, Z., Carlini, N., Nasr, M., Kolter, J. Z., & Fredrikson, M. (2023). Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043."}],"container-title":["Information Processing &amp; Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457325004856?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457325004856?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T21:57:11Z","timestamp":1771538231000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0306457325004856"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":57,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["S0306457325004856"],"URL":"https:\/\/doi.org\/10.1016\/j.ipm.2025.104544","relation":{},"ISSN":["0306-4573"],"issn-type":[{"value":"0306-4573","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SEAttack: A self-evolving jailbreak attack to induce toxic responses for non-toxic queries in large language models","name":"articletitle","label":"Article Title"},{"value":"Information Processing & Management","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.ipm.2025.104544","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104544"}}