{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T21:15:03Z","timestamp":1778102103701,"version":"3.51.4"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,10]]},"DOI":"10.1016\/j.neunet.2026.109065","type":"journal-article","created":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T22:22:00Z","timestamp":1777674120000},"page":"109065","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SecReEvalBench: A real-world scenario-based security resilience benchmark for large language models"],"prefix":"10.1016","volume":"202","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7408-8719","authenticated-orcid":false,"given":"Huining","family":"Cui","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3003-1313","authenticated-orcid":false,"given":"Wei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109065_bib0001","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S. et al. (2023). Gpt-4 technical report. arXiv: 2303.08774."},{"key":"10.1016\/j.neunet.2026.109065_bib0002","unstructured":"Anthropic (2024). Claude 3 model family. Accessed: 2025-05-09 https:\/\/www.anthropic.com\/news\/claude-3-family."},{"key":"10.1016\/j.neunet.2026.109065_bib0003","unstructured":"Bethany, M., Galiopoulos, A., Bethany, E., Karkevandi, M. B., Vishwamitra, N., & Najafirad, P. (2024). Large language model lateral spear phishing: A comparative study in large-scale organizational settings. arXiv: 2401.09727."},{"key":"10.1016\/j.neunet.2026.109065_bib0004","unstructured":"Bhatt, M., Chennabasappa, S., Li, Y., Nikolaidis, C., Song, D., Wan, S., Ahmad, F., Aschermann, C., Chen, Y., Kapil, D. et al. (2024). Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models. arXiv: 2404.13161."},{"key":"10.1016\/j.neunet.2026.109065_bib0005","unstructured":"Bhatt, M., Chennabasappa, S., Nikolaidis, C., Wan, S., Evtimov, I., Gabi, D., Song, D., Ahmad, F., Aschermann, C., Fontana, L. et al. (2023). Purple llama cyberseceval: A secure coding benchmark for language models. arXiv: 2312.04724."},{"key":"10.1016\/j.neunet.2026.109065_bib0006","series-title":"2025 IEEE\/ACM second international conference on AI foundation models and software engineering (forge)","first-page":"93","article-title":"Benchmarking prompt engineering techniques for secure code generation with GPT models","author":"Bruni","year":"2025"},{"key":"10.1016\/j.neunet.2026.109065_bib0007","unstructured":"com, P. (2024). Fine-tuned distilroberta-base for rejection in the output detection. https:\/\/huggingface.co\/ProtectAI\/distilroberta-base-rejection-v1."},{"key":"10.1016\/j.neunet.2026.109065_bib0008","series-title":"Proceedings of the 22nd australasian data science and machine learning conference (ausDM\u201924)","article-title":"SecEval: A security evaluation dataset for large language models","author":"Cui","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0009","unstructured":"Deng, J., Cheng, J., Sun, H., Zhang, Z., & Huang, M. (2023). Towards safer generative language models: A survey on safety risks, evaluations, and improvements. https:\/\/arxiv.org\/abs\/2302.09270."},{"key":"10.1016\/j.neunet.2026.109065_bib0010","series-title":"The thirteenth international conference on learning representations","article-title":"FairMT-bench: Benchmarking fairness for multi-turn dialogue in conversational LLMs","author":"Fan","year":"2025"},{"key":"10.1016\/j.neunet.2026.109065_bib0011","unstructured":"Fedorov, I., Plawiak, K., Wu, L., Elgamal, T., Suda, N., Smith, E., Zhan, H., Chi, J., Hulovatyy, Y., Patel, K., Liu, Z., Zhao, C., Shi, Y., Blankevoort, T., Pasupuleti, M., Soran, B., Coudert, Z. D., Alao, R., Krishnamoorthi, R., & Chandra, V. (2024). Llama guard 3-1b-INT4: Compact and efficient safeguard for human-AI conversations. https:\/\/arxiv.org\/abs\/2411.17713."},{"key":"10.1016\/j.neunet.2026.109065_bib0012","unstructured":"Freiberger, V., Fleig, A., & Buchmann, E. (2025). Prisme: A novel LLM-powered tool for interactive privacy policy assessment. https:\/\/arxiv.org\/abs\/2501.16033."},{"key":"10.1016\/j.neunet.2026.109065_bib0013","unstructured":"Grattafiori, A., Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Vaughan, A. et al. (2024). The llama 3 herd of models. arXiv: 2407.21783."},{"key":"10.1016\/j.neunet.2026.109065_bib0014","series-title":"Proceedings of the 16th ACM workshop on artificial intelligence and security","first-page":"79","article-title":"Not what you\u2019ve signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection","author":"Greshake","year":"2023"},{"key":"10.1016\/j.neunet.2026.109065_bib0015","unstructured":"Guo, D., Yang, D., Zhang, H., Song, J., Zhang, R., Xu, R., Zhu, Q., Ma, S., Wang, P., Bi, X. et al. (2025). DeepSeek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv: 2501.12948."},{"key":"10.1016\/j.neunet.2026.109065_bib0016","doi-asserted-by":"crossref","unstructured":"Hartvigsen, T., Gabriel, S., Palangi, H., Sap, M., Ray, D., & Kamar, E. (2022). Toxigen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection. https:\/\/arxiv.org\/abs\/2203.09509.","DOI":"10.18653\/v1\/2022.acl-long.234"},{"key":"10.1016\/j.neunet.2026.109065_bib0017","unstructured":"Hazell, J. (2023). Spear phishing with large language models. arXiv: 2305.06972."},{"key":"10.1016\/j.neunet.2026.109065_bib0018","unstructured":"Jiang, A. Q., Sablayrolles, A., Mensch, A., Bamford, C., Chaplot, D. S., de las Casas, D., Bressand, F., Lengyel, G., Lample, G., Saulnier, L., Lavaud, L. R., Lachaux, M.-A., Stock, P., Scao, T. L., Lavril, T., Wang, T., Lacroix, T., & Sayed, W. E. (2023). Mistral 7b. https:\/\/arxiv.org\/abs\/2310.06825."},{"key":"10.1016\/j.neunet.2026.109065_bib0019","series-title":"Proceedings of the 28th international conference on evaluation and assessment in software engineering","first-page":"600","article-title":"LLM security guard for code","author":"Kavian","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0020","unstructured":"Kouremetis, M., Dotter, M., Byrne, A., Martin, D., Michalak, E., Russo, G., Threet, M., & Zarrella, G. (2025). Occult: Evaluating large language models for offensive cyber operation capabilities. arXiv: 2502.15797."},{"key":"10.1016\/j.neunet.2026.109065_bib0021","series-title":"The thirteenth international conference on learning representations","article-title":"HarmAug: Effective data augmentation for knowledge distillation of safety guard models","author":"Lee","year":"2025"},{"issue":"6","key":"10.1016\/j.neunet.2026.109065_bib0022","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3402818","article-title":"SafeRoute: Learning to navigate streets safely in an urban environment","volume":"11","author":"Levy","year":"2020","journal-title":"ACM Transactions on Intelligent Systems and Technology"},{"key":"10.1016\/j.neunet.2026.109065_bib0023","doi-asserted-by":"crossref","unstructured":"Li, H., Guo, D., Fan, W., Xu, M., Huang, J., Meng, F., & Song, Y. (2023). Multi-step jailbreaking privacy attacks on chatGPT. https:\/\/arxiv.org\/abs\/2304.05197.","DOI":"10.18653\/v1\/2023.findings-emnlp.272"},{"key":"10.1016\/j.neunet.2026.109065_bib0024","doi-asserted-by":"crossref","unstructured":"Li, H., Hu, W., Jing, H., Chen, Y., Hu, Q., Han, S., Chu, T., Hu, P., & Song, Y. (2025). PrivaCI-bench: Evaluating privacy with contextual integrity and legal compliance. https:\/\/arxiv.org\/abs\/2502.17041.","DOI":"10.18653\/v1\/2025.acl-long.518"},{"key":"10.1016\/j.neunet.2026.109065_bib0025","doi-asserted-by":"crossref","unstructured":"Li, L., Dong, B., Wang, R., Hu, X., Zuo, W., Lin, D., Qiao, Y., & Shao, J. (2024). Salad-bench: A hierarchical and comprehensive safety benchmark for large language models. https:\/\/arxiv.org\/abs\/2402.05044.","DOI":"10.18653\/v1\/2024.findings-acl.235"},{"key":"10.1016\/j.neunet.2026.109065_bib0026","unstructured":"Liu, C., Zhao, F., Qing, L., Kang, Y., Sun, C., Kuang, K., & Wu, F. (2023). Goal-oriented prompt attack and safety evaluation for LLMs. arXiv: 2309.11830."},{"key":"10.1016\/j.neunet.2026.109065_bib0027","series-title":"33rd USENIX security symposium (USENIX security 24)","first-page":"1831","article-title":"Formalizing and benchmarking prompt injection attacks and defenses","author":"Liu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0028","doi-asserted-by":"crossref","first-page":"123032","DOI":"10.52202\/079017-3910","article-title":"SG-Bench: Evaluating LLM safety generalization across diverse tasks and prompt types","volume":"37","author":"Mou","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"7","key":"10.1016\/j.neunet.2026.109065_bib0029","doi-asserted-by":"crossref","first-page":"3062","DOI":"10.1109\/TKDE.2024.3349708","article-title":"Fraud\u2019s bargain attack: Generating adversarial text samples via word manipulation process","volume":"36","author":"Ni","year":"2024","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.neunet.2026.109065_bib0030","series-title":"2025 IEEE\/ACM international workshop on large language models for code (LLM4code)","first-page":"33","article-title":"CWEval: Outcome-driven evaluation on functionality and security of LLM code generation","author":"Peng","year":"2025"},{"key":"10.1016\/j.neunet.2026.109065_bib0031","unstructured":"Perez, F., & Ribeiro, I. (2022). Ignore previous prompt: Attack techniques for language models. https:\/\/arxiv.org\/abs\/2211.09527."},{"key":"10.1016\/j.neunet.2026.109065_bib0032","doi-asserted-by":"crossref","unstructured":"Rodriguez, D., Seymour, W., Alamo, J. M. D., & Such, J. (2025). Towards safer chatbots: A framework for policy compliance evaluation of custom GPTs. https:\/\/arxiv.org\/abs\/2502.01436.","DOI":"10.1016\/j.array.2026.100834"},{"key":"10.1016\/j.neunet.2026.109065_bib0033","series-title":"International symposium on cyber security, cryptology, and machine learning","first-page":"100","article-title":"LLMSecCode: Evaluating large language models for secure coding","author":"Ryd\u00e9n","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0034","series-title":"Proceedings of the 39th IEEE\/ACM international conference on automated software engineering workshops","first-page":"54","article-title":"SALLM: Security assessment of generated code","author":"Siddiq","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0035","unstructured":"Sun, H., Zhang, Z., Deng, J., Cheng, J., & Huang, M. (2023). Safety assessment of chinese large language models. https:\/\/arxiv.org\/abs\/2304.10436."},{"key":"10.1016\/j.neunet.2026.109065_bib0036","unstructured":"Tamkin, A., Brundage, M., Clark, J., & Ganguli, D. (2021). Understanding the capabilities, limitations, and societal impact of large language models. https:\/\/arxiv.org\/abs\/2102.02503."},{"key":"10.1016\/j.neunet.2026.109065_bib0037","unstructured":"G. Team, Mesnard, T., Hardin, C., Dadashi, R., Bhupatiraju, S., Pathak, S., Sifre, L., Rivi\u00e8re, M., Kale, M. S., Love, J. et al. (2024). Gemma: Open models based on gemini research and technology. arXiv: 2403.08295."},{"key":"10.1016\/j.neunet.2026.109065_bib0038","series-title":"2024 IEEE international conference on cyber security and resilience (CSR)","first-page":"296","article-title":"CyberMetric: A benchmark dataset based on retrieval-augmented generation for evaluating LLMs in cybersecurity knowledge","author":"Tihanyi","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0039","series-title":"2024 IEEE symposium on security and privacy (SP)","first-page":"862","article-title":"LLMs cannot reliably identify and reason about security vulnerabilities (yet?): A comprehensive evaluation, framework, and benchmarks","author":"Ullah","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0040","unstructured":"Wan, S., Nikolaidis, C., Song, D., Molnar, D., Crnkovich, J., Grace, J., Bhatt, M., Chennabasappa, S., Whitman, S., Ding, S. et al. (2024). CYBERSECEVAL 3: Advancing the evaluation of cybersecurity risks and capabilities in large language models. arXiv: 2408.01605."},{"key":"10.1016\/j.neunet.2026.109065_bib0041","series-title":"NeurIPS","article-title":"DecodingTrust: A comprehensive assessment of trustworthiness in GPT models","author":"Wang","year":"2023"},{"key":"10.1016\/j.neunet.2026.109065_bib0042","first-page":"80079","article-title":"Jailbroken: How does llm safety training fail?","volume":"36","author":"Wei","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109065_bib0043","unstructured":"Weidinger, L., Mellor, J., Rauh, M., Griffin, C., Uesato, J., Huang, P.-S., Cheng, M., Glaese, M., Balle, B., Kasirzadeh, A., Kenton, Z., Brown, S., Hawkins, W., Stepleton, T., Biles, C., Birhane, A., Haas, J., Rimell, L., Hendricks, L. A., Isaac, W., Legassick, S., Irving, G., & Gabriel, I. (2021). Ethical and social risks of harm from language models. https:\/\/arxiv.org\/abs\/2112.04359."},{"key":"10.1016\/j.neunet.2026.109065_bib0044","unstructured":"Xu, G., Liu, J., Yan, M., Xu, H., Si, J., Zhou, Z., Yi, P., Gao, X., Sang, J., Zhang, R., Zhang, J., Peng, C., Huang, F., & Zhou, J. (2023). CValues: Measuring the values of Chinese large language models from safety to responsibility. https:\/\/arxiv.org\/abs\/2307.09705."},{"key":"10.1016\/j.neunet.2026.109065_bib0045","unstructured":"Yang, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Li, C., Liu, D., Huang, F., Wei, H., Lin, H., Yang, J., Tu, J., Zhang, J., Yang, J., Yang, J., Zhou, J., Lin, J., Dang, K., Lu, K., Bao, K., Yang, K., Yu, L., Li, M., Xue, M., Zhang, P., Zhu, Q., Men, R., Lin, R., Li, T., Xia, T., Ren, X., Ren, X., Fan, Y., Su, Y., Zhang, Y., Wan, Y., Liu, Y., Cui, Z., Zhang, Z., & Qiu, Z. (2024). Qwen2.5 technical report. arXiv: 2412.15115."},{"key":"10.1016\/j.neunet.2026.109065_bib0046","series-title":"Proceedings of the 31st ACM SIGKDD conference on knowledge discovery and data mining v.1","first-page":"1809","article-title":"Benchmarking and defending against indirect prompt injection attacks on large language models","author":"Yi","year":"2025"},{"key":"10.1016\/j.neunet.2026.109065_bib0047","series-title":"Proceedings of the 41st international conference on machine learning","article-title":"RigorLLM: Resilient guardrails for large language models against undesired content","author":"Yuan","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0048","unstructured":"Zhang, H., Huang, J., Mei, K., Yao, Y., Wang, Z., Zhan, C., Wang, H., & Zhang, Y. (2025). Agent security bench (ASB): Formalizing and benchmarking attacks and defenses in LLM-based agents. https:\/\/arxiv.org\/abs\/2410.02644."},{"key":"10.1016\/j.neunet.2026.109065_bib0049","unstructured":"Zhang, M., Pan, X., & Yang, M. (2023). Jade: A linguistics-based safety evaluation platform for large language models. arXiv: 2311.00286."},{"key":"10.1016\/j.neunet.2026.109065_bib0050","series-title":"Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"15537","article-title":"SafetyBench: Evaluating the safety of large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109065_bib0051","unstructured":"Zhou, W., Wang, X., Xiong, L., Xia, H., Gu, Y., Chai, M., Zhu, F., Huang, C., Dou, S., Xi, Z., Zheng, R., Gao, S., Zou, Y., Yan, H., Le, Y., Wang, R., Li, L., Shao, J., Gui, T., Zhang, Q., & Huang, X. (2024). Easyjailbreak: A unified framework for jailbreaking large language models. https:\/\/arxiv.org\/abs\/2403.12171."},{"key":"10.1016\/j.neunet.2026.109065_bib0052","unstructured":"Zizzo, G., Cornacchia, G., Fraser, K., Hameed, M. Z., Rawat, A., Buesser, B., Purcell, M., Chen, P.-Y., Sattigeri, P., & Varshney, K. (2025). Adversarial prompt evaluation: Systematic benchmarking of guardrails against prompt input attacks on LLMs. arXiv: 2502.15427."},{"key":"10.1016\/j.neunet.2026.109065_bib0053","unstructured":"Zou, A., Wang, Z., Carlini, N., Nasr, M., Kolter, J. Z., & Fredrikson, M. (2023). Universal and transferable adversarial attacks on aligned language models. https:\/\/arxiv.org\/abs\/2307.15043."}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005253?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005253?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T20:53:23Z","timestamp":1778100803000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026005253"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,10]]},"references-count":53,"alternative-id":["S0893608026005253"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109065","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SecReEvalBench: A real-world scenario-based security resilience benchmark for large language models","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109065","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"109065"}}