{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,21]],"date-time":"2025-12-21T10:04:56Z","timestamp":1766311496260,"version":"3.41.0"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T00:00:00Z","timestamp":1748563200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T00:00:00Z","timestamp":1748563200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Natural Sciences and Engineering Research Council of Canada,Canada"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Inf. Secur."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10207-025-01066-4","type":"journal-article","created":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T09:10:10Z","timestamp":1748596210000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A Data-centric approach for safe and secure large language models against threatening and toxic content"],"prefix":"10.1007","volume":"24","author":[{"given":"Chaima","family":"Njeh","sequence":"first","affiliation":[]},{"given":"Ha\u00effa","family":"Nakouri","sequence":"additional","affiliation":[]},{"given":"Fehmi","family":"Jaafar","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,30]]},"reference":[{"key":"1066_CR1","doi-asserted-by":"crossref","unstructured":"Caselli, T., Basile, V., Mitrovi\u0107, J., Granitzer, M.: Hatebert: Retraining bert for abusive language detection in english. arXiv preprint arXiv:2010.12472 (2020)","DOI":"10.18653\/v1\/2021.woah-1.3"},{"key":"1066_CR2","unstructured":"Kim, Y., Park, S., Han, Y.-S.: Generalizable implicit hate speech detection using contrastive learning. In: Proceedings of the 29th International Conference on Computational Linguistics, pp. 6667\u20136679 (2022)"},{"key":"1066_CR3","unstructured":"Wang, Y.-S., Chang, Y.: Toxicity detection with generative prompt-based inference. arXiv preprint arXiv:2205.12390 (2022)"},{"key":"1066_CR4","unstructured":"Zhang, T., Luo, H., Chuang, Y.-S., Fang, W., Gaitskell, L., Hartvigsen, T., Wu, X., Fox, D., Meng, H., Glass, J.: Interpretable unified language checking. arXiv preprint arXiv:2304.03728 (2023)"},{"key":"1066_CR5","doi-asserted-by":"publisher","first-page":"21779","DOI":"10.1609\/aaai.v38i19.30178","volume":"38","author":"J Zhang","year":"2024","unstructured":"Zhang, J., Wu, Q., Xu, Y., Cao, C., Du, Z., Psounis, K.: Efficient toxic content detection by bootstrapping and distilling large language models. Proceedings of the AAAI Conference on Artificial Intelligence 38, 21779\u201321787 (2024)","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"1066_CR6","doi-asserted-by":"crossref","unstructured":"Fu, J., Qin, X., Yang, F., Wang, L., Zhang, J., Lin, Q., Chen, Y., Zhang, D., Rajmohan, S., Zhang, Q.: AutoRAG-HP: Automatic Online Hyper-Parameter Tuning for Retrieval-Augmented Generation (2024). https:\/\/arxiv.org\/abs\/2406.19251","DOI":"10.18653\/v1\/2024.findings-emnlp.223"},{"key":"1066_CR7","first-page":"201","volume-title":"International Conference on Hybrid Artificial Intelligence Systems","author":"C Njeh","year":"2024","unstructured":"Njeh, C., Nakouri, H., Jaafar, F.: Enhancing rag-retrieval to improve llms robustness and resilience to hallucinations. In: International Conference on Hybrid Artificial Intelligence Systems, pp. 201\u2013213. Springer (2024)"},{"key":"1066_CR8","unstructured":"Shukor, M., Rame, A., Dancette, C., Cord, M.: Beyond task performance: Evaluating and reducing the flaws of large multimodal models with in-context learning. arXiv preprint arXiv:2310.00647 (2023)"},{"key":"1066_CR9","first-page":"2511","volume":"36","author":"Z Sun","year":"2024","unstructured":"Sun, Z., Shen, Y., Zhou, Q., Zhang, H., Chen, Z., Cox, D., Yang, Y., Gan, C.: Principle-driven self-alignment of language models from scratch with minimal human supervision. Adv. Neural. Inf. Process. Syst. 36, 2511\u20132565 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1066_CR10","doi-asserted-by":"crossref","unstructured":"Huang, J., Gu, S.S., Hou, L., Wu, Y., Wang, X., Yu, H., Han, J.: Large language models can self-improve. arXiv preprint arXiv:2210.11610 (2022)","DOI":"10.18653\/v1\/2023.emnlp-main.67"},{"issue":"12","key":"1066_CR11","doi-asserted-by":"publisher","first-page":"1486","DOI":"10.1038\/s42256-023-00765-8","volume":"5","author":"Y Xie","year":"2023","unstructured":"Xie, Y., Yi, J., Shao, J., Curl, J., Lyu, L., Chen, Q., Xie, X., Wu, F.: Defending chatgpt against jailbreak attack via self-reminders. Nat. Mach. Intell. 5(12), 1486\u20131496 (2023)","journal-title":"Nat. Mach. Intell."},{"key":"1066_CR12","unstructured":"Zhao, J., Deng, Z., Madras, D., Zou, J., Ren, M.: Learning and forgetting unsafe examples in large language models. arXiv preprint arXiv:2312.12736 (2023)"},{"key":"1066_CR13","unstructured":"Gou, Z., Shao, Z., Gong, Y., Shen, Y., Yang, Y., Duan, N., Chen, W.: Critic: Large language models can self-correct with tool-interactive critiquing. arXiv preprint arXiv:2305.11738 (2023)"},{"key":"1066_CR14","unstructured":"Mousavi, S., Guti\u00e9rrez, R.L., Rengarajan, D., Gundecha, V., Babu, A.R., Naug, A., Guillen, A., Sarkar, S.: N-critics: Self-refinement of large language models with ensemble of critics. arXiv preprint arXiv:2310.18679 (2023)"},{"key":"1066_CR15","first-page":"46534","volume":"36","author":"A Madaan","year":"2024","unstructured":"Madaan, A., Tandon, N., Gupta, P., Hallinan, S., Gao, L., Wiegreffe, S., Alon, U., Dziri, N., Prabhumoye, S., Yang, Y., et al.: Self-refine: Iterative refinement with self-feedback. Adv. Neural. Inf. Process. Syst. 36, 46534\u201346594 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1066_CR16","unstructured":"An, S., Ma, Z., Lin, Z., Zheng, N., Lou, J.-G., Chen, W.: Learning from mistakes makes llm better reasoner. arXiv preprint arXiv:2310.20689 (2023)"},{"key":"1066_CR17","unstructured":"Krishna, S.: On the intersection of self-correction and trust in language models. arXiv preprint arXiv:2311.02801 (2023)"},{"key":"1066_CR18","doi-asserted-by":"crossref","unstructured":"Wu, T.-H., Lian, L., Gonzalez, J.E., Li, B., Darrell, T.: Self-correcting llm-controlled diffusion models. arXiv preprint arXiv:2311.16090 (2023)","DOI":"10.1109\/CVPR52733.2024.00605"},{"key":"1066_CR19","doi-asserted-by":"crossref","unstructured":"Pan, L., Saxon, M., Xu, W., Nathani, D., Wang, X., Wang, W.Y.: Automatically correcting large language models: Surveying the landscape of diverse self-correction strategies. arXiv preprint arXiv:2308.03188 (2023)","DOI":"10.1162\/tacl_a_00660"},{"key":"1066_CR20","doi-asserted-by":"crossref","unstructured":"Jiang, D., Zhang, J., Weller, O., Weir, N., Van\u00a0Durme, B., Khashabi, D.: Self-[in] correct: Llms struggle with refining self-generated responses. arXiv preprint arXiv:2404.04298 (2024)","DOI":"10.1609\/aaai.v39i23.34603"},{"key":"1066_CR21","unstructured":"Huang, J., Chen, X., Mishra, S., Zheng, H.S., Yu, A.W., Song, X., Zhou, D.: Large language models cannot self-correct reasoning yet. arXiv preprint arXiv:2310.01798 (2023)"},{"key":"1066_CR22","unstructured":"Li, L., Chen, G., Su, Y., Chen, Z., Zhang, Y., Xing, E., Zhang, K.: Confidence matters: Revisiting intrinsic self-correction capabilities of large language models. arXiv preprint arXiv:2402.12563 (2024)"},{"key":"1066_CR23","doi-asserted-by":"crossref","unstructured":"Shashidhar, S., Chinta, A., Sahai, V., Wang, Z., Ji, H.: Democratizing llms: An exploration of cost-performance trade-offs in self-refined open-source models. arXiv preprint arXiv:2310.07611 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.608"},{"key":"1066_CR24","volume-title":"Model editing as a robust and denoised variant of dpo: A case study on toxicity","author":"R Uppaal","year":"2024","unstructured":"Uppaal, R., Dey, A., He, Y., Zhong, Y., Hu, J.: Model editing as a robust and denoised variant of dpo: A case study on toxicity. Neurips Safe Generative AI Workshop 2024 (2024)"},{"key":"1066_CR25","unstructured":"OpenAI, Achiam, J., et al., S.A.: GPT-4 Technical Report (2024)"},{"key":"1066_CR26","unstructured":"Anil, R., Dai, A.M., et al., O.F.: PaLM 2 Technical Report (2023)"},{"key":"1066_CR27","unstructured":"Jiang, A.Q., Sablayrolles, A., Mensch, A., Bamford, C., Chaplot, D.S., Casas, D.d.l., Bressand, F., Lengyel, G., Lample, G., Saulnier, L., et al.: Mistral 7b. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"1066_CR28","unstructured":"Team, G., Mesnard, T., Hardin, C., Dadashi, R., Bhupatiraju, S., Pathak, S., Sifre, L., Rivi\u00e8re, M., Kale, M.S., Love, J., et al.: Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295 (2024)"},{"key":"1066_CR29","doi-asserted-by":"publisher","DOI":"10.1007\/b98835","volume-title":"Principal Component Analysis","author":"IT Jolliffe","year":"2002","unstructured":"Jolliffe, I.T.: Principal Component Analysis. Springer Series in Statistics, Springer, New York (2002). https:\/\/doi.org\/10.1007\/b98835"}],"container-title":["International Journal of Information Security"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10207-025-01066-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10207-025-01066-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10207-025-01066-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T10:11:56Z","timestamp":1750500716000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10207-025-01066-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,30]]},"references-count":29,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1066"],"URL":"https:\/\/doi.org\/10.1007\/s10207-025-01066-4","relation":{},"ISSN":["1615-5262","1615-5270"],"issn-type":[{"type":"print","value":"1615-5262"},{"type":"electronic","value":"1615-5270"}],"subject":[],"published":{"date-parts":[[2025,5,30]]},"assertion":[{"value":"30 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no financial or proprietary interests in any material discussed in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}},{"value":"We declare that this submission follows the policies as outlined in the Guide for Authors. The current research involves no Human Participants and\/or Animals.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"Code is available upon request.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}}],"article-number":"148"}}