{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T08:20:00Z","timestamp":1770106800970,"version":"3.49.0"},"reference-count":68,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"General Program of National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62471064"],"award-info":[{"award-number":["62471064"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005374","name":"Fundamental Research Funds for Beijing University of Posts and Telecommunications","doi-asserted-by":"publisher","award":["2025AI4S02"],"award-info":[{"award-number":["2025AI4S02"]}],"id":[{"id":"10.13039\/501100005374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans.Inform.Forensic Secur."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tifs.2026.3652843","type":"journal-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T21:59:20Z","timestamp":1768255160000},"page":"1409-1423","source":"Crossref","is-referenced-by-count":0,"title":["Refining Positive and Toxic Samples for Dual Safety Self-Alignment of LLMs With Minimal Human Interventions"],"prefix":"10.1109","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8659-1505","authenticated-orcid":false,"given":"Jingxin","family":"Xu","sequence":"first","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1987-2736","authenticated-orcid":false,"given":"Guoshun","family":"Nan","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9597-8329","authenticated-orcid":false,"given":"Sheng","family":"Guan","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Sicong","family":"Leng","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Nanyang Avenue, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1928-6493","authenticated-orcid":false,"given":"Yilian","family":"Liu","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0557-7381","authenticated-orcid":false,"given":"Zixiao","family":"Wang","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4164-7116","authenticated-orcid":false,"given":"Yuyang","family":"Ma","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Zhili","family":"Zhou","sequence":"additional","affiliation":[{"name":"Guangzhou University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5571-9539","authenticated-orcid":false,"given":"Yanzhao","family":"Hou","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9518-1622","authenticated-orcid":false,"given":"Xiaofeng","family":"Tao","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref2","article-title":"DeepSeek-V3 technical report","volume-title":"arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref3","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref4","article-title":"Constitutional AI: Harmlessness from AI feedback","author":"Bai","year":"2022","journal-title":"arXiv:2212.08073"},{"key":"ref5","first-page":"1","article-title":"The ai alignment problem: Why it is hard, and where to start","volume":"4","author":"Yudkowsky","year":"2016","journal-title":"Symbolic Syst. Distinguished Speaker"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09539-2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"ref8","first-page":"39416","article-title":"Self-alignment of large language models via monopolylogue-based social scene simulation","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Pang"},{"key":"ref9","article-title":"Mixture of insighTful experts (MoTE): The synergy of thought chains and expert mixtures in self-alignment","author":"Liu","year":"2024","journal-title":"arXiv:2405.00557"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3516579"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2025.3560042"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3516554"},{"key":"ref13","first-page":"1","article-title":"Finetuned language models are zero-shot learners","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Wei"},{"key":"ref14","article-title":"Training a helpful and harmless assistant with reinforcement learning from human feedback","author":"Bai","year":"2022","journal-title":"arXiv:2204.05862"},{"key":"ref15","first-page":"1","article-title":"Fine-tuning aligned language models compromises safety, even when users do not intend to!","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Qi"},{"key":"ref16","article-title":"Stanford alpaca: An instruction-following LLaMA model","author":"Taori","year":"2023"},{"key":"ref17","volume-title":"Vicuna: An Open-Source Chatbot Impressing Gpt-4 With 90% Chatgpt Quality","author":"Chiang","year":"2023"},{"key":"ref18","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NIPS)","author":"Ouyang"},{"key":"ref19","first-page":"1","article-title":"BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ji"},{"key":"ref20","first-page":"35413","article-title":"Poisoning language models during instruction tuning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wan"},{"key":"ref21","first-page":"1","article-title":"Openassistant conversations-democratizing large language model alignment","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"K\u00f6pf"},{"key":"ref22","first-page":"1","article-title":"Principle-driven self-alignment of language models from scratch with minimal human supervision","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sun"},{"key":"ref23","article-title":"RAIN: Your language models can align themselves without finetuning","author":"Li","year":"2023","journal-title":"arXiv:2309.07124"},{"key":"ref24","article-title":"RLAIF vs. RLHF: Scaling reinforcement learning from human feedback with AI feedback","author":"Lee","year":"2023","journal-title":"arXiv:2309.00267"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71547-6_4"},{"key":"ref26","article-title":"Spread preference annotation: Direct preference judgment for efficient LLM alignment","author":"Kim","year":"2024","journal-title":"arXiv:2406.04412"},{"key":"ref27","article-title":"Secrets of RLHF in large language models\u2014Part II: Reward modeling","author":"Wang","year":"2024","journal-title":"arXiv:2401.06080"},{"key":"ref28","first-page":"16025","article-title":"Variational Bayesian unlearning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref29","article-title":"An empirical study of catastrophic forgetting in large language models during continual fine-tuning","author":"Luo","year":"2023","journal-title":"arXiv:2308.08747"},{"key":"ref30","article-title":"A survey of large language models","author":"Zhao","year":"2023","journal-title":"arXiv:2303.18223"},{"key":"ref31","article-title":"On the opportunities and risks of foundation models","author":"Bommasani","year":"2021","journal-title":"arXiv:2108.07258"},{"key":"ref32","first-page":"1","article-title":"Safety-tuned LLaMAs: Lessons from improving the safety of large language models that follow instructions","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Bianchi"},{"key":"ref33","first-page":"1","article-title":"Fine-grained human feedback gives better rewards for language model training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wu"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1197"},{"key":"ref35","first-page":"3008","article-title":"Learning to summarize with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Stiennon"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.427"},{"key":"ref37","first-page":"1","article-title":"Gaining wisdom from setbacks: Aligning large language models via mistake analysis","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Chen"},{"key":"ref38","first-page":"5784","article-title":"Understanding negative samples in instance discriminative self-supervised representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nozawa"},{"key":"ref39","article-title":"Learning from failure: Integrating negative examples when fine-tuning large language models as agents","author":"Wang","year":"2024","journal-title":"arXiv:2402.11651"},{"key":"ref40","first-page":"6827","article-title":"What makes for good views for contrastive learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Tian"},{"key":"ref41","first-page":"1","article-title":"Neural text degeneration with unlikelihood training","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Welleck"},{"key":"ref42","article-title":"RLAIF-V: Aligning MLLMs through open-source AI feedback for super GPT-4V trustworthiness","author":"Yu","year":"2024","journal-title":"arXiv:2405.17220"},{"key":"ref43","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref44","volume-title":"Openai. GPT-4V(Ision) System Card","year":"2023"},{"key":"ref45","article-title":"Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned","author":"Ganguli","year":"2022","journal-title":"arXiv:2209.07858"},{"key":"ref46","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref47","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lee"},{"key":"ref48","first-page":"41414","article-title":"The wisdom of hindsight makes language models better instruction followers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhang"},{"key":"ref49","volume-title":"Koala: A Dialogue Model for Academic Research","author":"Lab","year":"2023"},{"key":"ref50","first-page":"1","article-title":"GLM-130B: An open bilingual pre-trained model","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zeng"},{"key":"ref51","article-title":"Qwen3 technical report","volume-title":"arXiv:2505.09388","author":"Yang","year":"2025"},{"key":"ref52","volume-title":"Mistral\u2014A Journey Towards Reproducible Language Model Training","author":"Karamcheti","year":"2021"},{"key":"ref53","first-page":"1","article-title":"WizardLM: Empowering large pre-trained language models to follow complex instructions","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Xu"},{"key":"ref54","first-page":"1","article-title":"The curious case of neural text degeneration","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Holtzman"},{"key":"ref55","first-page":"1","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Hu"},{"key":"ref56","article-title":"The Llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref57","first-page":"1","article-title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","author":"Srivastava","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref58","first-page":"1","article-title":"AutoDAN: Generating stealthy jailbreak prompts on aligned large language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Liu"},{"key":"ref59","first-page":"35181","article-title":"HarmBench: A standardized evaluation framework for automated red teaming and robust refusal","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Mazeika"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"ref61","first-page":"1","article-title":"Measuring massive multitask language understanding","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Hendrycks"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1454"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.35"},{"key":"ref64","first-page":"1","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Rafailov"},{"key":"ref65","article-title":"KTO: Model alignment as prospect theoretic optimization","author":"Ethayarajh","year":"2024","journal-title":"arXiv:2402.01306"},{"key":"ref66","article-title":"Universal and transferable adversarial attacks on aligned language models","author":"Zou","year":"2023","journal-title":"arXiv:2307.15043"},{"key":"ref67","article-title":"Llama guard 3 vision: Safeguarding human-AI image understanding conversations","author":"Chi","year":"2024","journal-title":"arXiv:2411.10414"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.329"}],"container-title":["IEEE Transactions on Information Forensics and Security"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10206\/11313711\/11345453.pdf?arnumber=11345453","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T20:43:24Z","timestamp":1770065004000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11345453\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":68,"URL":"https:\/\/doi.org\/10.1109\/tifs.2026.3652843","relation":{},"ISSN":["1556-6013","1556-6021"],"issn-type":[{"value":"1556-6013","type":"print"},{"value":"1556-6021","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}