{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T15:03:51Z","timestamp":1781276631817,"version":"3.54.1"},"reference-count":141,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,10]]},"DOI":"10.1016\/j.neunet.2026.108996","type":"journal-article","created":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T06:14:37Z","timestamp":1776233677000},"page":"108996","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Aligning large language models across the lifecycle: A survey on safety-usability trade-offs from pre-training to post-training"],"prefix":"10.1016","volume":"202","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1800-2826","authenticated-orcid":false,"given":"Zhiqiang","family":"Hao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9113-9398","authenticated-orcid":false,"given":"Hongming","family":"Fei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiao","family":"Fu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bin","family":"Luo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108996_bib0001","series-title":"Proceedings of the 2016\u202fACM SIGSAC conference on computer and communications security","first-page":"308","article-title":"Deep learning with differential privacy","author":"Abadi","year":"2016"},{"key":"10.1016\/j.neunet.2026.108996_bib0002","doi-asserted-by":"crossref","unstructured":"Alexandrov, A., Raychev, V., M\u00fcller, M. N., Zhang, C., Vechev, M., & Toutanova, K. (2024). Mitigating catastrophic forgetting in language transfer via model merging.arXiv: 2407.08699\">.","DOI":"10.18653\/v1\/2024.findings-emnlp.1000"},{"key":"10.1016\/j.neunet.2026.108996_bib0003","unstructured":"Atari, M. et al. (2023). The WEIRDness of AI: How AI-generated text is biased toward western, educated, industrialized, rich, and democratic populations.arXiv: 2306.09222\">."},{"key":"10.1016\/j.neunet.2026.108996_bib0004","unstructured":"Bai, Y., Kadavath, S., Kundu, S., Askell, A., Kernion, J., Jones, A., Chen, A., Goldie, A., Mirhoseini, A., McKinnon, C. et al. (2022a). Constitutional AI: Harmlessness from AI feedback. arXiv: 2212.08073\">."},{"key":"10.1016\/j.neunet.2026.108996_bib0005","unstructured":"Bai, Y. et al. (2022b). Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv: 2204.05862\">."},{"key":"10.1016\/j.neunet.2026.108996_bib0006","series-title":"Proceedings of the 2021\u202fACM conference on fairness, accountability, and transparency","first-page":"610","article-title":"On the dangers of stochastic parrots: Can language models be too big?","author":"Bender","year":"2021"},{"key":"10.1016\/j.neunet.2026.108996_bib0007","unstructured":"Bhardwaj, R., & Poria, S. (2023). Red-teaming large language models using chain of utterances for safety-alignment. arXiv: 2308.09662\">arXiv preprint arXiv: 2308.09662."},{"key":"10.1016\/j.neunet.2026.108996_bib0008","unstructured":"Bianchi, F., Suzgun, M., Attanasio, G., R\u00f6ttger, P., Jurafsky, D., Hashimoto, T., & Zou, J. (2023). Safety-tuned llamas: Lessons from improving the safety of large language models that follow instructions.arXiv: 2309.07875\">arXiv preprint arXiv: 2309.07875."},{"key":"10.1016\/j.neunet.2026.108996_bib0009","series-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: Student research workshop","first-page":"7","article-title":"Identifying and reducing gender bias in word-level language models","author":"Bordia","year":"2019"},{"key":"10.1016\/j.neunet.2026.108996_bib0010","series-title":"2021\u202fIEEE Symposium on security and privacy (SP)","first-page":"141","article-title":"Machine unlearning","author":"Bourtoule","year":"2021"},{"key":"10.1016\/j.neunet.2026.108996_bib0011","series-title":"Findings of the association for computational linguistics: EMNLP 2022","first-page":"3194","article-title":"Hidden in plain sight: The impact of PII redaction on language model pre-training","author":"Caines","year":"2022"},{"key":"10.1016\/j.neunet.2026.108996_bib0012","series-title":"Advances in neural information processing systems","first-page":"61478","article-title":"Are aligned neural networks adversarially aligned?","volume":"vol. 36","author":"Carlini","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0013","series-title":"30th USENIX security symposium (USENIX security 21)","first-page":"1633","article-title":"Extracting training data from large language models","author":"Carlini","year":"2021"},{"key":"10.1016\/j.neunet.2026.108996_bib0014","unstructured":"Casper, S. et al. (2023). Open problems and fundamental limitations of reinforcement learning from human feedback.arXiv: 2307.15217\">arXiv preprint arXiv: 2307.15217."},{"key":"10.1016\/j.neunet.2026.108996_bib0015","unstructured":"Chalkidis, I. (2025). Decoding alignment: A critical survey of LLM development initiatives through value-setting and data-centric lens.arXiv: 2508.16982\">arXiv preprint arXiv: 2508.16982."},{"key":"10.1016\/j.neunet.2026.108996_bib0016","unstructured":"Cheng, P. et al. (2023). Backdoor attacks and countermeasures in natural language processing models: A comprehensive security review. arXiv: 2309.06055\">arXiv preprint arXiv: 2309.06055."},{"key":"10.1016\/j.neunet.2026.108996_bib0017","unstructured":"Chu, J., Liu, Y., Yang, Z., Shen, X., Backes, M., & Zhang, Y. (2025). Jailbreakradar: Comprehensive assessment of jailbreak attacks against LLMs. https:\/\/arxiv.org\/abs\/2402.05668."},{"key":"10.1016\/j.neunet.2026.108996_bib0018","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R. et al. (2021). Training verifiers to solve math word problems. arXiv: 2110.14168\">arXiv preprint arXiv: 2110.14168."},{"key":"10.1016\/j.neunet.2026.108996_bib0019","series-title":"Findings of the association for computational linguistics: EACL 2023","first-page":"1767","article-title":"Evaluating the ripple effects of model editing","author":"Cohen","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0020","unstructured":"Cui, J., Chiang, W.-L., Stoica, I., & Hsieh, C.-J. (2024). Or-bench: An over-refusal benchmark for large language models. arXiv: 2306.09222\">arXiv preprint arXiv: 2405.20947."},{"key":"10.1016\/j.neunet.2026.108996_bib0021","unstructured":"Dai, T. et al. (2023). Safe RLHF: Safe reinforcement learning from human feedback. arXiv: 2310.12773\">arXiv preprint arXiv: 2310.12773."},{"issue":"6","key":"10.1016\/j.neunet.2026.108996_bib0022","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3712001","article-title":"Security and privacy challenges of large language models: A survey","volume":"57","author":"Das","year":"2025","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.neunet.2026.108996_bib0023","unstructured":"Djuhera, A., Kadhe, S. R., Ahmed, F., Zawad, S., & Boche, H. (2025). SafeMERGE: Preserving safety alignment in fine-tuned large language models via selective layer-wise model merging. arXiv: 2503.17239\">arXiv preprint arXiv: 2503.17239."},{"key":"10.1016\/j.neunet.2026.108996_bib0024","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing","first-page":"343","article-title":"Documenting the english colossal clean crawled corpus","author":"Dodge","year":"2021"},{"key":"10.1016\/j.neunet.2026.108996_bib0025","unstructured":"Du, Y., Kong, S. T., & Srikant, R. (2025). Primal-dual direct preference optimization for constrained LLM alignment.arXiv: 2510.05703\">arXiv preprint arXiv: 2510.05703."},{"key":"10.1016\/j.neunet.2026.108996_bib0026","unstructured":"Duan, J. et al. (2024). Stronger privacy auditing with better canary generation.arXiv: 2405.19365\">arXiv preprint arXiv: 2405.19365."},{"key":"10.1016\/j.neunet.2026.108996_bib0027","unstructured":"Ethayarajh, K., Xu, W., Muennighoff, N., Jurafsky, D., & Kiela, D. (2024). Kto: Model alignment as prospect theoretic optimization. arXiv: 2402.01306\">arXiv preprint arXiv: 2402.01306."},{"key":"10.1016\/j.neunet.2026.108996_bib0028","unstructured":"Farn, H., Su, H., Kumar, S. H., Sahay, S., Chen, S.-T., & Lee, H.-y. (2024). Safeguard fine-tuned LLMs through pre-and post-tuning model merging.arXiv: 2412.19512\">arXiv preprint arXiv: 2412.19512."},{"key":"10.1016\/j.neunet.2026.108996_bib0029","unstructured":"Fu, T., Cai, D., Liu, L., Shi, S., & Yan, R. (2024). Disperse-then-merge: Pushing the limits of instruction tuning via alignment tax reduction. arXiv: 2405.13432\">arXiv preprint arXiv: 2405.13432."},{"key":"10.1016\/j.neunet.2026.108996_bib0030","first-page":"1","article-title":"Bias and fairness in large language models: A survey","author":"Gallegos","year":"2024","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.neunet.2026.108996_bib0031","unstructured":"Ganguli, D., Sharma, A., Lovitt, L., Askell, A., Schiefer, N., & Amodei, D. (2023). The capacity for moral self-correction in large language models. arXiv: 2302.07459\">arXiv preprint arXiv: 2302.07459."},{"key":"10.1016\/j.neunet.2026.108996_bib0032","series-title":"International conference on machine learning","first-page":"10835","article-title":"Scaling laws for reward model overoptimization","author":"Gao","year":"2023"},{"issue":"12","key":"10.1016\/j.neunet.2026.108996_bib0033","doi-asserted-by":"crossref","first-page":"86","DOI":"10.1145\/3458723","article-title":"Datasheets for datasets","volume":"64","author":"Gebru","year":"2021","journal-title":"Communications of the ACM"},{"key":"10.1016\/j.neunet.2026.108996_bib0034","unstructured":"Geng, J., Li, Q., Woisetschlaeger, H., Chen, Z., Cai, F., Wang, Y., Nakov, P., Jacobsen, H.-A., & Karray, F. (2025). A comprehensive survey of machine unlearning techniques for large language models. https:\/\/arxiv.org\/abs\/2503.01854."},{"key":"10.1016\/j.neunet.2026.108996_bib0035","doi-asserted-by":"crossref","unstructured":"Gholami, M., Akbari, M., Hu, C., Masrani, V., Wang, Z. J., & Zhang, Y. (2024). Gold: Generalized knowledge distillation via out-of-distribution-guided language data generation. https:\/\/arxiv.org\/abs\/2403.19754.","DOI":"10.18653\/v1\/2024.findings-naacl.272"},{"key":"10.1016\/j.neunet.2026.108996_bib0036","unstructured":"Goodfellow, I. J., Shlens, J., & Szegedy, C. (2014). Explaining and harnessing adversarial examples. arXiv: 1412.6572\">arXiv preprint arXiv: 1412.6572."},{"key":"10.1016\/j.neunet.2026.108996_bib0037","unstructured":"Guan, M. Y., Joglekar, M., Wallace, E., Jain, S., Barak, B., Helyar, A., Dias, R., Vallone, A., Ren, H., Wei, J. et al. (2024). Deliberative alignment: Reasoning enables safer language models.arXiv: 2412.16339\">arXiv preprint arXiv: 2412.16339."},{"key":"10.1016\/j.neunet.2026.108996_bib0038","series-title":"Proceedings of the 31st ACM joint european software engineering conference and symposium on the foundations of software engineering","first-page":"2082","article-title":"Getting pwn\u2019d by AI: Penetration testing with large language models","author":"Happe","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0039","series-title":"Proceedings of the 60th annual meeting of the association for computational linguistics","first-page":"3309","article-title":"Toxigen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection","author":"Hartvigsen","year":"2022"},{"key":"10.1016\/j.neunet.2026.108996_bib0040","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"15888","article-title":"Does localization inform editing? Surprising differences in causality-based localization vs. knowledge editing in language models","author":"Hase","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0041","series-title":"Joint European conference on machine learning and knowledge discovery in databases","first-page":"257","article-title":"Pareto multi-objective alignment for language models","author":"He","year":"2025"},{"key":"10.1016\/j.neunet.2026.108996_bib0042","unstructured":"Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., & Steinhardt, J. (2020). Measuring massive multitask language understanding. arXiv: 2009.03300\">arXiv preprint arXiv: 2009.03300."},{"key":"10.1016\/j.neunet.2026.108996_bib0043","unstructured":"Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the knowledge in a neural network.arXiv: 1503.02531\">arXiv preprint arXiv: 1503.02531."},{"key":"10.1016\/j.neunet.2026.108996_bib0044","unstructured":"Hong, J. et al. (2024). ORPO: Monolithic preference optimization without reference model.arXiv: 2403.07691\">arXiv preprint arXiv: 2403.07691."},{"key":"10.1016\/j.neunet.2026.108996_bib0045","first-page":"1","article-title":"Weird? Institutions and consumers\u2019 perceptions of artificial intelligence in 31 countries","author":"Howell","year":"2025","journal-title":"AI & Society"},{"issue":"2","key":"10.1016\/j.neunet.2026.108996_bib0046","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"issue":"7","key":"10.1016\/j.neunet.2026.108996_bib0047","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1007\/s10462-024-10824-0","article-title":"A survey of safety and trustworthiness of large language models through the lens of verification and validation","volume":"57","author":"Huang","year":"2024","journal-title":"Artificial Intelligence Review"},{"key":"10.1016\/j.neunet.2026.108996_bib0048","unstructured":"Ivison, H., Wang, Y., Pyatkin, V., Lambert, N., Peters, M., Dasigi, P., Jang, J., Wadden, D., Smith, N. A., Beltagy, I., & Hajishirzi, H. (2023). Camels in a changing climate: Enhancing LM adaptation with tulu 2. https:\/\/arxiv.org\/abs\/2311.10702."},{"key":"10.1016\/j.neunet.2026.108996_bib0049","unstructured":"Jain, N., Chen, J., Somepalli, G., & Goldstein, T. (2023). Towards safe and trustworthy parameter-efficient fine-tuning. arXiv: 2310.18138\">arXiv preprint arXiv: 2310.18138."},{"key":"10.1016\/j.neunet.2026.108996_bib0050","unstructured":"Ji, M., Wu, Y., Wu, Z., Wang, S., Yang, J., Dras, M., & Naseem, U. (2025). A survey on progress in llm alignment from the perspective of reward design. arXiv: 2505.02666\">arXiv preprint arXiv: 2505.02666."},{"key":"10.1016\/j.neunet.2026.108996_bib0051","series-title":"Proceedings of the 2023\u202fACM SIGSAC conference on computer and communications security","first-page":"1331","article-title":"Certified data removal from large language models","author":"Jia","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0052","unstructured":"Jiang, H. et al. (2019). SMART: Robust and efficient fine-tuning for pre-trained natural language models through principled regularized optimization. arXiv: 1911.03437\">arXiv preprint arXiv: 1911.03437."},{"key":"10.1016\/j.neunet.2026.108996_bib0053","unstructured":"Kim, M., Kwak, J. M., Alssum, L., Ghanem, B., Torr, P., Krueger, D., Barez, F., & Bibi, A. (2025). Rethinking safety in llm fine-tuning: An optimization perspective. arXiv: 2508.12531\">arXiv preprint arXiv: 2508.12531."},{"key":"10.1016\/j.neunet.2026.108996_bib0054","series-title":"Proceedings of the national academy of sciences","first-page":"3521","article-title":"Overcoming catastrophic forgetting in neural networks","volume":"114","author":"Kirkpatrick","year":"2017"},{"key":"10.1016\/j.neunet.2026.108996_bib0055","series-title":"International conference on machine learning","first-page":"17838","article-title":"Pretraining language models with human preferences","author":"Korbak","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0056","unstructured":"Lambert, N. (2025). Reinforcement learning from human feedback. https:\/\/arxiv.org\/abs\/2504.12501."},{"key":"10.1016\/j.neunet.2026.108996_bib0057","unstructured":"Lee, H., Phatale, S., Mansoor, H., Mesnard, T., Ferret, J., Lu, K., Bishop, C., Hall, E., Carbune, V., Rastogi, A., & Prakash, S. (2024). Rlaif vs. rlhf: Scaling reinforcement learning from human feedback with ai feedback. https:\/\/arxiv.org\/abs\/2309.00267."},{"key":"10.1016\/j.neunet.2026.108996_bib0058","unstructured":"Lee, S., Han, J., Song, H., Choi, S. J., Lee, H., & Yu, Y. (2025). Kl penalty control via perturbation for direct preference optimization. arXiv: 2502.13177\">arXiv preprint arXiv: 2502.13177."},{"key":"10.1016\/j.neunet.2026.108996_bib0059","unstructured":"Lermen, S., & Rogers-Smith, C. (2023). Lora fine-tuning efficiently undoes safety training in llama 2-chat 70b. arXiv: 2310.20624\">arXiv preprint arXiv: 2310.20624."},{"key":"10.1016\/j.neunet.2026.108996_bib0060","unstructured":"Li, A., Zhang, Y., & Li, Y. (2023a). BadEdit: Backdooring large language models by model editing.arXiv: 2311.00882\">arXiv preprint arXiv: 2311.00882."},{"key":"10.1016\/j.neunet.2026.108996_bib0061","unstructured":"Li, C., Zhang, H., Xu, Y., Xue, H., Ao, X., & He, Q. (2025a). Gradient-adaptive policy optimization: Towards multi-objective alignment of large language models. arXiv: 2507.01915\">arXiv preprint arXiv: 2507.01915."},{"key":"10.1016\/j.neunet.2026.108996_bib0062","unstructured":"Li, H., Li, L., Lu, Z., Wei, X., Li, R., Shao, J., & Sha, L. (2025b). Layer-aware representation filtering: Purifying finetuning data to preserve llm safety alignment. arXiv: 2507.18631\">arXiv preprint arXiv: 2507.18631."},{"key":"10.1016\/j.neunet.2026.108996_bib0063","unstructured":"Li, H. et al. (2023b). Privacy in large language models: Attacks, defenses and future directions. arXiv: 2310.10383\">arXiv preprint arXiv: 2310.10383."},{"key":"10.1016\/j.neunet.2026.108996_bib0064","unstructured":"Li, K., Chen, Y., Vi\u00e9gas, F., & Wattenberg, M. (2025c). When bad data leads to good models. https:\/\/arxiv.org\/abs\/2505.04741."},{"key":"10.1016\/j.neunet.2026.108996_bib0065","unstructured":"Li, S., Liu, N., Cheng, W., Liu, H., & Chen, C. (2023c). BadLoRA: Backdoor attacks on low-rank adaptation for large language models. arXiv: 2311.12497\">arXiv preprint arXiv: 2311.12497."},{"key":"10.1016\/j.neunet.2026.108996_bib0066","unstructured":"Li, Y., Chen, Z., Zhang, Z., & Wang, S. (2024). Stealthy backdoor attacks on knowledge editing in large language models.arXiv: 2403.01239\">arXiv preprint arXiv: 2403.01239."},{"issue":"5","key":"10.1016\/j.neunet.2026.108996_bib0067","first-page":"745","article-title":"Catastrophic forgetting in connectionist networks: A review","volume":"36","author":"Li","year":"2024","journal-title":"Neural Computation"},{"key":"10.1016\/j.neunet.2026.108996_bib0068","doi-asserted-by":"crossref","unstructured":"Lin, S., Hilton, J., & Evans, O. (2022). TruthfulQA: Measuring how models mimic human falsehoods. Proceedings of the 60th annual meeting of the association for computational linguistics, (pp. 3214\u20133252).","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"10.1016\/j.neunet.2026.108996_bib0069","series-title":"Proceedings of the 2022\u202fAAAI\/ACM conference on AI, ethics, and society","first-page":"526","article-title":"The right to be forgotten in the era of large language models","author":"Liu","year":"2022"},{"key":"10.1016\/j.neunet.2026.108996_bib0070","unstructured":"Liu, Y., Zhang, T., Wu, C., Zhang, Y., Zhang, S., Liu, K. et al. (2023). Trustworthy LLMs: A survey and guideline for evaluating, improving and utilizing large language models. arXiv: 2308.05374\">arXiv preprint arXiv: 2308.05374."},{"key":"10.1016\/j.neunet.2026.108996_bib0071","unstructured":"Luo, Y. et al. (2023). An empirical study of catastrophic forgetting in large language models during continual fine-tuning.arXiv: 2308.08747\">."},{"key":"10.1016\/j.neunet.2026.108996_bib0072","unstructured":"Madry, A. et al. (2017). Towards deep learning models resistant to adversarial attacks.arXiv: 1706.06083\">arXiv preprint arXiv: 1706.06083."},{"key":"10.1016\/j.neunet.2026.108996_bib0073","unstructured":"Maini, P., Morcos, A. S., & Rolnick, D. (2024). Understanding catastrophic forgetting and unlearning in large language models. arXiv: 2403.00886\">arXiv preprint arXiv: 2403.00886."},{"key":"10.1016\/j.neunet.2026.108996_bib0074","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"15009","article-title":"A holistic approach to undesired content detection in the real world","volume":"vol. 37","author":"Markov","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0075","series-title":"Advances in neural information processing systems","first-page":"17359","article-title":"Locating and editing factual associations in GPT","volume":"vol. 35","author":"Meng","year":"2022"},{"key":"10.1016\/j.neunet.2026.108996_bib0076","unstructured":"Meng, K., Sharma, A. S., Andonian, A., Belinkov, Y., & Bau, D. (2022b). Mass-editing memory in a transformer. arXiv: 2210.07229\">arXiv preprint arXiv: 2210.07229."},{"key":"10.1016\/j.neunet.2026.108996_bib0077","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"24559","article-title":"Backdoor attacks in the model editing era","author":"Mitchell","year":"2024"},{"issue":"4","key":"10.1016\/j.neunet.2026.108996_bib0078","first-page":"1091","article-title":"Auditing large language models: A three-layered approach","volume":"3","author":"M\u00f6kander","year":"2023","journal-title":"AI and Ethics"},{"key":"10.1016\/j.neunet.2026.108996_bib0079","unstructured":"Nadeem, M., Bethke, A., & Reddy, S. (2020). StereoSet: Measuring stereotypical bias in pretrained language models. arXiv: 2004.09456\">arXiv preprint arXiv: 2004.09456."},{"key":"10.1016\/j.neunet.2026.108996_bib0080","series-title":"Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP)","first-page":"1953","article-title":"CrowS-Pairs: A challenge dataset for measuring social biases in masked language models","author":"Nangia","year":"2020"},{"key":"10.1016\/j.neunet.2026.108996_bib0081","series-title":"Advances in neural information processing systems 35","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","author":"Ouyang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108996_bib0082","unstructured":"Ovsianas, A., Ramapuram, J., Busbridge, D., Dhekane, E. G., & Webb, R. (2022). Elastic weight consolidation improves the robustness of self-supervised learning methods under transfer. https:\/\/arxiv.org\/abs\/2210.16365."},{"key":"10.1016\/j.neunet.2026.108996_bib0083","unstructured":"Pal, A., Bhargava, R., Hinsz, K., Esterhuizen, J., & Bhattacharya, S. (2024). The empirical impact of data sanitization on language models. arXiv: 2411.05978\">arXiv preprint arXiv: 2411.05978."},{"key":"10.1016\/j.neunet.2026.108996_bib0084","unstructured":"Pan, Y., Shi, T., Zhao, J., & Ma, J. W. (2025). Detecting and filtering unsafe training data via data attribution. arXiv: 2502.11411\">arXiv preprint arXiv: 2502.11411."},{"key":"10.1016\/j.neunet.2026.108996_bib0085","series-title":"Findings of the association for computational linguistics: ACL 2022","first-page":"2036","article-title":"BBQ: A hand-built bias benchmark for question answering","author":"Parrish","year":"2022"},{"issue":"11","key":"10.1016\/j.neunet.2026.108996_bib0086","doi-asserted-by":"crossref","DOI":"10.1016\/j.patter.2021.100336","article-title":"Data and its (dis)contents: A survey of dataset development and use in machine learning research","volume":"2","author":"Paullada","year":"2021","journal-title":"Patterns"},{"key":"10.1016\/j.neunet.2026.108996_bib0087","unstructured":"Penedo, G. et al. (2023). The refinedweb dataset for falcon LLM: Outperforming curated corpora with web data, and web data only.arXiv: 2306.01116\">arXiv preprint arXiv: 2306.01116."},{"key":"10.1016\/j.neunet.2026.108996_bib0088","series-title":"Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"14661","article-title":"What do we measure when we measure knowledge editing?","author":"Pinter","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0089","series-title":"47th annual conference of the cognitive science society","first-page":"665","article-title":"Whose values prevail? Bias in large language model value alignment","author":"Qi","year":"2025"},{"key":"10.1016\/j.neunet.2026.108996_bib0090","unstructured":"Qi, X., Chen, M., Chen, Z., Wang, Z., & Wang, X. (2023a). Fine-tuning aligned language models compromises safety, even when users do not intend to. arXiv: 2310.03693\">arXiv preprint arXiv: 2310.03693."},{"key":"10.1016\/j.neunet.2026.108996_bib0091","unstructured":"Qi, X., Zhang, Y., Bai, J., Zhang, S., Liu, K., & Wang, M. (2023b). Visual adversarial examples jailbreak large language models. arXiv: 2306.13213\">arXiv preprint arXiv: 2306.13213."},{"key":"10.1016\/j.neunet.2026.108996_bib0092","series-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","first-page":"1530","article-title":"Reducing gender bias in word-level language models with a gender-equalizing loss function","author":"Qian","year":"2019"},{"issue":"8","key":"10.1016\/j.neunet.2026.108996_bib0093","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"10.1016\/j.neunet.2026.108996_bib0094","series-title":"Advances in neural information processing systems 36","first-page":"3730","article-title":"Direct preference optimization: Your language model is secretly a reward model","author":"Rafailov","year":"2023"},{"issue":"140","key":"10.1016\/j.neunet.2026.108996_bib0095","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.108996_bib0096","unstructured":"Robey, A., Wong, E., Hassani, H., & Pappas, G. J. (2023). SmoothLLM: Defending large language models against jailbreaking attacks. arXiv: 2310.03684\">arXiv preprint arXiv: 2310.03684."},{"key":"10.1016\/j.neunet.2026.108996_bib0097","series-title":"Proceedings of the 2024 conference of the North American chapter of the association for computational linguistics: Human language technologies","first-page":"5377","article-title":"XSTest: A test suite for identifying exaggerated safety behaviours in large language models","author":"R\u00f6ttger","year":"2024"},{"key":"10.1016\/j.neunet.2026.108996_bib0098","series-title":"International conference on machine learning","first-page":"9455","article-title":"Remember what you want to forget: A framework for machine unlearning","author":"Sekhari","year":"2021"},{"key":"10.1016\/j.neunet.2026.108996_bib0099","unstructured":"Shen, X. et al. (2023a). \u201cDo Anything Now\u201d: Characterizing and evaluating in-the-wild jailbreak prompts on large language models. arXiv: 2308.03825\">arXiv preprint arXiv: 2308.03825."},{"key":"10.1016\/j.neunet.2026.108996_bib0100","unstructured":"Shen, Z. et al. (2023b). Principle-driven self-alignment of language models from scratch with minimal human supervision. arXiv: 2305.03047\">arXiv preprint arXiv: 2305.03047."},{"key":"10.1016\/j.neunet.2026.108996_bib0101","series-title":"2017\u202fIEEE Symposium on security and privacy (SP)","first-page":"3","article-title":"Membership inference attacks against machine learning models","author":"Shokri","year":"2017"},{"issue":"1","key":"10.1016\/j.neunet.2026.108996_bib0102","first-page":"1","article-title":"Large language models for PII detection in educational datasets","volume":"16","author":"Singhal","year":"2024","journal-title":"Journal of Educational Data Mining"},{"key":"10.1016\/j.neunet.2026.108996_bib0103","article-title":"Certified defenses for data poisoning attacks","volume":"30","author":"Steinhardt","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108996_bib0104","unstructured":"Tan, Y., Jiang, Y., Li, Y., Liu, J., Bu, X., Su, W., Yue, X., Zhu, X., & Zheng, B. (2025). Equilibrate RLHF: Towards balancing helpfulness-safety trade-off in large language models. https:\/\/arxiv.org\/abs\/2502.11555."},{"key":"10.1016\/j.neunet.2026.108996_bib0105","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et al. (2023). Llama 2: Open foundation and fine-tuned chat models. arXiv: 2307.09288\">arXiv preprint arXiv: 2307.09288."},{"key":"10.1016\/j.neunet.2026.108996_bib0106","unstructured":"Wan, X., Chen, Y., Lin, B. Y., Huang, H. T., & Wang, W. (2023). Poisoning attacks on and defenses for large language models: A survey. arXiv: 2310.16333\">arXiv preprint arXiv: 2310.16333."},{"key":"10.1016\/j.neunet.2026.108996_bib0107","unstructured":"Wang, B., Chen, W., Pei, H., Xie, C., Kang, M., Zhang, C., Xu, C., Xiong, Z., Dutta, R., Schaeffer, R. et al. (2023a). Decodingtrust: A comprehensive assessment of trustworthiness in GPT models. arXiv e-prints, (pp. arXiv\u20132306)."},{"key":"10.1016\/j.neunet.2026.108996_bib0108","unstructured":"Wang, K., Zhang, G., Zhou, Z., Wu, J., Yu, M., Zhao, S., Yin, C., Fu, J., Yan, Y., Luo, H. et al. (2025). A comprehensive survey in LLM (-agent) full stack safety: Data, training and deployment.arXiv: 2504.15585\">arXiv preprint arXiv: 2504.15585."},{"key":"10.1016\/j.neunet.2026.108996_bib0109","unstructured":"Wang, S. et al. (2024). Unique security and privacy threats of large language models: A comprehensive survey. arXiv: 2406.07973\">arXiv preprint arXiv: 2406.07973."},{"key":"10.1016\/j.neunet.2026.108996_bib0110","unstructured":"Wang, Y., Zhong, W., Li, L., Mi, F., Zeng, X., Huang, W., Shang, L., Jiang, X., & Liu, Q. (2023b). Aligning large language models with human: A survey. arXiv: 2307.12966\">arXiv preprint arXiv: 2307.12966."},{"key":"10.1016\/j.neunet.2026.108996_bib0111","unstructured":"Wang, Z., Zhang, Z., Liu, Y., Wang, S., & Zhang, Y. (2023c). ProSafe: A proactive safety-aware framework for large language model fine-tuning. arXiv: 2311.02105\">arXiv preprint arXiv: 2311.02105."},{"key":"10.1016\/j.neunet.2026.108996_bib0112","unstructured":"Waris, H., Khan, S. I., Nawaz, M., & Rawat, D. B. (2023). Unlearning in LLMs: A critical survey and future directions. arXiv: 2311.03348\">arXiv preprint arXiv: 2311.03348."},{"key":"10.1016\/j.neunet.2026.108996_bib0113","unstructured":"Weidinger, L. et al. (2021). Ethical and social risks of harm from language models. arXiv: 2112.04359\">arXiv preprint arXiv: 2112.04359."},{"key":"10.1016\/j.neunet.2026.108996_bib0114","unstructured":"Wenzek, G. et al. (2019). CCNet: Extracting high quality monolingual datasets from web crawl data. arXiv: 1911.00359\">arXiv preprint arXiv: 1911.00359."},{"key":"10.1016\/j.neunet.2026.108996_bib0115","unstructured":"Wu, T., Luo, L., Li, Y.-F., Pan, S., Vu, T.-T., & Haffari, G. (2024a). Continual learning for large language models: A survey. https:\/\/arxiv.org\/abs\/2402.01364."},{"key":"10.1016\/j.neunet.2026.108996_bib0116","unstructured":"Wu, T. et al. (2024b). Continual learning for large language models: A survey. arXiv: 2402.01364\">arXiv preprint arXiv: 2402.01364."},{"key":"10.1016\/j.neunet.2026.108996_bib0117","unstructured":"Wu, Z., Wu, Q., Chen, L., Xu, K., & He, J. (2023). SafePEFT: A framework for safety-assured parameter-efficient fine-tuning of large language models. arXiv: 2310.13575\">arXiv preprint arXiv: 2310.13575."},{"key":"10.1016\/j.neunet.2026.108996_bib0118","unstructured":"Xiang, R., Zhang, J., & Liu, B. (2023). RobustLoRA: A plug-and-play subspace-based defense for backdoor attacks in LoRA. arXiv: 2310.15131\">arXiv preprint arXiv: 2310.15131."},{"key":"10.1016\/j.neunet.2026.108996_bib0119","article-title":"Self-pluralising culture alignment for large language models","author":"Xu","year":"2024","journal-title":"Qeios"},{"issue":"4","key":"10.1016\/j.neunet.2026.108996_bib0120","article-title":"A survey on large language model (LLM) security and privacy: The good, the bad, and the ugly","volume":"3","author":"Yao","year":"2023","journal-title":"High-Confidence Computing"},{"key":"10.1016\/j.neunet.2026.108996_bib0121","unstructured":"Yi, S. et al. (2024). Jailbreak attacks and defenses against large language models: A survey. arXiv: 2407.04295\">arXiv preprint arXiv: 2407.04295."},{"key":"10.1016\/j.neunet.2026.108996_bib0122","unstructured":"Yong, Z.-X., Menghini, C., & Bach, S. H. (2023). Low-resource languages jailbreak GPT-4. arXiv: 2310.02446\">arXiv preprint arXiv: 2310.02446."},{"key":"10.1016\/j.neunet.2026.108996_bib0123","series-title":"Icassp 2025-2025 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"1","article-title":"Bridging the fairness gap: Enhancing pre-trained models with LLM-generated sentences","author":"Yu","year":"2025"},{"key":"10.1016\/j.neunet.2026.108996_bib0124","unstructured":"Zeng, Y., Zhang, Z., Jia, R., & Wang, W. (2024). Defending against weight-poisoning backdoor attacks for parameter-efficient fine-tuning. arXiv: 2401.17330\">arXiv preprint arXiv: 2401.17330."},{"key":"10.1016\/j.neunet.2026.108996_bib0125","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"1117","article-title":"Poisoning language models during instruction tuning","author":"Zhan","year":"2023"},{"key":"10.1016\/j.neunet.2026.108996_bib0126","series-title":"Proceedings of the 2018\u202fAAAI\/ACM conference on AI, ethics, and society","first-page":"335","article-title":"Mitigating unwanted biases with adversarial learning","author":"Zhang","year":"2018"},{"key":"10.1016\/j.neunet.2026.108996_bib0127","unstructured":"Zhang, T., Liu, Y., Wu, C., Zhang, Y., Zhang, S., Liu, K. et al. (2024a). A survey on knowledge editing for large language models. arXiv: 2401.06202\">arXiv preprint arXiv: 2401.06202."},{"key":"10.1016\/j.neunet.2026.108996_bib0128","unstructured":"Zhang, T., Liu, Y., Zhang, Y., Wu, C., Zhang, S., Liu, K. et al. (2023a). A survey on safety and trustworthiness of large language models. arXiv: 2312.05332\">arXiv preprint arXiv: 2312.05332."},{"key":"10.1016\/j.neunet.2026.108996_bib0129","unstructured":"Zhang, Y., Li, M., Han, W., Yao, Y., Cen, Z., & Zhao, D. (2025). Safety is not only about refusal: Reasoning-enhanced fine-tuning for interpretable LLM safety. arXiv: 2503.05021\">arXiv preprint arXiv: 2503.05021."},{"key":"10.1016\/j.neunet.2026.108996_bib0130","unstructured":"Zhang, Y., Liu, Y., Zhang, T., Wu, C., Zhang, S., Li, Y., & Wang, M. (2024b). A survey of backdoor attacks and defenses in large language models. arXiv: 2405.02102\">arXiv preprint arXiv: 2405.02102."},{"key":"10.1016\/j.neunet.2026.108996_bib0131","unstructured":"Zhang, Z., Lei, L., Wu, L., Sun, R., Huang, Y., Long, C., Liu, X., Lei, X., Tang, J., & Huang, M. (2023b). Safetybench: Evaluating the safety of large language models. arXiv: 2309.07045\">arXiv preprint arXiv: 2309.07045."},{"key":"10.1016\/j.neunet.2026.108996_bib0132","series-title":"Proceedings of the 2024 conference of the North American chapter of the association for computational linguistics: Human language technologies","first-page":"555","article-title":"SaLoRA: A safety-tuned LoRA for large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.neunet.2026.108996_bib0133","unstructured":"Zhao, S., Duan, R., Wang, F., Chen, C., Kang, C., Ruan, S., Tao, J., Chen, Y., Xue, H., & Wei, X. (2025a). Jailbreaking multimodal large language models via shuffle inconsistency. arXiv: 2501.04931\">arXiv preprint arXiv: 2501.04931."},{"key":"10.1016\/j.neunet.2026.108996_bib0134","unstructured":"Zhao, X., Cai, W., Shi, T., Huang, D., Lin, L., Mei, S., & Song, D. (2025b). Improving llm safety alignment with dual-objective optimization. arXiv: 2503.03710\">arXiv preprint arXiv: 2503.03710."},{"key":"10.1016\/j.neunet.2026.108996_bib0135","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","volume":"36","author":"Zheng","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108996_bib0136","unstructured":"Zhou, C., Chen, S., Zhang, Y., & Wang, S. (2023). Rethinking the evaluation of unlearning in large language models. arXiv: 2310.09861\">arXiv preprint arXiv: 2310.09861."},{"key":"10.1016\/j.neunet.2026.108996_bib0137","unstructured":"Zhou, W., Bai, S., Mandic, D. P., Zhao, Q., & Chen, B. (2024). Revisiting the adversarial robustness of vision language models: a multimodal perspective. arXiv: 2404.19287\">arXiv preprint arXiv: 2404.19287."},{"key":"10.1016\/j.neunet.2026.108996_bib0138","unstructured":"Zhu, C. et al. (2019). FreeLB: Enhanced adversarial training for natural language understanding. arXiv: 1909.11764\">arXiv preprint arXiv: 1909.11764."},{"key":"10.1016\/j.neunet.2026.108996_bib0139","doi-asserted-by":"crossref","unstructured":"Zhu, J., Yan, L., Wang, S., Yin, D., & Sha, L. (2025). Reasoning-to-defend: Safety-aware reasoning can defend large language models from jailbreaking. arXiv: 2502.12970\">arXiv preprint arXiv: 2502.12970.","DOI":"10.18653\/v1\/2025.emnlp-main.1493"},{"key":"10.1016\/j.neunet.2026.108996_bib0140","unstructured":"Zhuang, S. et al. (2024). Enhancing LLM safety via constrained direct preference optimization. arXiv: 2403.02475\">arXiv preprint arXiv: 2403.02475."},{"key":"10.1016\/j.neunet.2026.108996_bib0141","series-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","first-page":"1550","article-title":"Counterfactual data augmentation for mitigating gender stereotypes in languages with rich morphology","author":"Zmigrod","year":"2019"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026004570?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026004570?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T14:45:46Z","timestamp":1781275546000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026004570"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,10]]},"references-count":141,"alternative-id":["S0893608026004570"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108996","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Aligning large language models across the lifecycle: A survey on safety-usability trade-offs from pre-training to post-training","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108996","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108996"}}