{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T17:44:55Z","timestamp":1742924695682,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":26,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819756681"},{"type":"electronic","value":"9789819756698"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-5669-8_23","type":"book-chapter","created":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T17:02:31Z","timestamp":1722618151000},"page":"281-290","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cybernetic Sentinels: Unveiling the Impact of Safety Data Selection on Model Security in Supervised Fine-Tuning"],"prefix":"10.1007","author":[{"given":"Ruihui","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongying","family":"He","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuan","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,3]]},"reference":[{"key":"23_CR1","unstructured":"GPT-4 Homepage. https:\/\/openai.com\/research\/gpt-4. Accessed 15 May 2024"},{"key":"23_CR2","unstructured":"Chowdhery, A., et al.: PaLM: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)"},{"key":"23_CR3","unstructured":"Touvron, H., Martin, L., Stone, K., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Bender, E.M., Gebru, T., McMillan-Major, A., Shmitchell, S.: On the dangers of stochastic parrots: can language models be too big? In: Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency, pp. 610\u2013623. ACM, New York (2021)","DOI":"10.1145\/3442188.3445922"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Hutchinson, B., et al.: Social biases in NLP models as barriers for persons with disabilities. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 5491\u20135501, Association for Computational Linguistics (2020)","DOI":"10.18653\/v1\/2020.acl-main.487"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Abid, A., Farooqi, M., Zou, J.: Large language models associate Muslims with violence. Nat. Mach. Intelligence, 3(6), 461\u2013463 (2021)","DOI":"10.1038\/s42256-021-00359-2"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Self-instruct: aligning language models with self-generated instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp. 13484\u201313508, Association for Computational Linguistics, Toronto (2023)","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Zhang, R., Zhang, J., et al.: LLaMA factory: unified efficient fine-tuning of 100+ language models. arXiv preprint arXiv:2403.13372 (2024)","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"23_CR9","unstructured":"Ouyang, L., Wu, J., Jiang, X., et al.: Training language models to follow instructions with human feedback. Adv. Neural Inform. Process. Syst. 35, 27730\u201327744. New Orleans (2022)"},{"key":"23_CR10","unstructured":"Federico, B., et al.: Safety-tuned LLaMAs: lessons from improving the safety of large language models that follow instructions. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"23_CR11","unstructured":"Zhou, C., et al.: LIMA: less is more for alignment. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"23_CR12","unstructured":"Xu, G., Liu, J., Yan, M., et al.: CValues: measuring the values of Chinese large language models from safety to responsibility. arXiv preprint arXiv:2307.09705 (2023)"},{"key":"23_CR13","unstructured":"Deep, G., et al.: Red teaming language models to reduce harms: methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858 (2022)"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Mart: improving LLM safety with multi-round automatic red-teaming. arXiv preprint arXiv:2311.07689 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.107"},{"key":"23_CR15","unstructured":"Wei, A., Haghtalab, N., Steinhardt, J.: Jailbroken: how does LLM safety training fail? In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR16","unstructured":"Lin, B.Y., Ravichander, A., Lu, X., et al.: The unlocking spell on base LLMS: rethinking alignment via in-context learning. arXiv preprint arXiv:2312.01552 (2023)"},{"key":"23_CR17","unstructured":"Ji, J., Liu, M., Dai, J., et al.: Beavertails: towards improved safety alignment of llm via a human-preference dataset. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR18","unstructured":"Wang, W., Wei, F., Dong, L., et al.: Minilm: deep self-attention distillation for task-agnostic compression of pre-trained transformers. In: Advances in Neural Information Processing Systems, vol. 33, pp. 5776\u20135788. Vancouver (2020)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Shaikh, O., et al.: On second thought, let\u2019s not think step by step! bias and toxicity in zero-shot reasoning. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp. 4454\u20134470. Association for Computational Linguistics, Toronto (2023)","DOI":"10.18653\/v1\/2023.acl-long.244"},{"key":"23_CR20","unstructured":"Zou, A., Wang, Z., Kolter, J.Z., et al.: Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)"},{"key":"23_CR21","unstructured":"Wang, Y., Li, H., et al.: Do-not-answer: evaluating safeguards in LLMs. In: Findings of the Association for Computational Linguistics, pp. 896\u2013911. Association for Computational Linguistics, Malta (2024)"},{"key":"23_CR22","unstructured":"Gao, L., Tow, J., et al.: A framework for few-shot language model evaluation (2021)"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Bisk, Y., Zellers, R., Gao, J., et al.: Piqa: reasoning about physical commonsense in natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, issue 5, pp. 7432\u20137439, Association for the Advancement of Artificial Intelligence (2020)","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"23_CR24","unstructured":"Clark, C., Lee, K., et al.: BoolQ: exploring the surprising difficulty of natural yes\/no questions. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1, pp. 2924\u20132936. Association for Computational Linguistics, Minneapolis (2019)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Mihaylov, T., Clark, P., et al.: Can a suit of armor conduct electricity? A new dataset for open book question answering. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 2381\u20132391, Association for Computational Linguistics, Brussels (2018)","DOI":"10.18653\/v1\/D18-1260"},{"key":"23_CR26","unstructured":"Hu, E.J., Wallis, P., Allen-Zhu, Z., et al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-5669-8_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T19:53:09Z","timestamp":1732564389000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-5669-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819756681","9789819756698"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-5669-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"3 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tianjin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 August 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/2024\/index.htm","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}