{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T21:13:20Z","timestamp":1772831600966,"version":"3.50.1"},"reference-count":10,"publisher":"Association for Computing Machinery (ACM)","issue":"3","license":[{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["Commun. ACM"],"published-print":{"date-parts":[[2025,3]]},"abstract":"<jats:p>The better we align AI models with our values, the easier we may make it to realign them with opposing values.<\/jats:p>","DOI":"10.1145\/3705294","type":"journal-article","created":{"date-parts":[[2025,2,5]],"date-time":"2025-02-05T18:09:06Z","timestamp":1738778946000},"page":"24-26","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["The AI Alignment Paradox"],"prefix":"10.1145","volume":"68","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3984-1232","authenticated-orcid":false,"given":"Robert","family":"West","sequence":"first","affiliation":[{"name":"EPFL, School of Computer and Communication Sciences, Lausanne, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9542-9146","authenticated-orcid":false,"given":"Roland","family":"Aydin","sequence":"additional","affiliation":[{"name":"Technische Universit\u00e4t Hamburg, Hamburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,2,21]]},"reference":[{"key":"e_1_3_1_2_2","unstructured":"Arditi A. et al. Refusal in language models is mediated by a single direction. In Proceedings of the Annual Conf. on Neural Information Processing Systems. (2024)."},{"key":"e_1_3_1_3_2","unstructured":"Bai Y. et al. Constitutional AI: Harmlessness from AI Feedback. (2022); arXiv:2212.08073"},{"key":"e_1_3_1_4_2","unstructured":"Chu J. et al. Comprehensive Assessment of Jailbreak Attacks Against LLMs. (2024); arXiv:2402.05668"},{"key":"e_1_3_1_5_2","unstructured":"McGuffie K. and Newhouse A. The Radicalization Risks of GPT-3 and Advanced Neural Language Models. (2020); arXiv:2009.06807"},{"key":"e_1_3_1_6_2","unstructured":"Qi X. et al. Fine-tuning aligned language models compromises safety even when users do not intend to! In Proceedings of Intern. Conf. on Learning Representations. (2023)."},{"key":"e_1_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Rimsky N. et al Steering Llama 2 via contrastive activation addition. In Proceedings of the Annual Meeting of the Association for Computational Linguistics. (2024).","DOI":"10.18653\/v1\/2024.acl-long.828"},{"key":"e_1_3_1_8_2","volume-title":"Human Compatible: AI and the Problem of Control","author":"Russell S.","year":"2019","unstructured":"Russell, S. Human Compatible: AI and the Problem of Control. Penguin, U.K. (2019)."},{"key":"e_1_3_1_9_2","unstructured":"Schwinn L. et al. Soft prompt threats: Attacking safety alignment and unlearning in open-source LLMs through the embedding space. In Proceedings of the Annual Conf. on Neural Information Processing Systems. (2024)."},{"key":"e_1_3_1_10_2","unstructured":"Sorensen T. et al. A roadmap to pluralistic alignment. In Proceedings of the Intern. Conf. on Machine Learning. (2024)."},{"key":"e_1_3_1_11_2","unstructured":"Wolf Y. et al. Fundamental Limitations of Alignment in Large Language Models. (2023); arXiv:2304.11082"}],"container-title":["Communications of the ACM"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3705294","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3705294","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3705294"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,21]]},"references-count":10,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["10.1145\/3705294"],"URL":"https:\/\/doi.org\/10.1145\/3705294","relation":{},"ISSN":["0001-0782","1557-7317"],"issn-type":[{"value":"0001-0782","type":"print"},{"value":"1557-7317","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,21]]},"assertion":[{"value":"2024-04-12","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-02-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}