{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:49:01Z","timestamp":1776116941895,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":350,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3630106.3659037","type":"proceedings-article","created":{"date-parts":[[2024,6,5]],"date-time":"2024-06-05T13:14:21Z","timestamp":1717593261000},"page":"2254-2272","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":47,"title":["Black-Box Access is Insufficient for Rigorous AI Audits"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0084-1937","authenticated-orcid":false,"given":"Stephen","family":"Casper","sequence":"first","affiliation":[{"name":"Massachusetts Institute of Technology, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7488-9258","authenticated-orcid":false,"given":"Carson","family":"Ezell","sequence":"additional","affiliation":[{"name":"Harvard University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1646-7171","authenticated-orcid":false,"given":"Charlotte","family":"Siegmann","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2538-6295","authenticated-orcid":false,"given":"Noam","family":"Kolt","sequence":"additional","affiliation":[{"name":"University of Toronto, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6408-2009","authenticated-orcid":false,"given":"Taylor Lynn","family":"Curtis","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5552-2961","authenticated-orcid":false,"given":"Benjamin","family":"Bucknall","sequence":"additional","affiliation":[{"name":"Centre for the Governance of AI, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2952-4188","authenticated-orcid":false,"given":"Andreas","family":"Haupt","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8522-4333","authenticated-orcid":false,"given":"Kevin","family":"Wei","sequence":"additional","affiliation":[{"name":"Harvard University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6859-6029","authenticated-orcid":false,"given":"J\u00e9r\u00e9my","family":"Scheurer","sequence":"additional","affiliation":[{"name":"Apollo Research, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8244-3154","authenticated-orcid":false,"given":"Marius","family":"Hobbhahn","sequence":"additional","affiliation":[{"name":"Apollo Research, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2137-6027","authenticated-orcid":false,"given":"Lee","family":"Sharkey","sequence":"additional","affiliation":[{"name":"Apollo Research, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5324-5824","authenticated-orcid":false,"given":"Satyapriya","family":"Krishna","sequence":"additional","affiliation":[{"name":"Harvard University, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6741-2996","authenticated-orcid":false,"given":"Marvin","family":"Von Hagen","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1611-5737","authenticated-orcid":false,"given":"Silas","family":"Alberti","sequence":"additional","affiliation":[{"name":"Stanford University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7547-3951","authenticated-orcid":false,"given":"Alan","family":"Chan","sequence":"additional","affiliation":[{"name":"Centre for the Governance of AI, Mila (Quebec AI Institute), Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3791-7890","authenticated-orcid":false,"given":"Qinyi","family":"Sun","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0627-7879","authenticated-orcid":false,"given":"Michael","family":"Gerovitch","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1744-6765","authenticated-orcid":false,"given":"David","family":"Bau","sequence":"additional","affiliation":[{"name":"Northeastern University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7670-7190","authenticated-orcid":false,"given":"Max","family":"Tegmark","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7256-0937","authenticated-orcid":false,"given":"David","family":"Krueger","sequence":"additional","affiliation":[{"name":"University of Cambridge, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6168-4763","authenticated-orcid":false,"given":"Dylan","family":"Hadfield-Menell","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,6,5]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462563"},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Machine Learning. PMLR, 66\u201388","author":"Abid Abubakar","year":"2022","unstructured":"Abubakar Abid, Mert Yuksekgonul, and James Zou. 2022. Meaningfully debugging model mistakes using conceptual counterfactual explanations. In International Conference on Machine Learning. PMLR, 66\u201388."},{"key":"e_1_3_2_1_3_1","volume-title":"Debugging tests for model explanations. arXiv preprint arXiv:2011.05429","author":"Adebayo Julius","year":"2020","unstructured":"Julius Adebayo, Michael Muelly, Ilaria Liccardi, and Been Kim. 2020. Debugging tests for model explanations. arXiv preprint arXiv:2011.05429 (2020)."},{"key":"e_1_3_2_1_4_1","first-page":"15784","article-title":"Openxai: Towards a transparent evaluation of model explanations","volume":"35","author":"Agarwal Chirag","year":"2022","unstructured":"Chirag Agarwal, Satyapriya Krishna, Eshika Saxena, Martin Pawelczyk, Nari Johnson, Isha Puri, Marinka Zitnik, and Himabindu Lakkaraju. 2022. Openxai: Towards a transparent evaluation of model explanations. Advances in Neural Information Processing Systems 35 (2022), 15784\u201315799.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","unstructured":"AI Safety Summit. 2023. The Bletchley Declaration by Countries Attending the AI Safety Summit. https:\/\/www.gov.uk\/government\/publications\/ai-safety-summit-2023-the-bletchley-declaration\/the-bletchley-declaration-by-countries-attending-the-ai-safety-summit-1-2-november-2023"},{"key":"e_1_3_2_1_6_1","first-page":"2640","volume-title":"Proceedings of the 36th International Conference on Machine Learning. PMLR, 161\u2013170","author":"Aivodji Ulrich","year":"2019","unstructured":"Ulrich Aivodji, Hiromi Arai, Olivier Fortineau, S\u00e9bastien Gambs, Satoshi Hara, and Alain Tapp. 2019. Fairwashing: the risk of rationalization. In Proceedings of the 36th International Conference on Machine Learning. PMLR, 161\u2013170. https:\/\/proceedings.mlr.press\/v97\/aivodji19a.html ISSN: 2640-3498."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2807385"},{"key":"e_1_3_2_1_8_1","unstructured":"Guillaume Alain and Yoshua Bengio. 2018. Understanding intermediate layers using linear classifier probes. (2018). arxiv:1610.01644\u00a0[stat.ML]"},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/www.jailbreakchat.com\/","author":"Albert Alex","year":"2023","unstructured":"Alex Albert. 2023. Jailbreak Chat. (2023). https:\/\/www.jailbreakchat.com\/"},{"key":"e_1_3_2_1_10_1","volume-title":"Frontier AI regulation: Managing emerging risks to public safety. arXiv preprint arXiv:2307.03718","author":"Anderljung Markus","year":"2023","unstructured":"Markus Anderljung, Joslyn Barnhart, Jade Leung, Anton Korinek, Cullen O\u2019Keefe, Jess Whittlestone, Shahar Avin, Miles Brundage, Justin Bullock, Duncan Cass-Beggs, 2023. Frontier AI regulation: Managing emerging risks to public safety. arXiv preprint arXiv:2307.03718 (2023)."},{"key":"e_1_3_2_1_11_1","unstructured":"Markus Anderljung Everett\u00a0Thornton Smith Joe O\u2019Brien Lisa Soder Benjamin Bucknall Emma Bluemke Jonas Schuett Robert Trager Lacey Strahm and Rumman Chowdhury. 2023. Towards Publicly Accountable Frontier LLMs: Building an External Scrutiny Ecosystem under the ASPIRE Framework. (2023). arxiv:2311.14711\u00a0[cs.CY]"},{"key":"e_1_3_2_1_12_1","volume-title":"Ethics of data and analytics","author":"Angwin Julia","unstructured":"Julia Angwin, Jeff Larson, Surya Mattu, and Lauren Kirchner. 2022. Machine bias. In Ethics of data and analytics. Auerbach Publications, 254\u2013264."},{"key":"e_1_3_2_1_13_1","unstructured":"Anthropic. 2023. Challenges in evaluating AI systems. (2023). https:\/\/www.anthropic.com\/index\/evaluating-ai-systems"},{"key":"e_1_3_2_1_14_1","volume-title":"On the pitfalls of analyzing individual neurons in language models. arXiv preprint arXiv:2110.07483","author":"Antverg Omer","year":"2021","unstructured":"Omer Antverg and Yonatan Belinkov. 2021. On the pitfalls of analyzing individual neurons in language models. arXiv preprint arXiv:2110.07483 (2021)."},{"key":"e_1_3_2_1_15_1","unstructured":"Compiled Auditing\u00a0Standard ASA. 2006. Auditing standard ASA 210 terms of audit engagements."},{"key":"e_1_3_2_1_16_1","volume-title":"Feature representation in convolutional neural networks. arXiv preprint arXiv:1507.02313","author":"Athiwaratkun Ben","year":"2015","unstructured":"Ben Athiwaratkun and Keegan Kang. 2015. Feature representation in convolutional neural networks. arXiv preprint arXiv:1507.02313 (2015)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jfineco.2019.06.001"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00422"},{"key":"e_1_3_2_1_19_1","unstructured":"Yoshua Bengio Geoffrey Hinton Andrew Yao Dawn Song Pieter Abbeel Yuval\u00a0Noah Harari Ya-Qin Zhang Lan Xue Shai Shalev-Shwartz Gillian Hadfield [n. d.]. Managing AI Risks in an Era of Rapid Progress. ([n. d.])."},{"key":"e_1_3_2_1_20_1","volume-title":"Managing AI Risks in an Era of Rapid Progress. arXiv preprint arXiv:2310.17688","author":"Bengio Yoshua","year":"2023","unstructured":"Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Yuval\u00a0Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, 2023. Managing AI Risks in an Era of Rapid Progress. arXiv preprint arXiv:2310.17688 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"A survey of black-box adversarial attacks on computer vision models. arXiv preprint arXiv:1912.01667","author":"Bhambri Siddhant","year":"2019","unstructured":"Siddhant Bhambri, Sumanyu Muku, Avinash Tulasi, and Arun\u00a0Balaji Buduru. 2019. A survey of black-box adversarial attacks on computer vision models. arXiv preprint arXiv:1912.01667 (2019)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533083"},{"key":"e_1_3_2_1_23_1","volume-title":"Into the LAIONs Den: Investigating Hate in Multimodal Datasets. arXiv preprint arXiv:2311.03449","author":"Birhane Abeba","year":"2023","unstructured":"Abeba Birhane, Vinay Prabhu, Sang Han, Vishnu\u00a0Naresh Boddeti, and Alexandra\u00a0Sasha Luccioni. 2023. Into the LAIONs Den: Investigating Hate in Multimodal Datasets. arXiv preprint arXiv:2311.03449 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Multimodal datasets: misogyny, pornography, and malignant stereotypes. arXiv preprint arXiv:2110.01963","author":"Birhane Abeba","year":"2021","unstructured":"Abeba Birhane, Vinay\u00a0Uday Prabhu, and Emmanuel Kahembwe. 2021. Multimodal datasets: misogyny, pornography, and malignant stereotypes. arXiv preprint arXiv:2110.01963 (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Abeba Birhane Ryan Steed Victor Ojewale Briana Vecchione and Inioluwa\u00a0Deborah Raji. 2024. AI auditing: The Broken Bus on the Road to AI Accountability. arxiv:2401.14462\u00a0[cs.CY]","DOI":"10.1109\/SaTML59370.2024.00037"},{"key":"e_1_3_2_1_26_1","volume-title":"Exploring the Relevance of Data Privacy-Enhancing Technologies for AI Governance Use Cases. (March","author":"Bluemke Emma","year":"2023","unstructured":"Emma Bluemke, Tantum Collins, Ben Garfinkel, and Andrew Trask. 2023. Exploring the Relevance of Data Privacy-Enhancing Technologies for AI Governance Use Cases. (March 2023). https:\/\/arxiv.org\/abs\/2303.08956v2"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1540-6261.2011.01708.x"},{"key":"e_1_3_2_1_28_1","volume-title":"Man is to computer programmer as woman is to homemaker? debiasing word embeddings. Advances in neural information processing systems 29","author":"Bolukbasi Tolga","year":"2016","unstructured":"Tolga Bolukbasi, Kai-Wei Chang, James\u00a0Y Zou, Venkatesh Saligrama, and Adam\u00a0T Kalai. 2016. Man is to computer programmer as woman is to homemaker? debiasing word embeddings. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_29_1","volume-title":"The Foundation Model Transparency Index. (Oct","author":"Bommasani Rishi","year":"2023","unstructured":"Rishi Bommasani, Kevin Klyman, Shayne Longpre, Sayash Kapoor, Nestor Maslej, Betty Xiong, Daniel Zhang, and Percy Liang. 2023. The Foundation Model Transparency Index. (Oct. 2023). http:\/\/arxiv.org\/abs\/2310.12941 arXiv:2310.12941 [cs]."},{"key":"e_1_3_2_1_30_1","volume-title":"ChemCrow: Augmenting large-language models with chemistry tools. arXiv preprint arXiv:2304.05376","author":"Bran M","year":"2023","unstructured":"Andres\u00a0M Bran, Sam Cox, Andrew\u00a0D White, and Philippe Schwaller. 2023. ChemCrow: Augmenting large-language models with chemistry tools. arXiv preprint arXiv:2304.05376 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Towards Monosemanticity: Decomposing Language Models With Dictionary Learning. Transformer Circuits Thread","author":"Bricken Trenton","year":"2023","unstructured":"Trenton Bricken, Adly Templeton, Joshua Batson, Brian Chen, Adam Jermyn, Tom Conerly, Nick Turner, Cem Anil, Carson Denison, Amanda Askell, Robert Lasenby, Yifan Wu, Shauna Kravec, Nicholas Schiefer, Tim Maxwell, Nicholas Joseph, Zac Hatfield-Dodds, Alex Tamkin, Karina Nguyen, Brayden McLean, Josiah\u00a0E Burke, Tristan Hume, Shan Carter, Tom Henighan, and Christopher Olah. 2023. Towards Monosemanticity: Decomposing Language Models With Dictionary Learning. Transformer Circuits Thread (2023). https:\/\/transformer-circuits.pub\/2023\/monosemantic-features\/index.html."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1177\/2053951720983865"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Miles Brundage Shahar Avin Jasmine Wang Haydn Belfield Gretchen Krueger Gillian Hadfield Heidy Khlaaf Jingying Yang Helen Toner Ruth Fong Tegan Maharaj Pang\u00a0Wei Koh Sara Hooker Jade Leung Andrew Trask Emma Bluemke Jonathan Lebensold Cullen O\u2019Keefe Mark Koren Th\u00e9o Ryffel J.\u00a0B. Rubinovitz Tamay Besiroglu Federica Carugati Jack Clark Peter Eckersley Sarah de Haas Maritza Johnson Ben Laurie Alex Ingerman Igor Krawczuk Amanda Askell Rosario Cammarota Andrew Lohn David Krueger Charlotte Stix Peter Henderson Logan Graham Carina Prunkl Bianca Martin Elizabeth Seger Noa Zilberman Se\u00e1n\u00a0\u00d3 h\u00c9igeartaigh Frens Kroeger Girish Sastry Rebecca Kagan Adrian Weller Brian Tse Elizabeth Barnes Allan Dafoe Paul Scharre Ariel Herbert-Voss Martijn Rasser Shagun Sodhani Carrick Flynn Thomas\u00a0Krendl Gilbert Lisa Dyer Saif Khan Yoshua Bengio and Markus Anderljung. 2020. Toward Trustworthy AI Development: Mechanisms for Supporting Verifiable Claims. (April 2020). https:\/\/doi.org\/10.48550\/arXiv.2004.07213 arXiv:2004.07213 [cs].","DOI":"10.48550\/arXiv.2004.07213"},{"key":"e_1_3_2_1_34_1","volume-title":"Model Access Requirements. (Oct.","author":"Bucknall S","year":"2023","unstructured":"Benjamin\u00a0S Bucknall and Robert\u00a0F Trager. 2023. Structured Access for Third-Party Research on Frontier AI Models: Investigating Researchers\u2019 Model Access Requirements. (Oct. 2023). https:\/\/www.oxfordmartin.ox.ac.uk\/publications\/structured-access-for-third-party-research-on-frontier-ai-models-investigating-researchers-model-access-requirements\/"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.3817520"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/REW57809.2023.00062"},{"key":"e_1_3_2_1_37_1","volume-title":"Conference on fairness, accountability and transparency. PMLR, 77\u201391","author":"Buolamwini Joy","year":"2018","unstructured":"Joy Buolamwini and Timnit Gebru. 2018. Gender shades: Intersectional accuracy disparities in commercial gender classification. In Conference on fairness, accountability and transparency. PMLR, 77\u201391."},{"key":"e_1_3_2_1_38_1","volume-title":"Discovering latent knowledge in language models without supervision. arXiv preprint arXiv:2212.03827","author":"Burns Collin","year":"2022","unstructured":"Collin Burns, Haotian Ye, Dan Klein, and Jacob Steinhardt. 2022. Discovering latent knowledge in language models without supervision. arXiv preprint arXiv:2212.03827 (2022)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.23915\/distill.00024"},{"key":"e_1_3_2_1_40_1","volume-title":"Quantifying memorization across neural language models. arXiv preprint arXiv:2202.07646","author":"Carlini Nicholas","year":"2022","unstructured":"Nicholas Carlini, Daphne Ippolito, Matthew Jagielski, Katherine Lee, Florian Tramer, and Chiyuan Zhang. 2022. Quantifying memorization across neural language models. arXiv preprint arXiv:2202.07646 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Poisoning web-scale training datasets is practical. arXiv preprint arXiv:2302.10149","author":"Carlini Nicholas","year":"2023","unstructured":"Nicholas Carlini, Matthew Jagielski, Christopher\u00a0A Choquette-Choo, Daniel Paleka, Will Pearce, Hyrum Anderson, Andreas Terzis, Kurt Thomas, and Florian Tram\u00e8r. 2023. Poisoning web-scale training datasets is practical. arXiv preprint arXiv:2302.10149 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Are aligned neural networks adversarially aligned?arXiv preprint arXiv:2306.15447","author":"Carlini Nicholas","year":"2023","unstructured":"Nicholas Carlini, Milad Nasr, Christopher\u00a0A Choquette-Choo, Matthew Jagielski, Irena Gao, Anas Awadalla, Pang\u00a0Wei Koh, Daphne Ippolito, Katherine Lee, Florian Tramer, 2023. Are aligned neural networks adversarially aligned?arXiv preprint arXiv:2306.15447 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tramer, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom Brown, Dawn Song, Ulfar Erlingsson, 2021. Extracting training data from large language models. In 30th USENIX Security Symposium (USENIX Security 21). 2633\u20132650."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Shan Carter Zan Armstrong Ludwig Schubert Ian Johnson and Chris Olah. 2019. Exploring neural networks with activation atlases. Distill. (2019).","DOI":"10.23915\/distill.00015"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics8080832"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2307.15217"},{"key":"e_1_3_2_1_47_1","volume-title":"Measuring the Success of Diffusion Models at Imitating Human Artists. arXiv preprint arXiv:2307.04028","author":"Casper Stephen","year":"2023","unstructured":"Stephen Casper, Zifan Guo, Shreya Mogulothu, Zachary Marinov, Chinmay Deshpande, Rui-Jie Yew, Zheng Dai, and Dylan Hadfield-Menell. 2023. Measuring the Success of Diffusion Models at Imitating Human Artists. arXiv preprint arXiv:2307.04028 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"NeurIPS ML Safety Workshop.","author":"Casper Stephen","year":"2022","unstructured":"Stephen Casper, Kaivalya Hariharan, and Dylan Hadfield-Menell. 2022. Diagnostics for deep neural networks with automated copy\/paste attacks. In NeurIPS ML Safety Workshop."},{"key":"e_1_3_2_1_49_1","volume-title":"Red Teaming with Mind Reading: White-Box Adversarial Policies Against RL Agents. (Oct","author":"Casper Stephen","year":"2023","unstructured":"Stephen Casper, Taylor Killian, Gabriel Kreiman, and Dylan Hadfield-Menell. 2023. Red Teaming with Mind Reading: White-Box Adversarial Policies Against RL Agents. (Oct. 2023). http:\/\/arxiv.org\/abs\/2209.02167 arXiv:2209.02167 [cs]."},{"key":"e_1_3_2_1_50_1","volume-title":"Red Teaming Deep Neural Networks with Feature Synthesis Tools. (Sept","author":"Casper Stephen","year":"2023","unstructured":"Stephen Casper, Yuxiao Li, Jiawei Li, Tong Bu, Kevin Zhang, Kaivalya Hariharan, and Dylan Hadfield-Menell. 2023. Red Teaming Deep Neural Networks with Feature Synthesis Tools. (Sept. 2023). http:\/\/arxiv.org\/abs\/2302.10894 arXiv:2302.10894 [cs]."},{"key":"e_1_3_2_1_51_1","volume-title":"Explore","author":"Casper Stephen","year":"2023","unstructured":"Stephen Casper, Jason Lin, Joe Kwon, Gatlen Culp, and Dylan Hadfield-Menell. 2023. Explore, Establish, Exploit: Red Teaming Language Models from Scratch. arXiv preprint arXiv:2306.09442 (2023)."},{"key":"e_1_3_2_1_52_1","first-page":"33093","article-title":"Robust feature-level adversaries are interpretability tools","volume":"35","author":"Casper Stephen","year":"2022","unstructured":"Stephen Casper, Max Nadeau, Dylan Hadfield-Menell, and Gabriel Kreiman. 2022. Robust feature-level adversaries are interpretability tools. Advances in Neural Information Processing Systems 35 (2022), 33093\u201333106.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_53_1","unstructured":"Stephen Casper Lennart Schulze Oam Patel and Dylan Hadfield-Menell. 2024. Defending Against Unforeseen Failure Modes with Latent Adversarial Training. arxiv:2403.05030\u00a0[cs.CR]"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594033"},{"key":"e_1_3_2_1_55_1","volume-title":"Jailbreaking Black Box Large Language Models in Twenty Queries. arXiv preprint arXiv:2310.08419","author":"Chao Patrick","year":"2023","unstructured":"Patrick Chao, Alexander Robey, Edgar Dobriban, Hamed Hassani, George\u00a0J Pappas, and Eric Wong. 2023. Jailbreaking Black Box Large Language Models in Twenty Queries. arXiv preprint arXiv:2310.08419 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"From Text to MITRE Techniques: Exploring the Malicious Use of Large Language Models for Generating Cyber Attack Payloads. arXiv preprint arXiv:2305.15336","author":"Charan PV","year":"2023","unstructured":"PV Charan, Hrushikesh Chunduri, P\u00a0Mohan Anand, and Sandeep\u00a0K Shukla. 2023. From Text to MITRE Techniques: Exploring the Malicious Use of Large Language Models for Generating Cyber Attack Payloads. arXiv preprint arXiv:2305.15336 (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3564284"},{"key":"e_1_3_2_1_58_1","volume-title":"Targeted backdoor attacks on deep learning systems using data poisoning. arXiv preprint arXiv:1712.05526","author":"Chen Xinyun","year":"2017","unstructured":"Xinyun Chen, Chang Liu, Bo Li, Kimberly Lu, and Dawn Song. 2017. Targeted backdoor attacks on deep learning systems using data poisoning. arXiv preprint arXiv:1712.05526 (2017)."},{"key":"e_1_3_2_1_59_1","volume-title":"Fairness testing: A comprehensive survey and analysis of trends. arXiv preprint arXiv:2207.10223","author":"Chen Zhenpeng","year":"2022","unstructured":"Zhenpeng Chen, Jie\u00a0M Zhang, Max Hort, Federica Sarro, and Mark Harman. 2022. Fairness testing: A comprehensive survey and analysis of trends. arXiv preprint arXiv:2207.10223 (2022)."},{"key":"e_1_3_2_1_60_1","unstructured":"China Academy of Information and Communications Technology and JD Explore Academy. 2021. White Paper on Trustworthy Artificial Intelligence. https:\/\/cset.georgetown.edu\/publication\/white-paper-on-trustworthy-artificial-intelligence\/"},{"key":"e_1_3_2_1_61_1","unstructured":"Chinese National Information Security Standardization Technical Committee. 2023. Translation: Basic Safety Requirements for Generative Artificial Intelligence Services (Draft for Feedback). https:\/\/cset.georgetown.edu\/publication\/china-safety-requirements-for-generative-ai\/?utm_source=substack&utm_medium=email"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2103.04244"},{"key":"e_1_3_2_1_63_1","unstructured":"Paul Christiano. 2019. Worst-case guarantees. https:\/\/ai-alignment.com\/training-robust-corrigibility-ce0e0a3b9b4d"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","first-page":"113","DOI":"10.3390\/computers10090113","article-title":"Evaluating impact of race in facial recognition across machine learning and deep learning algorithms","volume":"10","author":"Coe James","year":"2021","unstructured":"James Coe and Mustafa Atay. 2021. Evaluating impact of race in facial recognition across machine learning and deep learning algorithms. Computers 10, 9 (2021), 113.","journal-title":"Computers"},{"key":"e_1_3_2_1_65_1","unstructured":"The New York\u00a0Times Company. 2023. The New York Times Company v. OpenAI. https:\/\/nytco-assets.nytimes.com\/2023\/12\/NYT_Complaint_Dec2023.pdf Case e 1:23-cv-11195."},{"key":"e_1_3_2_1_66_1","volume-title":"What you can cram into a single vector: Probing sentence embeddings for linguistic properties. arXiv preprint arXiv:1805.01070","author":"Conneau Alexis","year":"2018","unstructured":"Alexis Conneau, German Kruszewski, Guillaume Lample, Lo\u00efc Barrault, and Marco Baroni. 2018. What you can cram into a single vector: Probing sentence embeddings for linguistic properties. arXiv preprint arXiv:1805.01070 (2018)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533213"},{"key":"e_1_3_2_1_68_1","volume-title":"Local Law 144 of","author":"Cumbo Laurie","year":"2021","unstructured":"Laurie Cumbo, Alicka Ampry-Samuel, Helen Rosenthal, Robert Cornegy, Ben Kallos, Adrienne Adams, Farah Louis, Margaret Chin, Fernando Cabrera, Deborah Rose, Vanessa Gibson, Justin Brannan, Carlina Rivera, Mark Levine, Diana Ayala, I.\u00a0Daneek Miller, Stephen Levin, and Inez Barron. 2021. Local Law 144 of 2021. https:\/\/legistar.council.nyc.gov\/LegislationDetail.aspx?ID=4344524&GUID=B051915D-A9AC-451E-81F8-6596032FA3F9&Options=ID%7cText%7c&Search="},{"key":"e_1_3_2_1_69_1","unstructured":"Hoagy Cunningham Aidan Ewart Logan Riggs Robert Huben and Lee Sharkey. 2023. Sparse Autoencoders Find Highly Interpretable Features in Language Models. (2023). arxiv:2309.08600\u00a0[cs.LG]"},{"key":"e_1_3_2_1_70_1","volume-title":"Opportunities and challenges in explainable artificial intelligence (xai): A survey. arXiv preprint arXiv:2006.11371","author":"Das Arun","year":"2020","unstructured":"Arun Das and Paul Rad. 2020. Opportunities and challenges in explainable artificial intelligence (xai): A survey. arXiv preprint arXiv:2006.11371 (2020)."},{"key":"e_1_3_2_1_71_1","volume-title":"AI capabilities can be significantly improved without expensive retraining. (Dec","author":"Davidson Tom","year":"2023","unstructured":"Tom Davidson, Jean-Stanislas Denain, Pablo Villalobos, and Guillem Bas. 2023. AI capabilities can be significantly improved without expensive retraining. (Dec. 2023). https:\/\/arxiv.org\/abs\/2312.07413v1"},{"key":"e_1_3_2_1_72_1","volume-title":"Investigating Data Contamination in Modern Benchmarks for Large Language Models. arXiv preprint arXiv:2311.09783","author":"Deng Chunyuan","year":"2023","unstructured":"Chunyuan Deng, Yilun Zhao, Xiangru Tang, Mark Gerstein, and Arman Cohan. 2023. Investigating Data Contamination in Modern Benchmarks for Large Language Models. arXiv preprint arXiv:2311.09783 (2023)."},{"key":"e_1_3_2_1_73_1","volume-title":"Jailbreaker: Automated Jailbreak Across Multiple Large Language Model Chatbots. arXiv preprint arXiv:2307.08715","author":"Deng Gelei","year":"2023","unstructured":"Gelei Deng, Yi Liu, Yuekang Li, Kailong Wang, Ying Zhang, Zefeng Li, Haoyu Wang, Tianwei Zhang, and Yang Liu. 2023. Jailbreaker: Automated Jailbreak Across Multiple Large Language Model Chatbots. arXiv preprint arXiv:2307.08715 (2023)."},{"key":"e_1_3_2_1_74_1","volume-title":"Rlprompt: Optimizing discrete text prompts with reinforcement learning. arXiv preprint arXiv:2205.12548","author":"Deng Mingkai","year":"2022","unstructured":"Mingkai Deng, Jianyu Wang, Cheng-Ping Hsieh, Yihan Wang, Han Guo, Tianmin Shu, Meng Song, Eric\u00a0P Xing, and Zhiting Hu. 2022. Rlprompt: Optimizing discrete text prompts with reinforcement learning. arXiv preprint arXiv:2205.12548 (2022)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445924"},{"key":"e_1_3_2_1_76_1","volume-title":"Hard choices in artificial intelligence: Addressing normative uncertainty through sociotechnical commitments. arXiv preprint arXiv:1911.09005","author":"Dobbe Roel","year":"2019","unstructured":"Roel Dobbe, Thomas\u00a0Krendl Gilbert, and Yonatan Mintz. 2019. Hard choices in artificial intelligence: Addressing normative uncertainty through sociotechnical commitments. arXiv preprint arXiv:1911.09005 (2019)."},{"key":"e_1_3_2_1_77_1","volume-title":"Towards interpretable deep neural networks by leveraging adversarial examples. arXiv preprint arXiv:1708.05493","author":"Dong Yinpeng","year":"2017","unstructured":"Yinpeng Dong, Hang Su, Jun Zhu, and Fan Bao. 2017. Towards interpretable deep neural networks by leveraging adversarial examples. arXiv preprint arXiv:1708.05493 (2017)."},{"key":"e_1_3_2_1_78_1","volume-title":"Shortcut learning of large language models in natural language understanding. Communications of the ACM (CACM)","author":"Du Mengnan","year":"2023","unstructured":"Mengnan Du, Fengxiang He, Na Zou, Dacheng Tao, and Xia Hu. 2023. Shortcut learning of large language models in natural language understanding. Communications of the ACM (CACM) (2023)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3561048"},{"key":"e_1_3_2_1_80_1","volume-title":"Hotflip: White-box adversarial examples for text classification. arXiv preprint arXiv:1712.06751","author":"Ebrahimi Javid","year":"2017","unstructured":"Javid Ebrahimi, Anyi Rao, Daniel Lowd, and Dejing Dou. 2017. Hotflip: White-box adversarial examples for text classification. arXiv preprint arXiv:1712.06751 (2017)."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1177\/0093854818811379"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1086\/229939"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"crossref","DOI":"10.7208\/chicago\/9780226400938.001.0001","volume-title":"Working Law: Courts, Corporations, and Symbolic Civil Rights","author":"Edelman B.","year":"2016","unstructured":"Lauren\u00a0B. Edelman. 2016. Working Law: Courts, Corporations, and Symbolic Civil Rights. University of Chicago Press, Chicago, IL. https:\/\/press.uchicago.edu\/ucp\/books\/book\/chicago\/W\/bo24550454.html"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00359"},{"key":"e_1_3_2_1_85_1","first-page":"1","article-title":"Laying down harmonised rules on artificial intelligence (Artificial Intelligence Act) and amending certain union legislative acts","volume":"106","author":"European Commission","year":"2021","unstructured":"European Commission. 2021. Laying down harmonised rules on artificial intelligence (Artificial Intelligence Act) and amending certain union legislative acts. Eur Comm 106 (2021), 1\u2013108.","journal-title":"Eur Comm"},{"key":"e_1_3_2_1_86_1","unstructured":"European Union. 2016. General Data Protection Regulation. https:\/\/gdpr-info.eu\/"},{"key":"e_1_3_2_1_87_1","unstructured":"European Union. 2021. Artificial Intelligence Act. https:\/\/eur-lex.europa.eu\/legal-content\/EN\/TXT\/?uri=CELEX%3A52021PC0206"},{"key":"e_1_3_2_1_88_1","unstructured":"European Union. 2022. Digital Markets Act. https:\/\/eur-lex.europa.eu\/legal-content\/EN\/TXT\/?uri=CELEX%3A32022R1925"},{"key":"e_1_3_2_1_89_1","unstructured":"EY. 2019. EY Global Code of Conduct. Online. Retrieved from: https:\/\/assets.ey.com\/content\/dam\/ey-sites\/ey-com\/en_gl\/generic\/EY_Code_of_Conduct.pdf."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1257\/jep.10.3.103"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Michael Feffer Anusha Sinha Zachary\u00a0C. Lipton and Hoda Heidari. 2024. Red-Teaming for Generative AI: Silver Bullet or Security Theater?http:\/\/arxiv.org\/abs\/2401.15897 arXiv:2401.15897 [cs].","DOI":"10.1609\/aies.v7i1.31647"},{"key":"e_1_3_2_1_92_1","unstructured":"Jaden Fiotto-Kaufmann Arnab Sen-Sharma Caden Juang David Bau Eric Todd Francesca Lucchetti and Will Brockman. 2023. nnsight. https:\/\/nnsight.net\/"},{"key":"e_1_3_2_1_93_1","first-page":"1","article-title":"Bernard Madoff and the solo auditor red flag","volume":"1","author":"Fuerman D","year":"2009","unstructured":"Ross\u00a0D Fuerman. 2009. Bernard Madoff and the solo auditor red flag. Journal of Forensic & Investigative Accounting 1, 1 (2009), 1\u201338.","journal-title":"Journal of Forensic & Investigative Accounting"},{"key":"e_1_3_2_1_94_1","unstructured":"G7. 2023. Hiroshima Process International Code of Conduct for Organizations Developing Advanced AI Systems. https:\/\/digital-strategy.ec.europa.eu\/en\/library\/hiroshima-process-international-code-conduct-advanced-ai-systems"},{"key":"e_1_3_2_1_95_1","volume-title":"Interpreting CLIP\u2019s Image Representation via Text-Based Decomposition. arXiv preprint arXiv:2310.05916","author":"Gandelsman Yossi","year":"2023","unstructured":"Yossi Gandelsman, Alexei\u00a0A Efros, and Jacob Steinhardt. 2023. Interpreting CLIP\u2019s Image Representation via Text-Based Decomposition. arXiv preprint arXiv:2310.05916 (2023)."},{"key":"e_1_3_2_1_96_1","volume-title":"Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858","author":"Ganguli Deep","year":"2022","unstructured":"Deep Ganguli, Liane Lovitt, Jackson Kernion, Amanda Askell, Yuntao Bai, Saurav Kadavath, Ben Mann, Ethan Perez, Nicholas Schiefer, Kamal Ndousse, 2022. Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned. arXiv preprint arXiv:2209.07858 (2022)."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_2_1_99_1","volume-title":"Dissecting recall of factual associations in auto-regressive language models. arXiv preprint arXiv:2304.14767","author":"Geva Mor","year":"2023","unstructured":"Mor Geva, Jasmijn Bastings, Katja Filippova, and Amir Globerson. 2023. Dissecting recall of factual associations in auto-regressive language models. arXiv preprint arXiv:2304.14767 (2023)."},{"key":"e_1_3_2_1_100_1","volume-title":"Transformer feed-forward layers are key-value memories. arXiv preprint arXiv:2012.14913","author":"Geva Mor","year":"2020","unstructured":"Mor Geva, Roei Schuster, Jonathan Berant, and Omer Levy. 2020. Transformer feed-forward layers are key-value memories. arXiv preprint arXiv:2012.14913 (2020)."},{"key":"e_1_3_2_1_101_1","volume-title":"Neuron Shapley: Discovering the Responsible Neurons.","author":"Ghorbani Amirata","year":"2020","unstructured":"Amirata Ghorbani and James Zou. 2020. Neuron Shapley: Discovering the Responsible Neurons. (2020). arxiv:2002.09815\u00a0[stat.ML]"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1016\/S2589-7500(22)00063-2"},{"key":"e_1_3_2_1_103_1","volume-title":"Explaining Explanations: An Overview of Interpretability of Machine Learning. (Feb.","author":"Gilpin H.","year":"2019","unstructured":"Leilani\u00a0H. Gilpin, David Bau, Ben\u00a0Z. Yuan, Ayesha Bajwa, Michael Specter, and Lalana Kagal. 2019. Explaining Explanations: An Overview of Interpretability of Machine Learning. (Feb. 2019). http:\/\/arxiv.org\/abs\/1806.00069 arXiv:1806.00069 [cs, stat]."},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.intacc.2013.04.004"},{"key":"e_1_3_2_1_105_1","volume-title":"Time travel in llms: Tracing data contamination in large language models. arXiv preprint arXiv:2308.08493","author":"Golchin Shahriar","year":"2023","unstructured":"Shahriar Golchin and Mihai Surdeanu. 2023. Time travel in llms: Tracing data contamination in large language models. arXiv preprint arXiv:2308.08493 (2023)."},{"key":"e_1_3_2_1_106_1","first-page":"707","article-title":"The Auditor-Firm Conflict of Interests: Its Implications for Independence","volume":"49","author":"Goldman Arieh","year":"1974","unstructured":"Arieh Goldman and Benzion Barlev. 1974. The Auditor-Firm Conflict of Interests: Its Implications for Independence. The Accounting Review 49, 4 (1974), 707\u2013718. https:\/\/www.jstor.org\/stable\/245049 Publisher: American Accounting Association.","journal-title":"The Accounting Review"},{"key":"e_1_3_2_1_107_1","volume-title":"Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572","author":"Goodfellow J","year":"2014","unstructured":"Ian\u00a0J Goodfellow, Jonathon Shlens, and Christian Szegedy. 2014. Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572 (2014)."},{"key":"e_1_3_2_1_108_1","unstructured":"Google. 2021. Consultation on the EU AI Act Proposal. https:\/\/ec.europa.eu\/info\/law\/better-regulation\/have-your-say\/initiatives\/12527-Artificial-intelligence-ethical-and-legal-requirements\/F2662492_en"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533074"},{"key":"e_1_3_2_1_110_1","volume-title":"Backpropagation through the void: Optimizing control variates for black-box gradient estimation. arXiv preprint arXiv:1711.00123","author":"Grathwohl Will","year":"2017","unstructured":"Will Grathwohl, Dami Choi, Yuhuai Wu, Geoffrey Roeder, and David Duvenaud. 2017. Backpropagation through the void: Optimizing control variates for black-box gradient estimation. arXiv preprint arXiv:1711.00123 (2017)."},{"key":"e_1_3_2_1_111_1","first-page":"1","article-title":"Black box algorithms and the rights of individuals: No easy solution to the\" explainability\" problem","volume":"10","author":"Gryz Jarek","year":"2021","unstructured":"Jarek Gryz and Marcin Rojszczak. 2021. Black box algorithms and the rights of individuals: No easy solution to the\" explainability\" problem. Internet Policy Review 10, 2 (2021), 1\u201324.","journal-title":"Internet Policy Review"},{"key":"e_1_3_2_1_112_1","volume-title":"Gradient-based adversarial attacks against text transformers. arXiv preprint arXiv:2104.13733","author":"Guo Chuan","year":"2021","unstructured":"Chuan Guo, Alexandre Sablayrolles, Herv\u00e9 J\u00e9gou, and Douwe Kiela. 2021. Gradient-based adversarial attacks against text transformers. arXiv preprint arXiv:2104.13733 (2021)."},{"key":"e_1_3_2_1_113_1","unstructured":"Wes Gurnee and Max Tegmark. 2023. Language Models Represent Space and Time. (2023). arxiv:2310.02207\u00a0[cs.LG]"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.clsr.2023.105871"},{"key":"e_1_3_2_1_115_1","volume-title":"Regulating Gatekeeper AI and Data: Transparency, Access, and Fairness under the DMA, the GDPR, and beyond. (Aug","author":"Hacker Philipp","year":"2023","unstructured":"Philipp Hacker, Johann Cordes, and Janina Rochon. 2023. Regulating Gatekeeper AI and Data: Transparency, Access, and Fairness under the DMA, the GDPR, and beyond. (Aug. 2023). http:\/\/arxiv.org\/abs\/2212.04997 arXiv:2212.04997 [cs]."},{"key":"e_1_3_2_1_116_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-04083-2_17"},{"key":"e_1_3_2_1_117_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCI.2021.3129960"},{"key":"e_1_3_2_1_118_1","volume-title":"Large language models can be used to effectively scale spear phishing campaigns. arXiv preprint arXiv:2305.06972","author":"Hazell Julian","year":"2023","unstructured":"Julian Hazell. 2023. Large language models can be used to effectively scale spear phishing campaigns. arXiv preprint arXiv:2305.06972 (2023)."},{"key":"e_1_3_2_1_119_1","volume-title":"Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654","author":"He Pengcheng","year":"2020","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2020. Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654 (2020)."},{"key":"e_1_3_2_1_120_1","volume-title":"Foundation models and fair use. arXiv preprint arXiv:2303.15715","author":"Henderson Peter","year":"2023","unstructured":"Peter Henderson, Xuechen Li, Dan Jurafsky, Tatsunori Hashimoto, Mark\u00a0A Lemley, and Percy Liang. 2023. Foundation models and fair use. arXiv preprint arXiv:2303.15715 (2023)."},{"key":"e_1_3_2_1_121_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"crossref","unstructured":"Dan Hendrycks Kevin Zhao Steven Basart Jacob Steinhardt and Dawn Song. 2021. Natural Adversarial Examples. (2021). arxiv:1907.07174\u00a0[cs.LG]","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_123_1","volume-title":"International Conference on Learning Representations.","author":"Hernandez Evan","year":"2021","unstructured":"Evan Hernandez, Sarah Schwettmann, David Bau, Teona Bagashvili, Antonio Torralba, and Jacob Andreas. 2021. Natural language descriptions of deep visual features. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.1111\/ablj.12134"},{"key":"e_1_3_2_1_125_1","volume-title":"Taxonomy, Challenges, and Open Questions.","author":"Huang Lei","year":"2023","unstructured":"Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. 2023. A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions. (2023). arxiv:2311.05232\u00a0[cs.CL]"},{"key":"e_1_3_2_1_126_1","volume-title":"An overview of 11 proposals for building safe advanced ai. arXiv preprint arXiv:2012.07532","author":"Hubinger Evan","year":"2020","unstructured":"Evan Hubinger. 2020. An overview of 11 proposals for building safe advanced ai. arXiv preprint arXiv:2012.07532 (2020)."},{"key":"e_1_3_2_1_127_1","volume-title":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training. arXiv preprint arXiv:2401.05566","author":"Hubinger Evan","year":"2024","unstructured":"Evan Hubinger, Carson Denison, Jesse Mu, Mike Lambert, Meg Tong, Monte MacDiarmid, Tamera Lanham, Daniel\u00a0M Ziegler, Tim Maxwell, Newton Cheng, 2024. Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training. arXiv preprint arXiv:2401.05566 (2024)."},{"key":"e_1_3_2_1_128_1","volume-title":"International conference on machine learning. PMLR, 2137\u20132146","author":"Ilyas Andrew","year":"2018","unstructured":"Andrew Ilyas, Logan Engstrom, Anish Athalye, and Jessy Lin. 2018. Black-box adversarial attacks with limited queries and information. In International conference on machine learning. PMLR, 2137\u20132146."},{"key":"e_1_3_2_1_129_1","unstructured":"International Atomic Energy Agency. 2016. A Day in the Life of a Safeguards Inspector. https:\/\/www.iaea.org\/newscenter\/news\/a-day-in-the-life-of-a-safeguards-inspector Accessed: 2024-04-15."},{"key":"e_1_3_2_1_130_1","unstructured":"International Atomic Energy Agency. 2023. IAEA Safeguards Overview: Comprehensive Safeguards Agreements and Additional Protocols. https:\/\/www.iaea.org\/publications\/factsheets\/iaea-safeguards-overview"},{"key":"e_1_3_2_1_131_1","volume-title":"Stop uploading test data in plain text: Practical strategies for mitigating data contamination by evaluation benchmarks. arXiv preprint arXiv:2305.10160","author":"Jacovi Alon","year":"2023","unstructured":"Alon Jacovi, Avi Caciularu, Omer Goldman, and Yoav Goldberg. 2023. Stop uploading test data in plain text: Practical strategies for mitigating data contamination by evaluation benchmarks. arXiv preprint arXiv:2305.10160 (2023)."},{"key":"e_1_3_2_1_132_1","volume-title":"Mechanistically analyzing the effects of fine-tuning on procedurally defined tasks. arXiv preprint arXiv:2311.12786","author":"Jain Samyak","year":"2023","unstructured":"Samyak Jain, Robert Kirk, Ekdeep\u00a0Singh Lubana, Robert\u00a0P Dick, Hidenori Tanaka, Edward Grefenstette, Tim Rockt\u00e4schel, and David\u00a0Scott Krueger. 2023. Mechanistically analyzing the effects of fine-tuning on procedurally defined tasks. arXiv preprint arXiv:2311.12786 (2023)."},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_1_134_1","volume-title":"Smart: Robust and efficient fine-tuning for pre-trained natural language models through principled regularized optimization. arXiv preprint arXiv:1911.03437","author":"Jiang Haoming","year":"2019","unstructured":"Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu, Jianfeng Gao, and Tuo Zhao. 2019. Smart: Robust and efficient fine-tuning for pre-trained natural language models through principled regularized optimization. arXiv preprint arXiv:1911.03437 (2019)."},{"key":"e_1_3_2_1_135_1","doi-asserted-by":"crossref","first-page":"1040","DOI":"10.1038\/s41467-023-36583-0","article-title":"Abstract representations emerge naturally in neural networks trained to perform multiple tasks","volume":"14","author":"Johnston W\u00a0Jeffrey","year":"2023","unstructured":"W\u00a0Jeffrey Johnston and Stefano Fusi. 2023. Abstract representations emerge naturally in neural networks trained to perform multiple tasks. Nature Communications 14, 1 (2023), 1040.","journal-title":"Nature Communications"},{"key":"e_1_3_2_1_136_1","volume-title":"Automatically Auditing Large Language Models via Discrete Optimization. arXiv preprint arXiv:2303.04381","author":"Jones Erik","year":"2023","unstructured":"Erik Jones, Anca Dragan, Aditi Raghunathan, and Jacob Steinhardt. 2023. Automatically Auditing Large Language Models via Discrete Optimization. arXiv preprint arXiv:2303.04381 (2023)."},{"key":"e_1_3_2_1_137_1","volume-title":"On fairness and interpretability. arXiv preprint arXiv:2106.13271","author":"M Jose","year":"2021","unstructured":"Joemon\u00a0M Jose 2021. On fairness and interpretability. arXiv preprint arXiv:2106.13271 (2021)."},{"key":"e_1_3_2_1_138_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patter.2023.100804"},{"key":"e_1_3_2_1_139_1","volume-title":"Copyright Violations and Large Language Models. arXiv preprint arXiv:2310.13771","author":"Karamolegkou Antonia","year":"2023","unstructured":"Antonia Karamolegkou, Jiaang Li, Li Zhou, and Anders S\u00f8gaard. 2023. Copyright Violations and Large Language Models. arXiv preprint arXiv:2310.13771 (2023)."},{"key":"e_1_3_2_1_140_1","volume-title":"Targeted phishing campaigns using large scale language models. arXiv preprint arXiv:2301.00665","author":"Karanjai Rabimba","year":"2022","unstructured":"Rabimba Karanjai. 2022. Targeted phishing campaigns using large scale language models. arXiv preprint arXiv:2301.00665 (2022)."},{"key":"e_1_3_2_1_141_1","unstructured":"Max Kaufmann Daniel Kang Yi Sun Steven Basart Xuwang Yin Mantas Mazeika Akul Arora Adam Dziedzic Franziska Boenisch Tom Brown Jacob Steinhardt and Dan Hendrycks. 2023. Testing Robustness Against Unforeseen Adversaries. (2023). arxiv:1908.08016\u00a0[cs.LG]"},{"key":"e_1_3_2_1_142_1","doi-asserted-by":"publisher","DOI":"10.3390\/jintelligence9030046"},{"key":"e_1_3_2_1_143_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3006051"},{"key":"e_1_3_2_1_144_1","article-title":"A comparative study of white box, black box and grey box testing techniques","volume":"3","author":"Khan Mohd\u00a0Ehmer","year":"2012","unstructured":"Mohd\u00a0Ehmer Khan and Farmeena Khan. 2012. A comparative study of white box, black box and grey box testing techniques. International Journal of Advanced Computer Science and Applications 3, 6 (2012).","journal-title":"International Journal of Advanced Computer Science and Applications"},{"key":"e_1_3_2_1_145_1","volume-title":"How AI Can Be Regulated Like Nuclear Energy. TIME (Oct","author":"Khlaaf Heidy","year":"2023","unstructured":"Heidy Khlaaf. 2023. How AI Can Be Regulated Like Nuclear Energy. TIME (Oct. 2023). https:\/\/time.com\/6327635\/ai-needs-to-be-regulated-like-nuclear-weapons\/"},{"key":"e_1_3_2_1_146_1","first-page":"2640","volume-title":"Proceedings of the 35th International Conference on Machine Learning. PMLR, 2668\u20132677","author":"Kim Been","year":"2018","unstructured":"Been Kim, Martin Wattenberg, Justin Gilmer, Carrie Cai, James Wexler, Fernanda Viegas, and Rory Sayres. 2018. Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV). In Proceedings of the 35th International Conference on Machine Learning. PMLR, 2668\u20132677. https:\/\/proceedings.mlr.press\/v80\/kim18d.html ISSN: 2640-3498."},{"key":"e_1_3_2_1_147_1","volume-title":"Evaluating Language-Model Agents on Realistic Autonomous Tasks. https:\/\/evals.alignment.org\/language-model-pilot-report. (July","author":"Kinniment Megan","year":"2023","unstructured":"Megan Kinniment, Lucas\u00a0Jun Koba\u00a0Sato, Haoxing Du, Brian Goodrich, Max Hasin, Lawrence Chan, Luke\u00a0Harold Miles, Tao\u00a0R Lin, Hjalmar Wijk, Joel Burget, Aaron Ho, Elizabeth Barnes, and Paul Christiano. 2023. Evaluating Language-Model Agents on Realistic Autonomous Tasks. https:\/\/evals.alignment.org\/language-model-pilot-report. (July 2023)."},{"key":"e_1_3_2_1_148_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-04301-w"},{"key":"e_1_3_2_1_149_1","volume-title":"Risk assessment at AGI companies: A review of popular risk assessment techniques from other safety-critical industries. (July","author":"Koessler Leonie","year":"2023","unstructured":"Leonie Koessler and Jonas Schuett. 2023. Risk assessment at AGI companies: A review of popular risk assessment techniques from other safety-critical industries. (July 2023). https:\/\/arxiv.org\/abs\/2307.08823v1"},{"key":"e_1_3_2_1_150_1","volume-title":"Algorithmic black swans","author":"Kolt Noam","year":"2023","unstructured":"Noam Kolt. 2023. Algorithmic black swans. Washington University Law Review 101 (2023)."},{"key":"e_1_3_2_1_151_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2021.3067225"},{"key":"e_1_3_2_1_152_1","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.448221"},{"key":"e_1_3_2_1_153_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2402.06625"},{"key":"e_1_3_2_1_154_1","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 5830\u20135842","author":"Krishna Satyapriya","year":"2022","unstructured":"Satyapriya Krishna, Rahul Gupta, Apurv Verma, Jwala Dhamala, Yada Pruksachatkun, and Kai-Wei Chang. 2022. Measuring Fairness of Text Classifiers via Prediction Sensitivity. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 5830\u20135842."},{"key":"e_1_3_2_1_155_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13347-019-00372-9"},{"key":"e_1_3_2_1_156_1","unstructured":"Yilun Kuang and Yash Bharti. [n. d.]. Scale-invariant-Fine-Tuning (SiFT) for Improved Generalization in Classification. ([n. d.])."},{"key":"e_1_3_2_1_157_1","volume-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. 2251\u20132277","author":"Kumar Sachin","year":"2022","unstructured":"Sachin Kumar, Biswajit Paria, and Yulia Tsvetkov. 2022. Gradient-based constrained sampling from language models. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. 2251\u20132277."},{"key":"e_1_3_2_1_158_1","volume-title":"Proceedings of the 28th International Joint Conference on Artificial Intelligence. 2779\u20132785","author":"Kumari Nupur","year":"2019","unstructured":"Nupur Kumari, Mayank Singh, Abhishek Sinha, Harshitha Machiraju, Balaji Krishnamurthy, and Vineeth\u00a0N Balasubramanian. 2019. Harnessing the vulnerability of latent layers in adversarially trained models. In Proceedings of the 28th International Joint Conference on Artificial Intelligence. 2779\u20132785."},{"key":"e_1_3_2_1_159_1","volume-title":"Entangled Preferences: The History and Risks of Reinforcement Learning and Human Feedback.","author":"Lambert Nathan","year":"2023","unstructured":"Nathan Lambert, Thomas\u00a0Krendl Gilbert, and Tom Zick. 2023. Entangled Preferences: The History and Risks of Reinforcement Learning and Human Feedback. (2023). arxiv:2310.13595\u00a0[cs.CY]"},{"key":"e_1_3_2_1_160_1","volume-title":"Auditing the AI auditors: A framework for evaluating fairness and bias in high stakes AI predictive models.American Psychologist 78, 1","author":"Landers N","year":"2023","unstructured":"Richard\u00a0N Landers and Tara\u00a0S Behrend. 2023. Auditing the AI auditors: A framework for evaluating fairness and bias in high stakes AI predictive models.American Psychologist 78, 1 (2023), 36."},{"key":"e_1_3_2_1_161_1","unstructured":"Jose\u00a0Antonio Lanz. 2023. Stable Diffusion XL v0.9 Leaks Early Generating Raves From Users. https:\/\/decrypt.co\/147612\/stable-diffusion-xl-v0-9-leaks-early-generating-raves-from-users"},{"key":"e_1_3_2_1_162_1","volume-title":"arXiv preprint arXiv:2309.01446","author":"Lapid Raz","year":"2023","unstructured":"Raz Lapid, Ron Langberg, and Moshe Sipper. 2023. Open Sesame! Universal Black Box Jailbreaking of Large Language Models. arXiv preprint arXiv:2309.01446 (2023)."},{"key":"e_1_3_2_1_163_1","doi-asserted-by":"crossref","unstructured":"Seth Lazar and Alondra Nelson. 2023. AI safety on whose terms? 138\u2013138\u00a0pages.","DOI":"10.1126\/science.adi8982"},{"key":"e_1_3_2_1_164_1","volume-title":"A Mechanistic Understanding of Alignment Algorithms: A Case Study on DPO and Toxicity. arXiv preprint arXiv:2401.01967","author":"Lee Andrew","year":"2024","unstructured":"Andrew Lee, Xiaoyan Bai, Itamar Pres, Martin Wattenberg, Jonathan\u00a0K Kummerfeld, and Rada Mihalcea. 2024. A Mechanistic Understanding of Alignment Algorithms: A Case Study on DPO and Toxicity. arXiv preprint arXiv:2401.01967 (2024)."},{"key":"e_1_3_2_1_165_1","doi-asserted-by":"crossref","unstructured":"Sharkey Lee Ghuidhir Cl\u00edodhna\u00a0N\u00ed Dan Braun Scheurer J\u00e9r\u00e9my Mikita Balesni Bushnaq Lucius Stix Charlotte and Marius Hobbhahn. 2023. A causal framework for AI Regulation and Auditing. (2023).","DOI":"10.20944\/preprints202401.1424.v1"},{"key":"e_1_3_2_1_166_1","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4531029"},{"key":"e_1_3_2_1_167_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0165-4101(00)00025-2"},{"key":"e_1_3_2_1_168_1","unstructured":"Simon Lermen Charlie Rogers-Smith and Jeffrey Ladish. 2023. LoRA Fine-tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B. (2023). arxiv:2310.20624\u00a0[cs.LG]"},{"key":"e_1_3_2_1_169_1","volume-title":"Multi-step Jailbreaking Privacy Attacks on ChatGPT. arXiv preprint arXiv:2304.05197","author":"Li Haoran","year":"2023","unstructured":"Haoran Li, Dadi Guo, Wei Fan, Mingshi Xu, and Yangqiu Song. 2023. Multi-step Jailbreaking Privacy Attacks on ChatGPT. arXiv preprint arXiv:2304.05197 (2023)."},{"key":"e_1_3_2_1_170_1","volume-title":"Textbugger: Generating adversarial text against real-world applications. arXiv preprint arXiv:1812.05271","author":"Li Jinfeng","year":"2018","unstructured":"Jinfeng Li, Shouling Ji, Tianyu Du, Bo Li, and Ting Wang. 2018. Textbugger: Generating adversarial text against real-world applications. arXiv preprint arXiv:1812.05271 (2018)."},{"key":"e_1_3_2_1_171_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.17022"},{"key":"e_1_3_2_1_172_1","volume-title":"Holistic evaluation of language models. arXiv preprint arXiv:2211.09110","author":"Liang Percy","year":"2022","unstructured":"Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, 2022. Holistic evaluation of language models. arXiv preprint arXiv:2211.09110 (2022)."},{"key":"e_1_3_2_1_173_1","doi-asserted-by":"publisher","DOI":"10.3390\/e23010018"},{"key":"e_1_3_2_1_174_1","doi-asserted-by":"crossref","first-page":"160","DOI":"10.1016\/j.jaccpubpol.2009.10.007","article-title":"Social responsibility and corporate reputation: The case of the Arthur Andersen Enron audit failure","volume":"29","author":"Linthicum Cheryl","year":"2010","unstructured":"Cheryl Linthicum, Austin\u00a0L Reitenga, and Juan\u00a0Manuel Sanchez. 2010. Social responsibility and corporate reputation: The case of the Arthur Andersen Enron audit failure. Journal of Accounting and Public Policy 29, 2 (2010), 160\u2013176.","journal-title":"Journal of Accounting and Public Policy"},{"key":"e_1_3_2_1_175_1","volume-title":"Character-level White-Box Adversarial Attacks against Transformers via Attachable Subwords Substitution. ArXiv abs\/2210.17004","author":"Liu Aiwei","year":"2022","unstructured":"Aiwei Liu, Honghai Yu, Xuming Hu, Shuang Li, Li Lin, Fukun Ma, Yawen Yang, and Lijie Wen. 2022. Character-level White-Box Adversarial Attacks against Transformers via Attachable Subwords Substitution. ArXiv abs\/2210.17004 (2022). https:\/\/api.semanticscholar.org\/CorpusID:253236900"},{"key":"e_1_3_2_1_176_1","volume-title":"Adversarial training for large neural language models. arXiv preprint arXiv:2004.08994","author":"Liu Xiaodong","year":"2020","unstructured":"Xiaodong Liu, Hao Cheng, Pengcheng He, Weizhu Chen, Yu Wang, Hoifung Poon, and Jianfeng Gao. 2020. Adversarial training for large neural language models. arXiv preprint arXiv:2004.08994 (2020)."},{"key":"e_1_3_2_1_177_1","doi-asserted-by":"publisher","DOI":"10.1016\/S2589-7500(22)00003-6"},{"key":"e_1_3_2_1_178_1","volume-title":"Latent Feature Relation Consistency for Adversarial Robustness. arXiv preprint arXiv:2303.16697","author":"Liu Xingbin","year":"2023","unstructured":"Xingbin Liu, Huafeng Kuang, Hong Liu, Xianming Lin, Yongjian Wu, and Rongrong Ji. 2023. Latent Feature Relation Consistency for Adversarial Robustness. arXiv preprint arXiv:2303.16697 (2023)."},{"key":"e_1_3_2_1_179_1","volume-title":"Delving into transferable adversarial examples and black-box attacks. arXiv preprint arXiv:1611.02770","author":"Liu Yanpei","year":"2016","unstructured":"Yanpei Liu, Xinyun Chen, Chang Liu, and Dawn Song. 2016. Delving into transferable adversarial examples and black-box attacks. arXiv preprint arXiv:1611.02770 (2016)."},{"key":"e_1_3_2_1_180_1","volume-title":"Jailbreaking chatgpt via prompt engineering: An empirical study. arXiv preprint arXiv:2305.13860","author":"Liu Yi","year":"2023","unstructured":"Yi Liu, Gelei Deng, Zhengzi Xu, Yuekang Li, Yaowen Zheng, Ying Zhang, Lida Zhao, Tianwei Zhang, and Yang Liu. 2023. Jailbreaking chatgpt via prompt engineering: An empirical study. arXiv preprint arXiv:2305.13860 (2023)."},{"key":"e_1_3_2_1_181_1","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594079"},{"key":"e_1_3_2_1_182_1","volume-title":"What\u2019s in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. arXiv preprint arXiv:2105.02732","author":"Luccioni Alexandra\u00a0Sasha","year":"2021","unstructured":"Alexandra\u00a0Sasha Luccioni and Joseph\u00a0D Viviano. 2021. What\u2019s in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. arXiv preprint arXiv:2105.02732 (2021)."},{"key":"e_1_3_2_1_183_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.acra.2019.09.009"},{"key":"e_1_3_2_1_184_1","unstructured":"Samuel Marks Can Rager Eric\u00a0J. Michaud Yonatan Belinkov David Bau and Aaron Mueller. 2024. Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in Language Models. arxiv:2403.19647\u00a0[cs.LG]"},{"key":"e_1_3_2_1_185_1","unstructured":"Samuel Marks and Max Tegmark. 2023. The Geometry of Truth: Emergent Linear Structure in Large Language Model Representations of True\/False Datasets. (2023). arxiv:2310.06824\u00a0[cs.AI]"},{"key":"e_1_3_2_1_186_1","doi-asserted-by":"publisher","DOI":"10.1287\/orsc.2015.1039"},{"key":"e_1_3_2_1_187_1","volume-title":"Scaling shared model governance via model splitting. arXiv preprint arXiv:1812.05979","author":"Martic Miljan","year":"2018","unstructured":"Miljan Martic, Jan Leike, Andrew Trask, Matteo Hessel, Shane Legg, and Pushmeet Kohli. 2018. Scaling shared model governance via model splitting. arXiv preprint arXiv:1812.05979 (2018)."},{"key":"e_1_3_2_1_188_1","volume-title":"A survey on bias and fairness in machine learning. ACM computing surveys (CSUR) 54, 6","author":"Mehrabi Ninareh","year":"2021","unstructured":"Ninareh Mehrabi, Fred Morstatter, Nripsuta Saxena, Kristina Lerman, and Aram Galstyan. 2021. A survey on bias and fairness in machine learning. ACM computing surveys (CSUR) 54, 6 (2021), 1\u201335."},{"key":"e_1_3_2_1_189_1","first-page":"17359","article-title":"Locating and editing factual associations in GPT","volume":"35","author":"Meng Kevin","year":"2022","unstructured":"Kevin Meng, David Bau, Alex Andonian, and Yonatan Belinkov. 2022. Locating and editing factual associations in GPT. Advances in Neural Information Processing Systems 35 (2022), 17359\u201317372.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_190_1","volume-title":"A relationship and not a thing: A relational approach to algorithmic accountability and assessment documentation. arXiv preprint arXiv:2203.01455","author":"Metcalf Jacob","year":"2022","unstructured":"Jacob Metcalf, Emanuel Moss, Ranjit Singh, Emnet Tafese, and Elizabeth\u00a0Anne Watkins. 2022. A relationship and not a thing: A relational approach to algorithmic accountability and assessment documentation. arXiv preprint arXiv:2203.01455 (2022)."},{"key":"e_1_3_2_1_191_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445935"},{"key":"e_1_3_2_1_192_1","unstructured":"METR. 2023. METR. https:\/\/evals.alignment.org\/"},{"key":"e_1_3_2_1_193_1","volume-title":"Explanation in artificial intelligence: Insights from the social sciences. Artificial intelligence 267","author":"Miller Tim","year":"2019","unstructured":"Tim Miller. 2019. Explanation in artificial intelligence: Insights from the social sciences. Artificial intelligence 267 (2019), 1\u201338."},{"key":"e_1_3_2_1_194_1","volume-title":"Taking control: Policies to address extinction risks from advanced AI. arXiv preprint arXiv:2310.20563","author":"Miotti Andrea","year":"2023","unstructured":"Andrea Miotti and Akash Wasil. 2023. Taking control: Policies to address extinction risks from advanced AI. arXiv preprint arXiv:2310.20563 (2023)."},{"key":"e_1_3_2_1_195_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287596"},{"key":"e_1_3_2_1_196_1","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1007\/s44206-023-00074-y","article-title":"Auditing of AI: Legal, Ethical and Technical Approaches","volume":"2","author":"M\u00f6kander Jakob","year":"2023","unstructured":"Jakob M\u00f6kander. 2023. Auditing of AI: Legal, Ethical and Technical Approaches. Digital Society 2, 3 (2023), 49.","journal-title":"Digital Society"},{"key":"e_1_3_2_1_197_1","doi-asserted-by":"publisher","DOI":"10.5465\/amr.2006.19379621"},{"key":"e_1_3_2_1_198_1","unstructured":"Christopher\u00a0A Mouton Caleb Lucas and Ella Guest. 2023. The Operational Risks of AI in Large-Scale Biological Attacks: A Red-Team Approach. (2023)."},{"key":"e_1_3_2_1_199_1","first-page":"17153","article-title":"Compositional explanations of neurons","volume":"33","author":"Mu Jesse","year":"2020","unstructured":"Jesse Mu and Jacob Andreas. 2020. Compositional explanations of neurons. Advances in Neural Information Processing Systems 33 (2020), 17153\u201317163.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_200_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-021-09557-8"},{"key":"e_1_3_2_1_201_1","doi-asserted-by":"publisher","DOI":"10.1007\/s43681-023-00289-2"},{"key":"e_1_3_2_1_202_1","doi-asserted-by":"publisher","DOI":"10.1007\/s43681-023-00289-2"},{"key":"e_1_3_2_1_203_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2311.10538"},{"key":"e_1_3_2_1_204_1","unstructured":"Neel Nanda Lawrence Chan Tom Lieberum Jess Smith and Jacob Steinhardt. 2023. Progress measures for grokking via mechanistic interpretability. (2023). arxiv:2301.05217\u00a0[cs.LG]"},{"key":"e_1_3_2_1_205_1","unstructured":"Arvind Narayanan and Sayash Kapoor. 2023. Evaluating LLMs is a minefield. https:\/\/www.cs.princeton.edu\/\u00a0arvindn\/talks\/evaluating_llms_minefield\/#\/8"},{"key":"e_1_3_2_1_206_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2311.17035"},{"key":"e_1_3_2_1_207_1","unstructured":"National Institute for Standards and Technology. 2023. Request for Information (RFI) Related to NIST\u2019s Assignments Under Sections 4.1 4.5 and 11 of the Executive Order Concerning Artificial Intelligence (Sections 4.1 4.5 and 11). https:\/\/www.federalregister.gov\/documents\/2023\/12\/21\/2023-28232\/request-for-information-rfi-related-to-nists-assignments-under-sections-41-45-and-11-of-the"},{"key":"e_1_3_2_1_208_1","volume-title":"Translation: Chinese Expert Group Offers \u2019Governance Principles","author":"National New Generation Artificial Intelligence Governance Expert Committee.","year":"2019","unstructured":"National New Generation Artificial Intelligence Governance Expert Committee. 2019. Translation: Chinese Expert Group Offers \u2019Governance Principles\u2019 for \u2019Responsible AI\u2019. https:\/\/digichina.stanford.edu\/work\/translation-chinese-expert-group-offers-governance-principles-for-responsible-ai\/"},{"key":"e_1_3_2_1_209_1","unstructured":"National New Generation Artificial Intelligence Governance Specialist Committee. 2021. \"Ethical Norms for New Generation Artificial Intelligence\" Released. https:\/\/cset.georgetown.edu\/publication\/ethical-norms-for-new-generation-artificial-intelligence-released\/"},{"key":"e_1_3_2_1_211_1","volume-title":"Translation: Artificial Intelligence Law, Model Law v. 1.0 (Expert Suggestion Draft) \u2013","author":"Ng Kwan\u00a0Yee","year":"2023","unstructured":"Kwan\u00a0Yee Ng, Jason Zhou, Ben Murphy, Rogier Creemers, and Hunter Dorwart. 2023. Translation: Artificial Intelligence Law, Model Law v. 1.0 (Expert Suggestion Draft) \u2013 Aug. 2023. (Aug. 2023). https:\/\/digichina.stanford.edu\/work\/translation-artificial-intelligence-law-model-law-v-1-0-expert-suggestion-draft-aug-2023\/"},{"key":"e_1_3_2_1_212_1","volume-title":"The alignment problem from a deep learning perspective. arXiv preprint arXiv:2209.00626","author":"Ngo Richard","year":"2022","unstructured":"Richard Ngo, Lawrence Chan, and S\u00f6ren Mindermann. 2022. The alignment problem from a deep learning perspective. arXiv preprint arXiv:2209.00626 (2022)."},{"key":"e_1_3_2_1_213_1","first-page":"85","article-title":"Sticky Regulations","volume":"85","author":"Nielson L","year":"2018","unstructured":"Aaron\u00a0L Nielson. 2018. Sticky Regulations. U. Chi. L. Rev. 85 (2018), 85.","journal-title":"U. Chi. L. Rev."},{"key":"e_1_3_2_1_214_1","unstructured":"OECD. 2019. Recommendation of the Council on Artificial Intelligence. https:\/\/legalinstruments.oecd.org\/en\/instruments\/OECD-LEGAL-0449"},{"key":"e_1_3_2_1_215_1","unstructured":"Electronic\u00a0Code of Federal\u00a0Regulations. 2023. Regulation M. Code of Federal Regulations. https:\/\/www.ecfr.gov\/current\/title-17\/chapter-II\/part-242\/subject-group-ECFR3dd95cf4d3f6730 17 CFR Part 242."},{"key":"e_1_3_2_1_216_1","unstructured":"Office of Science and Technology Policy. 2022. Notice and Explanation. https:\/\/www.whitehouse.gov\/ostp\/ai-bill-of-rights\/notice-and-explanation\/"},{"key":"e_1_3_2_1_217_1","unstructured":"Office of the President of the United States. 2023. Executive Order on the Safe Secure and Trustworthy Development and Use of Artificial Intelligence. https:\/\/www.whitehouse.gov\/briefing-room\/presidential-actions\/2023\/10\/30\/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence\/"},{"key":"e_1_3_2_1_218_1","volume-title":"Towards AI Accountability Infrastructure: Gaps and Opportunities in AI Audit Tooling. arXiv preprint arXiv:2402.17861","author":"Ojewale Victor","year":"2024","unstructured":"Victor Ojewale, Ryan Steed, Briana Vecchione, Abeba Birhane, and Inioluwa\u00a0Deborah Raji. 2024. Towards AI Accountability Infrastructure: Gaps and Opportunities in AI Audit Tooling. arXiv preprint arXiv:2402.17861 (2024)."},{"key":"e_1_3_2_1_219_1","doi-asserted-by":"publisher","DOI":"10.5465\/amr.1991.4279002"},{"key":"e_1_3_2_1_220_1","unstructured":"A.J. Oneal. 2023. Chat GPT \"DAN\" (and other \"Jailbreaks\"). https:\/\/gist.github.com\/coolaj86\/6f4f7b30129b0251f61fa7baaa881516."},{"key":"e_1_3_2_1_221_1","unstructured":"OpenAI. 2023. GPT-3.5 Turbo fine-tuning and API updates. https:\/\/openai.com\/blog\/gpt-3-5-turbo-fine-tuning-and-api-updates"},{"key":"e_1_3_2_1_223_1","unstructured":"OpenAI. 2023. OpenAI Preparedness Challenge. https:\/\/openai.com\/form\/preparedness-challenge"},{"key":"e_1_3_2_1_224_1","unstructured":"OpenAI. 2023. OpenAI Red Teaming Network. https:\/\/openai.com\/blog\/red-teaming-network"},{"key":"e_1_3_2_1_225_1","volume-title":"How to audit an AI model owned by someone else (part 1). OpenMined Blog (June","year":"2023","unstructured":"Openmined. 2023. How to audit an AI model owned by someone else (part 1). OpenMined Blog (June 2023). https:\/\/blog.openmined.org\/ai-audit-part-1\/"},{"key":"e_1_3_2_1_226_1","doi-asserted-by":"crossref","first-page":"667","DOI":"10.1587\/transinf.2021EDP7161","article-title":"Latent Space Virtual Adversarial Training for Supervised and Semi-Supervised Learning","volume":"105","author":"Osada Genki","year":"2022","unstructured":"Genki Osada, Budrul Ahsan, Revoti\u00a0Prasad Bora, and Takashi Nishide. 2022. Latent Space Virtual Adversarial Training for Supervised and Semi-Supervised Learning. IEICE TRANSACTIONS on Information and Systems 105, 3 (2022), 667\u2013678.","journal-title":"IEICE TRANSACTIONS on Information and Systems"},{"key":"e_1_3_2_1_227_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence, Vol.\u00a036","author":"Pan Lin","year":"2022","unstructured":"Lin Pan, Chung-Wei Hang, Avirup Sil, and Saloni Potdar. 2022. Improved text classification via contrastive adversarial training. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol.\u00a036. 11130\u201311138."},{"key":"e_1_3_2_1_228_1","volume-title":"Technical report on the cleverhans v2. 1.0 adversarial examples library. arXiv preprint arXiv:1610.00768","author":"Papernot Nicolas","year":"2016","unstructured":"Nicolas Papernot, Fartash Faghri, Nicholas Carlini, Ian Goodfellow, Reuben Feinman, Alexey Kurakin, Cihang Xie, Yash Sharma, Tom Brown, Aurko Roy, 2016. Technical report on the cleverhans v2. 1.0 adversarial examples library. arXiv preprint arXiv:1610.00768 (2016)."},{"key":"e_1_3_2_1_229_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7758\u20137767","author":"Park Geon\u00a0Yeong","year":"2021","unstructured":"Geon\u00a0Yeong Park and Sang\u00a0Wan Lee. 2021. Reliably fast adversarial training via latent adversarial perturbation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 7758\u20137767."},{"key":"e_1_3_2_1_230_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2304.03442"},{"key":"e_1_3_2_1_231_1","unstructured":"Peter\u00a0S. Park Simon Goldstein Aidan O\u2019Gara Michael Chen and Dan Hendrycks. 2023. AI Deception: A Survey of Examples Risks and Potential Solutions. (2023). arxiv:2308.14752\u00a0[cs.CY]"},{"key":"e_1_3_2_1_232_1","volume-title":"Sarbanes-Oxley Act of","author":"PCAOB.","year":"2002","unstructured":"PCAOB. 2002. Sarbanes-Oxley Act of 2002. https:\/\/pcaobus.org\/About\/History\/Documents\/PDFs\/Sarbanes_Oxley_Act_of_2002.pdf Public Law 107-204, 116 Stat. 745."},{"key":"e_1_3_2_1_233_1","volume-title":"Red teaming language models with language models. arXiv preprint arXiv:2202.03286","author":"Perez Ethan","year":"2022","unstructured":"Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. 2022. Red teaming language models with language models. arXiv preprint arXiv:2202.03286 (2022)."},{"key":"e_1_3_2_1_234_1","volume-title":"Discovering Language Model Behaviors with Model-Written Evaluations. arXiv preprint arXiv:2212.09251","author":"Perez Ethan","year":"2022","unstructured":"Ethan Perez, Sam Ringer, Kamil\u0117 Luko\u0161i\u016bt\u0117, Karina Nguyen, Edwin Chen, Scott Heiner, Craig Pettit, Catherine Olsson, Sandipan Kundu, Saurav Kadavath, 2022. Discovering Language Model Behaviors with Model-Written Evaluations. arXiv preprint arXiv:2212.09251 (2022)."},{"key":"e_1_3_2_1_235_1","volume-title":"Model Artificial Intelligence Governance Framework","author":"Personal Data Protection Commission Singapore. 2020.","unstructured":"Personal Data Protection Commission Singapore. 2020. Model Artificial Intelligence Governance Framework, Second Edition. https:\/\/www.pdpc.gov.sg\/-\/media\/Files\/PDPC\/PDF-Files\/Resource-for-Organisation\/AI\/SGModelAIGovFramework2.pdf"},{"key":"e_1_3_2_1_238_1","volume-title":"Artificial Intelligence in Medicine","author":"Ploug Thomas","unstructured":"Thomas Ploug and S\u00f8ren Holm. 2021. Right to Contest AI Diagnostics: Defining Transparency and Explainability Requirements from a Patient\u2019s Perspective. In Artificial Intelligence in Medicine. Springer, 1\u201312."},{"key":"e_1_3_2_1_239_1","volume-title":"On the Challenges of Using Black-Box APIs for Toxicity Evaluation in Research. arXiv preprint arXiv:2304.12397","author":"Pozzobon Luiza","year":"2023","unstructured":"Luiza Pozzobon, Beyza Ermis, Patrick Lewis, and Sara Hooker. 2023. On the Challenges of Using Black-Box APIs for Toxicity Evaluation in Research. arXiv preprint arXiv:2304.12397 (2023)."},{"key":"e_1_3_2_1_240_1","volume-title":"Grips: Gradient-free, edit-based instruction search for prompting large language models. arXiv preprint arXiv:2203.07281","author":"Prasad Archiki","year":"2022","unstructured":"Archiki Prasad, Peter Hase, Xiang Zhou, and Mohit Bansal. 2022. Grips: Gradient-free, edit-based instruction search for prompting large language models. arXiv preprint arXiv:2203.07281 (2022)."},{"key":"e_1_3_2_1_241_1","volume-title":"Visual Adversarial Examples Jailbreak Large Language Models. arXiv preprint arXiv:2306.13213","author":"Qi Xiangyu","year":"2023","unstructured":"Xiangyu Qi, Kaixuan Huang, Ashwinee Panda, Mengdi Wang, and Prateek Mittal. 2023. Visual Adversarial Examples Jailbreak Large Language Models. arXiv preprint arXiv:2306.13213 (2023)."},{"key":"e_1_3_2_1_242_1","volume-title":"Even When Users Do Not Intend To!arXiv preprint arXiv:2310.03693","author":"Qi Xiangyu","year":"2023","unstructured":"Xiangyu Qi, Yi Zeng, Tinghao Xie, Pin-Yu Chen, Ruoxi Jia, Prateek Mittal, and Peter Henderson. 2023. Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!arXiv preprint arXiv:2310.03693 (2023)."},{"key":"e_1_3_2_1_243_1","volume-title":"Towards Speeding up Adversarial Training in Latent Spaces. arXiv preprint arXiv:2102.00662","author":"Qian Yaguan","year":"2021","unstructured":"Yaguan Qian, Qiqi Shao, Tengteng Yao, Bin Wang, Shouling Ji, Shaoning Zeng, Zhaoquan Gu, and Wassim Swaileh. 2021. Towards Speeding up Adversarial Training in Latent Spaces. arXiv preprint arXiv:2102.00662 (2021)."},{"key":"e_1_3_2_1_244_1","volume-title":"Unsafe diffusion: On the generation of unsafe images and hateful memes from text-to-image models. arXiv preprint arXiv:2305.13873","author":"Qu Yiting","year":"2023","unstructured":"Yiting Qu, Xinyue Shen, Xinlei He, Michael Backes, Savvas Zannettou, and Yang Zhang. 2023. Unsafe diffusion: On the generation of unsafe images and hateful memes from text-to-image models. arXiv preprint arXiv:2305.13873 (2023)."},{"key":"e_1_3_2_1_245_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372828"},{"key":"e_1_3_2_1_246_1","volume-title":"\u201cFour-Fifths Rule","author":"Raghavan Manish","unstructured":"Manish Raghavan and Pauline Kim. 2023. Limitations of the \u201cFour-Fifths Rule\u201d and Statistical Parity Tests for Measuring Fairness. https:\/\/openreview.net\/forum?id=M2aNjwX4Ec&referrer=%5Bthe%20profile%20of%20Manish%20Raghavan%5D(%2Fprofile%3Fid%3D\u00a0Manish_Raghavan1)"},{"key":"e_1_3_2_1_247_1","unstructured":"Inioluwa\u00a0Deborah Raji. 2022. The Anatomy of AI Audits: Form Process and Consequences. (2022)."},{"key":"e_1_3_2_1_248_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306618.3314244"},{"key":"e_1_3_2_1_249_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571151"},{"key":"e_1_3_2_1_250_1","doi-asserted-by":"publisher","DOI":"10.1145\/3375627.3375820"},{"key":"e_1_3_2_1_251_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372873"},{"key":"e_1_3_2_1_252_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514094.3534181"},{"key":"e_1_3_2_1_253_1","volume-title":"Red-teaming the stable diffusion safety filter. arXiv preprint arXiv:2210.04610","author":"Rando Javier","year":"2022","unstructured":"Javier Rando, Daniel Paleka, David Lindner, Lennart Heim, and Florian Tram\u00e8r. 2022. Red-teaming the stable diffusion safety filter. arXiv preprint arXiv:2210.04610 (2022)."},{"key":"e_1_3_2_1_254_1","unstructured":"Javier Rando and Florian Tram\u00e8r. 2023. Universal Jailbreak Backdoors from Poisoned Human Feedback. (2023). arxiv:2311.14455\u00a0[cs.AI]"},{"key":"e_1_3_2_1_255_1","unstructured":"Abhinav Rao Sachin Vashistha Atharva Naik Somak Aditya and Monojit Choudhury. 2023. Tricking LLMs into Disobedience: Understanding Analyzing and Preventing Jailbreaks. (2023). arxiv:2305.14965\u00a0[cs.CL]"},{"key":"e_1_3_2_1_256_1","volume-title":"2023 IEEE Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE, 464\u2013483","author":"R\u00e4uker Tilman","year":"2023","unstructured":"Tilman R\u00e4uker, Anson Ho, Stephen Casper, and Dylan Hadfield-Menell. 2023. Toward transparent ai: A survey on interpreting the inner structures of deep neural networks. In 2023 IEEE Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE, 464\u2013483."},{"key":"e_1_3_2_1_257_1","volume-title":"Probing the probing paradigm: Does probing accuracy entail task relevance?arXiv preprint arXiv:2005.00719","author":"Ravichander Abhilasha","year":"2020","unstructured":"Abhilasha Ravichander, Yonatan Belinkov, and Eduard Hovy. 2020. Probing the probing paradigm: Does probing accuracy entail task relevance?arXiv preprint arXiv:2005.00719 (2020)."},{"key":"e_1_3_2_1_258_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1103"},{"key":"e_1_3_2_1_259_1","volume-title":"Model-agnostic interpretability of machine learning. arXiv preprint arXiv:1606.05386","author":"Ribeiro Marco\u00a0Tulio","year":"2016","unstructured":"Marco\u00a0Tulio Ribeiro, Sameer Singh, and Carlos Guestrin. 2016. Model-agnostic interpretability of machine learning. arXiv preprint arXiv:1606.05386 (2016)."},{"key":"e_1_3_2_1_260_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178876.3186143"},{"key":"e_1_3_2_1_261_1","volume-title":"Copyright in Generative AI training: Balancing Fair Use through Standardization and Transparency. Available at SSRN 4579322","author":"Rodriguez\u00a0Maffioli Daniel","year":"2023","unstructured":"Daniel Rodriguez\u00a0Maffioli. 2023. Copyright in Generative AI training: Balancing Fair Use through Standardization and Transparency. Available at SSRN 4579322 (2023)."},{"key":"e_1_3_2_1_262_1","volume-title":"The New York Times is suing OpenAI and Microsoft for copyright infringement. The Verge (Dec","author":"Roth Emma","year":"2023","unstructured":"Emma Roth. 2023. The New York Times is suing OpenAI and Microsoft for copyright infringement. The Verge (Dec. 2023). https:\/\/www.theverge.com\/2023\/12\/27\/24016212\/new-york-times-openai-microsoft-lawsuit-copyright-infringement"},{"key":"e_1_3_2_1_263_1","volume-title":"Token-Modification Adversarial Attacks for Natural Language Processing: A Survey. ArXiv abs\/2103.00676","author":"Roth Tom","year":"2021","unstructured":"Tom Roth, Yansong Gao, Alsharif Abuadbba, Surya Nepal, and Wei Liu. 2021. Token-Modification Adversarial Attacks for Natural Language Processing: A Survey. ArXiv abs\/2103.00676 (2021). https:\/\/api.semanticscholar.org\/CorpusID:232075640"},{"key":"e_1_3_2_1_264_1","first-page":"26","article-title":"Please stop explaining black box models for high stakes decisions","volume":"1050","author":"Rudin Cynthia","year":"2018","unstructured":"Cynthia Rudin. 2018. Please stop explaining black box models for high stakes decisions. Stat 1050 (2018), 26.","journal-title":"Stat"},{"key":"e_1_3_2_1_265_1","volume-title":"Weighted Token-Level Virtual Adversarial Training in Text Classification. In 2022 3rd International Conference on Pattern Recognition and Machine Learning (PRML). IEEE, 117\u2013123","author":"Sae-Lim Teerapong","year":"2022","unstructured":"Teerapong Sae-Lim and Suronapee Phoomvuthisarn. 2022. Weighted Token-Level Virtual Adversarial Training in Text Classification. In 2022 3rd International Conference on Pattern Recognition and Machine Learning (PRML). IEEE, 117\u2013123."},{"key":"e_1_3_2_1_266_1","volume-title":"Artificial intelligence and biological misuse: Differentiating risks of language models and biological design tools. arXiv preprint arXiv:2306.13952","author":"Sandbrink B","year":"2023","unstructured":"Jonas\u00a0B Sandbrink. 2023. Artificial intelligence and biological misuse: Differentiating risks of language models and biological design tools. arXiv preprint arXiv:2306.13952 (2023)."},{"key":"e_1_3_2_1_267_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence, Vol.\u00a032","author":"Sankaranarayanan Swami","year":"2018","unstructured":"Swami Sankaranarayanan, Arpit Jain, Rama Chellappa, and Ser\u00a0Nam Lim. 2018. Regularizing deep networks using efficient layerwise adversarial training. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol.\u00a032."},{"key":"e_1_3_2_1_268_1","volume-title":"Whose opinions do language models reflect?arXiv preprint arXiv:2303.17548","author":"Santurkar Shibani","year":"2023","unstructured":"Shibani Santurkar, Esin Durmus, Faisal Ladhak, Cinoo Lee, Percy Liang, and Tatsunori Hashimoto. 2023. Whose opinions do language models reflect?arXiv preprint arXiv:2303.17548 (2023)."},{"key":"e_1_3_2_1_269_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00499"},{"key":"e_1_3_2_1_270_1","volume-title":"Are Emergent Abilities of Large Language Models a Mirage?","author":"Schaeffer Rylan","year":"2023","unstructured":"Rylan Schaeffer, Brando Miranda, and Sanmi Koyejo. 2023. Are Emergent Abilities of Large Language Models a Mirage? (2023). arxiv:2304.15004\u00a0[cs.AI]"},{"key":"e_1_3_2_1_271_1","volume-title":"Technical Report: Large Language Models can Strategically Deceive their Users when Put Under Pressure. arXiv preprint arXiv:2311.07590","author":"Scheurer J\u00e9r\u00e9my","year":"2023","unstructured":"J\u00e9r\u00e9my Scheurer, Mikita Balesni, and Marius Hobbhahn. 2023. Technical Report: Large Language Models can Strategically Deceive their Users when Put Under Pressure. arXiv preprint arXiv:2311.07590 (2023)."},{"key":"e_1_3_2_1_272_1","volume-title":"Three lines of defense against risks from AI. arXiv preprint arXiv:2212.08364","author":"Schuett Jonas","year":"2022","unstructured":"Jonas Schuett. 2022. Three lines of defense against risks from AI. arXiv preprint arXiv:2212.08364 (2022)."},{"key":"e_1_3_2_1_273_1","volume-title":"AGI labs need an internal audit function. (May","author":"Schuett Jonas","year":"2023","unstructured":"Jonas Schuett. 2023. AGI labs need an internal audit function. (May 2023). https:\/\/arxiv.org\/abs\/2305.17038v1"},{"key":"e_1_3_2_1_274_1","volume-title":"Towards best practices in AGI safety and governance: A survey of expert opinion. arXiv preprint arXiv:2305.07153","author":"Schuett Jonas","year":"2023","unstructured":"Jonas Schuett, Noemi Dreksler, Markus Anderljung, David McCaffary, Lennart Heim, Emma Bluemke, and Ben Garfinkel. 2023. Towards best practices in AGI safety and governance: A survey of expert opinion. arXiv preprint arXiv:2305.07153 (2023)."},{"key":"e_1_3_2_1_275_1","unstructured":"Leo Schwinn David Dobre Stephan G\u00fcnnemann and Gauthier Gidel. 2023. Adversarial Attacks and Defenses in Large Language Models: Old and New Threats. (2023). arxiv:2310.19737\u00a0[cs.AI]"},{"key":"e_1_3_2_1_276_1","doi-asserted-by":"crossref","unstructured":"Elizabeth Seger Noemi Dreksler Richard Moulange Emily Dardaman Jonas Schuett K Wei Christoph Winter Mackenzie Arnold Se\u00e1n \u00d3\u00a0h\u00c9igeartaigh Anton Korinek 2023. Open-Sourcing Highly Capable Foundation Models: An Evaluation of Risks Benefits and Alternative Methods for Pursuing Open-Source Objectives. (2023).","DOI":"10.2139\/ssrn.4596436"},{"key":"e_1_3_2_1_277_1","unstructured":"Rusheb Shah Quentin Feuillade-Montixi Soroush Pour Arush Tagade Stephen Casper and Javier Rando. 2023. Scalable and Transferable Black-Box Jailbreaks for Language Models via Persona Modulation. (2023). arxiv:2311.03348\u00a0[cs.CL]"},{"key":"e_1_3_2_1_278_1","volume-title":"Representation Bias in Data: A Survey on Identification and Resolution Techniques. Comput. Surveys","author":"Shahbazi Nima","year":"2023","unstructured":"Nima Shahbazi, Yin Lin, Abolfazl Asudeh, and HV Jagadish. 2023. Representation Bias in Data: A Survey on Identification and Resolution Techniques. Comput. Surveys (2023)."},{"key":"e_1_3_2_1_279_1","doi-asserted-by":"crossref","unstructured":"Lee Sharkey Cl\u00edodhna\u00a0N\u00ed Ghuidhir Dan Braun J\u00e9r\u00e9my Scheurer Mikita Balesni Lucius Bushnaq Charlotte Stix and Marius Hobbhahn. 2024. A Causal Framework for AI Regulation and Auditing. (2024).","DOI":"10.20944\/preprints202401.1424.v1"},{"key":"e_1_3_2_1_280_1","unstructured":"Mrinank Sharma Meg Tong Tomasz Korbak David Duvenaud Amanda Askell Samuel\u00a0R. Bowman Newton Cheng Esin Durmus Zac Hatfield-Dodds Scott\u00a0R. Johnston Shauna Kravec Timothy Maxwell Sam McCandlish Kamal Ndousse Oliver Rausch Nicholas Schiefer Da Yan Miranda Zhang and Ethan Perez. 2023. Towards Understanding Sycophancy in Language Models. (2023). arxiv:2310.13548\u00a0[cs.CL]"},{"key":"e_1_3_2_1_281_1","volume-title":"Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks. arXiv preprint arXiv:2310.10844","author":"Shayegani Erfan","year":"2023","unstructured":"Erfan Shayegani, Md\u00a0Abdullah\u00a0Al Mamun, Yu Fu, Pedram Zaree, Yue Dong, and Nael Abu-Ghazaleh. 2023. Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks. arXiv preprint arXiv:2310.10844 (2023)."},{"key":"e_1_3_2_1_282_1","volume-title":"Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models. arXiv preprint arXiv:2308.03825","author":"Shen Xinyue","year":"2023","unstructured":"Xinyue Shen, Zeyuan Chen, Michael Backes, Yun Shen, and Yang Zhang. 2023. \" Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models. arXiv preprint arXiv:2308.03825 (2023)."},{"key":"e_1_3_2_1_283_1","doi-asserted-by":"crossref","unstructured":"Toby Shevlane. 2022. Structured access: an emerging paradigm for safe AI deployment. (2022). arxiv:2201.05159\u00a0[cs.AI]","DOI":"10.1093\/oxfordhb\/9780197579329.013.39"},{"key":"e_1_3_2_1_284_1","volume-title":"Model evaluation for extreme risks. arXiv preprint arXiv:2305.15324","author":"Shevlane Toby","year":"2023","unstructured":"Toby Shevlane, Sebastian Farquhar, Ben Garfinkel, Mary Phuong, Jess Whittlestone, Jade Leung, Daniel Kokotajlo, Nahema Marchal, Markus Anderljung, Noam Kolt, 2023. Model evaluation for extreme risks. arXiv preprint arXiv:2305.15324 (2023)."},{"key":"e_1_3_2_1_285_1","volume-title":"Detecting pretraining data from large language models. arXiv preprint arXiv:2310.16789","author":"Shi Weijia","year":"2023","unstructured":"Weijia Shi, Anirudh Ajith, Mengzhou Xia, Yangsibo Huang, Daogao Liu, Terra Blevins, Danqi Chen, and Luke Zettlemoyer. 2023. Detecting pretraining data from large language models. arXiv preprint arXiv:2310.16789 (2023)."},{"key":"e_1_3_2_1_286_1","volume-title":"Toward Human Readable Prompt Tuning: Kubrick\u2019s The Shining is a good movie, and a good prompt too?arXiv preprint arXiv:2212.10539","author":"Shi Weijia","year":"2022","unstructured":"Weijia Shi, Xiaochuang Han, Hila Gonen, Ari Holtzman, Yulia Tsvetkov, and Luke Zettlemoyer. 2022. Toward Human Readable Prompt Tuning: Kubrick\u2019s The Shining is a good movie, and a good prompt too?arXiv preprint arXiv:2212.10539 (2022)."},{"key":"e_1_3_2_1_287_1","volume-title":"Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980","author":"Shin Taylor","year":"2020","unstructured":"Taylor Shin, Yasaman Razeghi, Robert\u00a0L Logan\u00a0IV, Eric Wallace, and Sameer Singh. 2020. Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)."},{"key":"e_1_3_2_1_288_1","volume-title":"Multiplicity as an AI Governance Principle. Available at SSRN 4444354","author":"Shur-Ofry Michal","year":"2023","unstructured":"Michal Shur-Ofry. 2023. Multiplicity as an AI Governance Principle. Available at SSRN 4444354 (2023)."},{"key":"e_1_3_2_1_289_1","volume-title":"Large language models encode clinical knowledge. arXiv preprint arXiv:2212.13138","author":"Singhal Karan","year":"2022","unstructured":"Karan Singhal, Shekoofeh Azizi, Tao Tu, S\u00a0Sara Mahdavi, Jason Wei, Hyung\u00a0Won Chung, Nathan Scales, Ajay Tanwani, Heather Cole-Lewis, Stephen Pfohl, 2022. Large language models encode clinical knowledge. arXiv preprint arXiv:2212.13138 (2022)."},{"key":"e_1_3_2_1_290_1","doi-asserted-by":"publisher","DOI":"10.1145\/3375627.3375830"},{"key":"e_1_3_2_1_291_1","volume-title":"Identifying and Mitigating Privacy Risks Stemming from Language Models: A Survey. arXiv preprint arXiv:2310.01424","author":"Smith Victoria","year":"2023","unstructured":"Victoria Smith, Ali\u00a0Shahin Shamsabadi, Carolyn Ashurst, and Adrian Weller. 2023. Identifying and Mitigating Privacy Risks Stemming from Language Models: A Survey. arXiv preprint arXiv:2310.01424 (2023)."},{"key":"e_1_3_2_1_292_1","volume-title":"Can large language models democratize access to dual-use biotechnology?arXiv preprint arXiv:2306.03809","author":"Soice H","year":"2023","unstructured":"Emily\u00a0H Soice, Rafael Rocha, Kimberlee Cordova, Michael Specter, and Kevin\u00a0M Esvelt. 2023. Can large language models democratize access to dual-use biotechnology?arXiv preprint arXiv:2306.03809 (2023)."},{"key":"e_1_3_2_1_293_1","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3593981"},{"key":"e_1_3_2_1_294_1","volume-title":"Evaluating the Social Impact of Generative AI Systems in Systems and Society. arXiv preprint arXiv:2306.05949","author":"Solaiman Irene","year":"2023","unstructured":"Irene Solaiman, Zeerak Talat, William Agnew, Lama Ahmad, Dylan Baker, Su\u00a0Lin Blodgett, Hal Daum\u00e9\u00a0III, Jesse Dodge, Ellie Evans, Sara Hooker, 2023. Evaluating the Social Impact of Generative AI Systems in Systems and Society. arXiv preprint arXiv:2306.05949 (2023)."},{"key":"e_1_3_2_1_295_1","volume-title":"Universal adversarial attacks with natural triggers for text classification. arXiv preprint arXiv:2005.00174","author":"Song Liwei","year":"2020","unstructured":"Liwei Song, Xinwei Yu, Hsuan-Tung Peng, and Karthik Narasimhan. 2020. Universal adversarial attacks with natural triggers for text classification. arXiv preprint arXiv:2005.00174 (2020)."},{"key":"e_1_3_2_1_296_1","volume-title":"A Roadmap to Pluralistic Alignment. arXiv preprint arXiv:2402.05070","author":"Sorensen Taylor","year":"2024","unstructured":"Taylor Sorensen, Jared Moore, Jillian Fisher, Mitchell Gordon, Niloofar Mireshghallah, Christopher\u00a0Michael Rytting, Andre Ye, Liwei Jiang, Ximing Lu, Nouha Dziri, 2024. A Roadmap to Pluralistic Alignment. arXiv preprint arXiv:2402.05070 (2024)."},{"key":"e_1_3_2_1_297_1","volume-title":"Abubakar Abid, Adam Fisch, Adam\u00a0R Brown, Adam Santoro, Aditya Gupta","author":"Srivastava Aarohi","year":"2022","unstructured":"Aarohi Srivastava, Abhinav Rastogi, Abhishek Rao, Abu Awal\u00a0Md Shoeb, Abubakar Abid, Adam Fisch, Adam\u00a0R Brown, Adam Santoro, Aditya Gupta, Adri\u00e0 Garriga-Alonso, 2022. Beyond the imitation game: Quantifying and extrapolating the capabilities of language models. arXiv preprint arXiv:2206.04615 (2022)."},{"key":"e_1_3_2_1_298_1","unstructured":"Huaman Sun Jiaxin Pei Minje Choi and David Jurgens. 2023. Aligning with Whom? Large Language Models Have Gender and Racial Biases in Subjective NLP Tasks. (2023). arxiv:2311.09730\u00a0[cs.CL]"},{"key":"e_1_3_2_1_299_1","unstructured":"Lichao Sun Yue Huang Haoran Wang Siyuan Wu Qihui Zhang Chujie Gao Yixin Huang Wenhan Lyu Yixuan Zhang Xiner Li Zhengliang Liu Yixin Liu Yijue Wang Zhikun Zhang Bhavya Kailkhura Caiming Xiong Chao Zhang Chaowei Xiao Chunyuan Li Eric Xing Furong Huang Hao Liu Heng Ji Hongyi Wang Huan Zhang Huaxiu Yao Manolis Kellis Marinka Zitnik Meng Jiang Mohit Bansal James Zou Jian Pei Jian Liu Jianfeng Gao Jiawei Han Jieyu Zhao Jiliang Tang Jindong Wang John Mitchell Kai Shu Kaidi Xu Kai-Wei Chang Lifang He Lifu Huang Michael Backes Neil\u00a0Zhenqiang Gong Philip\u00a0S. Yu Pin-Yu Chen Quanquan Gu Ran Xu Rex Ying Shuiwang Ji Suman Jana Tianlong Chen Tianming Liu Tianyi Zhou Willian Wang Xiang Li Xiangliang Zhang Xiao Wang Xing Xie Xun Chen Xuyu Wang Yan Liu Yanfang Ye Yinzhi Cao and Yue Zhao. 2024. TrustLLM: Trustworthiness in Large Language Models. arxiv:2401.05561\u00a0[cs.CL]"},{"key":"e_1_3_2_1_300_1","volume-title":"Do Large Language Models Show Decision Heuristics Similar to Humans? A Case Study Using GPT-3.5. arXiv preprint arXiv:2305.04400","author":"Suri Gaurav","year":"2023","unstructured":"Gaurav Suri, Lily\u00a0R Slater, Ali Ziaee, and Morgan Nguyen. 2023. Do Large Language Models Show Decision Heuristics Similar to Humans? A Case Study Using GPT-3.5. arXiv preprint arXiv:2305.04400 (2023)."},{"key":"e_1_3_2_1_301_1","volume-title":"Using Large Language Models for Cybersecurity Capture-The-Flag Challenges and Certification Questions. arXiv preprint arXiv:2308.10443","author":"Tann Wesley","year":"2023","unstructured":"Wesley Tann, Yuancheng Liu, Jun\u00a0Heng Sim, Choon\u00a0Meng Seah, and Ee-Chien Chang. 2023. Using Large Language Models for Cybersecurity Capture-The-Flag Challenges and Certification Questions. arXiv preprint arXiv:2308.10443 (2023)."},{"key":"e_1_3_2_1_302_1","unstructured":"Yan Tao Olga Viberg Ryan\u00a0S. Baker and Rene\u00a0F. Kizilcec. 2023. Auditing and Mitigating Cultural Bias in LLMs. (2023). arxiv:2311.14096\u00a0[cs.CL]"},{"key":"e_1_3_2_1_303_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2309.01933"},{"key":"e_1_3_2_1_304_1","unstructured":"David Thiel. 2023. Identifying and Eliminating CSAM in Generative ML Training Data and Models. (2023)."},{"key":"e_1_3_2_1_305_1","unstructured":"David Thiel Melissa Stroebel and Rebecca Portnoff. 2023. Generative ML and CSAM: Implications and Mitigations. (2023)."},{"key":"e_1_3_2_1_306_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian\u00a0Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit\u00a0Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric\u00a0Michael Smith Ranjan Subramanian Xiaoqing\u00a0Ellen Tan Binh Tang Ross Taylor Adina Williams Jian\u00a0Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. (2023). arxiv:2307.09288\u00a0[cs.CL]"},{"key":"e_1_3_2_1_307_1","volume-title":"Se\u00e1n\u00a0\u00d3 h\u00c9igeartaigh","author":"Trager Robert","year":"2023","unstructured":"Robert Trager, Ben Harack, Anka Reuel, Allison Carnegie, Lennart Heim, Lewis Ho, Sarah Kreps, Ranjit Lall, Owen Larter, Se\u00e1n\u00a0\u00d3 h\u00c9igeartaigh, 2023. International governance of civilian AI: A jurisdictional certification approach. arXiv preprint arXiv:2308.15514 (2023)."},{"key":"e_1_3_2_1_308_1","unstructured":"Yu-Lin Tsai Chia-Yi Hsu Chulin Xie Chih-Hsun Lin Jia-You Chen Bo Li Pin-Yu Chen Chia-Mu Yu and Chun-Ying Huang. 2023. Ring-A-Bell! How Reliable are Concept Removal Methods for Diffusion Models?arXiv preprint arXiv:2310.10012 (2023)."},{"key":"e_1_3_2_1_309_1","volume-title":"Activation addition: Steering language models without optimization. arXiv preprint arXiv:2308.10248","author":"Turner Alex","year":"2023","unstructured":"Alex Turner, Lisa Thiergart, David Udell, Gavin Leech, Ulisse Mini, and Monte MacDiarmid. 2023. Activation addition: Steering language models without optimization. arXiv preprint arXiv:2308.10248 (2023)."},{"key":"e_1_3_2_1_310_1","unstructured":"Miles Turpin Julian Michael Ethan Perez and Samuel\u00a0R. Bowman. 2023. Language Models Don\u2019t Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting. (2023). arxiv:2305.04388\u00a0[cs.CL]"},{"key":"e_1_3_2_1_312_1","unstructured":"United Nations. 2022. Principles for the ethical use of artificial intelligence in the United Nations system. https:\/\/unsceb.org\/sites\/default\/files\/2023-03\/CEB_2022_2_Add.1%20%28AI%20ethics%20principles%29.pdf"},{"key":"e_1_3_2_1_313_1","unstructured":"United States National Science Foundation. 2023. National Deep Inference Facility for Very Large Language Models (NDIF). (2023)."},{"key":"e_1_3_2_1_314_1","doi-asserted-by":"publisher","unstructured":"U.S. Department of Commerce and National Institute of Standards and Technology. 2023. AI Risk Management Framework: AI RMF (1.0). https:\/\/doi.org\/10.6028\/NIST.AI.100-1","DOI":"10.6028\/NIST.AI.100-1"},{"key":"e_1_3_2_1_315_1","volume-title":"van\u00a0den Brom","author":"E.","year":"2022","unstructured":"H.\u00a0E. van\u00a0den Brom. 2022. On-site Inspection and Legal Certainty. SSRN Electronic Journal (2022). https:\/\/api.semanticscholar.org\/CorpusID:249326468"},{"key":"e_1_3_2_1_316_1","first-page":"4","article-title":"The unexpected benefits of Sarbanes-Oxley","volume":"84","author":"Wagner Stephen","year":"2006","unstructured":"Stephen Wagner and Lee Dittmar. 2006. The unexpected benefits of Sarbanes-Oxley. Harvard Business Review 84, 4 (April 2006), 133\u2013140; 150.","journal-title":"Harvard Business Review"},{"key":"e_1_3_2_1_317_1","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.3339372"},{"key":"e_1_3_2_1_318_1","volume-title":"Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125","author":"Wallace Eric","year":"2019","unstructured":"Eric Wallace, Shi Feng, Nikhil Kandpal, Matt Gardner, and Sameer Singh. 2019. Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125 (2019)."},{"key":"e_1_3_2_1_319_1","unstructured":"Alexander Wan Eric Wallace Sheng Shen and Dan Klein. 2023. Poisoning Language Models During Instruction Tuning. (2023). arxiv:2305.00944\u00a0[cs.CL]"},{"key":"e_1_3_2_1_320_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Wang Alex","year":"2018","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel\u00a0R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_1_321_1","unstructured":"Jiongxiao Wang Junlin Wu Muhao Chen Yevgeniy Vorobeychik and Chaowei Xiao. 2023. On the Exploitability of Reinforcement Learning with Human Feedback for Large Language Models. (2023). arxiv:2311.09641\u00a0[cs.AI]"},{"key":"e_1_3_2_1_322_1","unstructured":"Song Wang Yaochen Zhu Haochen Liu Zaiyi Zheng Chen Chen and Jundong Li. 2023. Knowledge Editing for Large Language Models: A Survey. (2023). arxiv:2310.16218\u00a0[cs.CL]"},{"key":"e_1_3_2_1_323_1","unstructured":"Tony\u00a0T. Wang Adam Gleave Tom Tseng Kellin Pelrine Nora Belrose Joseph Miller Michael\u00a0D. Dennis Yawen Duan Viktor Pogrebniak Sergey Levine and Stuart Russell. 2023. Adversarial Policies Beat Superhuman Go AIs. (2023). arxiv:2211.00241\u00a0[cs.LG]"},{"key":"e_1_3_2_1_324_1","volume-title":"Proceedings of the 2021 AAAI\/ACM Conference on AI, Ethics, and Society. 1010\u20131022","author":"Watkins Elizabeth\u00a0Anne","year":"2021","unstructured":"Elizabeth\u00a0Anne Watkins, Emanuel Moss, Jacob Metcalf, Ranjit Singh, and Madeleine\u00a0Clare Elish. 2021. Governing algorithmic systems with impact assessments: Six observations. In Proceedings of the 2021 AAAI\/ACM Conference on AI, Ethics, and Society. 1010\u20131022."},{"key":"e_1_3_2_1_325_1","volume-title":"Jailbroken: How does llm safety training fail?arXiv preprint arXiv:2307.02483","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. 2023. Jailbroken: How does llm safety training fail?arXiv preprint arXiv:2307.02483 (2023)."},{"key":"e_1_3_2_1_326_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc\u00a0V Le, Denny Zhou, 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems 35 (2022), 24824\u201324837.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_327_1","volume-title":"Ethical and social risks of harm from language models. arXiv preprint arXiv:2112.04359","author":"Weidinger Laura","year":"2021","unstructured":"Laura Weidinger, John Mellor, Maribeth Rauh, Conor Griffin, Jonathan Uesato, Po-Sen Huang, Myra Cheng, Mia Glaese, Borja Balle, Atoosa Kasirzadeh, 2021. Ethical and social risks of harm from language models. arXiv preprint arXiv:2112.04359 (2021)."},{"key":"e_1_3_2_1_328_1","volume-title":"Sociotechnical Safety Evaluation of Generative AI Systems. (Oct","author":"Weidinger Laura","year":"2023","unstructured":"Laura Weidinger, Maribeth Rauh, Nahema Marchal, Arianna Manzini, Lisa\u00a0Anne Hendricks, Juan Mateos-Garcia, Stevie Bergman, Jackie Kay, Conor Griffin, Ben Bariach, Iason Gabriel, Verena Rieser, and William Isaac. 2023. Sociotechnical Safety Evaluation of Generative AI Systems. (Oct. 2023). http:\/\/arxiv.org\/abs\/2310.11986 arXiv:2310.11986 [cs]."},{"key":"e_1_3_2_1_329_1","volume-title":"Hard prompts made easy: Gradient-based discrete optimization for prompt tuning and discovery. arXiv preprint arXiv:2302.03668","author":"Wen Yuxin","year":"2023","unstructured":"Yuxin Wen, Neel Jain, John Kirchenbauer, Micah Goldblum, Jonas Geiping, and Tom Goldstein. 2023. Hard prompts made easy: Gradient-based discrete optimization for prompt tuning and discovery. arXiv preprint arXiv:2302.03668 (2023)."},{"key":"e_1_3_2_1_330_1","doi-asserted-by":"publisher","DOI":"10.1111\/papa.12187"},{"key":"e_1_3_2_1_331_1","doi-asserted-by":"publisher","DOI":"10.1257\/jep.24.2.211"},{"key":"e_1_3_2_1_332_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3372833"},{"key":"e_1_3_2_1_333_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544549.3583747"},{"key":"e_1_3_2_1_334_1","volume-title":"BackdoorBench: A Comprehensive Benchmark of Backdoor Learning. arXiv preprint arXiv:2206.12654","author":"Wu Baoyuan","year":"2022","unstructured":"Baoyuan Wu, Hongrui Chen, Mingda Zhang, Zihao Zhu, Shaokui Wei, Danni Yuan, Chao Shen, and Hongyuan Zha. 2022. BackdoorBench: A Comprehensive Benchmark of Backdoor Learning. arXiv preprint arXiv:2206.12654 (2022)."},{"key":"e_1_3_2_1_335_1","volume-title":"DEPN: Detecting and Editing Privacy Neurons in Pretrained Language Models.","author":"Wu Xinwei","year":"2023","unstructured":"Xinwei Wu, Junzhuo Li, Minghui Xu, Weilong Dong, Shuangzhi Wu, Chao Bian, and Deyi Xiong. 2023. DEPN: Detecting and Editing Privacy Neurons in Pretrained Language Models. (2023). arxiv:2310.20138\u00a0[cs.CR]"},{"key":"e_1_3_2_1_336_1","volume-title":"Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models.","author":"Yang Xianjun","year":"2023","unstructured":"Xianjun Yang, Xiao Wang, Qi Zhang, Linda Petzold, William\u00a0Yang Wang, Xun Zhao, and Dahua Lin. 2023. Shadow Alignment: The Ease of Subverting Safely-Aligned Language Models. (2023). arxiv:2310.02949\u00a0[cs.CL]"},{"key":"e_1_3_2_1_337_1","volume-title":"A Survey on Large Language Model (LLM) Security and Privacy: The Good, the Bad, and the Ugly. arXiv preprint arXiv:2312.02003","author":"Yao Yifan","year":"2023","unstructured":"Yifan Yao, Jinhao Duan, Kaidi Xu, Yuanfang Cai, Eric Sun, and Yue Zhang. 2023. A Survey on Large Language Model (LLM) Security and Privacy: The Good, the Bad, and the Ugly. arXiv preprint arXiv:2312.02003 (2023)."},{"key":"e_1_3_2_1_338_1","volume-title":"Proceedings of the 2022 AAAI\/ACM Conference on AI, Ethics, and Society. 823\u2013830","author":"Yew Rui-Jie","year":"2022","unstructured":"Rui-Jie Yew and Dylan Hadfield-Menell. 2022. A Penalty Default Approach to Preemptive Harm Disclosure and Mitigation for AI Systems. In Proceedings of the 2022 AAAI\/ACM Conference on AI, Ethics, and Society. 823\u2013830."},{"key":"e_1_3_2_1_339_1","unstructured":"Zheng-Xin Yong Cristina Menghini and Stephen\u00a0H. Bach. 2023. Low-Resource Languages Jailbreak GPT-4. (2023). arxiv:2310.02446\u00a0[cs.CL]"},{"key":"e_1_3_2_1_340_1","volume-title":"GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts. arXiv preprint arXiv:2309.10253","author":"Yu Jiahao","year":"2023","unstructured":"Jiahao Yu, Xingwei Lin, and Xinyu Xing. 2023. GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts. arXiv preprint arXiv:2309.10253 (2023)."},{"key":"e_1_3_2_1_341_1","volume-title":"Post-hoc concept bottleneck models. arXiv preprint arXiv:2205.15480","author":"Yuksekgonul Mert","year":"2022","unstructured":"Mert Yuksekgonul, Maggie Wang, and James Zou. 2022. Post-hoc concept bottleneck models. arXiv preprint arXiv:2205.15480 (2022)."},{"key":"e_1_3_2_1_342_1","unstructured":"Qiusi Zhan Richard Fang Rohan Bindu Akul Gupta Tatsunori Hashimoto and Daniel Kang. 2023. Removing RLHF Protections in GPT-4 via Fine-Tuning. (2023). arxiv:2311.05553\u00a0[cs.CL]"},{"key":"e_1_3_2_1_343_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.accinf.2022.100572"},{"key":"e_1_3_2_1_344_1","volume-title":"Adversarial Machine Learning in Latent Representations of Neural Networks. arXiv preprint arXiv:2309.17401","author":"Zhang Milin","year":"2023","unstructured":"Milin Zhang, Mohammad Abdi, and Francesco Restuccia. 2023. Adversarial Machine Learning in Latent Representations of Neural Networks. arXiv preprint arXiv:2309.17401 (2023)."},{"key":"e_1_3_2_1_345_1","volume-title":"Adversarial Attacks on Deep Learning Models in Natural Language Processing: A Survey. arXiv: Computation and Language","author":"Zhang W.","year":"2019","unstructured":"W. Zhang, Quan.Z Sheng, Ahoud Abdulrahmn\u00a0F. Alhazmi, and Chenliang Li. 2019. Adversarial Attacks on Deep Learning Models in Natural Language Processing: A Survey. arXiv: Computation and Language (2019). https:\/\/api.semanticscholar.org\/CorpusID:260428188"},{"key":"e_1_3_2_1_346_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3374217","article-title":"Adversarial attacks on deep-learning models in natural language processing: A survey","volume":"11","author":"Zhang Wei\u00a0Emma","year":"2020","unstructured":"Wei\u00a0Emma Zhang, Quan\u00a0Z Sheng, Ahoud Alhazmi, and Chenliang Li. 2020. Adversarial attacks on deep-learning models in natural language processing: A survey. ACM Transactions on Intelligent Systems and Technology (TIST) 11, 3 (2020), 1\u201341.","journal-title":"ACM Transactions on Intelligent Systems and Technology (TIST)"},{"key":"e_1_3_2_1_347_1","volume-title":"Explainability for large language models: A survey. ACM Transactions on Intelligent Systems and Technology","author":"Zhao Haiyan","year":"2023","unstructured":"Haiyan Zhao, Hanjie Chen, Fan Yang, Ninghao Liu, Huiqi Deng, Hengyi Cai, Shuaiqiang Wang, Dawei Yin, and Mengnan Du. 2023. Explainability for large language models: A survey. ACM Transactions on Intelligent Systems and Technology (2023)."},{"key":"e_1_3_2_1_348_1","unstructured":"Ziqian Zhong Ziming Liu Max Tegmark and Jacob Andreas. 2023. The Clock and the Pizza: Two Stories in Mechanistic Explanation of Neural Networks. (2023). arxiv:2306.17844\u00a0[cs.LG]"},{"key":"e_1_3_2_1_349_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2307.13854"},{"key":"e_1_3_2_1_350_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 452\u2013467","author":"Zhou Wen","year":"2018","unstructured":"Wen Zhou, Xin Hou, Yongjun Chen, Mengyun Tang, Xiangqi Huang, Xiang Gan, and Yong Yang. 2018. Transferable adversarial perturbations. In Proceedings of the European Conference on Computer Vision (ECCV). 452\u2013467."},{"key":"e_1_3_2_1_351_1","volume-title":"Latent adversarial defence with boundary-guided generation. arXiv preprint arXiv:1907.07001","author":"Zhou Xiaowei","year":"2019","unstructured":"Xiaowei Zhou, Ivor\u00a0W Tsang, and Jie Yin. 2019. Latent adversarial defence with boundary-guided generation. arXiv preprint arXiv:1907.07001 (2019)."},{"key":"e_1_3_2_1_352_1","volume-title":"Freelb: Enhanced adversarial training for natural language understanding. arXiv preprint arXiv:1909.11764","author":"Zhu Chen","year":"2019","unstructured":"Chen Zhu, Yu Cheng, Zhe Gan, Siqi Sun, Tom Goldstein, and Jingjing Liu. 2019. Freelb: Enhanced adversarial training for natural language understanding. arXiv preprint arXiv:1909.11764 (2019)."},{"key":"e_1_3_2_1_353_1","unstructured":"Daniel\u00a0M. Ziegler Seraphina Nix Lawrence Chan Tim Bauman Peter Schmidt-Nielsen Tao Lin Adam Scherlis Noa Nabeshima Ben Weinstein-Raun Daniel de Haas Buck Shlegeris and Nate Thomas. 2022. Adversarial Training for High-Stakes Reliability. (2022). arxiv:2205.01663\u00a0[cs.LG]"},{"key":"e_1_3_2_1_354_1","volume-title":"Representation engineering: A top-down approach to ai transparency. arXiv preprint arXiv:2310.01405","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Long Phan, Sarah Chen, James Campbell, Phillip Guo, Richard Ren, Alexander Pan, Xuwang Yin, Mantas Mazeika, Ann-Kathrin Dombrowski, 2023. Representation engineering: A top-down approach to ai transparency. arXiv preprint arXiv:2310.01405 (2023)."},{"key":"e_1_3_2_1_355_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2307.15043"}],"event":{"name":"FAccT '24: The 2024 ACM Conference on Fairness, Accountability, and Transparency","location":"Rio de Janeiro Brazil","acronym":"FAccT '24"},"container-title":["The 2024 ACM Conference on Fairness, Accountability, and Transparency"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3630106.3659037","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3630106.3659037","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:57:07Z","timestamp":1750291027000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3630106.3659037"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":350,"alternative-id":["10.1145\/3630106.3659037","10.1145\/3630106"],"URL":"https:\/\/doi.org\/10.1145\/3630106.3659037","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-06-05","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}