{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T18:31:49Z","timestamp":1776277909536,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":92,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"European Commission","doi-asserted-by":"publisher","award":["101057917"],"award-info":[{"award-number":["101057917"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1145\/3658644.3670388","type":"proceedings-article","created":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T12:19:20Z","timestamp":1733746760000},"page":"1671-1685","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":145,"title":["\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9954-587X","authenticated-orcid":false,"given":"Xinyue","family":"Shen","sequence":"first","affiliation":[{"name":"CISPA Helmholtz Center for Information Security, Saarbrucken, Saarland, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4504-7108","authenticated-orcid":false,"given":"Zeyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"CISPA Helmholtz Center for Information Security, Saarbrucken, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7130-9211","authenticated-orcid":false,"given":"Michael","family":"Backes","sequence":"additional","affiliation":[{"name":"CISPA Helmholtz Center for Information Security, Saarbrucken, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5215-2976","authenticated-orcid":false,"given":"Yun","family":"Shen","sequence":"additional","affiliation":[{"name":"NetApp, Bristol, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3612-7348","authenticated-orcid":false,"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[{"name":"CISPA Helmholtz Center for Information Security, Saarbrucken, Germany"}]}],"member":"320","published-online":{"date-parts":[[2024,12,9]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"A pro-innovation approach to AI regulation. https:\/\/assets.publishing.service.gov. uk\/government\/uploads\/system\/uploads\/attachment_data\/file\/1146542\/a_proinnovation_ approach_to_AI_regulation.pdf."},{"key":"e_1_3_2_1_2_1","unstructured":"AIPRM. https:\/\/www.aiprm.com\/."},{"key":"e_1_3_2_1_3_1","unstructured":"Awesome ChatGPT Prompts. https:\/\/huggingface.co\/datasets\/fka\/awesomechatgpt-prompts."},{"key":"e_1_3_2_1_4_1","unstructured":"ChatGPT. https:\/\/chat.openai.com\/chat."},{"key":"e_1_3_2_1_5_1","unstructured":"Disboard. https:\/\/disboard.org\/."},{"key":"e_1_3_2_1_6_1","unstructured":"Discord. https:\/\/en.wikipedia.org\/wiki\/Discord."},{"key":"e_1_3_2_1_7_1","unstructured":"FlowGPT. https:\/\/flowgpt.com\/."},{"key":"e_1_3_2_1_8_1","unstructured":"General Data Protection Regulation (GDPR). https:\/\/gdpr-info.eu\/."},{"key":"e_1_3_2_1_9_1","unstructured":"JailbreakChat. https:\/\/www.jailbreakchat.com."},{"key":"e_1_3_2_1_10_1","unstructured":"Measures for the Management of Generative Artificial Intelligence Services. http:\/\/www.cac.gov.cn\/2023-07\/13\/c_1690898327029107.htm."},{"key":"e_1_3_2_1_11_1","unstructured":"The Artificial Intelligence Act. https:\/\/artificialintelligenceact.eu\/."},{"key":"e_1_3_2_1_12_1","volume-title":"Maria Korobeynikova, and Fabrizio Gilardi. Open-Source Large Language Models Outperform Crowd Workers and Approach ChatGPT in Text-Annotation Tasks. CoRR abs\/2307.02179","author":"Alizadeh Meysam","year":"2023","unstructured":"Meysam Alizadeh, Ma\u00ebl Kubli, Zeynab Samei, Shirin Dehghani, Juan Diego Bermeo, Maria Korobeynikova, and Fabrizio Gilardi. Open-Source Large Language Models Outperform Crowd Workers and Approach ChatGPT in Text-Annotation Tasks. CoRR abs\/2307.02179, 2023."},{"key":"e_1_3_2_1_13_1","first-page":"769","volume-title":"Bagdasaryan and Vitaly Shmatikov. Spinning Language Models: Risks of Propaganda-As-A-Service and Countermeasures. In IEEE Symposium on Security and Privacy (S&P)","author":"Eugene","year":"2022","unstructured":"Eugene Bagdasaryan and Vitaly Shmatikov. Spinning Language Models: Risks of Propaganda-As-A-Service and Countermeasures. In IEEE Symposium on Security and Privacy (S&P), pages 769--786. IEEE, 2022."},{"key":"e_1_3_2_1_14_1","volume-title":"Multimodal Evaluation of ChatGPT on Reasoning, Hallucination, and Interactivity. CoRR abs\/2302.04023","author":"Bang Yejin","year":"2023","unstructured":"Yejin Bang, Samuel Cahyawijaya, Nayeon Lee, Wenliang Dai, Dan Su, Bryan Wilie, Holy Lovenia, Ziwei Ji, Tiezheng Yu, Willy Chung, Quyet V. Do, Yan Xu, and Pascale Fung. A Multitask, Multilingual, Multimodal Evaluation of ChatGPT on Reasoning, Hallucination, and Interactivity. CoRR abs\/2302.04023, 2023."},{"key":"e_1_3_2_1_15_1","first-page":"830","volume-title":"Jeremy Blackburn. The Pushshift Reddit Dataset. In International Conference on Web and Social Media (ICWSM)","author":"Baumgartner Jason","year":"2020","unstructured":"Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. The Pushshift Reddit Dataset. In International Conference on Web and Social Media (ICWSM), pages 830--839. AAAI, 2020."},{"key":"e_1_3_2_1_16_1","first-page":"2566","volume-title":"IEEE Symposium on Security and Privacy (S&P)","author":"Bitaab Marzieh","year":"2023","unstructured":"Marzieh Bitaab, Haehyun Cho, Adam Oest, Zhuoer Lyu,WeiWang, Jorij Abraham, Ruoyu Wang, Tiffany Bao, Yan Shoshitaishvili, and Adam Doup\u00e9. Beyond Phish: Toward Detecting Fraudulent e-CommerceWebsites at Scale. In IEEE Symposium on Security and Privacy (S&P), pages 2566--2583. IEEE, 2023."},{"key":"e_1_3_2_1_17_1","first-page":"1987","volume-title":"Nicolas Papernot. Bad Characters: Imperceptible NLP Attacks. In IEEE Symposium on Security and Privacy (S&P)","author":"Boucher Nicholas","year":"2022","unstructured":"Nicholas Boucher, Ilia Shumailov, Ross Anderson, and Nicolas Papernot. Bad Characters: Imperceptible NLP Attacks. In IEEE Symposium on Security and Privacy (S&P), pages 1987--2004. IEEE, 2022."},{"key":"e_1_3_2_1_18_1","first-page":"2633","volume-title":"USENIX Security Symposium (USENIX Security)","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini, Florian Tram\u00e8r, Eric Wallace, Matthew Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam Roberts, Tom B. Brown, Dawn Song, \u00dalfar Erlingsson, Alina Oprea, and Colin Raffel. Extracting Training Data from Large Language Models. In USENIX Security Symposium (USENIX Security), pages 2633--2650. USENIX, 2021."},{"key":"e_1_3_2_1_19_1","volume-title":"April","year":"2023","unstructured":"Checkpoint. OPWNAI : Cybercriminals Starting To Use ChatGPT. https:\/\/research.checkpoint.com\/2023\/opwnai-cybercriminals-starting-touse-chatgpt\/#single-post, April 2023."},{"key":"e_1_3_2_1_20_1","first-page":"554","volume-title":"Yang Zhang. BadNL: Backdoor Attacks Against NLP Models with Semantic-preserving Improvements. In Annual Computer Security Applications Conference (ACSAC)","author":"Chen Xiaoyi","year":"2021","unstructured":"Xiaoyi Chen, Ahmed Salem, Michael Backes, Shiqing Ma, Qingni Shen, Zhonghai Wu, and Yang Zhang. BadNL: Backdoor Attacks Against NLP Models with Semantic-preserving Improvements. In Annual Computer Security Applications Conference (ACSAC), pages 554--569. ACSAC, 2021."},{"key":"e_1_3_2_1_21_1","first-page":"352","volume-title":"Kai-Wei Chang. PLUE: Language Understanding Evaluation Benchmark for Privacy Policies in English. In Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Chi Jianfeng","year":"2023","unstructured":"Jianfeng Chi, Wasi Uddin Ahmad, Yuan Tian, and Kai-Wei Chang. PLUE: Language Understanding Evaluation Benchmark for Privacy Policies in English. In Annual Meeting of the Association for Computational Linguistics (ACL), pages 352--365. ACL, 2023."},{"key":"e_1_3_2_1_22_1","volume-title":"March","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality, March 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Free dolly: Introducing the worlds first truly open instruction-tuned llm","author":"Conover Mike","year":"2023","unstructured":"Mike Conover, Matt Hayes, Ankit Mathur, Jianwei Xie, Jun Wan, Sam Shah, Ali Ghodsi, PatrickWendell, Matei Zaharia, and Reynold Xin. Free dolly: Introducing the worlds first truly open instruction-tuned llm, 2023."},{"key":"e_1_3_2_1_24_1","volume-title":"Jailbreaker: Automated Jailbreak Across Multiple Large Language Model Chatbots. CoRR abs\/2307.08715","author":"Deng Gelei","year":"2023","unstructured":"Gelei Deng, Yi Liu, Yuekang Li, Kailong Wang, Ying Zhang, Zefeng Li, Haoyu Wang, Tianwei Zhang, and Yang Liu. Jailbreaker: Automated Jailbreak Across Multiple Large Language Model Chatbots. CoRR abs\/2307.08715, 2023."},{"key":"e_1_3_2_1_25_1","unstructured":"Discord. BreakGPT. https:\/\/disboard.org\/server\/1090300946568986810."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11135-014-0003-1"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigDataSecurity-HPSC-IDS58521.2023.00046"},{"key":"e_1_3_2_1_28_1","unstructured":"FlowGPT. Paraphrase a text. https:\/\/flowgpt.com\/p\/paraphrase-a-text."},{"key":"e_1_3_2_1_29_1","unstructured":"Google. AI ACROSS GOOGLE: PaLM 2. https:\/\/ai.google\/discover\/palm2\/."},{"key":"e_1_3_2_1_30_1","volume-title":"More than youve asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated Large Language Models. CoRR abs\/2302.12173","author":"Greshake Kai","year":"2023","unstructured":"Kai Greshake, Sahar Abdelnabi, Shailesh Mishra, Christoph Endres, Thorsten Holz, and Mario Fritz. More than youve asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated Large Language Models. CoRR abs\/2302.12173, 2023."},{"key":"e_1_3_2_1_31_1","volume-title":"Large Language Models Can Be Used To Effectively Scale Spear Phishing Campaigns. CoRR abs\/2305.06972","author":"Hazell Julian","year":"2023","unstructured":"Julian Hazell. Large Language Models Can Be Used To Effectively Scale Spear Phishing Campaigns. CoRR abs\/2305.06972, 2023."},{"key":"e_1_3_2_1_32_1","volume-title":"MGTBench: Benchmarking Machine-Generated Text Detection. CoRR abs\/2303.14822","author":"He Xinlei","year":"2023","unstructured":"Xinlei He, Xinyue Shen, Zeyuan Chen, Michael Backes, and Yang Zhang. MGTBench: Benchmarking Machine-Generated Text Detection. CoRR abs\/2303.14822, 2023."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1170"},{"key":"e_1_3_2_1_34_1","volume-title":"Is ChatGPT A Good Translator A Preliminary Study. CoRR abs\/2301.08745","author":"Jiao Wenxiang","year":"2023","unstructured":"Wenxiang Jiao, Wenxuan Wang, Jen-tse Huang, Xing Wang, and Zhaopeng Tu. Is ChatGPT A Good Translator A Preliminary Study. CoRR abs\/2301.08745, 2023."},{"key":"e_1_3_2_1_35_1","unstructured":"Jigsaw. Perspective API. https:\/\/www.perspectiveapi.com."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6311"},{"key":"e_1_3_2_1_37_1","volume-title":"Exploiting Programmatic Behavior of LLMs: Dual-Use Through Standard Security Attacks. CoRR abs\/2302.05733","author":"Kang Daniel","year":"2023","unstructured":"Daniel Kang, Xuechen Li, Ion Stoica, Carlos Guestrin, Matei Zaharia, and Tatsunori Hashimoto. Exploiting Programmatic Behavior of LLMs: Dual-Use Through Standard Security Attacks. CoRR abs\/2302.05733, 2023."},{"key":"e_1_3_2_1_38_1","volume-title":"On the Reliability of Watermarks for Large Language Models. CoRR abs\/2306.04634","author":"Kirchenbauer John","year":"2023","unstructured":"John Kirchenbauer, Jonas Geiping, YuxinWen, Manli Shu, Khalid Saifullah, Kezhi Kong, Kasun Fernando, Aniruddha Saha, Micah Goldblum, and Tom Goldstein. On the Reliability of Watermarks for Large Language Models. CoRR abs\/2306.04634, 2023."},{"key":"e_1_3_2_1_39_1","volume-title":"Multi-step Jailbreaking Privacy Attacks on ChatGPT. CoRR abs\/2304.05197","author":"Li Haoran","year":"2023","unstructured":"Haoran Li, Dadi Guo, Wei Fan, Mingshi Xu, and Yangqiu Song. Multi-step Jailbreaking Privacy Attacks on ChatGPT. CoRR abs\/2304.05197, 2023."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460120.3484587"},{"key":"e_1_3_2_1_41_1","volume-title":"Malla: Demystifying Realworld Large Language Model Integrated Malicious Services. CoRR abs\/2401.03315","author":"Lin Zilong","year":"2024","unstructured":"Zilong Lin, Jian Cui, Xiaojing Liao, and XiaoFengWang. Malla: Demystifying Realworld Large Language Model Integrated Malicious Services. CoRR abs\/2401.03315, 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"ACM Computing Surveys","author":"Liu Pengfei","year":"2023","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing. ACM Computing Surveys, 2023."},{"key":"e_1_3_2_1_43_1","volume-title":"Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study. CoRR abs\/2305.13860","author":"Liu Yi","year":"2023","unstructured":"Yi Liu, Gelei Deng, Zhengzi Xu, Yuekang Li, Yaowen Zheng, Ying Zhang, Lida Zhao, Tianwei Zhang, and Yang Liu. Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study. CoRR abs\/2305.13860, 2023."},{"key":"e_1_3_2_1_44_1","first-page":"346","volume-title":"Santiago Zanella B\u00e9guelin. Analyzing Leakage of Personally Identifiable Information in Language Models. In IEEE Symposium on Security and Privacy (S&P)","author":"Lukas Nils","year":"2023","unstructured":"Nils Lukas, Ahmed Salem, Robert Sim, Shruti Tople, Lukas Wutschitz, and Santiago Zanella B\u00e9guelin. Analyzing Leakage of Personally Identifiable Information in Language Models. In IEEE Symposium on Security and Privacy (S&P), pages 346--363. IEEE, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"A Holistic Approach to Undesired Content Detection in the Real World. CoRR abs\/208.03274","author":"Markov Todor","year":"2022","unstructured":"Todor Markov, Chong Zhang, Sandhini Agarwal, Tyna Eloundou, Teddy Lee, Steven Adler, Angela Jiang, and Lilian Weng. A Holistic Approach to Undesired Content Detection in the Real World. CoRR abs\/208.03274, 2022."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.00861"},{"key":"e_1_3_2_1_47_1","first-page":"88","volume-title":"International Conference on Intelligent Systems Design and Applications (ISDA)","author":"Meo Pasquale De","year":"2011","unstructured":"Pasquale De Meo, Emilio Ferrara, Giacomo Fiumara, and Alessandro Provetti. Generalized Louvain method for community detection in large networks. In International Conference on Intelligent Systems Design and Applications (ISDA), pages 88--93. IEEE, 2011."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.759"},{"key":"e_1_3_2_1_49_1","first-page":"1669","volume-title":"Gang Wang. DeepPhish: Understanding User Trust Towards Artificially Generated Profiles in Online Social Networks. In USENIX Security Symposium (USENIX Security)","author":"Mink Jaron","year":"2022","unstructured":"Jaron Mink, Licheng Luo, Nat\u00e3 M. Barbosa, Olivia Figueira, YangWang, and Gang Wang. DeepPhish: Understanding User Trust Towards Artificially Generated Profiles in Online Social Networks. In USENIX Security Symposium (USENIX Security), pages 1669--1686. USENIX, 2022."},{"key":"e_1_3_2_1_50_1","first-page":"8332","volume-title":"Reza Shokri. Quantifying Privacy Risks of Masked Language Models Using Membership Inference Attacks. In Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Mireshghallah Fatemehsadat","year":"2022","unstructured":"Fatemehsadat Mireshghallah, Kartik Goyal, Archit Uniyal, Taylor Berg-Kirkpatrick, and Reza Shokri. Quantifying Privacy Risks of Masked Language Models Using Membership Inference Attacks. In Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 8332--8347. ACL, 2022."},{"key":"e_1_3_2_1_51_1","first-page":"452","volume-title":"International Conference onWeb and Social Media (ICWSM)","author":"Mittos Alexandros","year":"2020","unstructured":"Alexandros Mittos, Savvas Zannettou, Jeremy Blackburn, and Emiliano De Cristofaro. AndWe Will Fight For Our Race! A Measurement Study of Genetic Testing Conversations on Reddit and 4chan. In International Conference onWeb and Social Media (ICWSM), pages 452--463. AAAI, 2020."},{"key":"e_1_3_2_1_52_1","unstructured":"NIST. AI Risk Management Framework. https:\/\/www.nist.gov\/itl\/ai-riskmanagement-framework."},{"key":"e_1_3_2_1_53_1","unstructured":"NVIDIA. NeMo-Guardrails. https:\/\/github.com\/NVIDIA\/NeMo-Guardrails."},{"key":"e_1_3_2_1_54_1","unstructured":"OpenAI. ChatGPT can nowsee hear and speak. https:\/\/openai.com\/blog\/chatgptcan-now-see-hear-and-speak."},{"key":"e_1_3_2_1_55_1","unstructured":"OpenAI. Function calling and other API updates. https:\/\/openai.com\/blog\/ function-calling-and-other-api-updates."},{"key":"e_1_3_2_1_56_1","unstructured":"OpenAI. Moderation Endpoint. https:\/\/platform.openai.com\/docs\/guides\/ moderation\/overview."},{"key":"e_1_3_2_1_57_1","unstructured":"OpenAI. New models and developer products announced at DevDay. https:\/\/ openai.com\/blog\/new-models-and-developer-products-announced-at-devday."},{"key":"e_1_3_2_1_58_1","unstructured":"OpenAI. Pricing. https:\/\/openai.com\/pricing."},{"key":"e_1_3_2_1_59_1","unstructured":"OpenAI. Usage policies. https:\/\/openai.com\/policies\/usage-policies."},{"key":"e_1_3_2_1_60_1","unstructured":"OpenAI. GPT-4 Technical Report. CoRR abs\/2303.08774 2023."},{"key":"e_1_3_2_1_61_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS). NeurIPS","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, PeterWelinder, Paul F. Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. In Annual Conference on Neural Information Processing Systems (NeurIPS). NeurIPS, 2022."},{"key":"e_1_3_2_1_62_1","volume-title":"or not to ChatGPT: That is the question! CoRR abs\/2304.01487","author":"Pegoraro Alessandro","year":"2023","unstructured":"Alessandro Pegoraro, Kavita Kumari, Hossein Fereidooni, and Ahmad-Reza Sadeghi. To ChatGPT, or not to ChatGPT: That is the question! CoRR abs\/2304.01487, 2023."},{"key":"e_1_3_2_1_63_1","volume-title":"International Conference on Machine Learning (ICML). JMLR","author":"Pei Kexin","year":"2023","unstructured":"Kexin Pei, David Bieber, Kensen Shi, Charles Sutton, and Pengcheng Yin. Can Large Language Models Reason about Program Invariants? In International Conference on Machine Learning (ICML). JMLR, 2023."},{"key":"e_1_3_2_1_64_1","volume-title":"Ignore Previous Prompt: Attack Techniques For Language Models. CoRR abs\/2211.09527","author":"Perez F\u00e1bio","year":"2022","unstructured":"F\u00e1bio Perez and Ian Ribeiro. Ignore Previous Prompt: Attack Techniques For Language Models. CoRR abs\/2211.09527, 2022."},{"key":"e_1_3_2_1_65_1","volume-title":"Yang Zhang. Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes From Text-To-Image Models. In ACM SIGSAC Conference on Computer and Communications Security (CCS). ACM","author":"Qu Yiting","year":"2023","unstructured":"Yiting Qu, Xinyue Shen, Xinlei He, Michael Backes, Savvas Zannettou, and Yang Zhang. Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes From Text-To-Image Models. In ACM SIGSAC Conference on Computer and Communications Security (CCS). ACM, 2023."},{"key":"e_1_3_2_1_66_1","unstructured":"Reddit. r\/ChatGPTJailbreak. https:\/\/www.reddit.com\/r\/ChatGPTJailbreak\/."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_68_1","first-page":"4902","volume-title":"Sameer Singh. Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Ribeiro Marco T\u00falio","year":"2020","unstructured":"Marco T\u00falio Ribeiro, Tongshuang Wu, Carlos Guestrin, and Sameer Singh. Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In Annual Meeting of the Association for Computational Linguistics (ACL), pages 4902--4912. ACL, 2020."},{"key":"e_1_3_2_1_69_1","volume-title":"Ethical research standards in a world of big data. F1000Research","author":"Rivers Caitlin M.","year":"2014","unstructured":"Caitlin M. Rivers and Bryan L. Lewis. Ethical research standards in a world of big data. F1000Research, 2014."},{"key":"e_1_3_2_1_70_1","first-page":"4454","volume-title":"Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Shaikh Omar","year":"2023","unstructured":"Omar Shaikh, Hongxin Zhang, William Held, Michael Bernstein, and Diyi Yang. On Second Thought, Lets Not Think Step by Step! Bias and Toxicity in Zero-Shot Reasoning. In Annual Meeting of the Association for Computational Linguistics (ACL), pages 4454--4470. ACL, 2023."},{"key":"e_1_3_2_1_71_1","volume-title":"Do Anything Now: Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models. CoRR abs\/2308.03825","author":"Shen Xinyue","year":"2023","unstructured":"Xinyue Shen, Zeyuan Chen, Michael Backes, Yun Shen, and Yang Zhang. Do Anything Now: Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models. CoRR abs\/2308.03825, 2023."},{"key":"e_1_3_2_1_72_1","volume-title":"ChatGPT We Trust? Measuring and Characterizing the Reliability of ChatGPT. CoRR abs\/2304.08979","author":"Shen Xinyue","year":"2023","unstructured":"Xinyue Shen, Zeyuan Chen, Michael Backes, and Yang Zhang. In ChatGPT We Trust? Measuring and Characterizing the Reliability of ChatGPT. CoRR abs\/2304.08979, 2023."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3548606.3560599"},{"key":"e_1_3_2_1_74_1","unstructured":"The White House. Blueprint for an AI Bill of Rights. https:\/\/www.whitehouse. gov\/ostp\/ai-bill-of-rights\/."},{"key":"e_1_3_2_1_75_1","first-page":"479","volume-title":"Conference of the European Association for Machine Translation (EAMT)","author":"Tiedemann J\u00f6rg","year":"2020","unstructured":"J\u00f6rg Tiedemann and Santhosh Thottingal. OPUS-MT - Building open translation services for the World. In Conference of the European Association for Machine Translation (EAMT), pages 479--480. European Association for Machine Translation, 2020."},{"key":"e_1_3_2_1_76_1","unstructured":"Together. OpenChatKit. https:\/\/github.com\/togethercomputer\/OpenChatKit."},{"key":"e_1_3_2_1_77_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971, 2023."},{"key":"e_1_3_2_1_78_1","volume-title":"ACM SIGSAC Conference on Computer and Communications Security (CCS). ACM","author":"Tram\u00e8r Florian","year":"2022","unstructured":"Florian Tram\u00e8r, Reza Shokri, Ayrton San Joaquin, Hoang Le, Matthew Jagielski, Sanghyun Hong, and Nicholas Carlini. Truth Serum: Poisoning Machine Learning Models to Reveal Their Secrets. In ACM SIGSAC Conference on Computer and Communications Security (CCS). ACM, 2022."},{"key":"e_1_3_2_1_79_1","first-page":"5998","volume-title":"Annual Conference on Neural Information Processing Systems (NIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is All you Need. In Annual Conference on Neural Information Processing Systems (NIPS), pages 5998--6008. NIPS, 2017."},{"key":"e_1_3_2_1_80_1","volume-title":"DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. CoRR abs\/2306.11698","author":"Wang Boxin","year":"2023","unstructured":"Boxin Wang, Weixin Chen, Hengzhi Pei, Chulin Xie, Mintong Kang, Chenhui Zhang, Chejian Xu, Zidi Xiong, Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin, Yu Cheng, Sanmi Koyejo, Dawn Song, and Bo Li. DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. CoRR abs\/2306.11698, 2023."},{"key":"e_1_3_2_1_81_1","first-page":"516","volume-title":"Duen Horng Chau. WizMap: Scalable Interactive Visualization for Exploring Large Machine Learning Embeddings. In Annual Meeting of the Association for Computational Linguistics (ACL)","author":"Wang Zijie","year":"2023","unstructured":"Zijie J.Wang, Fred Hohman, and Duen Horng Chau. WizMap: Scalable Interactive Visualization for Exploring Large Machine Learning Embeddings. In Annual Meeting of the Association for Computational Linguistics (ACL), pages 516--523. ACL, 2023."},{"key":"e_1_3_2_1_82_1","volume-title":"Jailbroken: How Does LLM Safety Training Fail? CoRR abs\/2307.02483","author":"Wei Alexander","year":"2023","unstructured":"Alexander Wei, Nika Haghtalab, and Jacob Steinhardt. Jailbroken: How Does LLM Safety Training Fail? CoRR abs\/2307.02483, 2023."},{"key":"e_1_3_2_1_83_1","unstructured":"Wikipedia. Spearmans rank correlation coefficient. https:\/\/en.wikipedia.org\/ wiki\/Spearman?s_rank_correlation_coefficient."},{"key":"e_1_3_2_1_84_1","volume-title":"Bo Li. Detecting AI Trojans Using Meta Neural Analysis. In IEEE Symposium on Security and Privacy (S&P). IEEE","author":"Xu Xiaojun","year":"2021","unstructured":"Xiaojun Xu, Qi Wang, Huichen Li, Nikita Borisov, Carl A. Gunter, and Bo Li. Detecting AI Trojans Using Meta Neural Analysis. In IEEE Symposium on Security and Privacy (S&P). IEEE, 2021."},{"key":"e_1_3_2_1_85_1","volume-title":"Chaowei Xiao. CodeIPPrompt: Intellectual Property Infringement Assessment of Code Language Models. In International Conference on Machine Learning (ICML). JMLR","author":"Yu Zhiyuan","year":"2023","unstructured":"Zhiyuan Yu, Yuhao Wu, Ning Zhang, Chenguang Wang, Yevgeniy Vorobeychik, and Chaowei Xiao. CodeIPPrompt: Intellectual Property Infringement Assessment of Code Language Models. In International Conference on Machine Learning (ICML). JMLR, 2023."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v14i1.7343"},{"key":"e_1_3_2_1_87_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zeng Aohan","year":"2023","unstructured":"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang, Hanyu Lai, Ming Ding, Zhuoyi Yang, Yifan Xu, Wendi Zheng, Xiao Xia, Weng Lam Tam, Zixuan Ma, Yufei Xue, Jidong Zhai,Wenguang Chen, Zhiyuan Liu, Peng Zhang, Yuxiao Dong, and Jie Tang. GLM-130B: An Open Bilingual Pre-trained Model. In International Conference on Learning Representations (ICLR), 2023."},{"key":"e_1_3_2_1_88_1","volume-title":"How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs. CoRR abs\/2401.06373","author":"Zeng Yi","year":"2024","unstructured":"Yi Zeng, Hongpeng Lin, Jingwen Zhang, Diyi Yang, Ruoxi Jia, and Weiyan Shi. How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to Challenge AI Safety by Humanizing LLMs. CoRR abs\/2401.06373, 2024."},{"key":"e_1_3_2_1_89_1","first-page":"1","volume-title":"Munmun De Choudhury. Synthetic Lies: Understanding AI-Generated Misinformation and Evaluating Algorithmic and Human Solutions. In Annual ACM Conference on Human Factors in Computing Systems (CHI)","author":"Zhou Jiawei","year":"2023","unstructured":"Jiawei Zhou, Yixuan Zhang, Qianni Luo, Andrea G. Parker, and Munmun De Choudhury. Synthetic Lies: Understanding AI-Generated Misinformation and Evaluating Algorithmic and Human Solutions. In Annual ACM Conference on Human Factors in Computing Systems (CHI), pages 436:1--436:20. ACM, 2023."},{"key":"e_1_3_2_1_90_1","volume-title":"Pan Hui, and Gareth Tyson. Can ChatGPT Reproduce Human-Generated Labels? A Study of Social Computing Tasks. CoRR abs\/2304.10145","author":"Zhu Yiming","year":"2023","unstructured":"Yiming Zhu, Peixian Zhang, Ehsan ul Haq, Pan Hui, and Gareth Tyson. Can ChatGPT Reproduce Human-Generated Labels? A Study of Social Computing Tasks. CoRR abs\/2304.10145, 2023."},{"key":"e_1_3_2_1_91_1","volume-title":"Universal and Transferable Adversarial Attacks on Aligned Language Models. CoRR abs\/2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, J. Zico Kolter, and Matt Fredrikson. Universal and Transferable Adversarial Attacks on Aligned Language Models. CoRR abs\/2307.15043, 2023."},{"key":"e_1_3_2_1_92_1","unstructured":"Zvi. Jailbreaking ChatGPT on Release Day. https:\/\/www.lesswrong.com\/posts\/ RYcoJdvmoBbi5Nax7\/jailbreaking-chatgpt-on-release-day."}],"event":{"name":"CCS '24: ACM SIGSAC Conference on Computer and Communications Security","location":"Salt Lake City UT USA","acronym":"CCS '24","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 2024 on ACM SIGSAC Conference on Computer and Communications Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3658644.3670388","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3658644.3670388","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T06:11:53Z","timestamp":1755843113000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3658644.3670388"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":92,"alternative-id":["10.1145\/3658644.3670388","10.1145\/3658644"],"URL":"https:\/\/doi.org\/10.1145\/3658644.3670388","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2024-12-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}