{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T17:44:24Z","timestamp":1778694264608,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,19]],"date-time":"2023-11-19T00:00:00Z","timestamp":1700352000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,19]]},"DOI":"10.1145\/3689217.3690619","type":"proceedings-article","created":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T18:22:01Z","timestamp":1732040521000},"page":"35-46","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Adversarial Attacks to Multi-Modal Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3525-7442","authenticated-orcid":false,"given":"Zhihao","family":"Dou","sequence":"first","affiliation":[{"name":"Duke University, Durham, NC, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6799-8841","authenticated-orcid":false,"given":"Xin","family":"Hu","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Kashiwa, Chiba Prefecture, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3245-2728","authenticated-orcid":false,"given":"Haibo","family":"Yang","sequence":"additional","affiliation":[{"name":"Rochester Institute of Technology, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0146-5101","authenticated-orcid":false,"given":"Zhuqing","family":"Liu","sequence":"additional","affiliation":[{"name":"University of North Texas, Denton, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1365-3911","authenticated-orcid":false,"given":"Minghong","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Louisville, Louisville, KY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,11,19]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. BindDiffusion: One diffusion model to bind them all. https:\/\/github.com\/sailsg\/BindDiffusion\/tree\/main"},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. Stable Audio software. https:\/\/www.stableaudio.com\/"},{"key":"e_1_3_2_1_3_1","volume-title":"Andrew Senior, Oriol Vinyals, and Andrew Zisserman.","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. Deep audio-visual speech recognition. In TPAMI."},{"key":"e_1_3_2_1_4_1","volume-title":"Poisoning and backdooring contrastive learning. arXiv preprint arXiv:2106.09667","author":"Carlini Nicholas","year":"2021","unstructured":"Nicholas Carlini and Andreas Terzis. 2021. Poisoning and backdooring contrastive learning. arXiv preprint arXiv:2106.09667 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Mehdi Cherti Romain Beaumont Ross Wightman Mitchell Wortsman Gabriel Ilharco Cade Gordon Christoph Schuhmann Ludwig Schmidt and Jenia Jitsev. 2023. Reproducible scaling laws for contrastive language-image learning. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_2_1_6_1","unstructured":"Adam Coates Andrew Ng and Honglak Lee. 2011. An analysis of single-layer networks in unsupervised feature learning. In AISTATS."},{"key":"e_1_3_2_1_7_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng,Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR."},{"key":"e_1_3_2_1_8_1","volume-title":"Audio Set: An ontology and human-labeled dataset for audio events. In ICASSP.","author":"Gemmeke Jort F.","year":"2017","unstructured":"Jort F. Gemmeke, Daniel P. W. Ellis, Dylan Freedman, Aren Jansen, and Marvin Ritter. 2017. Audio Set: An ontology and human-labeled dataset for audio events. In ICASSP."},{"key":"e_1_3_2_1_9_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. Imagebind: One embedding space to bind them all. In CVPR."},{"key":"e_1_3_2_1_10_1","volume-title":"Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572","author":"Goodfellow Ian J","year":"2014","unstructured":"Ian J Goodfellow, Jonathon Shlens, and Christian Szegedy. 2014. Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572 (2014)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Andrey Guzhov Federico Raue J\u00f6rn Hees and Andreas Dengel. 2021. Esresne t-fbsp: Learning robust time-frequency transformation of audio. In IJCNN.","DOI":"10.1109\/IJCNN52387.2021.9533654"},{"key":"e_1_3_2_1_12_1","volume-title":"Audioclip: Extending clip to image, text and audio. In ICASSP.","author":"Guzhov Andrey","year":"2022","unstructured":"Andrey Guzhov, Federico Raue, J\u00f6rn Hees, and Andreas Dengel. 2022. Audioclip: Extending clip to image, text and audio. In ICASSP."},{"key":"e_1_3_2_1_13_1","volume-title":"Backdooring Multimodal Learning. In IEEE Symposium on Security and Privacy.","author":"Han Xingshuo","year":"2023","unstructured":"Xingshuo Han, Yutong Wu, Qingjie Zhang, Yuan Zhou, Yuan Xu, Han Qiu, Guowen Xu, and Tianwei Zhang. 2023. Backdooring Multimodal Learning. In IEEE Symposium on Security and Privacy."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19982"},{"key":"e_1_3_2_1_15_1","volume-title":"What makes multi-modal learning better than single (provably). NeurIPS","author":"Huang Yu","year":"2021","unstructured":"Yu Huang, Chenzhuang Du, Zihui Xue, Xuanyao Chen, Hang Zhao, and Longbo Huang. 2021. What makes multi-modal learning better than single (provably). NeurIPS (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"Winclip: Zero-\/few-shot anomaly classification and segmentation. In CVPR.","author":"Jeong Jongheon","year":"2023","unstructured":"Jongheon Jeong, Yang Zou, Taewan Kim, Dongqing Zhang, Avinash Ravichandran, and Onkar Dabeer. 2023. Winclip: Zero-\/few-shot anomaly classification and segmentation. In CVPR."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46214.2022.9833644"},{"key":"e_1_3_2_1_18_1","volume-title":"Audiocaps: Generating captions for audios in the wild. In NAACL.","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In NAACL."},{"key":"e_1_3_2_1_19_1","volume-title":"Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430","author":"Liang Paul Pu","year":"2022","unstructured":"Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2022. Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430 (2022)."},{"key":"e_1_3_2_1_20_1","unstructured":"Yen-Ju Lu Zhong-QiuWang ShinjiWatanabe Alexander Richard Cheng Yu and Yu Tsao. 2022. Conditional diffusion probabilistic model for speech enhancement. In ICASSP."},{"key":"e_1_3_2_1_21_1","unstructured":"Weili Nie Brandon Guo Yujia Huang Chaowei Xiao Arash Vahdat and Anima Anandkumar. 2022. Diffusion models for adversarial purification. In ICML."},{"key":"e_1_3_2_1_22_1","volume-title":"Visual Adversarial Examples Jailbreak Large Language Models. arXiv preprint arXiv:2306.13213","author":"Qi Xiangyu","year":"2023","unstructured":"Xiangyu Qi, Kaixuan Huang, Ashwinee Panda, MengdiWang, and Prateek Mittal. 2023. Visual Adversarial Examples Jailbreak Large Language Models. arXiv preprint arXiv:2306.13213 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/323648.323771"},{"key":"e_1_3_2_1_26_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Hernan Aguirre, and Kiyoshi Tanaka.","author":"Tan Wei Ren","year":"2019","unstructured":"Wei Ren Tan, Chee Seng Chan, Hernan Aguirre, and Kiyoshi Tanaka. 2019. Improved ArtGAN for Conditional Synthesis of Natural Image and Artwork. IEEE Transactions on Image Processing (2019)."},{"key":"e_1_3_2_1_29_1","unstructured":"Zhipeng Wei Jingjing Chen Zuxuan Wu and Yu-Gang Jiang. 2022. Cross-modal transferable adversarial attacks from images to videos. In CVPR."},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Machine Learning. PMLR, 39299--39313","author":"Yang Ziqing","year":"2023","unstructured":"Ziqing Yang, Xinlei He, Zheng Li, Michael Backes, Mathias Humbert, Pascal Berrang, and Yang Zhang. 2023. Data poisoning attacks against multimodal encoders. In International Conference on Machine Learning. PMLR, 39299--39313."},{"key":"e_1_3_2_1_31_1","volume-title":"Adversarial Illusions in Multi-Modal Embeddings. In USENIX Security Symposium.","author":"Zhang Tingwei","year":"2024","unstructured":"Tingwei Zhang, Rishi Jha, Eugene Bagdasaryan, and Vitaly Shmatikov. 2024. Adversarial Illusions in Multi-Modal Embeddings. In USENIX Security Symposium."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Yinjie Zhang Geyang Xiao Bin Bai Zhiyu Wang Caijun Sun and Yonggang Tu. 2022. An Optimized Transfer Attack Framework Towards Multi-Modal Machine Learning. In DOCS.","DOI":"10.1109\/DOCS55193.2022.9967734"},{"key":"e_1_3_2_1_33_1","unstructured":"Bin Zhu Bin Lin Munan Ning Yang Yan Jiaxi Cui HongFa Wang Yatian Pang Wenhao Jiang Junwu Zhang Zongwei Li et al. 2023. LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment. arXiv preprint arXiv:2310.01852 (2023)."}],"event":{"name":"CCS '24: ACM SIGSAC Conference on Computer and Communications Security","location":"Salt Lake City UT USA","acronym":"CCS '24","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 1st ACM Workshop on Large AI Systems and Models with Privacy and Safety Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689217.3690619","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689217.3690619","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:12:33Z","timestamp":1755972753000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689217.3690619"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,19]]},"references-count":33,"alternative-id":["10.1145\/3689217.3690619","10.1145\/3689217"],"URL":"https:\/\/doi.org\/10.1145\/3689217.3690619","relation":{},"subject":[],"published":{"date-parts":[[2023,11,19]]},"assertion":[{"value":"2024-11-19","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}