{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T14:09:11Z","timestamp":1768313351152,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755715","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"5010-5019","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Transfer Attack for Bad and Good: Explain and Boost Adversarial Transferability across Multimodal Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3246-6636","authenticated-orcid":false,"given":"Hao","family":"Cheng","sequence":"first","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and The Hong Kong University of Science and Technology, HongKong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1505-1316","authenticated-orcid":false,"given":"Erjia","family":"Xiao","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8532-6895","authenticated-orcid":false,"given":"Jiayan","family":"Yang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6546-8136","authenticated-orcid":false,"given":"Jinhao","family":"Duan","sequence":"additional","affiliation":[{"name":"University of North Carolina at Chapel Hill, Chapel Hill, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7986-2209","authenticated-orcid":false,"given":"Yichi","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4338-4414","authenticated-orcid":false,"given":"Jiahang","family":"Cao","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8277-1898","authenticated-orcid":false,"given":"Qiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8379-4915","authenticated-orcid":false,"given":"Le","family":"Yang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4437-0671","authenticated-orcid":false,"given":"Kaidi","family":"Xu","sequence":"additional","affiliation":[{"name":"Drexel University, Philadelphia, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0574-0129","authenticated-orcid":false,"given":"Jindong","family":"Gu","sequence":"additional","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0792-8974","authenticated-orcid":false,"given":"Renjing","family":"Xu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Defense-Prefix for Preventing Typographic Attacks on CLIP. ICCV Workshop on Adversarial Robustness In the Real World","author":"Azuma Hiroki","year":"2023","unstructured":"Hiroki Azuma and Yusuke Matsui. 2023. Defense-Prefix for Preventing Typographic Attacks on CLIP. ICCV Workshop on Adversarial Robustness In the Real World (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"European Conference on Computer Vision. Springer, 179-196","author":"Cheng Hao","year":"2024","unstructured":"Hao Cheng, Erjia Xiao, Jindong Gu, Le Yang, Jinhao Duan, Jize Zhang, Jiahang Cao, Kaidi Xu, and Renjing Xu. 2024a. Unveiling typographic deceptions: Insights of the typographic vulnerability in large vision-language models. In European Conference on Computer Vision. Springer, 179-196."},{"key":"e_1_3_2_1_3_1","volume-title":"Exploring Typographic Visual Prompts Injection Threats in Cross-Modality Generation Models. arXiv preprint arXiv:2503.11519","author":"Cheng Hao","year":"2025","unstructured":"Hao Cheng, Erjia Xiao, Yichi Wang, Kaidi Xu, Mengshu Sun, Jindong Gu, and Renjing Xu. 2025a. Exploring Typographic Visual Prompts Injection Threats in Cross-Modality Generation Models. arXiv preprint arXiv:2503.11519 (2025)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00285"},{"key":"e_1_3_2_1_5_1","unstructured":"Hao Cheng Erjia Xiao Chengyuan Yu Zhao Yao Jiahang Cao Qiang Zhang Jiaxu Wang Mengshu Sun Kaidi Xu Jindong Gu et al. 2024b. Manipulation facing threats: Evaluating physical vulnerabilities in end-to-end vision language action models. arXiv preprint arXiv:2409.13174 (2024)."},{"key":"e_1_3_2_1_6_1","unstructured":"CNN-Business. 2024. 'There are no guardrails.' This mom believes an AI chatbot is responsible for her son's suicide. In https:\/\/edition.cnn.com\/2024\/10\/30\/tech\/teen-suicide-character-ai-lawsuit\/index.html."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02325"},{"key":"e_1_3_2_1_8_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv:2305.06500 [cs.CV]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00444"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612070"},{"key":"e_1_3_2_1_11_1","unstructured":"Jindong Gu Xiaojun Jia Pau de Jorge Wenqain Yu Xinwei Liu Avery Ma Yuan Xun Anjun Hu Ashkan Khakzar Zhijiang Li et al. 2023. A survey on transferability of adversarial examples across deep neural networks. arXiv preprint arXiv:2310.17626 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Sa-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation. arXiv preprint arXiv:2312.04913","author":"He Bangyan","year":"2023","unstructured":"Bangyan He, Xiaojun Jia, Siyuan Liang, Tianrui Lou, Yang Liu, and Xiaochun Cao. 2023. Sa-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation. arXiv preprint arXiv:2312.04913 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00021"},{"key":"e_1_3_2_1_15_1","volume-title":"As Firm As Their Foundations: Can open-sourced foundation models be used to create adversarial examples for downstream tasks? arXiv preprint arXiv:2403.12693","author":"Hu Anjun","year":"2024","unstructured":"Anjun Hu, Jindong Gu, Francesco Pinto, Konstantinos Kamnitsas, and Philip Torr. 2024. As Firm As Their Foundations: Can open-sourced foundation models be used to create adversarial examples for downstream tasks? arXiv preprint arXiv:2403.12693 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Catastrophic jailbreak of open-source llms via exploiting generation. arXiv preprint arXiv:2310.06987","author":"Huang Yangsibo","year":"2023","unstructured":"Yangsibo Huang, Samyak Gupta, Mengzhou Xia, Kai Li, and Danqi Chen. 2023. Catastrophic jailbreak of open-source llms via exploiting generation. arXiv preprint arXiv:2310.06987 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Nesterov accelerated gradient and scale invariance for adversarial attacks. arXiv preprint arXiv:1908.06281","author":"Lin Jiadong","year":"2019","unstructured":"Jiadong Lin, Chuanbiao Song, Kun He, Liwei Wang, and John E Hopcroft. 2019. Nesterov accelerated gradient and scale invariance for adversarial attacks. arXiv preprint arXiv:1908.06281 (2019)."},{"key":"e_1_3_2_1_19_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_20_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge.","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, and Yong Jae Lee. 2024b. Llava-next: Improved reasoning, ocr, and world knowledge."},{"key":"e_1_3_2_1_22_1","volume-title":"European Conference on Computer Vision. Springer, 93-109","author":"Liu Runtao","year":"2024","unstructured":"Runtao Liu, Ashkan Khakzar, Jindong Gu, Qifeng Chen, Philip Torr, and Fabio Pizzati. 2024a. Latent guard: a safety framework for text-to-image generation. In European Conference on Computer Vision. Springer, 93-109."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-15-9129-7_32"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"e_1_3_2_1_25_1","volume-title":"European Conference on Computer Vision. Springer, 74-92","author":"Ma Avery","year":"2024","unstructured":"Avery Ma, Amir-massoud Farahmand, Yangchen Pan, Philip Torr, and Jindong Gu. 2024. Improving adversarial transferability via model alignment. In European Conference on Computer Vision. Springer, 74-92."},{"key":"e_1_3_2_1_26_1","volume-title":"Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083","author":"Madry Aleksander","year":"2017","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2017. Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083 (2017)."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Learning Representations.","author":"Madry Aleksander","year":"2018","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2018. Towards Deep Learning Models Resistant to Adversarial Attacks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","first-page":"37","article-title":"English WordNet: A new open-source wordnet for English","volume":"28","author":"McCrae John P","year":"2020","unstructured":"John P McCrae, Ewa Rudnicka, and Francis Bond. 2020. English WordNet: A new open-source wordnet for English. K Lexical News, Vol. 28 (2020), 37-44.","journal-title":"K Lexical News"},{"key":"e_1_3_2_1_29_1","volume-title":"PLPHP: Per-Layer Per-Head Vision Token Pruning for Efficient Large Vision-Language Models. arXiv preprint arXiv:2502.14504","author":"Meng Yu","year":"2025","unstructured":"Yu Meng, Kaiyuan Li, Chenran Huang, Chen Gao, Xinlei Chen, Yong Li, and Xiaoping Zhang. 2025. PLPHP: Per-Layer Per-Head Vision Token Pruning for Efficient Large Vision-Language Models. arXiv preprint arXiv:2502.14504 (2025)."},{"key":"e_1_3_2_1_30_1","volume-title":"Understanding and evaluating hallucinations in 3d visual language models. arXiv preprint arXiv:2502.15888","author":"Peng Ruiying","year":"2025","unstructured":"Ruiying Peng, Kaiyuan Li, Weichen Zhang, Chen Gao, Xinlei Chen, and Yong Li. 2025. Understanding and evaluating hallucinations in 3d visual language models. arXiv preprint arXiv:2502.15888 (2025)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i19.30150"},{"key":"e_1_3_2_1_32_1","volume-title":"Multilingual large language model: A survey of resources, taxonomy and frontiers. arXiv preprint arXiv:2404.04925","author":"Qin Libo","year":"2024","unstructured":"Libo Qin, Qiguang Chen, Yuhang Zhou, Zhi Chen, Yinghui Li, Lizi Liao, Min Li, Wanxiang Che, and Philip S Yu. 2024. Multilingual large language model: A survey of resources, taxonomy and frontiers. arXiv preprint arXiv:2404.04925 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"The Thirteenth International Conference on Learning Representations (ICLR)","author":"Schaeffer Rylan","year":"2025","unstructured":"Rylan Schaeffer, Dan Valentine, Luke Bailey, James Chua, Zane Durante, Cristobal Eyzaguirre, Joe Benton, Brando Miranda, Henry Sleight, Tony Tong Wang, et al., 2025. Failures to Find Transferable Image Jailbreaks Between Vision-Language Models. In The Thirteenth International Conference on Learning Representations (ICLR) 2025."},{"key":"e_1_3_2_1_34_1","volume-title":"Francesco Croce, and Matthias Hein.","author":"Schlarmann Christian","year":"2024","unstructured":"Christian Schlarmann, Naman Deep Singh, Francesco Croce, and Matthias Hein. 2024. Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings for Robust Large Vision-Language Models. ICML (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_36_1","volume-title":"A tutorial on principal component analysis. arXiv preprint arXiv:1404.1100","author":"Shlens Jonathon","year":"2014","unstructured":"Jonathon Shlens. 2014. A tutorial on principal component analysis. arXiv preprint arXiv:1404.1100 (2014)."},{"key":"e_1_3_2_1_37_1","volume-title":"Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125","author":"Wallace Eric","year":"2019","unstructured":"Eric Wallace, Shi Feng, Nikhil Kandpal, Matt Gardner, and Sameer Singh. 2019. Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125 (2019)."},{"key":"e_1_3_2_1_38_1","volume-title":"From LLMs to MLLMs: Exploring the Landscape of Multimodal Jailbreaking. arXiv preprint arXiv:2406.14859","author":"Wang Siyuan","year":"2024","unstructured":"Siyuan Wang, Zhuohan Long, Zhihao Fan, and Zhongyu Wei. 2024. From LLMs to MLLMs: Exploring the Landscape of Multimodal Jailbreaking. arXiv preprint arXiv:2406.14859 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00196"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01585"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00425"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00891"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00284"},{"key":"e_1_3_2_1_44_1","volume-title":"Defending jailbreak attack in vlms via cross-modality information detector. arXiv preprint arXiv:2407.21659","author":"Xu Yue","year":"2024","unstructured":"Yue Xu, Xiuyuan Qi, Zhan Qin, and Wenjie Wang. 2024. Defending jailbreak attack in vlms via cross-modality information detector. arXiv preprint arXiv:2407.21659 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28499"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/SaTML64287.2025.00049"},{"key":"e_1_3_2_1_47_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. Findings of the Association for Computational Linguistics","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. Findings of the Association for Computational Linguistics (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00790"},{"key":"e_1_3_2_1_49_1","volume-title":"Text: Does Point Cloud Boost Spatial Reasoning of Large Language Models? arXiv preprint arXiv:2504.04540","author":"Zhang Weichen","year":"2025","unstructured":"Weichen Zhang, Ruiying Peng, Chen Gao, Jianjie Fang, Xin Zeng, Kaiyuan Li, Ziyou Wang, Jinqiang Cui, Xin Wang, Xinlei Chen, et al., 2025. The Point, the Vision and the Text: Does Point Cloud Boost Spatial Reasoning of Large Language Models? arXiv preprint arXiv:2504.04540 (2025)."},{"key":"e_1_3_2_1_50_1","volume-title":"Ngai-Man Man Cheung, and Min Lin","author":"Zhao Yunqing","year":"2024","unstructured":"Yunqing Zhao, Tianyu Pang, Chao Du, Xiao Yang, Chongxuan Li, Ngai-Man Man Cheung, and Min Lin. 2024b. On evaluating adversarial robustness of large vision-language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02303"},{"key":"e_1_3_2_1_52_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J Zico Kolter, and Matt Fredrikson. 2023. Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755715","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:38:21Z","timestamp":1765309101000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755715"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":53,"alternative-id":["10.1145\/3746027.3755715","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755715","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}