{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T15:21:57Z","timestamp":1780413717305,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","funder":[{"name":"the Outstanding Research Project of Shen Yuan Honors College, BUAA","award":["230123206"],"award-info":[{"award-number":["230123206"]}]},{"name":"the National Natural Science Foundation of China, State Key Laboratory of Complex & Critical Software Environment (CCSE)","award":["62206009"],"award-info":[{"award-number":["62206009"]}]},{"name":"Aeronautical Science Fund, the Fundamental Research Funds for the Central Universities","award":["20230017051001"],"award-info":[{"award-number":["20230017051001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755211","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"10955-10964","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Manipulating Multimodal Agents via Cross-Modal Prompt Injection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8296-7633","authenticated-orcid":false,"given":"Le","family":"Wang","sequence":"first","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7393-7362","authenticated-orcid":false,"given":"Zonghao","family":"Ying","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9874-6828","authenticated-orcid":false,"given":"Tianyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China and Shen Yuan Honors College, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6154-0233","authenticated-orcid":false,"given":"Siyuan","family":"Liang","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0042-9045","authenticated-orcid":false,"given":"Shengshan","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2523-1089","authenticated-orcid":false,"given":"Mingchuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Information Engineering, Henan University of Science and Technology, Luoyang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4224-1318","authenticated-orcid":false,"given":"Aishan","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8425-4195","authenticated-orcid":false,"given":"Xianglong","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Complex &amp; Critical Software Environment, Beihang University, Beijing, China, Zhongguancun Laboratory, Beijing, China, and Institute of Dataspace, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"ACM Workshop on Artificial Intelligence and Security, AISec.","author":"Abdelnabi Sahar","year":"2023","unstructured":"Sahar Abdelnabi, Kai Greshake, Shailesh Mishra, Christoph Endres, Thorsten Holz, and Mario Fritz. 2023. Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In ACM Workshop on Artificial Intelligence and Security, AISec."},{"key":"e_1_3_2_1_2_1","first-page":"i","volume":"202","author":"Abdin Marah","unstructured":"Marah Abdin, Jyoti Aneja, and et al. Hany Awadalla. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arXiv preprint arXiv:2404.14219 (2024).","journal-title":"Hany Awadalla."},{"key":"e_1_3_2_1_3_1","volume-title":"Attacking Multimodal OS Agents with Malicious Image Patches. arXiv preprint arXiv:2503.10809","author":"Aichberger Lukas","year":"2025","unstructured":"Lukas Aichberger, Alasdair Paren, Yarin Gal, Philip Torr, and Adel Bibi. 2025. Attacking Multimodal OS Agents with Malicious Image Patches. arXiv preprint arXiv:2503.10809 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"BUILD WITH CLAUDE: PDF support. https:\/\/docs.anthropic.com\/en\/docs\/build-with-claude\/pdf-support","year":"2024","unstructured":"Antropic. 2024. BUILD WITH CLAUDE: PDF support. https:\/\/docs.anthropic.com\/en\/docs\/build-with-claude\/pdf-support"},{"key":"e_1_3_2_1_5_1","volume-title":"Abusing Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs. arXiv preprint arXiv:2307.10490","author":"Bagdasaryan Eugene","year":"2023","unstructured":"Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, and Vitaly Shmatikov. 2023. Abusing Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs. arXiv preprint arXiv:2307.10490 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"James Tanner, Quan Vuong, Anna Walling, Haohuan Wang, and Ury Zhilinsky.","author":"Black Kevin","year":"2024","unstructured":"Kevin Black, Noah Brown, Danny Driess, Adnan Esmail, Michael Equi, Chelsea Finn, Niccolo Fusai, Lachy Groom, Karol Hausman, Brian Ichter, Szymon Jakubczak, Tim Jones, Liyiming Ke, Sergey Levine, Adrian Li-Bell, Mohith Mothukuri, Suraj Nair, Karl Pertsch, Lucy Xiaoyang Shi, James Tanner, Quan Vuong, Anna Walling, Haohuan Wang, and Ury Zhilinsky. 2024. \u03c0_0: A Vision-Language-Action Flow Model for General Robot Control. arXiv preprint arXiv:2410.24164 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"On Evaluating Adversarial Robustness. arXiv preprint arXiv:1902.06705","author":"Carlini Nicholas","year":"2019","unstructured":"Nicholas Carlini, Anish Athalye, Nicolas Papernot, Wieland Brendel, Jonas Rauber, Dimitris Tsipras, Ian Goodfellow, Aleksander Madry, and Alexey Kurakin. 2019. On Evaluating Adversarial Robustness. arXiv preprint arXiv:1902.06705 (2019)."},{"key":"e_1_3_2_1_8_1","volume-title":"Rethinking Model Ensemble in Transfer-based Adversarial Attacks. In International Conference on Learning Representations.","author":"Chen Huanran","year":"2024","unstructured":"Huanran Chen, Yichi Zhang, Yinpeng Dong, Xiao Yang, Hang Su, and Jun Zhu. 2024b. Rethinking Model Ensemble in Transfer-based Adversarial Attacks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2024a. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_10_1","volume-title":"A Fast, Strong and Open Vision Language Assistant for Mobile Devices. arXiv preprint arXiv:2312.16886","author":"Chu Xiangxiang","year":"2023","unstructured":"Xiangxiang Chu, Limeng Qiao, Xinyang Lin, Shuang Xu, Yang Yang, Yiming Hu, Fei Wei, Xinyu Zhang, Bo Zhang, Xiaolin Wei, and Chunhua Shen. 2023. MobileVLM: A Fast, Strong and Open Vision Language Assistant for Mobile Devices. arXiv preprint arXiv:2312.16886 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02325"},{"key":"e_1_3_2_1_12_1","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Tiong Junqi Zhao Weisheng Wang Boyang Li Pascale Fung and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_13_1","volume-title":"Navigating the Risks: A Survey of Security, Privacy, and Ethics Threats in LLM-Based Agents. arXiv preprint arXiv:2411.09523","author":"Gan Yuyou","year":"2024","unstructured":"Yuyou Gan, Yong Yang, Zhe Ma, Ping He, Rui Zeng, Yiming Wang, Qingming Li, Chunyi Zhou, Songze Li, Ting Wang, Yunjun Gao, Yingcai Wu, and Shouling Ji. 2024. Navigating the Risks: A Survey of Security, Privacy, and Ethics Threats in LLM-Based Agents. arXiv preprint arXiv:2411.09523 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34568"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01852"},{"key":"e_1_3_2_1_17_1","volume-title":"Empirical Analysis of Large Vision-Language Models against Goal Hijacking via Visual Prompt Injection. arXiv preprint arXiv:2408.03554","author":"Kimura Subaru","year":"2024","unstructured":"Subaru Kimura, Ryota Tanaka, Shumpei Miyawaki, Jun Suzuki, and Keisuke Sakaguchi. 2024. Empirical Analysis of Large Vision-Language Models against Goal Hijacking via Visual Prompt Injection. arXiv preprint arXiv:2408.03554 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_19_1","volume-title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods. arXiv preprint arXiv:2412.05579","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qian Dong, Junjie Chen, Huixue Su, Yujia Zhou, Qingyao Ai, Ziyi Ye, and Yiqun Liu. 2024a. LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods. arXiv preprint arXiv:2412.05579 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"European Conference on Computer Vision.","author":"Li Yifan","year":"2024","unstructured":"Yifan Li, Hangyu Guo, Kun Zhou, Wayne Xin Zhao, and Ji-Rong Wen. 2024b. Images are Achilles' Heel of Alignment: Exploiting Visual Vulnerabilities for Jailbreaking Multimodal Large Language Models. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_21_1","unstructured":"Victor Weixin Liang Yuhui Zhang Yongchan Kwon Serena Yeung and James Y Zou. 2022. Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Representation Learning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_22_1","volume-title":"AGENTSAFE: Benchmarking the Safety of Embodied Agents on Hazardous Instructions. arXiv preprint arXiv:2506.14697","author":"Liu Aishan","year":"2025","unstructured":"Aishan Liu, Zonghao Ying, Le Wang, Junjie Mu, Jinyang Guo, Jiakai Wang, Yuqing Ma, Siyuan Liang, Mingchuan Zhang, Xianglong Liu, and Dacheng Tao. 2025. AGENTSAFE: Benchmarking the Safety of Embodied Agents on Hazardous Instructions. arXiv preprint arXiv:2506.14697 (2025)."},{"key":"e_1_3_2_1_23_1","volume-title":"Compromising Embodied Agents with Contextual Backdoor Attacks. arXiv preprint arXiv:2408.02882","author":"Liu Aishan","year":"2024","unstructured":"Aishan Liu, Yuguang Zhou, Xianglong Liu, Tianyuan Zhang, Siyuan Liang, Jiakai Wang, Yanjun Pu, Tianlin Li, Junqi Zhang, Wenbo Zhou, Qing Guo, and Dacheng Tao. 2024d. Compromising Embodied Agents with Contextual Backdoor Attacks. arXiv preprint arXiv:2408.02882 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_25_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023b. Visual Instruction Tuning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_26_1","volume-title":"Automatic and Universal Prompt Injection Attacks against Large Language Modals. arXiv preprint arXiv:2403.04957","author":"Liu Xiaogeng","year":"2024","unstructured":"Xiaogeng Liu, Zhiyuan Yu, Yizhe Zhang, Ning Zhang, and Chaowei Xiao. 2024c. Automatic and Universal Prompt Injection Attacks against Large Language Modals. arXiv preprint arXiv:2403.04957 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Prompt Injection attack against LLM-integrated Applications. arXiv preprint arXiv:2306.05499","author":"Liu Yi","year":"2023","unstructured":"Yi Liu, Gelei Deng, Yuekang Li, Kailong Wang, Tianwei Zhang, Yepang Liu, Haoyu Wang, Yan Zheng, and Yang Liu. 2023a. Prompt Injection attack against LLM-integrated Applications. arXiv preprint arXiv:2306.05499 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Formalizing and Benchmarking Prompt Injection Attacks and Defenses. In USENIX Security Symposium.","author":"Liu Yupei","year":"2024","unstructured":"Yupei Liu, Yuqi Jia, Runpeng Geng, Jinyuan Jia, and Neil Zhenqiang Gong. 2024a. Formalizing and Benchmarking Prompt Injection Attacks and Defenses. In USENIX Security Symposium."},{"key":"e_1_3_2_1_29_1","volume-title":"Dolphins: Multimodal Language Model for Driving. In European Conference on Computer Vision.","author":"Ma Yingzi","year":"2024","unstructured":"Yingzi Ma, Yulong Cao, Jiachen Sun, Marco Pavone, and Chaowei Xiao. 2024. Dolphins: Multimodal Language Model for Driving. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_30_1","unstructured":"Mistral AI team. 2023. Mistral 7B. https:\/\/mistral.ai\/news\/announcing-mistral-7b"},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"OWASP. 2023. OWASP Top 10 for LLM Applications. https:\/\/owasp.org\/www-project-top-10-for-large-language-model-applications\/assets\/PDF\/OWASP-Top-10-for-LLMs-2023-v1_1.pdf"},{"key":"e_1_3_2_1_33_1","volume-title":"Song-Chun Zhu1, and Jianfeng Gao2.","author":"Pan Lu","year":"2023","unstructured":"Lu Pan, Peng Baolin, Cheng Hao, Galley Michel, Chang Kai-Wei, Ying Nian Wu, Song-Chun Zhu1, and Jianfeng Gao2. 2023. Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_34_1","volume-title":"Autonomous Workflow for Multimodal Fine-Grained Training Assistants Towards Mixed Reality. arXiv preprint arXiv:2405.13034","author":"Pei Jiahuan","year":"2024","unstructured":"Jiahuan Pei, Irene Viola, Haochen Huang, Junxiao Wang, Moonisa Ahsan, Fanghua Ye, Jiang Yiming, Yao Sai, Di Wang, Zhumin Chen, Pengjie Ren, and Pablo Cesar. 2024. Autonomous Workflow for Multimodal Fine-Grained Training Assistants Towards Mixed Reality. arXiv preprint arXiv:2405.13034 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"arXiv preprint arXiv:2412.15115","author":"Team Qwen","year":"2024","unstructured":"Qwen Team. 2024. Qwen2.5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_37_1","volume-title":"CoEDIT: Text Editing by Task-Specific Instruction Tuning. arXiv preprint arXiv:2305.09857","author":"Raheja Vipul","year":"2023","unstructured":"Vipul Raheja, Dhruv Kumar, Ryan Koo, and Dongyeop Kang. 2023. CoEDIT: Text Editing by Task-Specific Instruction Tuning. arXiv preprint arXiv:2305.09857 (2023)."},{"key":"e_1_3_2_1_38_1","unstructured":"AgileX Robotics. 2025. AgileX Robotics. https:\/\/global.agilex.ai\/pages\/limo"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision.","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. Highresolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF International Conference on Computer Vision."},{"key":"e_1_3_2_1_40_1","unstructured":"Sander Schulhoff. 2024. Sandwitch defense. https:\/\/learnprompting.org\/docs\/prompt_hacking\/defensive_measures\/sandwich_defense"},{"key":"e_1_3_2_1_41_1","volume-title":"International Conference on Learning Representations.","author":"Shayegani Erfan","unstructured":"Erfan Shayegani, Yue Dong, and Nael B. Abu-Ghazaleh. 2024. Jailbreak in pieces: Compositional Adversarial Attacks on Multi-Modal Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_42_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations.","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, *Azalia Mirhoseini, *Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3690291"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1170"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_2_1_46_1","volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90% ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/","author":"Team The Vicuna","year":"2023","unstructured":"The Vicuna Team. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90% ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_1_47_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_48_1","unstructured":"W3C. 2011. HTML5. https:\/\/www.w3.org\/TR\/2011\/WD-html5-20110405\/"},{"key":"e_1_3_2_1_49_1","volume-title":"OpenVLA: An Open-Source Vision-Language-Action Model. arXiv preprint arXiv:2404.13208","author":"Wallace Eric","year":"2024","unstructured":"Eric Wallace, Kai Xiao, Reimar Leike, Lilian Weng, Johannes Heidecke, and Alex Beutel. 2024a. OpenVLA: An Open-Source Vision-Language-Action Model. arXiv preprint arXiv:2404.13208 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions. arXiv preprint arXiv:2406.09246","author":"Wallace Eric","year":"2024","unstructured":"Eric Wallace, Kai Xiao, Reimar Leike, Lilian Weng, Johannes Heidecke, and Alex Beutel. 2024b. The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions. arXiv preprint arXiv:2406.09246 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024a. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"CogVLM: Visual Expert for Pretrained Language Models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, Jiazheng Xu, Bin Xu, Juanzi Li, Yuxiao Dong, Ming Ding, and Jie Tang. 2024b. CogVLM: Visual Expert for Pretrained Language Models. arXiv preprint arXiv:2311.03079 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral Planning States for Autonomous Driving. arXiv preprint arXiv:2312.09245","author":"Wang Wenhai","year":"2023","unstructured":"Wenhai Wang, Jiangwei Xie, ChuanYang Hu, Haoming Zou, Jianan Fan, Wenwen Tong, Yang Wen, Silei Wu, Hanming Deng, Zhiqi Li, Hao Tian, Lewei Lu, Xizhou Zhu, Xiaogang Wang, Yu Qiao, and Jifeng Dai. 2023. DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral Planning States for Autonomous Driving. arXiv preprint arXiv:2312.09245 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"TrojanRobot: Physical-World Backdoor Attacks Against VLM-based Robotic Manipulation. arXiv preprint arXiv:2411.11683","author":"Wang Xianlong","year":"2025","unstructured":"Xianlong Wang, Hewen Pan, Hangtao Zhang, Minghui Li, Shengshan Hu, Ziqi Zhou, Lulu Xue, Peijin Guo, Yichen Wang, Wei Wan, Aishan Liu, and Leo Yu Zhang. 2025. TrojanRobot: Physical-World Backdoor Attacks Against VLM-based Robotic Manipulation. arXiv preprint arXiv:2411.11683 (2025)."},{"key":"e_1_3_2_1_55_1","volume-title":"Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models. arXiv preprint arXiv:2303.04671","author":"Wu Chenfei","year":"2023","unstructured":"Chenfei Wu, Shengming Yin, Weizhen Qi, Xiaodong Wang, Zecheng Tang, and Nan Duan. 2023. Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models. arXiv preprint arXiv:2303.04671 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Ruslan Salakhutdinov, Daniel Fried, and Aditi Raghunathan.","author":"Wu Chen Henry","year":"2024","unstructured":"Chen Henry Wu, Rishi Shah, Jing Yu Koh, Ruslan Salakhutdinov, Daniel Fried, and Aditi Raghunathan. 2024. Dissecting Adversarial Robustness of Multimodal LM Agents. arXiv preprint arXiv:2406.12814 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Large Multimodal Agents: A Survey. arXiv preprint arXiv:2402.15116","author":"Xie Junlin","year":"2024","unstructured":"Junlin Xie, Zhihong Chen, Ruifei Zhang, Xiang Wan, and Guanbin Li. 2024. Large Multimodal Agents: A Survey. arXiv preprint arXiv:2402.15116 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, Qianyu Chen, Huarong Zhou, Zhensheng Zou, Haoye Zhang, Shengding Hu, Zhi Zheng, Jie Zhou, Jie Cai, Xu Han, Guoyang Zeng, Dahai Li, Zhiyuan Liu, and Maosong Sun. 2024. MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models. arXiv preprint arXiv:2312.14197","author":"Yi Jingwei","year":"2025","unstructured":"Jingwei Yi, Yueqi Xie, Bin Zhu, Emre Kiciman, Guangzhong Sun, Xing Xie, and Fangzhao Wu. 2025. Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models. arXiv preprint arXiv:2312.14197 (2025)."},{"key":"e_1_3_2_1_60_1","volume-title":"SafeBench: A Safety Evaluation Framework for Multimodal Large Language Models. arXiv preprint arXiv:2410.18927","author":"Ying Zonghao","year":"2024","unstructured":"Zonghao Ying, Aishan Liu, Siyuan Liang, Lei Huang, Jinyang Guo, Wenbo Zhou, Xianglong Liu, and Dacheng Tao. 2024. SafeBench: A Safety Evaluation Framework for Multimodal Large Language Models. arXiv preprint arXiv:2410.18927 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Sigmoid Loss for Language Image Pre-Training. In International Conference on Computer Vision.","author":"Zhai Xiaohua","year":"2023","unstructured":"Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, and Lucas Beyer. 2023. Sigmoid Loss for Language Image Pre-Training. In International Conference on Computer Vision."},{"key":"e_1_3_2_1_62_1","volume-title":"InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. arXiv preprint arXiv:2403.02691","author":"Zhan Qiusi","year":"2024","unstructured":"Qiusi Zhan, Zhixiang Liang, Zifan Ying, and Daniel Kang. 2024. InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. arXiv preprint arXiv:2403.02691 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"BadRobot: Jailbreaking Embodied LLMs in the Physical World. arXiv preprint arXiv:2407.20242","author":"Zhang Hangtao","year":"2025","unstructured":"Hangtao Zhang, Chenyu Zhu, Xianlong Wang, Ziqi Zhou, Changgan Yin, Minghui Li, Lulu Xue, Yichen Wang, Shengshan Hu, Aishan Liu, Peijin Guo, and Leo Yu Zhang. 2025d. BadRobot: Jailbreaking Embodied LLMs in the Physical World. arXiv preprint arXiv:2407.20242 (2025)."},{"key":"e_1_3_2_1_64_1","unstructured":"Kejia Zhang Keda Tao Jiasheng Tang and Huan Wang. 2025a. Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs. Article arXiv:2501.19164. arXiv:2501.19164"},{"key":"e_1_3_2_1_65_1","volume-title":"JailGuard: A Universal Detection Framework for LLM Prompt-based Attacks. arXiv preprint arXiv:2312.10766","author":"Zhang Xiaoyu","year":"2025","unstructured":"Xiaoyu Zhang, Cen Zhang, Tianlin Li, Yihao Huang, Xiaojun Jia, Ming Hu, Jie Zhang, Yang Liu, Shiqing Ma, and Chao Shen. 2025c. JailGuard: A Universal Detection Framework for LLM Prompt-based Attacks. arXiv preprint arXiv:2312.10766 (2025)."},{"key":"e_1_3_2_1_66_1","volume-title":"Meta Prompting for AI Systems. arXiv preprint arXiv:2311.11482","author":"Zhang Yifan","year":"2025","unstructured":"Yifan Zhang, Yang Yuan, and Andrew Chi-Chih Yao. 2025b. Meta Prompting for AI Systems. arXiv preprint arXiv:2311.11482 (2025)."},{"key":"e_1_3_2_1_67_1","unstructured":"Yunqing Zhao Tianyu Pang Chao Du Xiao Yang Chongxuan LI Ngai-Man (Man) Cheung and Min Lin. 2023. On Evaluating Adversarial Robustness of Large Vision-Language Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_68_1","volume-title":"Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv preprint arXiv:2307.15043","author":"Zou Andy","year":"2023","unstructured":"Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr, J. Zico Kolter, and Matt Fredrikson. 2023. Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv preprint arXiv:2307.15043 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755211","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:25Z","timestamp":1765339825000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755211"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":68,"alternative-id":["10.1145\/3746027.3755211","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755211","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}