{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:50:20Z","timestamp":1775580620544,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276245"],"award-info":[{"award-number":["62276245"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680779","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"1072-1081","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Break the Visual Perception: Adversarial Attacks Targeting Encoded Visual Tokens of Large Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2205-2100","authenticated-orcid":false,"given":"Yubo","family":"Wang","sequence":"first","affiliation":[{"name":"State Key Laboratory of Cognitive Intelligence, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7588-4264","authenticated-orcid":false,"given":"Chaohu","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Cognitive Intelligence, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2092-9031","authenticated-orcid":false,"given":"Yanqiu","family":"Qu","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3789-9705","authenticated-orcid":false,"given":"Haoyu","family":"Cao","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3987-2431","authenticated-orcid":false,"given":"Deqiang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tencent YouTu Lab, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0227-3793","authenticated-orcid":false,"given":"Linli","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Cognitive Intelligence, University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Controlled Caption Generation for Images Through Adversarial Attacks. CoRR","author":"Aafaq Nayyer","year":"2021","unstructured":"Nayyer Aafaq, Naveed Akhtar, Wei Liu, Mubarak Shah, and Ajmal Mian. 2021. Controlled Caption Generation for Images Through Adversarial Attacks. CoRR, Vol. abs\/2107.03050 (2021). showeprint[arXiv]2107.03050 https:\/\/arxiv.org\/abs\/2107.03050"},{"key":"e_1_3_2_1_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390","author":"Awadalla Anas","year":"2023","unstructured":"Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. 2023. Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond.","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023","author":"Carlini Nicholas","year":"2023","unstructured":"Nicholas Carlini, Milad Nasr, Christopher A. Choquette-Choo, Matthew Jagielski, Irena Gao, Pang Wei Koh, Daphne Ippolito, Florian Tram\u00e8r, and Ludwig Schmidt. 2023. Are aligned neural networks adversarially aligned?. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023, Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/c1f0b856a35986348ab3414177266f75-Abstract-Conference.html"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2017.49"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3128572.3140448"},{"key":"e_1_3_2_1_8_1","volume-title":"Young Kyun Jang, and Ser-Nam Lim","author":"Cui Xuanimng","year":"2023","unstructured":"Xuanimng Cui, Alejandro Aparcedo, Young Kyun Jang, and Ser-Nam Lim. 2023. On the Robustness of Large Multimodal Models Against Image Adversarial Attacks. arXiv preprint arXiv:2312.03777 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2305.06500"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00957"},{"key":"e_1_3_2_1_11_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3--7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_12_1","volume-title":"Hotflip: White-box adversarial examples for text classification. arXiv preprint arXiv:1712.06751","author":"Ebrahimi Javid","year":"2017","unstructured":"Javid Ebrahimi, Anyi Rao, Daniel Lowd, and Dejing Dou. 2017. Hotflip: White-box adversarial examples for text classification. arXiv preprint arXiv:1712.06751 (2017)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.15010"},{"key":"e_1_3_2_1_15_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Goodfellow Ian J.","year":"2015","unstructured":"Ian J. Goodfellow, Jonathon Shlens, and Christian Szegedy. 2015. Explaining and Harnessing Adversarial Examples. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6572"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_17_1","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24--26, 2017, Workshop Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=HJGU3Rodl","author":"Kurakin Alexey","year":"2017","unstructured":"Alexey Kurakin, Ian J. Goodfellow, and Samy Bengio. 2017. Adversarial examples in the physical world. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24--26, 2017, Workshop Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=HJGU3Rodl"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.03726"},{"key":"e_1_3_2_1_19_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_2_1_22_1","volume-title":"HRVDA: High-Resolution Visual Document Assistant. arxiv: 2404.06918 [cs.CV]","author":"Liu Chaohu","year":"2024","unstructured":"Chaohu Liu, Kun Yin, Haoyu Cao, Xinghua Jiang, Xin Li, Yinsong Liu, Deqiang Jiang, Xing Sun, and Linli Xu. 2024. HRVDA: High-Resolution Visual Document Assistant. arxiv: 2404.06918 [cs.CV]"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2304.08485"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1982.1056489"},{"key":"e_1_3_2_1_25_1","volume-title":"6th International Conference on Learning Representations, ICLR","author":"Madry Aleksander","year":"2018","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2018. Towards Deep Learning Models Resistant to Adversarial Attacks. In 6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, April 30 - May 3, 2018, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=rJzIBfZAb"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00774"},{"key":"e_1_3_2_1_27_1","volume-title":"Goodfellow","author":"Papernot Nicolas","year":"2016","unstructured":"Nicolas Papernot, Patrick D. McDaniel, and Ian J. Goodfellow. 2016. Transferability in Machine Learning: from Phenomena to Black-Box Attacks using Adversarial Samples. CoRR, Vol. abs\/1605.07277 (2016). showeprint[arXiv]1605.07277 http:\/\/arxiv.org\/abs\/1605.07277"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V38I19.30150"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/0377-0427(87)90125-7"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00395"},{"key":"e_1_3_2_1_32_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Shayegani Erfan","year":"2023","unstructured":"Erfan Shayegani, Yue Dong, and Nael Abu-Ghazaleh. 2023. Jailbreak in pieces: Compositional adversarial attacks on multi-modal language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","volume-title":"Human-Adversarial Visual Question Answering. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Sheng Sasha","year":"2021","unstructured":"Sasha Sheng, Amanpreet Singh, Vedanuj Goswami, Jose Alberto Lopez Magana, Tristan Thrush, Wojciech Galuba, Devi Parikh, and Douwe Kiela. 2021. Human-Adversarial Visual Question Answering. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6--14, 2021, virtual, Marc'Aurelio Ranzato, Alina Beygelzimer, Yann N. Dauphin, Percy Liang, and Jennifer Wortman Vaughan (Eds.). 20346--20359. https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/aa97d584861474f4097cf13ccb5325da-Abstract.html"},{"key":"e_1_3_2_1_34_1","volume-title":"2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14--16, 2014, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1312","author":"Szegedy Christian","year":"2014","unstructured":"Christian Szegedy, Wojciech Zaremba, Ilya Sutskever, Joan Bruna, Dumitru Erhan, Ian J. Goodfellow, and Rob Fergus. 2014. Intriguing properties of neural networks. In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14--16, 2014, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1312.6199"},{"key":"e_1_3_2_1_35_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_36_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4--9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998--6008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_37_1","volume-title":"mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Jiabo Ye, Ming Yan, Haowei Liu, Qi Qian, Ji Zhang, Fei Huang, and Jingren Zhou. 2023. mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2306.13549"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102842"},{"key":"e_1_3_2_1_41_1","volume-title":"On Evaluating Adversarial Robustness of Large Vision-Language Models. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023","author":"Zhao Yunqing","year":"2023","unstructured":"Yunqing Zhao, Tianyu Pang, Chao Du, Xiao Yang, Chongxuan Li, Ngai-Man Cheung, and Min Lin. 2023. On Evaluating Adversarial Robustness of Large Vision-Language Models. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023, Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/a97b58c4f7551053b0512f92244b0810-Abstract-Conference.html"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2304.10592"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680779","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680779","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680779"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":42,"alternative-id":["10.1145\/3664647.3680779","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680779","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}