{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T18:24:43Z","timestamp":1770229483910,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","funder":[{"name":"Natural Science Foundation of China","award":["72434005"],"award-info":[{"award-number":["72434005"]}]},{"name":"Natural Science Foundation of China","award":["72225011"],"award-info":[{"award-number":["72225011"]}]},{"name":"Natural Science Foundation of China","award":["L242400108"],"award-info":[{"award-number":["L242400108"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733324","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"550-559","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Enhancing Adversarial Robustness of Vision-Language Models through Low-Rank Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4898-6918","authenticated-orcid":false,"given":"Yuheng","family":"Ji","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China and School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9894-0062","authenticated-orcid":false,"given":"Yue","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Data Science, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6280-5381","authenticated-orcid":false,"given":"Zhicheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China and School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0632-0413","authenticated-orcid":false,"given":"Zhao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9892-1626","authenticated-orcid":false,"given":"Yuting","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4209-6695","authenticated-orcid":false,"given":"Xiaoshuai","family":"Hao","sequence":"additional","affiliation":[{"name":"Beijing Academy of Artificial Intelligence, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1180-6942","authenticated-orcid":false,"given":"Gang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0602-077X","authenticated-orcid":false,"given":"Xingwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0405-5458","authenticated-orcid":false,"given":"Xiaolong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China and School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Feature purification: How adversarial training performs robust deep learning","author":"Allen-Zhu Zeyuan","unstructured":"Zeyuan Allen-Zhu and Yuanzhi Li. 2022. Feature purification: How adversarial training performs robust deep learning. In FOCS. IEEE, 977--988."},{"key":"e_1_3_2_1_3_1","volume-title":"Square attack: a query-efficient black-box adversarial attack via random search","author":"Andriushchenko Maksym","unstructured":"Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion, and Matthias Hein. 2020. Square attack: a query-efficient black-box adversarial attack via random search. In ECCV. Springer, 484--501."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV. 5803--5812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_5_1","volume-title":"Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274","author":"Bahng Hyojin","year":"2022","unstructured":"Hyojin Bahng, Ali Jahanian, Swami Sankaranarayanan, and Phillip Isola. 2022. Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani G\u00fcl Varol and Andrew Zisserman. 2021. Frozen in time: A joint video and image encoder for end-to-end retrieval. In ICCV. 1728--1738.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_7_1","volume-title":"One transformer fits all distributions in multi-modal diffusion at scale. arXiv preprint arXiv:2303.06555","author":"Bao Fan","year":"2023","unstructured":"Fan Bao, Shen Nie, Kaiwen Xue, Chongxuan Li, Shi Pu, Yaole Wang, Gang Yue, Yue Cao, Hang Su, and Jun Zhu. 2023. One transformer fits all distributions in multi-modal diffusion at scale. arXiv preprint arXiv:2303.06555 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"CLAP: Contrastive Learning with Augmented Prompts for Robustness on Pretrained Vision-Language Models. arXiv preprint arXiv:2311.16445","author":"Cai Yichao","year":"2023","unstructured":"Yichao Cai, Yuhang Liu, Zhen Zhang, and Javen Qinfeng Shi. 2023. CLAP: Contrastive Learning with Augmented Prompts for Robustness on Pretrained Vision-Language Models. arXiv preprint arXiv:2311.16445 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3128572.3140448"},{"key":"e_1_3_2_1_10_1","volume-title":"Qlora: Efficient finetuning of quantized llms. arXiv preprint arXiv:2305.14314","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023. Qlora: Efficient finetuning of quantized llms. arXiv preprint arXiv:2305.14314 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"TASAR: Transfer-based Attack on Skeletal Action Recognition. arXiv preprint arXiv:2409.02483","author":"Diao Yunfeng","year":"2024","unstructured":"Yunfeng Diao, Baiqi Wu, Ruixuan Zhang, Ajian Liu, Xingxing Wei, Meng Wang, and He Wang. 2024. TASAR: Transfer-based Attack on Skeletal Action Recognition. arXiv preprint arXiv:2409.02483 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Nanyi Fei Zhiwu Lu Yizhao Gao Guoxing Yang Yuqi Huo Jingyuan Wen Haoyu Lu Ruihua Song Xin Gao Tao Xiang et al. 2022. Towards artificial general intelligence via a multimodal foundation model. Nature Communications (2022) 3094.","DOI":"10.1038\/s41467-022-30761-2"},{"key":"e_1_3_2_1_13_1","volume-title":"Intelligent driving intelligence test for autonomous vehicles with naturalistic and adversarial environment. Nature communications","author":"Feng Shuo","year":"2021","unstructured":"Shuo Feng, Xintao Yan, Haowei Sun, Yiheng Feng, and Henry X Liu. 2021. Intelligent driving intelligence test for autonomous vehicles with naturalistic and adversarial environment. Nature communications (2021), 748."},{"key":"e_1_3_2_1_14_1","volume-title":"Andrew L Beam, and Isaac S Kohane","author":"Finlayson Samuel G","year":"2019","unstructured":"Samuel G Finlayson, John D Bowers, Joichi Ito, Jonathan L Zittrain, Andrew L Beam, and Isaac S Kohane. 2019. Adversarial attacks on medical machine learning. Science (2019), 1287--1289."},{"key":"e_1_3_2_1_15_1","volume-title":"Large-scale adversarial training for vision-and-language representation learning. NeurIPS","author":"Gan Zhe","year":"2020","unstructured":"Zhe Gan, Yen-Chun Chen, Linjie Li, Chen Zhu, Yu Cheng, and Jingjing Liu. 2020. Large-scale adversarial training for vision-and-language representation learning. NeurIPS (2020), 6616--6628."},{"key":"e_1_3_2_1_16_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision","author":"Gao Peng","year":"2023","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2023. Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision (2023), 581--595."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-09274-4"},{"key":"e_1_3_2_1_18_1","volume-title":"Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572","author":"Goodfellow Ian J","year":"2014","unstructured":"Ian J Goodfellow, Jonathon Shlens, and Christian Szegedy. 2014. Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572 (2014)."},{"key":"e_1_3_2_1_19_1","volume-title":"Uncertainty-aware alignment network for cross-domain video-text retrieval. NeurIPS","author":"Hao Xiaoshuai","year":"2024","unstructured":"Xiaoshuai Hao and Wanqian Zhang. 2024. Uncertainty-aware alignment network for cross-domain video-text retrieval. NeurIPS (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Xiaoshuai Hao Wanqian Zhang Dayan Wu Fei Zhu and Bo Li. 2022. Listen and look: Multi-modal aggregation and co-attention network for video-audio retrieval. In ICME. 1--6.","DOI":"10.1109\/ICME52920.2022.9859647"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Xiaoshuai Hao Wanqian Zhang Dayan Wu Fei Zhu and Bo Li. 2023. Dual alignment unsupervised domain adaptation for video-text retrieval. In CVPR. 18962--18972.","DOI":"10.1109\/CVPR52729.2023.01818"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Xiaoshuai Hao Yucan Zhou Dayan Wu Wanqian Zhang Bo Li and Weiping Wang. 2021. Multi-feature graph attention network for cross-modal video-text retrieval. In ICMR. 135--143.","DOI":"10.1145\/3460426.3463608"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Xiaoshuai Hao Yucan Zhou Dayan Wu Wanqian Zhang Bo Li Weiping Wang and Dan Meng. 2021. What matters: Attentive and relational feature aggregation network for video-text retrieval. In ICME. 1--6.","DOI":"10.1109\/ICME51207.2021.9428325"},{"key":"e_1_3_2_1_24_1","volume-title":"Mixgen: A new multi-modal data augmentation. In WACV. 379--389.","author":"Hao Xiaoshuai","year":"2023","unstructured":"Xiaoshuai Hao, Yi Zhu, Srikar Appalaraju, Aston Zhang, Wanqian Zhang, Bo Li, and Mu Li. 2023. Mixgen: A new multi-modal data augmentation. In WACV. 379--389."},{"key":"e_1_3_2_1_25_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Menglin Jia Luming Tang Bor-Chun Chen Claire Cardie Serge Belongie Bharath Hariharan and Ser-Nam Lim. 2022. Visual prompt tuning. In ECCV. 709--727.","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_27_1","volume-title":"Adversarial machine learning at scale. arXiv preprint arXiv:1611.01236","author":"Kurakin Alexey","year":"2016","unstructured":"Alexey Kurakin, Ian Goodfellow, and Samy Bengio. 2016. Adversarial machine learning at scale. arXiv preprint arXiv:1611.01236 (2016)."},{"key":"e_1_3_2_1_28_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. 12888--12900."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Ninghao Liu Hongxia Yang and Xia Hu. 2018. Adversarial detection with model interpretation. In KDD. 1803--1811.","DOI":"10.1145\/3219819.3220027"},{"key":"e_1_3_2_1_31_1","volume-title":"Adversarial training for large neural language models. arXiv preprint arXiv:2004.08994","author":"Liu Xiaodong","year":"2020","unstructured":"Xiaodong Liu, Hao Cheng, Pengcheng He, Weizhu Chen, Yu Wang, Hoifung Poon, and Jianfeng Gao. 2020. Adversarial training for large neural language models. arXiv preprint arXiv:2004.08994 (2020)."},{"key":"e_1_3_2_1_32_1","volume-title":"GPT understands, too. AI Open","author":"Liu Xiao","year":"2023","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2023. GPT understands, too. AI Open (2023), 208--215."},{"key":"e_1_3_2_1_33_1","volume-title":"GuardReasoner: Towards Reasoning-based LLM Safeguards. arXiv preprint arXiv:2501.18492","author":"Liu Yue","year":"2025","unstructured":"Yue Liu, Hongcheng Gao, Shengfang Zhai, Jun Xia, TianyiWu, Zhiwei Xue, Yulin Chen, Kenji Kawaguchi, Jiaheng Zhang, and Bryan Hooi. 2025. GuardReasoner: Towards Reasoning-based LLM Safeguards. arXiv preprint arXiv:2501.18492 (2025)."},{"key":"e_1_3_2_1_34_1","volume-title":"Flipattack: Jailbreak llms via flipping. arXiv preprint arXiv:2410.02832","author":"Liu Yue","year":"2024","unstructured":"Yue Liu, Xiaoxin He, Miao Xiong, Jinlan Fu, Shumin Deng, and Bryan Hooi. 2024. Flipattack: Jailbreak llms via flipping. arXiv preprint arXiv:2410.02832 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"AFLoRA: Adaptive Freezing of Low Rank Adaptation in Parameter Efficient Fine-Tuning of Large Models. arXiv preprint arXiv:2403.13269","author":"Liu Zeyu","year":"2024","unstructured":"Zeyu Liu, Souvik Kundu, Anni Li, JunruiWan, Lianghao Jiang, and Peter Anthony Beerel. 2024. AFLoRA: Adaptive Freezing of Low Rank Adaptation in Parameter Efficient Fine-Tuning of Large Models. arXiv preprint arXiv:2403.13269 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"andWei Zhan","author":"Lu Haoyu","year":"2023","unstructured":"Haoyu Lu, Mingyu Ding, Yuqi Huo, Guoxing Yang, Zhiwu Lu, Masayoshi Tomizuka, andWei Zhan. 2023. UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling. arXiv preprint arXiv:2302.06605 (2023)."},{"key":"e_1_3_2_1_37_1","unstructured":"Yuning Lu Jianzhuang Liu Yonggang Zhang Yajing Liu and Xinmei Tian. 2022. Prompt distribution learning. In CVPR. 5206--5215."},{"key":"e_1_3_2_1_38_1","volume-title":"Understanding adversarial attacks on deep learning based medical image analysis systems. Pattern Recognition","author":"Ma Xingjun","year":"2021","unstructured":"Xingjun Ma, Yuhao Niu, Lin Gu, Yisen Wang, Yitian Zhao, James Bailey, and Feng Lu. 2021. Understanding adversarial attacks on deep learning based medical image analysis systems. Pattern Recognition (2021), 107332."},{"key":"e_1_3_2_1_39_1","volume-title":"Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083","author":"Madry Aleksander","year":"2017","unstructured":"Aleksander Madry. 2017. Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083 (2017)."},{"key":"e_1_3_2_1_40_1","volume-title":"Understanding zero-shot adversarial robustness for large-scale models. arXiv preprint arXiv:2212.07016","author":"Mao Chengzhi","year":"2022","unstructured":"Chengzhi Mao, Scott Geng, Junfeng Yang, Xin Wang, and Carl Vondrick. 2022. Understanding zero-shot adversarial robustness for large-scale models. arXiv preprint arXiv:2212.07016 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"On detecting adversarial perturbations. arXiv preprint arXiv:1702.04267","author":"Metzen Jan Hendrik","year":"2017","unstructured":"Jan Hendrik Metzen, Tim Genewein, Volker Fischer, and Bastian Bischoff. 2017. On detecting adversarial perturbations. arXiv preprint arXiv:1702.04267 (2017)."},{"key":"e_1_3_2_1_42_1","volume-title":"Jake Grigsby, Di Jin, and Yanjun Qi.","author":"Morris John","year":"2020","unstructured":"John Morris, Eli Lifland, Jin Yong Yoo, Jake Grigsby, Di Jin, and Yanjun Qi. 2020. TextAttack: A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in NLP. In EMNLP. 119--126."},{"key":"e_1_3_2_1_43_1","volume-title":"LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning. arXiv preprint arXiv:2403.17919","author":"Pan Rui","year":"2024","unstructured":"Rui Pan, Xiang Liu, Shizhe Diao, Renjie Pi, Jipeng Zhang, Chi Han, and Tong Zhang. 2024. LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning. arXiv preprint arXiv:2403.17919 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Bag of tricks for adversarial training. arXiv preprint arXiv:2010.00467","author":"Pang Tianyu","year":"2020","unstructured":"Tianyu Pang, Xiao Yang, Yinpeng Dong, Hang Su, and Jun Zhu. 2020. Bag of tricks for adversarial training. arXiv preprint arXiv:2010.00467 (2020)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Jing Pei Lei Deng Sen Song Mingguo Zhao Youhui Zhang ShuangWu Guanrui Wang Zhe Zou Zhenzhi Wu Wei He et al. 2019. Towards artificial general intelligence with hybrid Tianjic chip architecture. Nature (2019) 106--111.","DOI":"10.1038\/s41586-019-1424-8"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer LiweiWang Chris M Cervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV. 2641--2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_47_1","volume-title":"BiLoRA: A Bi-level Optimization Framework for Overfitting-Resilient Low-Rank Adaptation of Large Pre-trained Models. arXiv preprint arXiv:2403.13037","author":"Qiang Rushi","year":"2024","unstructured":"Rushi Qiang, Ruiyi Zhang, and Pengtao Xie. 2024. BiLoRA: A Bi-level Optimization Framework for Overfitting-Resilient Low-Rank Adaptation of Large Pre-trained Models. arXiv preprint arXiv:2403.13037 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684--10695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2976749.2978392"},{"key":"e_1_3_2_1_50_1","volume-title":"Intriguing properties of neural networks. arXiv preprint arXiv:1312.6199","author":"Szegedy Christian","year":"2013","unstructured":"Christian Szegedy, Wojciech Zaremba, Ilya Sutskever, Joan Bruna, Dumitru Erhan, Ian Goodfellow, and Rob Fergus. 2013. Intriguing properties of neural networks. arXiv preprint arXiv:1312.6199 (2013)."},{"key":"e_1_3_2_1_51_1","volume-title":"Re-identification of individuals in genomic datasets using public face images. Science advances","author":"Venkatesaramani Rajagopal","year":"2021","unstructured":"Rajagopal Venkatesaramani, Bradley A Malin, and Yevgeniy Vorobeychik. 2021. Re-identification of individuals in genomic datasets using public face images. Science advances (2021), eabg3296."},{"key":"e_1_3_2_1_52_1","volume-title":"Generalizing to unseen domains via adversarial data augmentation. NeurIPS","author":"Volpi Riccardo","year":"2018","unstructured":"Riccardo Volpi, Hongseok Namkoong, Ozan Sener, John C Duchi, Vittorio Murino, and Silvio Savarese. 2018. Generalizing to unseen domains via adversarial data augmentation. NeurIPS (2018), 5339--5349."},{"key":"e_1_3_2_1_53_1","unstructured":"Haixin Wang Xinlong Yang Jianlong Chang Dian Jin Jinan Sun Shikun Zhang Xiao Luo and Qi Tian. 2023. Parameter-efficient Tuning of Large-scale Multimodal Foundation Model. In NeurIPS. 15752--15774."},{"key":"e_1_3_2_1_54_1","volume-title":"LoRA Meets Dropout under a Unified Framework. arXiv preprint arXiv:2403.00812","author":"Wang Sheng","year":"2024","unstructured":"Sheng Wang, Liheng Chen, Jiyue Jiang, Boyang Xue, Lingpeng Kong, and Chuan Wu. 2024. LoRA Meets Dropout under a Unified Framework. arXiv preprint arXiv:2403.00812 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Yu-Xiong Wang Deva Ramanan and Martial Hebert. 2017. Growing a brain: Fine-tuning by increasing model capacity. In CVPR. 2471--2480.","DOI":"10.1109\/CVPR.2017.323"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Xingxing Wei Jun Zhu Sha Yuan and Hang Su. 2019. Sparse adversarial perturbations for videos. In AAAI. 8973--8980.","DOI":"10.1609\/aaai.v33i01.33018973"},{"key":"e_1_3_2_1_57_1","volume-title":"Mike Li, Simon Kornblith, Rebecca Roelofs, Raphael Gontijo Lopes, Hannaneh Hajishirzi, Ali Farhadi, Hongseok Namkoong, et al.","author":"Wortsman Mitchell","year":"2022","unstructured":"Mitchell Wortsman, Gabriel Ilharco, Jong Wook Kim, Mike Li, Simon Kornblith, Rebecca Roelofs, Raphael Gontijo Lopes, Hannaneh Hajishirzi, Ali Farhadi, Hongseok Namkoong, et al. 2022. Robust fine-tuning of zero-shot models. In CVPR. 7959--7971."},{"key":"e_1_3_2_1_58_1","volume-title":"Transferring textual knowledge for visual recognition. arXiv preprint arXiv:2207.01297","author":"Wu Wenhao","year":"2022","unstructured":"Wenhao Wu, Zhun Sun, and Wanli Ouyang. 2022. Transferring textual knowledge for visual recognition. arXiv preprint arXiv:2207.01297 (2022)."},{"key":"e_1_3_2_1_59_1","volume-title":"Class-aware visual prompt tuning for vision-language pre-trained model. arXiv preprint arXiv:2208.08340","author":"Xing Yinghui","year":"2022","unstructured":"Yinghui Xing, QiruiWu, De Cheng, Shizhou Zhang, Guoqiang Liang, and Yanning Zhang. 2022. Class-aware visual prompt tuning for vision-language pre-trained model. arXiv preprint arXiv:2208.08340 (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296.","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296."},{"key":"e_1_3_2_1_61_1","volume-title":"AutoLoRa: An Automated Robust Fine-Tuning Framework. In The Twelfth International Conference on Learning Representations.","author":"Xu Xilie","unstructured":"Xilie Xu, Jingfeng Zhang, and Mohan Kankanhalli. [n. d.]. AutoLoRa: An Automated Robust Fine-Tuning Framework. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_62_1","volume-title":"Fulllora-at: Efficiently boosting the robustness of pretrained vision transformers. arXiv preprint arXiv:2401.01752","author":"Yuan Zheng","year":"2024","unstructured":"Zheng Yuan, Jie Zhang, and Shiguang Shan. 2024. Fulllora-at: Efficiently boosting the robustness of pretrained vision transformers. arXiv preprint arXiv:2401.01752 (2024)."},{"key":"e_1_3_2_1_63_1","volume-title":"Glipv2: Unifying localization and vision-language understanding. NeurIPS","author":"Zhang Haotian","year":"2022","unstructured":"Haotian Zhang, Pengchuan Zhang, Xiaowei Hu, Yen-Chun Chen, Liunian Li, Xiyang Dai, Lijuan Wang, Lu Yuan, Jenq-Neng Hwang, and Jianfeng Gao. 2022. Glipv2: Unifying localization and vision-language understanding. NeurIPS (2022), 36067--36080."},{"key":"e_1_3_2_1_64_1","volume-title":"Adversarial prompt tuning for vision-language models. arXiv preprint arXiv:2311.11261","author":"Zhang Jiaming","year":"2023","unstructured":"Jiaming Zhang, Xingjun Ma, Xin Wang, Lingyu Qiu, Jiaqi Wang, Yu-Gang Jiang, and Jitao Sang. 2023. Adversarial prompt tuning for vision-language models. arXiv preprint arXiv:2311.11261 (2023)."},{"key":"e_1_3_2_1_65_1","volume-title":"Adaptive budget allocation for parameterefficient fine-tuning. arXiv preprint arXiv:2303.10512","author":"Zhang Qingru","year":"2023","unstructured":"Qingru Zhang, Minshuo Chen, Alexander Bukharin, Pengcheng He, Yu Cheng, Weizhu Chen, and Tuo Zhao. 2023. Adaptive budget allocation for parameterefficient fine-tuning. arXiv preprint arXiv:2303.10512 (2023)."},{"key":"e_1_3_2_1_66_1","volume-title":"Qi Alfred Chen, and Z Morley Mao","author":"Zhang Qingzhao","year":"2022","unstructured":"Qingzhao Zhang, Shengtuo Hu, Jiachen Sun, Qi Alfred Chen, and Z Morley Mao. 2022. On adversarial robustness of trajectory prediction for autonomous vehicles. In CVPR. 15159--15168."},{"key":"e_1_3_2_1_67_1","volume-title":"Galore: Memory-efficient llm training by gradient low-rank projection. arXiv preprint arXiv:2403.03507","author":"Zhao Jiawei","year":"2024","unstructured":"Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang Wang, Anima Anandkumar, and Yuandong Tian. 2024. Galore: Memory-efficient llm training by gradient low-rank projection. arXiv preprint arXiv:2403.03507 (2024)."},{"key":"e_1_3_2_1_68_1","volume-title":"On evaluating adversarial robustness of large visionlanguage models. arXiv preprint arXiv:2305.16934","author":"Zhao Yunqing","year":"2023","unstructured":"Yunqing Zhao, Tianyu Pang, Chao Du, Xiao Yang, Chongxuan Li, Ngai-Man Cheung, and Min Lin. 2023. On evaluating adversarial robustness of large visionlanguage models. arXiv preprint arXiv:2305.16934 (2023)."},{"key":"e_1_3_2_1_69_1","volume-title":"Multi-LoRA Composition for Image Generation. arXiv preprint arXiv:2402.16843","author":"Zhong Ming","year":"2024","unstructured":"Ming Zhong, Yelong Shen, Shuohang Wang, Yadong Lu, Yizhu Jiao, Siru Ouyang, Donghan Yu, Jiawei Han, and Weizhu Chen. 2024. Multi-LoRA Composition for Image Generation. arXiv preprint arXiv:2402.16843 (2024)."},{"key":"e_1_3_2_1_70_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022. Learning to prompt for vision-language models. International Journal of Computer Vision (2022), 2337--2348."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733324","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:09:55Z","timestamp":1755749395000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733324"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":70,"alternative-id":["10.1145\/3731715.3733324","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733324","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}