{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:45:53Z","timestamp":1772909153340,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,6]]},"DOI":"10.1145\/3709026.3709059","type":"proceedings-article","created":{"date-parts":[[2025,2,15]],"date-time":"2025-02-15T10:05:41Z","timestamp":1739613941000},"page":"212-220","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["On Enhancing Adversarial Robustness of Large Pre-trained Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5117-0187","authenticated-orcid":false,"given":"Jie","family":"Luo","sequence":"first","affiliation":[{"name":"The 723 institute of CSSC, Yangzhou, Jiangsu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1507-6927","authenticated-orcid":false,"given":"Lingfeng","family":"Kong","sequence":"additional","affiliation":[{"name":"The 723 institute of CSSC, Yangzhou, Jiangsu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,2,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"B.\u00a0Zi S.\u00a0Zhao X.\u00a0Ma and Y.\u00a0Jiang \u201cRevisiting adversarial robustness distillation: Robust soft labels make student better \u201d in 2021 IEEE\/CVF International Conference on Computer Vision ICCV 2021 Montreal QC Canada October 10-17 2021.\u00a0\u00a0\u00a0IEEE 2021 pp. 16\u00a0423\u201316\u00a0432.","DOI":"10.1109\/ICCV48922.2021.01613"},{"key":"e_1_3_3_1_3_2","unstructured":"A.\u00a0Radford J.\u00a0W. Kim C.\u00a0Hallacy A.\u00a0Ramesh G.\u00a0Goh S.\u00a0Agarwal G.\u00a0Sastry A.\u00a0Askell P.\u00a0Mishkin J.\u00a0Clark et\u00a0al. \u201cLearning transferable visual models from natural language supervision \u201d in International conference on machine learning.\u00a0\u00a0\u00a0PMLR 2021 pp. 8748\u20138763."},{"key":"e_1_3_3_1_4_2","unstructured":"C.\u00a0Mao S.\u00a0Geng J.\u00a0Yang X.\u00a0Wang and C.\u00a0Vondrick \u201cUnderstanding zero-shot adversarial robustness for large-scale models \u201d in ICLR.\u00a0\u00a0\u00a0OpenReview.net 2023."},{"key":"e_1_3_3_1_5_2","unstructured":"C.\u00a0Jia Y.\u00a0Yang Y.\u00a0Xia Y.-T. Chen Z.\u00a0Parekh H.\u00a0Pham Q.\u00a0Le Y.-H. Sung Z.\u00a0Li and T.\u00a0Duerig \u201cScaling up visual and vision-language representation learning with noisy text supervision \u201d in International conference on machine learning.\u00a0\u00a0\u00a0PMLR 2021 pp. 4904\u20134916."},{"key":"e_1_3_3_1_6_2","unstructured":"A.\u00a0Madry A.\u00a0Makelov L.\u00a0Schmidt D.\u00a0Tsipras and A.\u00a0Vladu \u201cTowards deep learning models resistant to adversarial attacks \u201d in ICLR (Poster).\u00a0\u00a0\u00a0OpenReview.net 2018."},{"key":"e_1_3_3_1_7_2","unstructured":"F.\u00a0Croce and M.\u00a0Hein \u201cReliable evaluation of adversarial robustness with an ensemble of diverse parameter-free attacks \u201d in ICML ser. Proceedings of Machine Learning Research vol. 119.\u00a0\u00a0\u00a0PMLR 2020 pp. 2206\u20132216."},{"key":"e_1_3_3_1_8_2","unstructured":"G.\u00a0Luo Y.\u00a0Zhou T.\u00a0Ren S.\u00a0Chen X.\u00a0Sun and R.\u00a0Ji \u201cCheap and quick: Efficient vision-language instruction tuning for large language models \u201d CoRR vol. abs\/2305.15023 2023."},{"key":"e_1_3_3_1_9_2","unstructured":"J.\u00a0Li D.\u00a0Li S.\u00a0Savarese and S.\u00a0C.\u00a0H. Hoi \u201cBLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models \u201d in ICML ser. Proceedings of Machine Learning Research vol. 202.\u00a0\u00a0\u00a0PMLR 2023 pp. 19\u00a0730\u201319\u00a0742."},{"key":"e_1_3_3_1_10_2","unstructured":"D.\u00a0Zhu J.\u00a0Chen X.\u00a0Shen X.\u00a0Li and M.\u00a0Elhoseiny \u201cMinigpt-4: Enhancing vision-language understanding with advanced large language models \u201d CoRR vol. abs\/2304.10592 2023."},{"key":"e_1_3_3_1_11_2","unstructured":"H.\u00a0Liu C.\u00a0Li Q.\u00a0Wu and Y.\u00a0J. Lee \u201cVisual instruction tuning \u201d CoRR vol. abs\/2304.08485 2023."},{"key":"e_1_3_3_1_12_2","unstructured":"Y.\u00a0Zhao T.\u00a0Pang C.\u00a0Du X.\u00a0Yang C.\u00a0Li N.\u00a0Cheung and M.\u00a0Lin \u201cOn evaluating adversarial robustness of large vision-language models \u201d CoRR vol. abs\/2305.16934 2023."},{"key":"e_1_3_3_1_13_2","unstructured":"Y.\u00a0Wang W.\u00a0Hu Y.\u00a0Dong and R.\u00a0Hong \u201cExploring transferability of multimodal adversarial samples for vision-language pre-training models with contrastive learning \u201d CoRR vol. abs\/2308.12636 2023."},{"key":"e_1_3_3_1_14_2","unstructured":"Y.\u00a0Dong H.\u00a0Chen J.\u00a0Chen Z.\u00a0Fang X.\u00a0Yang Y.\u00a0Zhang Y.\u00a0Tian H.\u00a0Su and J.\u00a0Zhu \u201cHow robust is google\u2019s bard to adversarial image attacks?\u201d CoRR vol. abs\/2309.11751 2023."},{"key":"e_1_3_3_1_15_2","unstructured":"H.\u00a0Zhang Y.\u00a0Yu J.\u00a0Jiao E.\u00a0P. Xing L.\u00a0E. Ghaoui and M.\u00a0I. Jordan \u201cTheoretically principled trade-off between robustness and accuracy \u201d in ICML ser. Proceedings of Machine Learning Research vol.\u00a097.\u00a0\u00a0\u00a0PMLR 2019 pp. 7472\u20137482."},{"key":"e_1_3_3_1_16_2","unstructured":"Y.\u00a0Zhai S.\u00a0Tong X.\u00a0Li M.\u00a0Cai Q.\u00a0Qu Y.\u00a0J. Lee and Y.\u00a0Ma \u201cInvestigating the catastrophic forgetting in multimodal large language models \u201d arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.10313 2023."},{"key":"e_1_3_3_1_17_2","unstructured":"D.\u00a0Tsipras S.\u00a0Santurkar L.\u00a0Engstrom A.\u00a0Turner and A.\u00a0Madry \u201cRobustness may be at odds with accuracy \u201d arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1805.12152 2018."},{"key":"e_1_3_3_1_18_2","unstructured":"P.\u00a0Lu S.\u00a0Mishra T.\u00a0Xia L.\u00a0Qiu K.-W. Chang S.-C. Zhu O.\u00a0Tafjord P.\u00a0Clark and A.\u00a0Kalyan \u201cLearn to explain: Multimodal reasoning via thought chains for science question answering \u201d Advances in Neural Information Processing Systems vol.\u00a035 pp. 2507\u20132521 2022."},{"key":"e_1_3_3_1_19_2","unstructured":"R.\u00a0Mokady A.\u00a0Hertz and A.\u00a0H. Bermano \u201cClipcap: Clip prefix for image captioning \u201d arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.09734 2021."},{"key":"e_1_3_3_1_20_2","unstructured":"A.\u00a0Radford J.\u00a0Wu R.\u00a0Child D.\u00a0Luan D.\u00a0Amodei I.\u00a0Sutskever et\u00a0al. \u201cLanguage models are unsupervised multitask learners \u201d OpenAI blog vol.\u00a01 no.\u00a08 p.\u00a09 2019."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"B.\u00a0Huang M.\u00a0Chen Y.\u00a0Wang J.\u00a0Lu M.\u00a0Cheng and W.\u00a0Wang \u201cBoosting accuracy and robustness of student models via adaptive adversarial distillation \u201d in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition 2023 pp. 24\u00a0668\u201324\u00a0677.","DOI":"10.1109\/CVPR52729.2023.02363"},{"key":"e_1_3_3_1_22_2","unstructured":"A.\u00a0Krizhevsky G.\u00a0Hinton et\u00a0al. \u201cLearning multiple layers of features from tiny images \u201d 2009."},{"key":"e_1_3_3_1_23_2","unstructured":"A.\u00a0Coates A.\u00a0Ng and H.\u00a0Lee \u201cAn analysis of single-layer networks in unsupervised feature learning \u201d in Proceedings of the fourteenth international conference on artificial intelligence and statistics.\u00a0\u00a0\u00a0JMLR Workshop and Conference Proceedings 2011 pp. 215\u2013223."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"L.\u00a0Bossard M.\u00a0Guillaumin and L.\u00a0Van\u00a0Gool \u201cFood-101\u2013mining discriminative components with random forests \u201d in Computer Vision\u2013ECCV 2014: 13th European Conference Zurich Switzerland September 6-12 2014 Proceedings Part VI 13.\u00a0\u00a0\u00a0Springer 2014 pp. 446\u2013461.","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"M.-E. Nilsback and A.\u00a0Zisserman \u201cAutomated flower classification over a large number of classes \u201d in 2008 Sixth Indian conference on computer vision graphics & image processing.\u00a0\u00a0\u00a0IEEE 2008 pp. 722\u2013729.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_3_1_26_2","unstructured":"S.\u00a0Maji E.\u00a0Rahtu J.\u00a0Kannala M.\u00a0Blaschko and A.\u00a0Vedaldi \u201cFine-grained visual classification of aircraft \u201d arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1306.5151 2013."},{"key":"e_1_3_3_1_27_2","unstructured":"X.\u00a0Chen H.\u00a0Fang T.-Y. Lin R.\u00a0Vedantam S.\u00a0Gupta P.\u00a0Doll\u00e1r and C.\u00a0L. Zitnick \u201cMicrosoft coco captions: Data collection and evaluation server \u201d arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1504.00325 2015."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"T.-Y. Lin M.\u00a0Maire S.\u00a0Belongie J.\u00a0Hays P.\u00a0Perona D.\u00a0Ramanan P.\u00a0Doll\u00e1r and C.\u00a0L. Zitnick \u201cMicrosoft coco: Common objects in context \u201d in Computer Vision\u2013ECCV 2014: 13th European Conference Zurich Switzerland September 6-12 2014 Proceedings Part V 13.\u00a0\u00a0\u00a0Springer 2014 pp. 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"A.\u00a0Karpathy and L.\u00a0Fei-Fei \u201cDeep visual-semantic alignments for generating image descriptions \u201d in Proceedings of the IEEE conference on computer vision and pattern recognition 2015 pp. 3128\u20133137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"K.\u00a0Papineni S.\u00a0Roukos T.\u00a0Ward and W.-J. Zhu \u201cBleu: a method for automatic evaluation of machine translation \u201d in Proceedings of the 40th annual meeting of the Association for Computational Linguistics 2002 pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_3_1_31_2","unstructured":"S.\u00a0Banerjee and A.\u00a0Lavie \u201cMeteor: An automatic metric for mt evaluation with improved correlation with human judgments \u201d in Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization 2005 pp. 65\u201372."},{"key":"e_1_3_3_1_32_2","unstructured":"C.-Y. Lin \u201cRouge: A package for automatic evaluation of summaries \u201d in Text summarization branches out 2004 pp. 74\u201381."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"R.\u00a0Vedantam C.\u00a0Lawrence\u00a0Zitnick and D.\u00a0Parikh \u201cCider: Consensus-based image description evaluation \u201d in Proceedings of the IEEE conference on computer vision and pattern recognition 2015 pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_34_2","unstructured":"G.\u00a0E. Hinton O.\u00a0Vinyals and J.\u00a0Dean \u201cDistilling the knowledge in a neural network \u201d CoRR vol. abs\/1503.02531 2015."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"J.\u00a0Yim D.\u00a0Joo J.\u00a0Bae and J.\u00a0Kim \u201cA gift from knowledge distillation: Fast optimization network minimization and transfer learning \u201d in CVPR.\u00a0\u00a0\u00a0IEEE Computer Society 2017 pp. 7130\u20137138.","DOI":"10.1109\/CVPR.2017.754"},{"key":"e_1_3_3_1_36_2","unstructured":"A.\u00a0Romero N.\u00a0Ballas S.\u00a0E. Kahou A.\u00a0Chassang C.\u00a0Gatta and Y.\u00a0Bengio \u201cFitnets: Hints for thin deep nets \u201d in ICLR (Poster) 2015."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"W.\u00a0Park D.\u00a0Kim Y.\u00a0Lu and M.\u00a0Cho \u201cRelational knowledge distillation \u201d in CVPR.\u00a0\u00a0\u00a0Computer Vision Foundation \/ IEEE 2019 pp. 3967\u20133976.","DOI":"10.1109\/CVPR.2019.00409"}],"event":{"name":"CSAI 2024: 2024 8th International Conference on Computer Science and Artificial Intelligence (CSAI)","location":"Beijing China","acronym":"CSAI 2024"},"container-title":["Proceedings of the 2024 8th International Conference on Computer Science and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3709026.3709059","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3709026.3709059","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:31Z","timestamp":1750295851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3709026.3709059"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"references-count":36,"alternative-id":["10.1145\/3709026.3709059","10.1145\/3709026"],"URL":"https:\/\/doi.org\/10.1145\/3709026.3709059","relation":{},"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"2025-02-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}