{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,31]],"date-time":"2026-07-31T15:25:52Z","timestamp":1785511552636,"version":"3.56.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Hubei Province Key Research and Development Technology Special Innovation Project","award":["No.2021BAA032"],"award-info":[{"award-number":["No.2021BAA032"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.U20A20177 & No.62202186"],"award-info":[{"award-number":["No.U20A20177 & No.62202186"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612454","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"6311-6320","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":64,"title":["AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6785-7306","authenticated-orcid":false,"given":"Ziqi","family":"Zhou","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0042-9045","authenticated-orcid":false,"given":"Shengshan","family":"Hu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1735-2024","authenticated-orcid":false,"given":"Minghui","family":"Li","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6805-6401","authenticated-orcid":false,"given":"Hangtao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0551-1200","authenticated-orcid":false,"given":"Yechao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3934-7605","authenticated-orcid":false,"given":"Hai","family":"Jin","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3600270.3600358"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682430"},{"key":"e_1_3_2_1_3_1","volume-title":"Adversarial patch. arXiv preprint arXiv:1712.09665","author":"Brown Tom B.","year":"2017","unstructured":"Tom B. Brown, Dandelion Man\u00e9, Aurko Roy, Mart\u00edn Abadi, and Justin Gilmer. 2017. Adversarial patch. arXiv preprint arXiv:1712.09665 (2017)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495883"},{"key":"e_1_3_2_1_5_1","volume-title":"Defensive distillation is not robust to adversarial examples. arXiv preprint arXiv:1607.04311","author":"Carlini Nicholas","year":"2016","unstructured":"Nicholas Carlini and David Wagner. 2016. Defensive distillation is not robust to adversarial examples. arXiv preprint arXiv:1607.04311 (2016)."},{"key":"e_1_3_2_1_6_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2019","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2019. Uniter: Learning universal image-text representations. CoRR (2019)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 14th International Conference on Artificial Intelligence and Statistics (AISTATS'11). JMLR Workshop and Conference Proceedings, 215--223","author":"Coates Adam","year":"2011","unstructured":"Adam Coates, Andrew Ng, and Honglak Lee. 2011. An analysis of single-layer networks in unsupervised feature learning. In Proceedings of the 14th International Conference on Artificial Intelligence and Statistics (AISTATS'11). JMLR Workshop and Conference Proceedings, 215--223."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3548606.3559355"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP40778.2020.9191288"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3540261.3541904"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00473"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_15_1","volume-title":"Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572","author":"Goodfellow Ian J.","year":"2014","unstructured":"Ian J. Goodfellow, Jonathon Shlens, and Christian Szegedy. 2014. Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572 (2014)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/SPW.2018.00015"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01459"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25166"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475396"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548272"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46214.2022.9833644"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML'18)","author":"Karmon Danny","year":"2018","unstructured":"Danny Karmon, Daniel Zoran, and Yoav Goldberg. 2018. Lavan: Localized and visible adversarial noise. In Proceedings of the International Conference on Machine Learning (ICML'18). PMLR, 2507--2515."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475692"},{"key":"e_1_3_2_1_24_1","unstructured":"Alex Krizhevsky and Geoffrey Hinton. 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 35th International Conference on Neural Information Processing Systems (NeurIPS'21)","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. In Proceedings of the 35th International Conference on Neural Information Processing Systems (NeurIPS'21). 9694--9705."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612254"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33011028"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 31st USENIX Security Symposium (USENIX Security'22)","author":"Liu Hongbin","year":"2022","unstructured":"Hongbin Liu, Jinyuan Jia, and Neil Zhenqiang Gong. 2022. PoisonedEncoder: Poisoning the Unlabeled Pre-training Data in Contrastive Learning. In Proceedings of the 31st USENIX Security Symposium (USENIX Security'22). 3629--3645."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460120.3484749"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS'19)","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS'19)."},{"key":"e_1_3_2_1_32_1","volume-title":"Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083","author":"Madry Aleksander","year":"2017","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2017. Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083 (2017)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.17"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the British Machine Vision Conference (BMVC'17)","author":"Mopuri Konda Reddy","unstructured":"Konda Reddy Mopuri, Utsav Garg, and R. Venkatesh Babu. 2017. Fast Feature Fool: A data independent approach to universal adversarial perturbations. In Proceedings of the British Machine Vision Conference (BMVC'17). BMVA Press."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00034"},{"key":"e_1_3_2_1_36_1","volume-title":"How Adversarial Robustness Transfers from Pre-training to Downstream Tasks. arXiv preprint arXiv:2208.03835","author":"Nern Laura Fee","year":"2022","unstructured":"Laura Fee Nern and Yash Sharma. 2022. How Adversarial Robustness Transfers from Pre-training to Downstream Tasks. arXiv preprint arXiv:2208.03835 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2852503"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML'21)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning (ICML'21). PMLR, 8748--8763."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the North American Chapter of the Association for Computational Linguistics - Human Language Technologies Workshop (NAACL-HLTW'10)","author":"Rashtchian Cyrus","year":"2010","unstructured":"Cyrus Rashtchian, Peter Young, Micah Hodosh, and Julia Hockenmaier. 2010. Collecting image annotations using amazon's mechanical turk. In Proceedings of the North American Chapter of the Association for Computational Linguistics - Human Language Technologies Workshop (NAACL-HLTW'10). 139--147."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6017"},{"key":"e_1_3_2_1_44_1","volume-title":"Universal adversarial attacks with natural triggers for text classification. arXiv preprint arXiv:2005.00174","author":"Song Liwei","year":"2020","unstructured":"Liwei Song, Xinwei Yu, Hsuan-Tung Peng, and Karthik Narasimhan. 2020. Universal adversarial attacks with natural triggers for text classification. arXiv preprint arXiv:2005.00174 (2020)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2012.02.016"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.77"},{"key":"e_1_3_2_1_47_1","volume-title":"Adversarial Patch Attacks against Aerial Imagery Object Detectors. Neurocomputing","author":"Tang Guijian","year":"2023","unstructured":"Guijian Tang, Tingsong Jiang, Weien Zhou, Chao Li, Wen Yao, and Yong Zhao. 2023. Adversarial Patch Attacks against Aerial Imagery Object Detectors. Neurocomputing (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454814"},{"key":"e_1_3_2_1_49_1","volume-title":"Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125","author":"Wallace Eric","year":"2019","unstructured":"Eric Wallace, Shi Feng, Nikhil Kandpal, Matt Gardner, and Sameer Singh. 2019. Universal adversarial triggers for attacking and analyzing NLP. arXiv preprint arXiv:1908.07125 (2019)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/GlobalSIP.2018.8646578"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548003"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_11"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"e_1_3_2_1_54_1","volume-title":"A Comprehensive Empirical Study of Vision-Language Pre-trained Model for Supervised Cross-Modal Retrieval. arXiv preprint arXiv:2201.02772","author":"Zeng Zhixiong","year":"2022","unstructured":"Zhixiong Zeng and Wenji Mao. 2022. A Comprehensive Empirical Study of Vision-Language Pre-trained Model for Supervised Cross-Modal Retrieval. arXiv preprint arXiv:2201.02772 (2022)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00401"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_1"},{"key":"e_1_3_2_1_58_1","volume-title":"To prune, or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:1710.01878","author":"Zhu Michael","year":"2017","unstructured":"Michael Zhu and Suyog Gupta. 2017. To prune, or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:1710.01878 (2017)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.5121\/ijcsit.2012.4304"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612454","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612454","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:08:54Z","timestamp":1755821334000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612454"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":59,"alternative-id":["10.1145\/3581783.3612454","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612454","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}