{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:47:58Z","timestamp":1755802078743,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["JZ2024HGTB0256"],"award-info":[{"award-number":["JZ2024HGTB0256"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Open Project of Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, Anhui University","award":["MMC202412"],"award-info":[{"award-number":["MMC202412"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306098"],"award-info":[{"award-number":["62306098"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733367","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"779-787","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Joint Adversarial Purification: Mitigating the Threat of Multimodal Adversarial Examples"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7058-8791","authenticated-orcid":false,"given":"Qin","family":"Li","sequence":"first","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5621-6310","authenticated-orcid":false,"given":"Youze","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0639-2012","authenticated-orcid":false,"given":"Wenbo","family":"Hu","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-3986","authenticated-orcid":false,"given":"Richang","family":"Hong","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_2_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"e_1_3_2_1_4_1","volume-title":"Diffusion Posterior Sampling for General Noisy Inverse Problems. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=OnD9zGAGT0k","author":"Chung Hyungjin","year":"2023","unstructured":"Hyungjin Chung, Jeongsol Kim, Michael Thompson Mccann, Marc Louis Klasky, and Jong Chul Ye. 2023. Diffusion Posterior Sampling for General Noisy Inverse Problems. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=OnD9zGAGT0k"},{"key":"e_1_3_2_1_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_6_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems, Vol. 34 (2021), 8780--8794."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 7597--7607","author":"Diao Yunfeng","year":"2021","unstructured":"Yunfeng Diao, Tianjia Shao, Yong-Liang Yang, Kun Zhou, and He Wang. 2021. BASAR: Black-box attack on skeletal action recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 7597--7607."},{"key":"e_1_3_2_1_8_1","volume-title":"TASAR: Transfer-based Attack on Skeletal Action Recognition. In The International Conference on Learning Representations (ICLR).","author":"Diao Yunfeng","year":"2025","unstructured":"Yunfeng Diao, Baiqi Wu, Ruixuan Zhang, Ajian Liu, Xingxing Wei, Meng Wang, and He Wang. 2025. TASAR: Transfer-based Attack on Skeletal Action Recognition. In The International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_9_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Improving Adversarial Transferability of Visual-Language Pre-training Models through Collaborative Multimodal Interaction. arXiv preprint arXiv:2403.10883","author":"Fu Jiyuan","year":"2024","unstructured":"Jiyuan Fu, Zhaoyu Chen, Kaixun Jiang, Haijing Guo, Jiafeng Wang, Shuyong Gao, and Wenqiang Zhang. 2024. Improving Adversarial Transferability of Visual-Language Pre-training Models through Collaborative Multimodal Interaction. arXiv preprint arXiv:2403.10883 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Sa-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation. arXiv preprint arXiv:2312.04913","author":"He Bangyan","year":"2023","unstructured":"Bangyan He, Xiaojun Jia, Siyuan Liang, Tianrui Lou, Yang Liu, and Xiaochun Cao. 2023. Sa-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation. arXiv preprint arXiv:2312.04913 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_14_1","volume-title":"Elucidating the design space of diffusion-based generative models. Advances in neural information processing systems","author":"Karras Tero","year":"2022","unstructured":"Tero Karras, Miika Aittala, Timo Aila, and Samuli Laine. 2022. Elucidating the design space of diffusion-based generative models. Advances in neural information processing systems, Vol. 35 (2022), 26565--26577."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.141"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475692"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475431"},{"key":"e_1_3_2_1_18_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3217449"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00037"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"e_1_3_2_1_22_1","volume-title":"Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083","author":"Madry Aleksander","year":"2017","unstructured":"Aleksander Madry. 2017. Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083 (2017)."},{"key":"e_1_3_2_1_23_1","volume-title":"Diffusion models for adversarial purification. arXiv preprint arXiv:2205.07460","author":"Nie Weili","year":"2022","unstructured":"Weili Nie, Brandon Guo, Yujia Huang, Chaowei Xiao, Arash Vahdat, and Anima Anandkumar. 2022. Diffusion models for adversarial purification. arXiv preprint arXiv:2205.07460 (2022)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_25_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Learning Representations.","author":"Samangouei Pouya","year":"2018","unstructured":"Pouya Samangouei, Maya Kabkab, and Rama Chellappa. 2018. Defense-GAN: Protecting classifiers against adversarial attacks using generative models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","volume-title":"Online adversarial purification based on self-supervision. arXiv preprint arXiv:2101.09387","author":"Shi Changhao","year":"2021","unstructured":"Changhao Shi, Chester Holtz, and Gal Mishne. 2021a. Online adversarial purification based on self-supervision. arXiv preprint arXiv:2101.09387 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475637"},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=PxTIG12RRHS","author":"Song Yang","year":"2021","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=PxTIG12RRHS"},{"key":"e_1_3_2_1_30_1","volume-title":"Guided diffusion model for adversarial purification. arXiv preprint arXiv:2205.14969","author":"Wang Jinyi","year":"2022","unstructured":"Jinyi Wang, Zhaoyang Lyu, Dahua Lin, Bo Dai, and Hongfei Fu. 2022. Guided diffusion model for adversarial purification. arXiv preprint arXiv:2205.14969 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00196"},{"key":"e_1_3_2_1_32_1","volume-title":"Exploring transferability of multimodal adversarial samples for vision-language pre-training models with contrastive learning. arXiv preprint arXiv:2308.12636","author":"Wang Youze","year":"2023","unstructured":"Youze Wang, Wenbo Hu, Yinpeng Dong, Hanwang Zhang, Hang Su, and Richang Hong. 2023b. Exploring transferability of multimodal adversarial samples for vision-language pre-training models with contrastive learning. arXiv preprint arXiv:2308.12636 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3345167"},{"key":"e_1_3_2_1_34_1","volume-title":"ICLR 2025 Workshop on Building Trust in Language Models and Applications. https:\/\/openreview.net\/forum?id=9obhyu9csa","author":"Wang Youze","year":"2025","unstructured":"Youze Wang, Wenbo Hu, Qin Li, and Richang Hong. 2025. Boosting Adversarial Robustness of Vision-Language Pre-training Models against Multimodal Adversarial attacks. In ICLR 2025 Workshop on Building Trust in Language Models and Applications. https:\/\/openreview.net\/forum?id=9obhyu9csa"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00586"},{"key":"e_1_3_2_1_36_1","volume-title":"Guided diffusion model for adversarial purification from random noise. arXiv preprint arXiv:2206.10875","author":"Wu Quanlin","year":"2022","unstructured":"Quanlin Wu, Hang Ye, and Yuntian Gu. 2022. Guided diffusion model for adversarial purification from random noise. arXiv preprint arXiv:2206.10875 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706 (2019)."},{"key":"e_1_3_2_1_38_1","volume-title":"Feature Squeezing: Detecting Adversarial Examples in Deep Neural Networks. arXiv preprint arXiv:1704.01155","author":"Xu Weilin","year":"2017","unstructured":"Weilin Xu, David Evans, and Yanjun Qi. 2017. Feature Squeezing: Detecting Adversarial Examples in Deep Neural Networks. arXiv preprint arXiv:1704.01155 (2017)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 12062--12072","author":"Yoon Jongmin","year":"2021","unstructured":"Jongmin Yoon, Sung Ju Hwang, and Juho Lee. 2021. Adversarial purification with score-based generative models. In International Conference on Machine Learning. PMLR, 12062--12072."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733367","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:06:09Z","timestamp":1755749169000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733367"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":42,"alternative-id":["10.1145\/3731715.3733367","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733367","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}