{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T06:08:47Z","timestamp":1774937327930,"version":"3.50.1"},"reference-count":75,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62425603"],"award-info":[{"award-number":["62425603"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005145","name":"Basic Research Program of Jiangsu Province","doi-asserted-by":"publisher","award":["BK20240011"],"award-info":[{"award-number":["BK20240011"]}],"id":[{"id":"10.13039\/501100005145","id-type":"DOI","asserted-by":"publisher"}]},{"name":"AI Singapore Program under the National Research Foundation Singapore through AISG","award":["AISG3-GV-2023-011"],"award-info":[{"award-number":["AISG3-GV-2023-011"]}]},{"name":"Lee Kong Chian Fellowships"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans.Inform.Forensic Secur."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tifs.2025.3574976","type":"journal-article","created":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T17:13:10Z","timestamp":1748538790000},"page":"5663-5677","source":"Crossref","is-referenced-by-count":23,"title":["Modality-Specific Interactive Attack for Vision-Language Pre-Training Models"],"prefix":"10.1109","volume":"20","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2617-9665","authenticated-orcid":false,"given":"Haiqi","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6973-8121","authenticated-orcid":false,"given":"Hao","family":"Tang","sequence":"additional","affiliation":[{"name":"Centre for Smart Health, The Hong Kong Polytechnic University, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6249-5596","authenticated-orcid":false,"given":"Yanpeng","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3802-4644","authenticated-orcid":false,"given":"Shengfeng","family":"He","sequence":"additional","affiliation":[{"name":"School of Computing and Information Systems, Singapore Management University, Bras Basah, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5341-5985","authenticated-orcid":false,"given":"Zechao","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01836"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"ref4","first-page":"21480","article-title":"When does contrastive learning preserve adversarial robustness from pretraining to finetuning?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Fan"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"ref6","first-page":"52936","article-title":"VLATTACK: Multimodal adversarial attacks on vision-language tasks via pre-trained models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/SaTML54575.2023.00030"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00754"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2020.3036801"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2019.2945198"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2998275"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612454"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/60"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/109"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_25"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.06083"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.500"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00957"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00427"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_42"},{"key":"ref24","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3132068"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2852750"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref29","first-page":"1","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3441947"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3511621"},{"key":"ref33","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"34","author":"Li"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02218"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"ref36","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108792"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289753"},{"key":"ref39","first-page":"1","article-title":"Explaining and harnessing adversarial examples","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Goodfellow"},{"key":"ref40","first-page":"1","article-title":"Nesterov accelerated gradient and scale invariance for adversarial attacks","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Lin"},{"key":"ref41","first-page":"1","article-title":"Intriguing properties of neural networks","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Szegedy"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_1"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3346211"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3156809"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3266702"},{"key":"ref46","first-page":"1","article-title":"On adversarial examples for character-level neural machine translation","volume-title":"Proc. Int. Conf. Comput. Linguistics","author":"Ebrahimi"},{"key":"ref47","first-page":"1","article-title":"Adversarial GLUE: A multi-task benchmark for robustness evaluation of language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3245984"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3455775"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3350376"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3226905"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23138"},{"key":"ref53","first-page":"10791","article-title":"Cross-modal learning with adversarial samples","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00222"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403084"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3559758"},{"key":"ref57","article-title":"OT-attack: Enhancing adversarial transferability of vision-language models via optimal transport optimization","author":"Han","year":"2023","journal-title":"arXiv:2312.04403"},{"key":"ref58","article-title":"SA-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation","author":"He","year":"2023","journal-title":"arXiv:2312.04913"},{"key":"ref59","article-title":"Improving adversarial transferability of vision-language pre-training models through collaborative multimodal interaction","author":"Fu","year":"2024","journal-title":"arXiv:2403.10883"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00284"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.01.071"},{"key":"ref63","first-page":"1","article-title":"Transferable adversarial attack based on integrated gradients","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Huang"},{"key":"ref64","first-page":"3319","article-title":"Axiomatic attribution for deep networks","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"70","author":"Sundararajan"},{"key":"ref65","first-page":"23296","article-title":"Intriguing properties of vision transformers","volume-title":"Proc. NeurIPS","author":"Naseer"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/759"},{"key":"ref69","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref70","article-title":"Qwen technical report","volume-title":"arXiv:2309.16609","author":"Bai","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3617592"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01837"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"ref74","first-page":"1","article-title":"Exploring diverse in-context configurations for image captioning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"}],"container-title":["IEEE Transactions on Information Forensics and Security"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10206\/10810755\/11018132.pdf?arnumber=11018132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T17:50:19Z","timestamp":1749837019000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11018132\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/tifs.2025.3574976","relation":{},"ISSN":["1556-6013","1556-6021"],"issn-type":[{"value":"1556-6013","type":"print"},{"value":"1556-6021","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}