{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T08:41:08Z","timestamp":1770799268517,"version":"3.50.0"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFB3108100"],"award-info":[{"award-number":["2024YFB3108100"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476250"],"award-info":[{"award-number":["62476250"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23B2054"],"award-info":[{"award-number":["U23B2054"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62506339"],"award-info":[{"award-number":["62506339"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472335"],"award-info":[{"award-number":["62472335"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372137"],"award-info":[{"award-number":["62372137"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LZ22F020007"],"award-info":[{"award-number":["LZ22F020007"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LQN25F010018"],"award-info":[{"award-number":["LQN25F010018"]}]},{"DOI":"10.13039\/501100002481","name":"Postgraduate Research and Innovation Foundation of Zhejiang University of Science and Technology","doi-asserted-by":"publisher","award":["2024yjskc19"],"award-info":[{"award-number":["2024yjskc19"]}],"id":[{"id":"10.13039\/501100002481","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tip.2026.3651982","type":"journal-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T21:01:28Z","timestamp":1769634088000},"page":"1082-1095","source":"Crossref","is-referenced-by-count":0,"title":["Individual and Common Attack: Enhancing Transferability in VLP Models Through Modal Feature Exploitation"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4056-9755","authenticated-orcid":false,"given":"Yaguan","family":"Qian","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence and Information Engineering, Zhejiang University of Science and Technology, Hangzhou, China"}]},{"given":"Yaxin","family":"Kong","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Information Engineering, Zhejiang University of Science and Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1703-6090","authenticated-orcid":false,"given":"Qiqi","family":"Bao","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Information Engineering, Zhejiang University of Science and Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7546-852X","authenticated-orcid":false,"given":"Zhaoquan","family":"Gu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}]},{"given":"Bin","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang Key Laboratory of Artificial Intelligence of Things (AIoT) Network and Data Security, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4268-372X","authenticated-orcid":false,"given":"Shouling","family":"Ji","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"given":"Jianping","family":"Zhang","sequence":"additional","affiliation":[{"name":"Meta AI, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0791-189X","authenticated-orcid":false,"given":"Zhen","family":"Lei","sequence":"additional","affiliation":[{"name":"Center for Biometrics and Security Research and the National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Using left and right brains together: Towards vision and language planning","author":"Cen","year":"2024","journal-title":"arXiv:2402.10534"},{"key":"ref2","first-page":"32897","article-title":"VLMo: Unified vision-language pre-training with mixture-of-modality-experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_25"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3378918"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28359"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/IJCB62174.2024.10744523"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00416"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3430508"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01836"},{"key":"ref12","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"ref18","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00427"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"ref24","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"Xu","year":"2015","journal-title":"arXiv:1502.03044"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.05.080"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612454"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00401"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3686835"},{"key":"ref30","article-title":"Adversarial robustness for visual grounding of multimodal large language models","author":"Gao","year":"2024","journal-title":"arXiv:2405.09981"},{"key":"ref31","article-title":"Semantic-aligned adversarial evolution triangle for high-transferability vision-language attack","author":"Jia","year":"2024","journal-title":"arXiv:2411.02669"},{"key":"ref32","first-page":"54111","article-title":"On evaluating adversarial robustness of large vision-language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhao"},{"key":"ref33","first-page":"52936","article-title":"VLATTACK: Multimodal adversarial attacks on vision-language tasks via pre-trained models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657781"},{"key":"ref35","first-page":"12116","article-title":"Do vision transformers see like convolutional neural networks?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Raghu"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6311"},{"key":"ref38","article-title":"TextBugger: Generating adversarial text against real-world applications","author":"Li","year":"2018","journal-title":"arXiv:1812.05271"},{"key":"ref39","article-title":"Adversarial GLUE: A multi-task benchmark for robustness evaluation of language models","author":"Wang","year":"2021","journal-title":"arXiv:2111.02840"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.500"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_34"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00444"},{"key":"ref43","article-title":"Nesterov accelerated gradient and scale invariance for adversarial attacks","author":"Lin","year":"2019","journal-title":"arXiv:1908.06281"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00196"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00289"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00173"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3368112"},{"key":"ref48","article-title":"An image is worth 16$\\times$\n16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv:2010.11929"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.06083"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref53","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee"},{"key":"ref54","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Proc. Workshop Text Summarization Branches Out","author":"Lin"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref56","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref57","article-title":"PandaGPT: One model to instruction-follow them all","author":"Su","year":"2023","journal-title":"arXiv:2305.16355"},{"key":"ref58","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref59","first-page":"18378","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Represent.","author":"Zhu"},{"key":"ref60","first-page":"49250","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/83\/11355710\/11364100.pdf?arnumber=11364100","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T21:10:21Z","timestamp":1770671421000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11364100\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/tip.2026.3651982","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}