{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T00:19:01Z","timestamp":1758845941344,"version":"3.44.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T00:00:00Z","timestamp":1757289600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T00:00:00Z","timestamp":1757289600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s40747-025-02036-y","type":"journal-article","created":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T15:10:54Z","timestamp":1757344254000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["JDA-attack: leveraging joint multimodal data augmentation to enhance adversarial transferability of vision-language pre-training models"],"prefix":"10.1007","volume":"11","author":[{"given":"Xujie","family":"Ren","sequence":"first","affiliation":[]},{"given":"Caikun","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Guoqiang","family":"Han","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zepeng","family":"Fan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,8]]},"reference":[{"key":"2036_CR1","first-page":"32897","volume":"35","author":"H Bao","year":"2022","unstructured":"Bao H, Wang W, Dong L et al (2022) Vlmo: unified vision-language pre-training with mixture-of-modality-experts. Adv Neural Inf Process Syst 35:32897\u201332912","journal-title":"Adv Neural Inf Process Syst"},{"key":"2036_CR2","doi-asserted-by":"crossref","unstructured":"Carlini N, Wagner D (2017) Towards evaluating the robustness of neural networks. In: 2017 ieee symposium on security and privacy (sp). IEEE. pp 39\u201357","DOI":"10.1109\/SP.2017.49"},{"issue":"1","key":"2036_CR3","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"FL Chen","year":"2023","unstructured":"Chen FL, Zhang DZ, Han ML et al (2023) Vlp: a survey on vision-language pre-training. Mach Intell Res 20(1):38\u201356","journal-title":"Mach Intell Res"},{"key":"2036_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110809","volume":"156","author":"H Cheng","year":"2024","unstructured":"Cheng H, Ye H, Zhou X et al (2024) Vision-language pre-training via modal interaction. Pattern Recogn 156:110809","journal-title":"Pattern Recogn"},{"key":"2036_CR5","doi-asserted-by":"publisher","first-page":"5856","DOI":"10.1109\/TIP.2022.3202366","volume":"31","author":"Y Deng","year":"2022","unstructured":"Deng Y, Karam LJ (2022) Frequency-tuned universal adversarial attacks on texture recognition. IEEE Trans Image Process 31:5856\u20135868","journal-title":"IEEE Trans Image Process"},{"key":"2036_CR6","doi-asserted-by":"crossref","unstructured":"Dong Y, Liao F, Pang T, et\u00a0al (2018) Boosting adversarial attacks with momentum. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 9185\u20139193","DOI":"10.1109\/CVPR.2018.00957"},{"key":"2036_CR7","unstructured":"Fu J, Chen Z, Jiang K, et\u00a0al (2024) Improving adversarial transferability of visual-language pre-training models through collaborative multimodal interaction. arXiv preprint arXiv:2403.10883. Accessed 10 Dec 2024"},{"key":"2036_CR8","doi-asserted-by":"crossref","unstructured":"Ganeshan A, BS V, Babu RV (2019) Fda: Feature disruptive attack. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp 8069\u20138079","DOI":"10.1109\/ICCV.2019.00816"},{"key":"2036_CR9","first-page":"442","volume-title":"European Conference on Computer Vision","author":"S Gao","year":"2024","unstructured":"Gao S, Jia X, Ren X et al (2024) Boosting transferability in vision-language attacks via diversification along the intersection region of adversarial trajectory. European Conference on Computer Vision. Springer, Cham, pp 442\u2013460"},{"key":"2036_CR10","doi-asserted-by":"crossref","unstructured":"Guo C, Sablayrolles A, J\u00e9gou H, et\u00a0al (2021) Gradient-based adversarial attacks against text transformers. arXiv preprint arXiv:2104.13733. Accessed 10 Dec 2024","DOI":"10.18653\/v1\/2021.emnlp-main.464"},{"key":"2036_CR11","unstructured":"Han D, Jia X, Bai Y, et\u00a0al (2023) Ot-attack: Enhancing adversarial transferability of vision-language models via optimal transport optimization. arXiv preprint arXiv:2312.04403. Accessed 10 Dec 2024"},{"key":"2036_CR12","doi-asserted-by":"crossref","unstructured":"Hao X, Zhu Y, Appalaraju S, et\u00a0al (2023) Mixgen: a new multi-modal data augmentation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision. pp 379\u2013389","DOI":"10.1109\/WACVW58289.2023.00042"},{"key":"2036_CR13","unstructured":"He B, Jia X, Liang S, et\u00a0al (2023) Sa-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation. arXiv preprint arXiv:2312.04913. Accessed 10 Dec 2024"},{"key":"2036_CR14","doi-asserted-by":"crossref","unstructured":"Hessel J, Holtzman A, Forbes M, et\u00a0al (2021) Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718. Accessed 10 Dec 2024","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"2036_CR15","unstructured":"Kim W, Son B, Kim I (2021) Vilt: vision-and-language transformer without convolution or region supervision. In: International conference on machine learning, PMLR. pp 5583\u20135594"},{"key":"2036_CR16","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1201\/9781351251389-8","volume-title":"Artificial intelligence safety and security","author":"A Kurakin","year":"2018","unstructured":"Kurakin A, Goodfellow IJ, Bengio S (2018) Adversarial examples in the physical world. Artificial intelligence safety and security. Chapman and Hall, CRC, pp 99\u2013112"},{"key":"2036_CR17","doi-asserted-by":"crossref","unstructured":"Lei C, Luo S, Liu Y, et\u00a0al (2021) Understanding chinese video and language via contrastive multimodal pre-training. In: Proceedings of the 29th ACM International Conference on Multimedia. pp 2567\u20132576","DOI":"10.1145\/3474085.3475431"},{"key":"2036_CR18","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju R, Gotmare A et al (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inf Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inf Process Syst"},{"key":"2036_CR19","unstructured":"Li J, Li D, Xiong C, et\u00a0al (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International conference on machine learning, PMLR. pp 12888\u201312900"},{"key":"2036_CR20","doi-asserted-by":"crossref","unstructured":"Li L, Ma R, Guo Q, et\u00a0al (2020) Bert-attack: Adversarial attack against bert using bert. arXiv preprint arXiv:2004.09984. Accessed 10 Dec 2024","DOI":"10.18653\/v1\/2020.emnlp-main.500"},{"key":"2036_CR21","unstructured":"Lin J, Song C, He K, et\u00a0al (2019) Nesterov accelerated gradient and scale invariance for adversarial attacks. arXiv preprint arXiv:1908.06281. Accessed 10 Dec 2024"},{"key":"2036_CR22","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, et\u00a0al (2014) Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, Springer, Cham. pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2036_CR23","first-page":"549","volume-title":"European conference on computer vision","author":"Y Long","year":"2022","unstructured":"Long Y, Zhang Q, Zeng B et al (2022) Frequency domain model augmentation for adversarial attack. European conference on computer vision. Springer, Cham, pp 549\u2013566"},{"key":"2036_CR24","doi-asserted-by":"crossref","unstructured":"Lu D, Wang Z, Wang T, et\u00a0al (2023) Set-level guidance attack: Boosting adversarial transferability of vision-language pre-training models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp 102\u2013111","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"2036_CR25","unstructured":"Madry A, Makelov A, Schmidt L, et\u00a0al (2017) Towards deep learning models resistant to adversarial attacks. arXiv preprint arXiv:1706.06083. Accessed 10 Dec 2024"},{"key":"2036_CR26","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM, et\u00a0al (2015) Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision. pp 2641\u20132649","DOI":"10.1109\/ICCV.2015.303"},{"key":"2036_CR27","unstructured":"Radford A, Kim JW, Hallacy C, et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR. pp 8748\u20138763"},{"key":"2036_CR28","first-page":"429","volume-title":"European conference on computer vision","author":"S Rao","year":"2020","unstructured":"Rao S, Stutz D, Schiele B (2020) Adversarial training against location-optimized adversarial patches. European conference on computer vision. Springer, Cham, pp 429\u2013448"},{"key":"2036_CR29","doi-asserted-by":"publisher","first-page":"68633","DOI":"10.1109\/ACCESS.2022.3185748","volume":"10","author":"YE Seyyar","year":"2022","unstructured":"Seyyar YE, Yavuz AG, \u00dcnver HM (2022) An attack detection framework based on bert and deep learning. IEEE Access 10:68633\u201368644","journal-title":"IEEE Access"},{"key":"2036_CR30","doi-asserted-by":"crossref","unstructured":"Wang H, Dong K, Zhu Z, et\u00a0al (2024) Transferable multimodal attack on vision-language pre-training models. In: 2024 IEEE Symposium on Security and Privacy (SP). IEEE Computer Society. pp 102\u2013102","DOI":"10.1109\/SP54263.2024.00102"},{"key":"2036_CR31","doi-asserted-by":"crossref","unstructured":"Wang X, He K (2021) Enhancing the transferability of adversarial attacks through variance tuning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 1924\u20131933","DOI":"10.1109\/CVPR46437.2021.00196"},{"key":"2036_CR32","doi-asserted-by":"crossref","unstructured":"Wang X, He X, Wang J, et\u00a0al (2021) Admix: Enhancing the transferability of adversarial attacks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp 16158\u201316167","DOI":"10.1109\/ICCV48922.2021.01585"},{"key":"2036_CR33","doi-asserted-by":"crossref","unstructured":"Wang X, Zhang Z, Zhang J (2023) Structure invariant transformation for better adversarial transferability. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp 4607\u20134619","DOI":"10.1109\/ICCV51070.2023.00425"},{"key":"2036_CR34","unstructured":"Wang Y, Hu W, Dong Y, et\u00a0al (2023) Exploring transferability of multimodal adversarial samples for vision-language pre-training models with contrastive learning. arXiv preprint arXiv:2308.12636. Accessed 10 Dec 2024"},{"key":"2036_CR35","doi-asserted-by":"crossref","unstructured":"Wei J, Zou K (2019) Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196. Accessed 10 Dec 2024","DOI":"10.18653\/v1\/D19-1670"},{"key":"2036_CR36","doi-asserted-by":"crossref","unstructured":"Wu W, Su Y, Chen X, et\u00a0al (2020) Boosting the transferability of adversarial samples via attention. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 1161\u20131170","DOI":"10.1109\/CVPR42600.2020.00124"},{"key":"2036_CR37","doi-asserted-by":"crossref","unstructured":"Xie C, Zhang Z, Zhou Y, et\u00a0al (2019) Improving transferability of adversarial examples with input diversity. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 2730\u20132739","DOI":"10.1109\/CVPR.2019.00284"},{"key":"2036_CR38","doi-asserted-by":"crossref","unstructured":"Yang J, Duan J, Tran S, et\u00a0al (2022) Vision-language pre-training with triple contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp 15671\u201315680","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"2036_CR39","unstructured":"Yin Z, Ye M, Zhang T, et\u00a0al (2024) Vlattack: Multimodal adversarial attacks on vision-language tasks via pre-trained models. Adv Neural Inf Process Syst. 36:52936\u201352956"},{"key":"2036_CR40","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II 14","author":"L Yu","year":"2016","unstructured":"Yu L, Poirson P, Yang S et al (2016) Modeling context in referring expressions. Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II 14. Springer, Cham, pp 69\u201385"},{"key":"2036_CR41","unstructured":"Yuan L, Zhang Y, Chen Y, et\u00a0al (2021) Bridge the gap between cv and nlp! a gradient-based textual adversarial attack framework. arXiv preprint arXiv:2110.15317. Accessed 10 Dec 2024"},{"key":"2036_CR42","doi-asserted-by":"crossref","unstructured":"Zhang J, Yi Q, Sang J (2022) Towards adversarial attack on vision-language pre-training models. In: Proceedings of the 30th ACM International Conference on Multimedia. pp 5005\u20135013","DOI":"10.1145\/3503161.3547801"},{"key":"2036_CR43","doi-asserted-by":"crossref","unstructured":"Zhang J, Huang J, Jin S, et\u00a0al (2024) Vision-language models for vision tasks: a survey. IEEE Trans Pattern Anal Mach Intell 46(8):5625\u20135644","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"2036_CR44","doi-asserted-by":"crossref","unstructured":"Zhang PF, Huang Z, Bai G (2024) Universal adversarial perturbations for vision-language pre-trained models. In: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval. pp 862\u2013871","DOI":"10.1145\/3626772.3657781"},{"key":"2036_CR45","doi-asserted-by":"crossref","unstructured":"Zheng H, Deng X, Jiang W, et\u00a0al (2024) A unified understanding of adversarial vulnerability regarding unimodal models and vision-language pre-training models. In: Proceedings of the 32nd ACM International Conference on Multimedia. pp 18\u201327","DOI":"10.1145\/3664647.3681184"},{"key":"2036_CR46","doi-asserted-by":"crossref","unstructured":"Zhou Z, Hu S, Li M, et\u00a0al (2023) Advclip: Downstream-agnostic adversarial examples in multimodal contrastive learning. In: Proceedings of the 31st ACM International Conference on Multimedia. pp 6311\u20136320","DOI":"10.1145\/3581783.3612454"},{"issue":"5","key":"2036_CR47","doi-asserted-by":"publisher","first-page":"6051","DOI":"10.1007\/s40747-023-01060-0","volume":"9","author":"P Zhu","year":"2023","unstructured":"Zhu P, Hong J, Li X et al (2023) Sgma: a novel adversarial attack approach with improved transferability. Complex Intell Syst 9(5):6051\u20136063","journal-title":"Complex Intell Syst"},{"key":"2036_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2023.103674","volume":"139","author":"P Zhu","year":"2024","unstructured":"Zhu P, Fan Z, Guo S et al (2024) Improving adversarial transferability through hybrid augmentation. Comput Secur 139:103674","journal-title":"Comput Secur"},{"key":"2036_CR49","unstructured":"Zhu P, Pan Z, Liu Y, et\u00a0al (2024) A general black-box adversarial attack on graph-based fake news detectors. arXiv preprint arXiv:2404.15744. Accessed 10 Dec 2024"},{"key":"2036_CR50","doi-asserted-by":"crossref","unstructured":"Zhu P, Pan Z, Tang K, et\u00a0al (2024) Node injection attack based on label propagation against graph neural network. IEEE Trans Comput Soc Syst 11(5):5858\u20135870","DOI":"10.1109\/TCSS.2024.3395794"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-02036-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-025-02036-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-02036-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,25]],"date-time":"2025-09-25T13:34:58Z","timestamp":1758807298000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-025-02036-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":50,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["2036"],"URL":"https:\/\/doi.org\/10.1007\/s40747-025-02036-y","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"type":"print","value":"2199-4536"},{"type":"electronic","value":"2198-6053"}],"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"18 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 September 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval\/Consent"}}],"article-number":"444"}}