{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T13:45:16Z","timestamp":1776347116253,"version":"3.51.2"},"reference-count":45,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100005145","name":"Basic Research Program of Jiangsu Province","doi-asserted-by":"publisher","award":["BK20240011"],"award-info":[{"award-number":["BK20240011"]}],"id":[{"id":"10.13039\/501100005145","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62425603"],"award-info":[{"award-number":["62425603"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113571","type":"journal-article","created":{"date-parts":[[2026,3,29]],"date-time":"2026-03-29T07:42:04Z","timestamp":1774770124000},"page":"113571","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PB","title":["Trajectory-enhanced transferable attacks for vision-language pre-trained models"],"prefix":"10.1016","volume":"179","author":[{"given":"Haiqi","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Ziqiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Zechao","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113571_bib0001","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","first-page":"3158","article-title":"ViLTA: enhancing vision-language pre-training through textual augmentation","author":"Wang","year":"2023"},{"issue":"12","key":"10.1016\/j.patcog.2026.113571_bib0002","doi-asserted-by":"crossref","first-page":"9904","DOI":"10.1109\/TPAMI.2021.3132068","article-title":"CTNet: context-based tandem network for semantic segmentation","volume":"44","author":"Li","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113571_bib0003","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109009","article-title":"Robust physical-world attacks on face recognition","volume":"133","author":"Zheng","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113571_bib0004","series-title":"Proc. Int. Conf. on Learn. Repr.","article-title":"Intriguing properties of neural networks","author":"Szegedy","year":"2014"},{"issue":"3","key":"10.1016\/j.patcog.2026.113571_bib0005","doi-asserted-by":"crossref","first-page":"1958","DOI":"10.1109\/TPAMI.2024.3511621","article-title":"Divide-and-conquer: confluent triple-flow network for RGB-T salient object detection","volume":"47","author":"Tang","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"10.1016\/j.patcog.2026.113571_bib0006","doi-asserted-by":"crossref","first-page":"2198","DOI":"10.1109\/TDSC.2025.3625576","article-title":"Gradient pruning interactive attack for vision-language pre-training models","volume":"23","author":"Zhang","year":"2026","journal-title":"IEEE Trans. Dependable Secur. Comput."},{"key":"10.1016\/j.patcog.2026.113571_bib0007","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"4312","article-title":"Evading defenses to transferable adversarial examples by translation-invariant attacks","author":"Dong","year":"2019"},{"key":"10.1016\/j.patcog.2026.113571_bib0008","series-title":"Proc. ACM Int. Conf. Multimedia","first-page":"5005","article-title":"Towards adversarial attack on vision-language pre-training models","author":"Zhang","year":"2022"},{"key":"10.1016\/j.patcog.2026.113571_bib0009","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","first-page":"102","article-title":"Set-level guidance attack: boosting adversarial transferability of vision-language pre-training models","author":"Lu","year":"2023"},{"key":"10.1016\/j.patcog.2026.113571_bib0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107332","article-title":"Understanding adversarial attacks on deep learning based medical image analysis systems","volume":"110","author":"Ma","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113571_bib0011","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108792","article-title":"Learning attention-guided pyramidal features for few-shot fine-grained recognition","volume":"130","author":"Tang","year":"2022","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113571_bib0012","doi-asserted-by":"crossref","first-page":"5663","DOI":"10.1109\/TIFS.2025.3574976","article-title":"Modality-specific interactive attack for vision-language pre-training models","volume":"20","author":"Zhang","year":"2025","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"10.1016\/j.patcog.2026.113571_bib0013","series-title":"Proc. Int. Conf. Mach. Learn.","article-title":"Robust CLIP: unsupervised adversarial fine-tuning of vision embeddings for robust large vision-language models","author":"Schlarmann","year":"2024"},{"key":"10.1016\/j.patcog.2026.113571_bib0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108527","article-title":"Hyperspherical class prototypes for adversarial robustness","volume":"125","author":"Mygdalis","year":"2022","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113571_bib0015","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"442","article-title":"Boosting transferability in vision-language attacks via diversification along the intersection region of adversarial trajectory","author":"Gao","year":"2025"},{"key":"10.1016\/j.patcog.2026.113571_bib0016","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"12888","article-title":"BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.patcog.2026.113571_bib0017","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"15671","article-title":"Vision-language pre-training with triple contrastive learning","author":"Yang","year":"2022"},{"key":"10.1016\/j.patcog.2026.113571_bib0018","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113571_bib0019","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","first-page":"2641","article-title":"Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models","author":"Plummer","year":"2015"},{"key":"10.1016\/j.patcog.2026.113571_bib0020","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"740","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014"},{"issue":"9","key":"10.1016\/j.patcog.2026.113571_bib0021","doi-asserted-by":"crossref","first-page":"2070","DOI":"10.1109\/TPAMI.2018.2852750","article-title":"Deep collaborative embedding for social image understanding","volume":"41","author":"Li","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113571_bib0022","unstructured":"J. Achiam, S. Adler, S. Agarwal, L. Ahmad, I. Akkaya, F.L. Aleman, D. Almeida, J. Altenschmidt, S. Altman, S. Anadkat, et al., GPT-4 technical report,(2023). arXiv: 2303.08774."},{"key":"10.1016\/j.patcog.2026.113571_bib0023","unstructured":"J. Bai, S. Bai, Y. Chu, Z. Cui, K. Dang, X. Deng, Y. Fan, W. Ge, Y. Han, F. Huang, et al., Qwen technical report,(2023). arXiv: 2309.16609."},{"key":"10.1016\/j.patcog.2026.113571_bib0024","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"23242","article-title":"Position-guided text prompt for vision-language pre-training","author":"Wang","year":"2023"},{"key":"10.1016\/j.patcog.2026.113571_bib0025","series-title":"Proc. Int. Conf. on Learn. Repr.","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.patcog.2026.113571_bib0026","series-title":"Proc. Neural Inf. Process. Syst.","first-page":"9694","article-title":"Align before fuse: vision and language representation learning with momentum distillation","author":"Li","year":"2021"},{"key":"10.1016\/j.patcog.2026.113571_bib0027","series-title":"Proc. Neural Inf. Process. Syst.","article-title":"VLATTACK : multimodal adversarial attacks on vision-language tasks via pre-trained models","author":"Yin","year":"2023"},{"key":"10.1016\/j.patcog.2026.113571_bib0028","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"5184","article-title":"ViSTA: vision and scene text aggregation for cross-modal retrieval","author":"Cheng","year":"2022"},{"key":"10.1016\/j.patcog.2026.113571_bib0029","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"2668","article-title":"Towards transferable adversarial attacks on vision transformers","volume":"36","author":"Wei","year":"2022"},{"key":"10.1016\/j.patcog.2026.113571_bib0030","doi-asserted-by":"crossref","first-page":"2098","DOI":"10.1109\/TIFS.2023.3346211","article-title":"A robust open-Set multi-instance learning for defending adversarial attacks in digital image","volume":"19","author":"Uddin","year":"2023","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"10.1016\/j.patcog.2026.113571_bib0031","doi-asserted-by":"crossref","first-page":"2398","DOI":"10.1109\/TIFS.2024.3350376","article-title":"FastTextDodger: decision-based adversarial attack against black-box NLP models with extremely high efficiency","volume":"19","author":"Hu","year":"2024","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"10.1016\/j.patcog.2026.113571_bib0032","series-title":"Proc. Int. Conf. on Learn. Repr.","article-title":"Explaining and harnessing adversarial examples","author":"Goodfellow","year":"2015"},{"key":"10.1016\/j.patcog.2026.113571_bib0033","series-title":"Proc. Int. Conf. on Learn. Repr.","article-title":"Towards deep learning models resistant to adversarial attacks","author":"Madry","year":"2018"},{"key":"10.1016\/j.patcog.2026.113571_bib0034","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"2730","article-title":"Improving transferability of adversarial examples with input diversity","author":"Xie","year":"2019"},{"key":"10.1016\/j.patcog.2026.113571_bib0035","series-title":"Proc. Int. Conf. on Learn. Repr.","article-title":"Nesterov accelerated gradient and scale invariance for adversarial attacks","author":"Lin","year":"2020"},{"key":"10.1016\/j.patcog.2026.113571_bib0036","series-title":"Proc. Conf. on Empir. Meth. in Natu. Lang. Proc.","first-page":"6193","article-title":"Bert-attack: adversarial attack against bert using bert","author":"Li","year":"2020"},{"key":"10.1016\/j.patcog.2026.113571_bib0037","unstructured":"J. Fu, Z. Chen, K. Jiang, H. Guo, J. Wang, S. Gao, W. Zhang, Improving adversarial transferability of visual-language pre-training models through collaborative multimodal interaction, (2024). arXiv: 2403.10883."},{"key":"10.1016\/j.patcog.2026.113571_bib0038","series-title":"Proc. ACM Int. Conf. Multimedia","first-page":"18","article-title":"A unified understanding of adversarial vulnerability regarding unimodal models and vision-language pre-training models","author":"Zheng","year":"2024"},{"issue":"10","key":"10.1016\/j.patcog.2026.113571_bib0039","doi-asserted-by":"crossref","first-page":"8489","DOI":"10.1109\/TPAMI.2025.3581476","article-title":"Semantic-aligned adversarial evolution triangle for high-Transferability vision-language attack","volume":"47","author":"Jia","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113571_bib0040","doi-asserted-by":"crossref","first-page":"190","DOI":"10.1016\/j.neucom.2023.01.071","article-title":"TextGuise: adaptive adversarial example attacks on text classification model","volume":"529","author":"Chang","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2026.113571_bib0041","series-title":"Proc. Conf. on Empi. Meth. in Natural Lang. Proc.","first-page":"1532","article-title":"Glove: global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.patcog.2026.113571_bib0042","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"69","article-title":"Modeling context in referring expressions","author":"Yu","year":"2016"},{"key":"10.1016\/j.patcog.2026.113571_bib0043","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.patcog.2026.113571_bib0044","unstructured":"C. Guo, M. Rana, M. Cisse, L. Van Der Maaten, Countering adversarial images using input transformations,(2017). arXiv: 1711.00117."},{"key":"10.1016\/j.patcog.2026.113571_bib0045","series-title":"Proc. IEEE\/CVF Int. Conf. on Comput. Vis. and Pattern Recognit.","first-page":"262","article-title":"A self-supervised approach for adversarial robustness","author":"Naseer","year":"2020"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005376?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005376?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T12:52:00Z","timestamp":1776343920000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326005376"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":45,"alternative-id":["S0031320326005376"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113571","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Trajectory-enhanced transferable attacks for vision-language pre-trained models","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113571","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113571"}}