{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T17:33:56Z","timestamp":1772127236368,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T00:00:00Z","timestamp":1768608000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T00:00:00Z","timestamp":1768608000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2144772"],"award-info":[{"award-number":["2144772"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-026-21339-x","type":"journal-article","created":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T09:31:25Z","timestamp":1768642285000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Robust defense strategies for multimodal contrastive learning: efficient fine-tuning against backdoor attacks"],"prefix":"10.1007","volume":"85","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7369-752X","authenticated-orcid":false,"given":"Md. Iqbal","family":"Hossain","sequence":"first","affiliation":[]},{"given":"Afia","family":"Sajeeda","sequence":"additional","affiliation":[]},{"given":"Neeresh Kumar","family":"Perla","sequence":"additional","affiliation":[]},{"given":"Ming","family":"Shao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,17]]},"reference":[{"issue":"8","key":"21339_CR1","doi-asserted-by":"publisher","first-page":"5625","DOI":"10.1109\/TPAMI.2024.3369699","volume":"46","author":"J Zhang","year":"2024","unstructured":"Zhang J, Huang J, Jin S, Lu S (2024) Vision-language models for vision tasks: A survey. IEEE Trans Pattern Anal Mach Intell 46(8):5625\u20135644. https:\/\/doi.org\/10.1109\/TPAMI.2024.3369699","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21339_CR2","unstructured":"Lauren\u00e7on H, Tronchon L, Cord M, Sanh V (2024) What matters when building vision-language models?. https:\/\/arxiv.org\/abs\/2405.02246"},{"key":"21339_CR3","unstructured":"Bordes F, Pang RY, Ajay A, Li AC, Bardes A, Petryk S, Ma\u00f1as O, Lin Z, Mahmoud A, Jayaraman B, Ibrahim M, Hall M, Xiong Y, Lebensold J, Ross C, Jayakumar S, Guo C, Bouchacourt D, Al-Tahan H, Padthe K, Sharma V, Xu H, Tan XE, Richards M, Lavoie S, Astolfi P, Hemmat RA, Chen J, Tirumala K, Assouel R, Moayeri M, Talattof A, Chaudhuri K, Liu Z, Chen X, Garrido Q, Ullrich K, Agrawal A, Saenko K, Celikyilmaz A, Chandra V (2024) An Introduction to Vision-Language Modeling. https:\/\/arxiv.org\/abs\/2405.17247"},{"key":"21339_CR4","unstructured":"Du Y, Konyushkova K, Denil M, Raju A, Landon J, Hill F, Freitas N, Cabi S (2023) Vision-Language Models as Success Detectors. https:\/\/arxiv.org\/abs\/2303.07280"},{"key":"21339_CR5","unstructured":"Radford A et al (2021) Learning transferable visual models from natural language supervision. arXiv arXiv:2103.00020"},{"key":"21339_CR6","unstructured":"Jia C, Yang Y, Xia Y, Chen Y-T, Parekh Z, Pham H, Le QV, Sung Y, Li Z, Duerig T (2021) Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"},{"key":"21339_CR7","doi-asserted-by":"crossref","unstructured":"Li Y, Fan H, Hu R, Feichtenhofer C, He K (2023) Scaling Language-Image Pre-training via Masking. https:\/\/arxiv.org\/abs\/2212.00794","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"21339_CR8","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. https:\/\/arxiv.org\/abs\/2201.12086"},{"key":"21339_CR9","unstructured":"Pham H, Dai Z, Ghiasi G, Kawaguchi K, Liu H, Yu AW, Yu J, Chen Y-T, Luong M-T, Wu Y et al (2021) Combined scaling for open-vocabulary image classification. arXiv preprint arXiv:2111.10050"},{"key":"21339_CR10","doi-asserted-by":"publisher","unstructured":"Yuhas BP, Goldstein MH, Sejnowski TJ (1989) Integration of acoustic and visual speech signals using neural networks. IEEE Commun Mag 27(11):65\u201371. https:\/\/doi.org\/10.1109\/35.41402","DOI":"10.1109\/35.41402"},{"key":"21339_CR11","unstructured":"Ngiam J, Khosla A, Kim M, Nam J, Lee H, Ng AY (2011) Multimodal deep learning. In: International conference on machine learning"},{"key":"21339_CR12","unstructured":"Srivastava N, Salakhutdinov RR (2012) Multimodal learning with deep boltzmann machines. In: Pereira F, Burges CJ, Bottou L, Weinberger KQ (eds) Advances in neural information processing systems, pp 2231\u20132239. Curran Associates, Inc., Red Hook, NY, USA"},{"key":"21339_CR13","doi-asserted-by":"crossref","unstructured":"Yuan X, Lin Z, Kuen J, Zhang J, Wang Y, Maire M, Kale A, Faieta B (2021) Multimodal Contrastive Training for Visual Representation Learning. https:\/\/arxiv.org\/abs\/2104.12836","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"21339_CR14","unstructured":"Chen T, Kornblith S, Norouzi M, Hinton G (2020) A Simple Framework for Contrastive Learning of Visual Representations. https:\/\/arxiv.org\/abs\/2002.05709"},{"key":"21339_CR15","doi-asserted-by":"crossref","unstructured":"Bansal H, Singhi N, Yang Y, Yin F, Grover A, Chang K-W (2023) Cleanclip: Mitigating data poisoning attacks in multimodal contrastive learning. arXiv arXiv:2303.03323","DOI":"10.1109\/ICCV51070.2023.00017"},{"key":"21339_CR16","doi-asserted-by":"publisher","unstructured":"Zhang J, Yi Q, Sang J (2022) Towards adversarial attack on vision-language pre-training models. In: Proceedings of the 30th ACM international conference on multimedia, pp 5005\u20135013. ACM, Lisboa, Portugal. https:\/\/doi.org\/10.1145\/3503161.3547801","DOI":"10.1145\/3503161.3547801"},{"key":"21339_CR17","doi-asserted-by":"crossref","unstructured":"Zhang J, Yi Q, Sang J (2022) Towards Adversarial Attack on Vision-Language Pre-training Models","DOI":"10.1145\/3503161.3547801"},{"key":"21339_CR18","unstructured":"Gao Y, Doan BG, Zhang Z, Ma S, Zhang J, Fu A, Nepal S, Kim H (2020) Backdoor Attacks and Countermeasures on Deep Learning: A Comprehensive Review. https:\/\/arxiv.org\/abs\/2007.10760"},{"key":"21339_CR19","unstructured":"Doan K, Lao Y, Li P (2021) Backdoor attack with imperceptible input and latent modification. In: Ranzato M, Beygelzimer A, Dauphin Y, Liang PS, Vaughan JW (eds) Advances in neural information processing systems, vol 34, pp 18944\u201318957. Curran Associates, Inc."},{"key":"21339_CR20","unstructured":"Khaddaj A, Leclerc G, Makelov A, Georgiev K, Salman H, Ilyas A, Madry A (2023) Rethinking backdoor attacks. In: Krause A, Brunskill E, Cho K, Engelhardt B, Sabato S, Scarlett J (eds) Proceedings of the 40th international conference on machine learning. Proceedings of Machine Learning Research, vol 202, pp 16216\u201316236. PMLR. https:\/\/proceedings.mlr.press\/v202\/khaddaj23a.html"},{"key":"21339_CR21","unstructured":"Li Y, Zhai T, Wu B, Jiang Y, Li Z, Xia S (2021) Rethinking the Trigger of Backdoor Attack. https:\/\/arxiv.org\/abs\/2004.04692"},{"key":"21339_CR22","doi-asserted-by":"publisher","unstructured":"Kawaguchi K, Bengio Y, Kaelbling L (2022) Generalization in Deep Learning, pp 112\u2013148. Cambridge University Press. http:\/\/dx.doi.org\/10.1017\/9781009025096.003https:\/\/doi.org\/10.1017\/9781009025096.003","DOI":"10.1017\/9781009025096.003"},{"key":"21339_CR23","doi-asserted-by":"publisher","unstructured":"Vlachas K, Tatsis K, Agathos K, Brink AR, Chatzi E (2021) A local basis approximation approach for nonlinear parametric model order reduction. J Sound Vibr 502:116055. https:\/\/doi.org\/10.1016\/j.jsv.2021.116055","DOI":"10.1016\/j.jsv.2021.116055"},{"key":"21339_CR24","doi-asserted-by":"publisher","unstructured":"Yanowitz SD, Bruckstein AM (1989) A new method for image segmentation. Comput Vision Graph Image Process 46(1):82\u201395. https:\/\/doi.org\/10.1016\/S0734-189X(89)80017-9","DOI":"10.1016\/S0734-189X(89)80017-9"},{"key":"21339_CR25","unstructured":"Zhao X, Ding W, An Y, Du Y, Yu T, Li M, Tang M, Wang J (2023) Fast Segment Anything. https:\/\/arxiv.org\/abs\/2306.12156"},{"key":"21339_CR26","doi-asserted-by":"crossref","unstructured":"Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S, Berg AC, Lo W-Y, Doll\u00e1r P, Girshick R (2023) Segment Anything","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"21339_CR27","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, Zhang P, Hu X, Zhang L, Wang L, Hu H, Dong L, Wei F, Choi Y, Gao J (2020) Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Proceedings of european conference on computer vision","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"21339_CR28","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et al (2021) Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020"},{"key":"21339_CR29","unstructured":"Jia C, Yang Y, Xia Y, Chen Y-T, Parekh Z, Pham H, Le Q, Sung Y-H, Li Z, Duerig T (2021) Scaling up visual and vision-language representation learning with noisy text supervision. In: International conference on machine learning, pp 4904\u20134916. PMLR"},{"key":"21339_CR30","doi-asserted-by":"publisher","unstructured":"Pham H et al (2023) Combined scaling for zero-shot transfer learning. arXiv arXiv:2111.10050https:\/\/doi.org\/10.48550\/arXiv.2111.10050","DOI":"10.48550\/arXiv.2111.10050"},{"key":"21339_CR31","unstructured":"Biggio B, Nelson B, Laskov P (2012) Poisoning attacks against support vector machines. In: Proceedings of the 29th international coference on international conference on machine learning. ICML\u201912, pp 1467\u20131474. Omnipress, Madison, WI, USA"},{"key":"21339_CR32","unstructured":"Gu T, Dolan-Gavitt B, Garg S (2017) Badnets: Identifying vulnerabilities in the machine learning model supply chain. arXiv preprint arXiv:1708.06733"},{"key":"21339_CR33","doi-asserted-by":"crossref","unstructured":"Liu Y, Ma S, Aafer Y, Lee W-C, Zhai J, Wang W, Zhang X (2018) Trojaning attack on neural networks. In: Proceedings of the 25th annual network and distributed system security symposium (NDSS)","DOI":"10.14722\/ndss.2018.23291"},{"key":"21339_CR34","unstructured":"Liu Y, Cheng L, Lin X, Huang W, Yuan B (2021) Trojannet: Embedding hidden trojan horse models within neural network. In: Proceedings of the 28th annual network and distributed system security symposium (NDSS)"},{"key":"21339_CR35","unstructured":"Nguyen TA, Tran AT (2021) Wanet - imperceptible warping-based backdoor attack. In: International conference on learning representations"},{"key":"21339_CR36","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proc. of the annual meeting of the association of computational linguistics (ACL)","DOI":"10.18653\/v1\/P18-1238"},{"key":"21339_CR37","unstructured":"Li Y, Liang F, Zhao L, Cui Y, Ouyang W, Shao J, Yu F, Yan J (2022) Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. In: International conference on learning representations"},{"key":"21339_CR38","unstructured":"Goel S, Bansal H, Bhatia S, Rossi RA, Vinay V, Grover A (2022) Cyclip: Cyclic contrastive language-image pretraining. In: Oh AH, Agarwal A, Belgrave D, Cho K (eds) Advances in neural information processing systems"},{"key":"21339_CR39","unstructured":"Carlini N, Terzis A (2022) Poisoning and backdooring contrastive learning. In: International conference on learning representations"},{"key":"21339_CR40","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Deep Residual Learning for Image Recognition. https:\/\/arxiv.org\/abs\/1512.03385"},{"key":"21339_CR41","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2021) An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"21339_CR42","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2023) Attention Is All You Need. https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"21339_CR43","unstructured":"Loshchilov I, Hutter F (2019) Decoupled Weight Decay Regularization. https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"21339_CR44","doi-asserted-by":"crossref","unstructured":"Cubuk ED, Zoph B, Mane D, Vasudevan V, Le QV (2019) Autoaugment: Learning augmentation strategies from data. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2019.00020"},{"key":"21339_CR45","doi-asserted-by":"crossref","unstructured":"Wei J, Zou K (2019) Eda: Easy data augmentation techniques for boosting performance on text classification tasks. In: Proc. of the annual meeting of the association of computational linguistics (ACL)","DOI":"10.18653\/v1\/D19-1670"},{"key":"21339_CR46","doi-asserted-by":"publisher","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 248\u2013255. IEEE, Miami, FL, USA. https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"21339_CR47","unstructured":"Krizhevsky A, Nair V, Hinton G (2009) Cifar-10: A dataset for image recognition and machine learning. Canadian Institute for Advanced Research (CIFAR)"},{"key":"21339_CR48","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Trans Assoc Comput Linguistics (TACL) 2:67\u201378","journal-title":"Trans Assoc Comput Linguistics (TACL)"},{"key":"21339_CR49","unstructured":"Goodfellow I, Shlens J, Szegedy C (2015) Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21339-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-026-21339-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21339-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T09:32:08Z","timestamp":1768642328000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-026-21339-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,17]]},"references-count":49,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2026,1]]}},"alternative-id":["21339"],"URL":"https:\/\/doi.org\/10.1007\/s11042-026-21339-x","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,17]]},"assertion":[{"value":"21 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable. No human\/animal subjects were involved.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to Participate"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to Publish"}},{"value":"The authors declare that they have no competing interests.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"5"}}