{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T05:04:05Z","timestamp":1748581445602,"version":"3.40.4"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Key Research and Development Program of China,China","award":["2018YFB1700902","2018YFB1700902"],"award-info":[{"award-number":["2018YFB1700902","2018YFB1700902"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s00530-025-01707-7","type":"journal-article","created":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T16:06:27Z","timestamp":1740153987000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["UMPA: Unified multi-modal prompt with adapter for vision-language models"],"prefix":"10.1007","volume":"31","author":[{"given":"Zhengwei","family":"Jin","sequence":"first","affiliation":[]},{"given":"Yun","family":"Wei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,21]]},"reference":[{"key":"1707_CR1","unstructured":"Radford. A., Kim. J. W., Hallacy. C., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. 8748\u20138763. PMLR (2021)"},{"key":"1707_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2024.01.012","volume":"119","author":"J Xing","year":"2024","unstructured":"Xing, J., Liu, J., Wang, J., et al.: A survey of efficient fine-tuning methods for vision-language models \u2014 prompt and adapter. Comput. Graph. Graph. 119, 103885 (2024)","journal-title":"Comput. Graph. Graph."},{"issue":"2","key":"1707_CR3","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","volume":"132","author":"P Gao","year":"2024","unstructured":"Gao, P., Geng, S., Zhang, R., et al.: CLIP-adapter: better vision-language models with feature adapters. Int. J. Comput. VisionComput. Vision 132(2), 581\u2013595 (2024)","journal-title":"Int. J. Comput. VisionComput. Vision"},{"key":"1707_CR4","doi-asserted-by":"publisher","first-page":"493","DOI":"10.1007\/978-3-031-19833-5_29","volume-title":"Computer Vision \u2013 ECCV 2022","author":"R Zhang","year":"2022","unstructured":"Zhang, R., Zhang, W., Fang, R., et al.: Tip-adapter: training-free adaption of\u00a0CLIP for few-shot classification. In: Avidan, S., Brostow, G., Ciss\u00e9, M., et al. (eds.) Computer Vision \u2013 ECCV 2022, pp. 493\u2013510. Springer Nature Switzerland, Cham (2022)"},{"key":"1707_CR5","unstructured":"Pantazis. O., Brostow. G., Jones. K., et al.: SVL-adapter: self-supervised adapter for vision-language pretrained models. arXiv (2022)"},{"key":"1707_CR6","doi-asserted-by":"crossref","unstructured":"Hu. Z., Lan. Y., Wang, L., et al.: LLM-adapters: an adapter family for parameter-efficient fine-tuning of large language models. arXiv (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.319"},{"issue":"9","key":"1707_CR7","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., et al.: Learning to prompt for vision-language models. Int. J. Comput. VisionComput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. VisionComput. Vision"},{"key":"1707_CR8","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C. C., et al.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"1707_CR9","first-page":"14274","volume":"35","author":"M Shu","year":"2022","unstructured":"Shu, M., Nie, W., Huang, D.A., et al.: Test-time prompt tuning for zero-shot generalization in vision-language models. Adv. Neural. Inf. Process. Syst. 35, 14274\u201314289 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1707_CR10","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, J., Zhang, Y., et al. Prompt distribution learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5206\u20135215 (2022)","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"1707_CR11","unstructured":"Huang, T., Chu, J., Wei, F.: Unsupervised prompt learning for vision-language models. arXiv (2022)"},{"issue":"6","key":"1707_CR12","first-page":"5390","volume":"38","author":"H Wang","year":"2024","unstructured":"Wang, H., Liu, F., Jiao, L., et al.: ViLT-CLIP: video and language tuning CLIP with multimodal prompt learning and scenario-guided optimization. Proc AAAI Conf Artif Intell 38(6), 5390\u20135400 (2024)","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"1707_CR13","unstructured":"Hu, E. J., Shen, Y., Wallis, P., et al.: LORA: Low-rank adaptation of large language models. arXiv e-prints, arXiv: 2106.09685 (2021)"},{"key":"1707_CR14","doi-asserted-by":"crossref","unstructured":"Qiu, X., Feng. H., Wang, Y., et al.: Progressive multi-modal conditional prompt tuning. In: Proceedings of the 2024 International Conference on Multimedia Retrieval. 46-54 (2024)","DOI":"10.1145\/3652583.3658049"},{"key":"1707_CR15","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A. G., et al.: Masked-attention mask transformer for universal image segmentation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1280\u20131289. New Orleans, LA, USA: IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"1707_CR16","unstructured":"Jia, C., Yang, Y., Xia, Y., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the 38th International Conference on Machine Learning. 4904\u20134916. PMLR (2021)"},{"key":"1707_CR17","doi-asserted-by":"crossref","unstructured":"Zhai, X., Wang, X., Mustafa, B., et al.: LiT: zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 18123\u201318133 (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"1707_CR18","unstructured":"Yao, L., Huang, R., Hou, L., et al.: FILIP: fine-grained interactive language-image pre-training. arXiv (2021)"},{"key":"1707_CR19","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"1707_CR20","unstructured":"Li, B., Weinberger, K. Q., Belongie, S., et al.: Language-driven Semantic Segmentation. arXiv (2022)"},{"key":"1707_CR21","doi-asserted-by":"crossref","unstructured":"Li, X. L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. arXiv (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"1707_CR22","doi-asserted-by":"crossref","unstructured":"Lee, D., Song, S., Suh, J., et al.: Read-only prompt optimization for vision-language few-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1401\u20131411 (2023)","DOI":"10.1109\/ICCV51070.2023.00135"},{"key":"1707_CR23","doi-asserted-by":"crossref","unstructured":"Khattak, M. U., Rasheed, H., Maaz, M., et al.: MaPLe: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"1707_CR24","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-031-19827-4_41","volume-title":"Computer Vision \u2013 ECCV 2022","author":"M Jia","year":"2022","unstructured":"Jia, M., Tang, L., Chen, B.C., et al.: Visual prompt tuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., et al. (eds.) Computer Vision \u2013 ECCV 2022, pp. 709\u2013727. Springer Nature Switzerland, Cham (2022)"},{"key":"1707_CR25","doi-asserted-by":"crossref","unstructured":"Zhu, B., Niu, Y., Han, Y., et al.: Prompt-aligned gradient for prompt tuning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 15659\u201315669 (2023)","DOI":"10.1109\/ICCV51070.2023.01435"},{"key":"1707_CR26","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: Visual-language prompt tuning with knowledge-guided context optimization. arXiv (2023)","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"1707_CR27","doi-asserted-by":"publisher","first-page":"2056","DOI":"10.1109\/TMM.2023.3291588","volume":"26","author":"Y Xing","year":"2024","unstructured":"Xing, Y., Wu, Q., Cheng, D., et al.: Dual modality prompt tuning for vision-language pre-trained model. IEEE Trans. Multimed. 26, 2056\u20132068 (2024)","journal-title":"IEEE Trans. Multimed."},{"key":"1707_CR28","doi-asserted-by":"crossref","unstructured":"Sung, Y. L., Cho, J., Bansal, M.: VL-adapter: parameter-efficient transfer learning for vision-and-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5227\u20135237 (2022)","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"1707_CR29","unstructured":"Houlsby, N., Giurgiu, A., Jastrzebski, S., et al.: Parameter-efficient transfer learning for NLP. In: Proceedings of the 36th International Conference on Machine Learning. 2790\u20132799. PMLR (2019)"},{"key":"1707_CR30","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv (2021)"},{"key":"1707_CR31","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems: Vol. 30. Curran Associates, Inc. (2017)"},{"key":"1707_CR32","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., et al.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition. 248\u2013255. Miami, FL: IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1707_CR33","unstructured":"Recht, B., Roelofs, R., Schmidt, L., et al.: Do ImageNet classifiers generalize to ImageNet?. In: Proceedings of the 36th International Conference on Machine Learning. 5389\u20135400. PMLR (2019)"},{"key":"1707_CR34","unstructured":"Wang, H., Ge, S., Lipton, Z., et al.: Learning robust global representations by penalizing local predictive power. In: Advances in Neural Information Processing Systems: Vol. 32. Curran Associates, Inc. (2019)"},{"key":"1707_CR35","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., et al.: Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15262\u201315271 (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"1707_CR36","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Basart, S., Mu, N., et al.: The many faces of robustness: a critical analysis of out-of-distribution generalization. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). 8320\u20138329. Montreal, QC, Canada: IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"1707_CR37","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: an incremental bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop. 178\u2013178 (2004)"},{"key":"1707_CR38","doi-asserted-by":"crossref","unstructured":"Parkhi, O. M., Vedaldi, A., Zisserman, A., et al.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition. 3498\u20133505 (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"1707_CR39","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., et al.: 3D Object Representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"1707_CR40","doi-asserted-by":"crossref","unstructured":"Nilsback, M. E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing. 722\u2013729 (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"1707_CR41","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1007\/978-3-319-10599-4_29","volume-title":"Computer Vision \u2013 ECCV 2014","author":"L Bossard","year":"2014","unstructured":"Bossard, L., Guillaumin, M., Van Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: Fleet, D., Pajdla, T., Schiele, B., et al. (eds.) Computer Vision \u2013 ECCV 2014, pp. 446\u2013461. Springer International Publishing, Cham (2014)"},{"key":"1707_CR42","unstructured":"Maji, S., Rahtu, E., Kannala, J., et al.: Fine-grained visual classification of aircraft. arXiv (2013)"},{"key":"1707_CR43","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K. A., et al.: SUN database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition. 3485\u20133492. San Francisco, CA, USA: IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"1707_CR44","unstructured":"Soomro, K., Zamir, A. R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv (2012)"},{"key":"1707_CR45","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., et al.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 3606\u20133613 (2014)","DOI":"10.1109\/CVPR.2014.461"},{"issue":"7","key":"1707_CR46","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., et al.: EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens. 12(7), 2217\u20132226 (2019)","journal-title":"IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens."},{"key":"1707_CR47","unstructured":"Kumar, A., Raghunathan, A., Jones, R., et al.: Fine-tuning can distort pretrained features and underperform out-of-distribution. arXiv preprint arXiv:2202.10054 (2022)"},{"key":"1707_CR48","doi-asserted-by":"crossref","unstructured":"Wortsman, M., Ilharco, G., Kim, J. W., et al.: Robust fine-tuning of zero-shot models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 7959\u20137971 (2022)","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"1707_CR49","doi-asserted-by":"crossref","unstructured":"Goyal, S., Kumar, A., Garg, S., et al.: Finetune like you pretrain: improved finetuning of zero-shot vision models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19338\u201319347 (2023)","DOI":"10.1109\/CVPR52729.2023.01853"},{"key":"1707_CR50","doi-asserted-by":"crossref","unstructured":"Silva-Rodriguez, J., Hajimiri, S., Ben Ayed, I., et al.: A closer look at the few-shot adaptation of large vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 23681\u201323690 (2024)","DOI":"10.1109\/CVPR52733.2024.02235"},{"key":"1707_CR51","unstructured":"Liu, M., Li, B., Yu, Y.: Fully fine-tuned CLIP models are efficient few-shot learners. arXiv preprint arXiv:2407.04003 (2024)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01707-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01707-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01707-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T19:34:41Z","timestamp":1745264081000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01707-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,21]]},"references-count":51,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["1707"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01707-7","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,2,21]]},"assertion":[{"value":"18 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"125"}}