{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T17:10:31Z","timestamp":1772039431579,"version":"3.50.1"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFF0600605"],"award-info":[{"award-number":["2021YFF0600605"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01878-3","type":"journal-article","created":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T09:04:10Z","timestamp":1751879050000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Parameter-efficient transfer learning of prompts and adapters on vision-language models"],"prefix":"10.1007","volume":"31","author":[{"given":"Ai","family":"Jian","sequence":"first","affiliation":[]},{"given":"Yun","family":"Wei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,7]]},"reference":[{"issue":"9","key":"1878_CR1","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"1878_CR2","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"1878_CR3","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: Tcp: Textual-based class-aware prompt tuning for visual-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23438\u201323448 (2024)","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"1878_CR4","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., Isola, P.: Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)"},{"key":"1878_CR5","unstructured":"Wu, J., Li, X., Wei, C., Wang, H., Yuille, A., Zhou, Y., Xie, C.: Unleashing the power of visual prompting at the pixel level. arXiv preprint arXiv:2212.10556 (2022)"},{"key":"1878_CR6","unstructured":"Peng, G., Shijie, G., Renrui, Z., Teli, M., Rongyao, F., Yongfeng, Z., Hongsheng, L., Clip-adapter, Q.Y.: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.045443 (2021)"},{"key":"1878_CR7","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zhang, W., Fang, R., Gao, P., Li, K., Dai, J., Qiao, Y., Li, H.: Tip-adapter: Training-free adaption of clip for few-shot classification. In: European Conference on Computer Vision, pp. 493\u2013510 (2022). Springer","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"1878_CR8","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"1878_CR9","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"1878_CR10","doi-asserted-by":"crossref","unstructured":"Xing, Y., Wu, Q., Cheng, D., Zhang, S., Liang, G., Wang, P., Zhang, Y.: Dual modality prompt tuning for vision-language pre-trained model. IEEE Transactions on Multimedia (2023)","DOI":"10.1109\/TMM.2023.3291588"},{"key":"1878_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111726","volume":"294","author":"Y Jia","year":"2024","unstructured":"Jia, Y., Ye, X., Liu, Y., Guo, S.: Multi-modal recursive prompt learning with mixup embedding for generalization recognition. Knowl.-Based Syst. 294, 111726 (2024)","journal-title":"Knowl.-Based Syst."},{"key":"1878_CR12","doi-asserted-by":"crossref","unstructured":"Guo, Z., Zhang, R., Qiu, L., Ma, X., Miao, X., He, X., Cui, B.: Calip: Zero-shot enhancement of clip with parameter-free attention. Proceedings of the AAAI Conference on Artificial Intelligence 37, 746\u2013754 (2023)","DOI":"10.1609\/aaai.v37i1.25152"},{"key":"1878_CR13","doi-asserted-by":"crossref","unstructured":"Chowdhury, S., Nag, S., Manocha, D.: Apollo: Unified adapter and prompt learning for vision language models. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 10173\u201310187 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.629"},{"key":"1878_CR14","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916 (2021). PMLR"},{"key":"1878_CR15","unstructured":"Yuan, L., Chen, D., Chen, Y.-L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et al.: Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"1878_CR16","unstructured":"Beyer, L., Steiner, A., Pinto, A.S., Kolesnikov, A., Wang, X., Salz, D., Neumann, M., Alabdulmohsin, I., Tschannen, M., Bugliarello, E., et al.: Paligemma: A versatile 3b vlm for transfer. arXiv preprint arXiv:2407.07726 (2024)"},{"key":"1878_CR17","unstructured":"Yao, H., Zhang, R., Yu, L., Xu, C.: Sep: Self-enhanced prompt tuning for visual-language model. arXiv preprint arXiv:2405.15549 (2024)"},{"key":"1878_CR18","doi-asserted-by":"crossref","unstructured":"Eichenberg, C., Black, S., Weinbach, S., Parcalabescu, L., Frank, A.: Magma\u2013multimodal augmentation of generative models through adapter-based finetuning. arXiv preprint arXiv:2112.05253 (2021)","DOI":"10.18653\/v1\/2022.findings-emnlp.179"},{"key":"1878_CR19","unstructured":"Qiu, L., Zhang, R., Guo, Z., Zeng, Z., Guo, Z., Li, Y., Zhang, G.: Vt-clip: Enhancing vision-language models with visual-guided texts. arXiv preprint arXiv:2112.02399 (2021)"},{"key":"1878_CR20","unstructured":"Koh, J.Y., Salakhutdinov, R., Fried, D.: Grounding language models to images for multimodal inputs and outputs. In: International Conference on Machine Learning, pp. 17283\u201317300 (2023). PMLR"},{"key":"1878_CR21","doi-asserted-by":"crossref","unstructured":"Guo, Z., Dong, B., Ji, Z., Bai, J., Guo, Y., Zuo, W.: Texts as images in prompt tuning for multi-label image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2808\u20132817 (2023)","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"1878_CR22","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Lei, Y., Zhang, B., Liu, L., Liu, Y.: Zegclip: Towards adapting clip for zero-shot semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11175\u201311185 (2023)","DOI":"10.1109\/CVPR52729.2023.01075"},{"issue":"2","key":"1878_CR23","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1007\/s11263-024-02172-x","volume":"133","author":"C Xu","year":"2025","unstructured":"Xu, C., Zhu, Y., Shen, H., Chen, B., Liao, Y., Chen, X., Wang, L.: Progressive visual prompt learning with contrastive feature re-formation. Int. J. Comput. Vision 133(2), 511\u2013526 (2025)","journal-title":"Int. J. Comput. Vision"},{"key":"1878_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, Y., Liu, Z., Yang, W., Wang, Y., Liao, Q.: Clip-based synergistic knowledge transfer for text-based person retrieval. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7935\u20137939 (2024). IEEE","DOI":"10.1109\/ICASSP48485.2024.10445963"},{"key":"1878_CR25","doi-asserted-by":"crossref","unstructured":"Wang, H., Liu, F., Jiao, L., Wang, J., Hao, Z., Li, S., Li, L., Chen, P., Liu, X.: Vilt-clip: Video and language tuning clip with multimodal prompt learning and scenario-guided optimization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 5390\u20135400 (2024)","DOI":"10.1609\/aaai.v38i6.28347"},{"key":"1878_CR26","doi-asserted-by":"crossref","unstructured":"Shang, C., Song, Z., Qiu, H., Wang, L., Meng, F., Li, H.: Prompt-driven referring image segmentation with instance contrasting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4124\u20134134 (2024)","DOI":"10.1109\/CVPR52733.2024.00395"},{"key":"1878_CR27","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)"},{"key":"1878_CR28","doi-asserted-by":"crossref","unstructured":"Yin, X., Wu, J., Yang, W., Zhou, X., Zhang, S., Zhang, T.: Hierarchy-aware interactive prompt learning for few-shot classification. IEEE Transactions on Circuits and Systems for Video Technology (2024)","DOI":"10.1109\/TCSVT.2024.3432753"},{"key":"1878_CR29","unstructured":"Yoo, S., Kim, E., Jung, D., Lee, J., Yoon, S.: Improving visual prompt tuning for self-supervised vision transformers. In: International Conference on Machine Learning, pp. 40075\u201340092 (2023). PMLR"},{"key":"1878_CR30","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: Visual-language prompt tuning with knowledge-guided context optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6757\u20136767 (2023)","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"1878_CR31","unstructured":"Roy, S., Etemad, A.: Consistency-guided prompt learning for vision-language models. arXiv preprint arXiv:2306.01195 (2023)"},{"key":"1878_CR32","unstructured":"Singha, M., Jha, A., Banerjee, B.: Gopro: Generate and optimize prompts in clip using self-supervised learning. arXiv preprint arXiv:2308.11605 (2023)"},{"key":"1878_CR33","unstructured":"Ma, T., Geng, S., Wang, M., Shao, J., Lu, J., Li, H., Gao, P., Qiao, Y.: A simple long-tailed recognition baseline via vision-language model. arXiv preprint arXiv:2111.14745 (2021)"},{"key":"1878_CR34","doi-asserted-by":"crossref","unstructured":"Liu, J., Hu, T., Zhang, Y., Feng, Y., Hao, J., Lv, J., Liu, Z.: Parameter-efficient transfer learning for medical visual question answering. IEEE Transactions on Emerging Topics in Computational Intelligence (2023)","DOI":"10.1109\/TETCI.2023.3311333"},{"key":"1878_CR35","unstructured":"Kahana, J., Cohen, N., Hoshen, Y.: Improving zero-shot models with label distribution priors. arXiv preprint arXiv:2212.00784 (2022)"},{"key":"1878_CR36","unstructured":"Houlsby, N., Giurgiu, A., Jastrzebski, S., Morrone, B., De\u00a0Laroussilhe, Q., Gesmundo, A., Attariyan, M., Gelly, S.: Parameter-efficient transfer learning for nlp. In: International Conference on Machine Learning, pp. 2790\u20132799 (2019). PMLR"},{"key":"1878_CR37","first-page":"23716","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1878_CR38","doi-asserted-by":"crossref","unstructured":"Sung, Y.-L., Cho, J., Bansal, M.: Vl-adapter: Parameter-efficient transfer learning for vision-and-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5227\u20135237 (2022)","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"1878_CR39","doi-asserted-by":"crossref","unstructured":"Hasegawa, T., Nishida, K., Maeda, K., Saito, K.: Duet: Image-text contrastive transfer learning with dual-adapter tuning. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 13607\u201313624 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.839"},{"key":"1878_CR40","unstructured":"Jie, S., Deng, Z.-H.: Convolutional bypasses are better vision transformer adapters. arXiv preprint arXiv:2207.07039 (2022)"},{"key":"1878_CR41","unstructured":"Seputis, D., Mihailov, S., Chatterjee, S., Xiao, Z.: Multi-modal adapter for vision-language models. arXiv preprint arXiv:2409.02958 (2024)"},{"key":"1878_CR42","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). Ieee","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1878_CR43","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178 (2004). IEEE"},{"key":"1878_CR44","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505 (2012). IEEE","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"1878_CR45","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3d object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"1878_CR46","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729 (2008). IEEE","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"1878_CR47","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Van\u00a0Gool, L.: Food-101\u2013mining discriminative components with random forests. In: Computer vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part VI 13, pp. 446\u2013461 (2014). Springer","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"1878_CR48","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)"},{"key":"1878_CR49","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: Large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492 (2010). IEEE","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"1878_CR50","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3606\u20133613 (2014)","DOI":"10.1109\/CVPR.2014.461"},{"issue":"7","key":"1878_CR51","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., Borth, D.: Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing 12(7), 2217\u20132226 (2019)","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"1878_CR52","unstructured":"Soomro, K.: Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"1878_CR53","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do imagenet classifiers generalize to imagenet? In: International Conference on Machine Learning, pp. 5389\u20135400 (2019). PMLR"},{"key":"1878_CR54","unstructured":"Wang, H., Ge, S., Lipton, Z., Xing, E.P.: Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32 (2019)"},{"key":"1878_CR55","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15262\u201315271 (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"1878_CR56","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Basart, S., Mu, N., Kadavath, S., Wang, F., Dorundo, E., Desai, R., Zhu, T., Parajuli, S., Guo, M., et al.: The many faces of robustness: A critical analysis of out-of-distribution generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8340\u20138349 (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"1878_CR57","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"1878_CR58","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"1878_CR59","doi-asserted-by":"crossref","unstructured":"Li, D., Wang, Z., Chen, Y., Jiang, R., Ding, W., Okumura, M.: A survey on deep active learning: Recent advances and new frontiers. IEEE Transactions on Neural Networks and Learning Systems (2024)","DOI":"10.1109\/TNNLS.2024.3396463"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01878-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01878-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01878-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:03:18Z","timestamp":1757926998000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01878-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,7]]},"references-count":59,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1878"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01878-3","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,7]]},"assertion":[{"value":"19 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"299"}}