{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T20:03:21Z","timestamp":1781553801967,"version":"3.54.5"},"reference-count":59,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,6]],"date-time":"2024-08-06T00:00:00Z","timestamp":1722902400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,6]],"date-time":"2024-08-06T00:00:00Z","timestamp":1722902400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","award":["2022ZD0160900"],"award-info":[{"award-number":["2022ZD0160900"]}],"id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010905","name":"Major Research Plan","doi-asserted-by":"publisher","award":["62076119"],"award-info":[{"award-number":["62076119"]}],"id":[{"id":"10.13039\/501100010905","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s11263-024-02172-x","type":"journal-article","created":{"date-parts":[[2024,8,6]],"date-time":"2024-08-06T11:02:56Z","timestamp":1722942176000},"page":"511-526","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":39,"title":["Progressive Visual Prompt Learning with Contrastive Feature Re-formation"],"prefix":"10.1007","volume":"133","author":[{"given":"Chen","family":"Xu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuhan","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haocheng","family":"Shen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Boheng","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yixuan","family":"Liao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoxin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3674-7718","authenticated-orcid":false,"given":"Limin","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,8,6]]},"reference":[{"key":"2172_CR1","unstructured":"Bahng H, Jahanian A, Sankaranarayanan S, Isola P (2022) Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274"},{"key":"2172_CR2","doi-asserted-by":"crossref","unstructured":"Bossard L, Guillaumin M, Gool LV (2014) Food-101\u2013mining discriminative components with random forests. In: ECCV, pp 446\u2013461","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"2172_CR3","doi-asserted-by":"crossref","unstructured":"Bossard L, Guillaumin M, Gool LV (2014) Food-101\u2013mining discriminative components with random forests. In: ECCV, pp 446\u2013461","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"2172_CR4","doi-asserted-by":"crossref","unstructured":"Cimpoi M, Maji S, Kokkinos I, Mohamed S, Vedaldi A (2014) Describing textures in the wild. In: CVPR, pp 3606\u20133613","DOI":"10.1109\/CVPR.2014.461"},{"key":"2172_CR5","doi-asserted-by":"crossref","unstructured":"Conde MV, Turgutlu K (2021) Clip-art: contrastive pre-training for fine-grained art classification. In: CVPR, pp 3956\u20133960","DOI":"10.1109\/CVPRW53098.2021.00444"},{"key":"2172_CR6","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: CVPR, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2172_CR7","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et\u00a0al (2021) An image is worth 16x16 words: Transformers for image recognition at scale. In: ICLR"},{"key":"2172_CR8","doi-asserted-by":"crossref","unstructured":"Du Y, Wei F, Zhang Z, Shi M, Gao Y, Li G (2022) Learning to prompt for open-vocabulary object detection with vision-language model. In: CVPR, pp 14084\u201314093","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"2172_CR9","doi-asserted-by":"crossref","unstructured":"Fei-Fei L, Fergus R, Perona P (2004) Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In: CVPR-W, pp 178\u2013178","DOI":"10.1109\/CVPR.2004.383"},{"key":"2172_CR10","unstructured":"Gao P, Geng S, Zhang R, Ma T, Fang R, Zhang Y, Li H, Qiao Y (2021) Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544"},{"key":"2172_CR11","doi-asserted-by":"crossref","unstructured":"Ghiasi G, Gu X, Cui Y, Lin TY (2022) Scaling open-vocabulary image segmentation with image-level labels. In: ECCV, pp 540\u2013557","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"2172_CR12","unstructured":"Glorot X, Bengio Y (2010) Understanding the difficulty of training deep feedforward neural networks. In: AISTATS, pp 249\u2013256"},{"key":"2172_CR13","unstructured":"Gu X, Lin TY, Kuo W, Cui Y (2021) Open-vocabulary object detection via vision and language knowledge distillation. In: ICLR"},{"key":"2172_CR14","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: CVPR, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"7","key":"2172_CR15","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., & Borth, D. (2019). Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 12(7), 2217\u20132226.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2172_CR16","unstructured":"Hinton G, Vinyals O, Dean J (2015) Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 2"},{"key":"2172_CR17","doi-asserted-by":"crossref","unstructured":"Hu X, Tang K, Miao C, Hua XS, Zhang H (2021) Distilling causal effect of data in class-incremental learning. In: CVPR, pp 3957\u20133966","DOI":"10.1109\/CVPR46437.2021.00395"},{"key":"2172_CR18","unstructured":"Jia C, Yang Y, Xia Y, Chen YT, Parekh Z, Pham H, Le Q, Sung YH, Li Z, Duerig T (2021) Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML, pp 4904\u20134916"},{"key":"2172_CR19","doi-asserted-by":"crossref","unstructured":"Jia M, Tang L, Chen BC, Cardie C, Belongie S, Hariharan B, Lim SN (2022) Visual prompt tuning. In: ECCV","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"2172_CR20","doi-asserted-by":"crossref","unstructured":"Khattak MU, Rasheed H, Maaz M, Khan S, Khan FS (2023) Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 19113\u201319122","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"2172_CR21","unstructured":"Kim W, Son B, Kim I (2021) Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, PMLR, pp 5583\u20135594"},{"key":"2172_CR22","doi-asserted-by":"crossref","unstructured":"Krause J, Stark M, Deng J, Fei-Fei L (2013) 3d object representations for fine-grained categorization. In: ICCV-W, pp 554\u2013561","DOI":"10.1109\/ICCVW.2013.77"},{"key":"2172_CR23","unstructured":"Li B, Weinberger KQ, Belongie S, Koltun V, Ranftl R (2021a) Language-driven semantic segmentation. In: ICLR"},{"key":"2172_CR24","doi-asserted-by":"crossref","unstructured":"Li XL, Liang P (2021) Prefix-tuning: Optimizing continuous prompts for generation. In: ACL, pp 4582\u20134597","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"2172_CR25","unstructured":"Li Y, Liang F, Zhao L, Cui Y, Ouyang W, Shao J, Yu F, Yan J (2021b) Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. In: ICLR"},{"key":"2172_CR26","doi-asserted-by":"crossref","unstructured":"Li Z, Hoiem D (2016) Learning without forgetting. In: ECCV, pp 614\u2013629","DOI":"10.1007\/978-3-319-46493-0_37"},{"key":"2172_CR27","doi-asserted-by":"crossref","unstructured":"Liu X, Ji K, Fu Y, Du Z, Yang Z, Tang J (2021) P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"2172_CR28","doi-asserted-by":"crossref","unstructured":"Liu Y, Su Y, Liu AA, Schiele B, Sun Q (2020) Mnemonics training: Multi-class incremental learning without forgetting. In: CVPR, pp 12245\u201312254","DOI":"10.1109\/CVPR42600.2020.01226"},{"key":"2172_CR29","unstructured":"Lu J, Batra D, Parikh D, Lee S (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32"},{"key":"2172_CR30","doi-asserted-by":"crossref","unstructured":"Lu Y, Liu J, Zhang Y, Liu Y, Tian X (2022) Prompt distribution learning. In: CVPR, pp 5206\u20135215","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"2172_CR31","unstructured":"Maji S, Rahtu E, Kannala J, Blaschko M, Vedaldi A (2013) Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151"},{"key":"2172_CR32","unstructured":"Mokady R, Hertz A, Bermano AH (2021) Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734"},{"key":"2172_CR33","doi-asserted-by":"crossref","unstructured":"Nilsback ME, Zisserman A (2008) Automated flower classification over a large number of classes. In: ICVGIP, pp 722\u2013729","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"2172_CR34","doi-asserted-by":"crossref","unstructured":"Parkhi OM, Vedaldi A, Zisserman A, Jawahar C (2012) Cats and dogs. In: CVPR, pp 3498\u20133505","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"2172_CR35","unstructured":"Phuong M, Lampert C (2019) Towards understanding knowledge distillation. In: ICML, pp 5142\u20135151"},{"key":"2172_CR36","doi-asserted-by":"crossref","unstructured":"Qin C, Joty S (2022) Continual few-shot relation learning via embedding space regularization and data augmentation. In: ACL, pp 2776\u20132789","DOI":"10.18653\/v1\/2022.acl-long.198"},{"key":"2172_CR37","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: ICML, pp 8748\u20138763"},{"key":"2172_CR38","doi-asserted-by":"crossref","unstructured":"Rebuffi SA, Kolesnikov A, Sperl G, Lampert CH (2017) icarl: Incremental classifier and representation learning. In: CVPR, pp 2001\u20132010","DOI":"10.1109\/CVPR.2017.587"},{"key":"2172_CR39","doi-asserted-by":"crossref","unstructured":"Ren S, Li L, Ren X, Zhao G, Sun X (2022) Rethinking the openness of clip. arXiv preprint arXiv:2206.01986","DOI":"10.18653\/v1\/2023.findings-acl.610"},{"key":"2172_CR40","unstructured":"Riemer M, Cases I, Ajemian R, Liu M, Rish I, Tu Y, Tesauro G (2019) Learning to learn without forgetting by maximizing transfer and minimizing interference. In: ICLR"},{"key":"2172_CR41","unstructured":"Shen S, Li LH, Tan H, Bansal M, Rohrbach A, Chang KW, Yao Z, Keutzer K (2021) How much can clip benefit vision-and-language tasks? In: ICLR"},{"key":"2172_CR42","doi-asserted-by":"crossref","unstructured":"Shi H, Hayat M, Wu Y, Cai J (2022) Proposalclip: Unsupervised open-category object proposal generation via exploiting clip cues. In: CVPR, pp 9611\u20139620","DOI":"10.1109\/CVPR52688.2022.00939"},{"key":"2172_CR43","unstructured":"Shu M, Nie W, Huang DA, Yu Z, Goldstein T, Anandkumar A, Xiao C (2022) Test-time prompt tuning for zero-shot generalization in vision-language models. arXiv preprint arXiv:2209.07511"},{"key":"2172_CR44","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"2172_CR45","unstructured":"Su W, Zhu X, Cao Y, Li B, Lu L, Wei F, Dai J (2019) Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530"},{"key":"2172_CR46","unstructured":"Sun Q, Fang Y, Wu L, Wang X, Cao Y (2023) Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389"},{"key":"2172_CR47","doi-asserted-by":"crossref","unstructured":"Tan H, Bansal M (2019) Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","DOI":"10.18653\/v1\/D19-1514"},{"key":"2172_CR48","doi-asserted-by":"crossref","unstructured":"Tang M, Wang Z, Liu Z, Rao F, Li D, Li X (2021) Clip4caption: Clip for video caption. In: ACM International Conference on Multimedia, pp 4858\u20134862","DOI":"10.1145\/3474085.3479207"},{"key":"2172_CR49","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141, & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30, 5998\u20136008.","journal-title":"Advances in neural information processing systems"},{"key":"2172_CR50","unstructured":"Wang Y, Huang Z, Hong X (2022a) S-prompts learning with pre-trained transformers: An occam\u2019s razor for domain incremental learning. arXiv preprint arXiv:2207.12819"},{"key":"2172_CR51","doi-asserted-by":"crossref","unstructured":"Wang Z, Lu Y, Li Q, Tao X, Guo Y, Gong M, Liu T (2022b) Cris: Clip-driven referring image segmentation. In: CVPR, pp 11686\u201311695","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"2172_CR52","doi-asserted-by":"crossref","unstructured":"Wang Z, Zhang Z, Lee CY, Zhang H, Sun R, Ren X, Su G, Perot V, Dy J, Pfister T (2022c) Learning to prompt for continual learning. In: CVPR, pp 139\u2013149","DOI":"10.1109\/CVPR52688.2022.00024"},{"key":"2172_CR53","doi-asserted-by":"crossref","unstructured":"Xiao J, Hays J, Ehinger KA, Oliva A, Torralba A (2010) Sun database: Large-scale scene recognition from abbey to zoo. In: CVPR, pp 3485\u20133492","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"2172_CR54","unstructured":"Yuan L, Chen D, Chen YL, Codella N, Dai X, Gao J, Hu H, Huang X, Li B, Li C, et\u00a0al (2021) Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432"},{"key":"2172_CR55","doi-asserted-by":"crossref","unstructured":"Zhai X, Wang X, Mustafa B, Steiner A, Keysers D, Kolesnikov A, Beyer L (2022) Lit: Zero-shot transfer with locked-image text tuning. In: CVPR, pp 18123\u201318133","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"2172_CR56","unstructured":"Zhang R, Fang R, Gao P, Zhang W, Li K, Dai J, Qiao Y, Li H (2021) Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930"},{"key":"2172_CR57","doi-asserted-by":"crossref","unstructured":"Zhou K, Yang J, Loy CC, Liu Z (2022a) Conditional prompt learning for vision-language models. In: CVPR, pp 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"2172_CR58","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2172_CR59","doi-asserted-by":"crossref","unstructured":"Zhu B, Niu Y, Han Y, Wu Y, Zhang H (2022) Prompt-aligned gradient for prompt tuning. arXiv preprint arXiv:2205.14865","DOI":"10.1109\/ICCV51070.2023.01435"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02172-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02172-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02172-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T06:39:43Z","timestamp":1737527983000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02172-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,6]]},"references-count":59,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["2172"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02172-x","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,6]]},"assertion":[{"value":"17 January 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 June 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}