{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T16:32:36Z","timestamp":1769185956032,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819555666","type":"print"},{"value":"9789819555673","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5567-3_1","type":"book-chapter","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:13:03Z","timestamp":1769116383000},"page":"3-17","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual-text Guided Cascading Visual Prompt for Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0916-7854","authenticated-orcid":false,"given":"Zebao","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4207-0426","authenticated-orcid":false,"given":"Wenlong","family":"Niu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4193-2819","authenticated-orcid":false,"given":"Yue","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Van\u00a0Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: Computer Vision \u2013 ECCV 2014, pp. 446\u2013461 (2014)","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild. In: In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.461"},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1_CR4","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: ICLR 2021-9th International Conference on Learning Representations (2021)"},{"key":"1_CR5","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: an incremental bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178 (2004)"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Feng, C., et al.: PromptDet: towards open-vocabulary detection using uncurated images. In: Computer Vision \u2013 ECCV 2022, pp. 701\u2013717 (2022)","DOI":"10.1007\/978-3-031-20077-9_41"},{"issue":"2","key":"1_CR7","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","volume":"132","author":"P Gao","year":"2024","unstructured":"Gao, P., et al.: CLIP-adapter: better vision-language models with feature adapters. Int. J. Comput. Vision 132(2), 581\u2013595 (2024)","journal-title":"Int. J. Comput. Vision"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"7","key":"1_CR9","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., Borth, D.: EuroSAT: a novel dataset and deep learning benchmark for land use and land cover classification. IEEE J. Sel. Top. Appl. Earth Observations Remote Sens. 12(7), 2217\u20132226 (2019)","journal-title":"IEEE J. Sel. Top. Appl. Earth Observations Remote Sens."},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., et al.: The many faces of robustness: a critical analysis of out-of-distribution generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8340\u20138349 (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15262\u201315271 (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"1_CR12","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the 38th International Conference on Machine Learning. vol.\u00a0139, pp. 4904\u20134916. PMLR (2021)"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Jia, M., et al.: Visual prompt tuning. In: Computer Vision \u2013 ECCV 2022, pp. 709\u2013727 (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Khattak, M., Rasheed, H., Maaz, M., Khan, S., Khan, F.: MaPLe: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) Workshops (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"1_CR16","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.: Align before fuse: vision and language representation learning with momentum distillation. In: Advances in Neural Information Processing Systems. vol.\u00a034, pp. 9694\u20139705. Curran Associates, Inc. (2021)"},{"issue":"9","key":"1_CR17","first-page":"1","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"1_CR18","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft (2013)"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Nilsback, M., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729 (2008)","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.V.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505 (2012)","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"1_CR21","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"1_CR22","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do ImageNet classifiers generalize to imagenet? In: Proceedings of the 36th International Conference on Machine Learning. vol.\u00a097, pp. 5389\u20135400. PMLR (2019)"},{"key":"1_CR23","unstructured":"Soomro, K.: UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"1_CR24","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems. vol.\u00a030. Curran Associates, Inc. (2017)"},{"key":"1_CR25","unstructured":"Wang, H., Songwei, G., Lipton, Z., Xing, E.P.: Learning robust global representations by penalizing local predictive power. In: Advances in Neural Information Processing Systems (2019)"},{"key":"1_CR26","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: SimVLM: simple visual language model pretraining with weak supervision. In: ICLR 2022 - 10th International Conference on Learning Representations (2022)"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K., Oliva, A., Torralba, A.: Sun database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492 (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19163\u201319173 (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: TCP: textual-based class-aware prompt tuning for visual-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23438\u201323448 (2024)","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"1_CR30","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. In: TMLR (2022)"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: LiT: zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18123\u201318133 (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"1_CR32","doi-asserted-by":"publisher","first-page":"1348","DOI":"10.1109\/TIP.2024.3362062","volume":"33","author":"C Zhao","year":"2024","unstructured":"Zhao, C., et al.: Learning domain invariant prompt for vision-language models. IEEE Trans. Image Process. 33, 1348\u20131360 (2024)","journal-title":"IEEE Trans. Image Process."},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"1_CR34","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5567-3_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:13:09Z","timestamp":1769116389000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5567-3_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819555666","9789819555673"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5567-3_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}