{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T18:06:32Z","timestamp":1779818792763,"version":"3.53.1"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032060655","type":"print"},{"value":"9783032060662","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T00:00:00Z","timestamp":1759536000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T00:00:00Z","timestamp":1759536000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-06066-2_22","type":"book-chapter","created":{"date-parts":[[2025,10,3]],"date-time":"2025-10-03T17:24:20Z","timestamp":1759512260000},"page":"369-386","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-generated Cross-Modal Prompt Tuning"],"prefix":"10.1007","author":[{"given":"Guiming","family":"Cao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zonghan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huan","family":"Huo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuming","family":"Ou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guandong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,10,4]]},"reference":[{"key":"22_CR1","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1007\/978-3-319-10599-4_29","volume-title":"Computer Vision - ECCV 2014","author":"L Bossard","year":"2014","unstructured":"Bossard, L., Guillaumin, M., Van Gool, L.: Food-101 - mining discriminative components with random forests. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision - ECCV 2014, pp. 446\u2013461. Springer International Publishing, Cham (2014)"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., Kholy, A.E., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: Universal image-text representation learning (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Cho, E., Kim, J., Kim, H.J.: Distribution-aware prompt tuning for vision-language models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 22004\u201322013 (October 2023)","DOI":"10.1109\/ICCV51070.2023.02011"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild (2013)","DOI":"10.1109\/CVPR.2014.461"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G.S., Dai, D.: Decoupling zero-shot semantic segmentation (2022)","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"22_CR7","doi-asserted-by":"publisher","unstructured":"Fei-Fei, L., Fergus, R., Perona, P.: Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178 (2004). https:\/\/doi.org\/10.1109\/CVPR.2004.383","DOI":"10.1109\/CVPR.2004.383"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Feng, C., et al.: Promptdet: Towards open-vocabulary detection using uncurated images (2022)","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"22_CR9","unstructured":"Gao, P., et al.: Clip-adapter: Better vision-language models with feature adapters (2021)"},{"key":"22_CR10","unstructured":"Gao, P., Lu, J., Li, H., Mottaghi, R., Kembhavi, A.: Container: Context aggregation network (2021)"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Helber, P., Bischke, B., Dengel, A., Borth, D.: Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification (2019)","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., et al.: The many faces of robustness: a critical analysis of out-of-distribution generalization (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"22_CR14","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision (2021)"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Wasim, S.T., Naseer, M., Khan, S., Yang, M.H., Khan, F.S.: Self-regulating prompts: Foundational model adaptation without forgetting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15190\u201315200 (October 2023)","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"22_CR17","unstructured":"Kim, J.H., Jun, J., Zhang, B.T.: Bilinear attention networks (2018)"},{"key":"22_CR18","doi-asserted-by":"publisher","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: 2013 IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013). https:\/\/doi.org\/10.1109\/ICCVW.2013.77","DOI":"10.1109\/ICCVW.2013.77"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Lee, D., Song, S., Suh, J., Choi, J., Lee, S., Kim, H.J.: Read-only prompt optimization for vision-language few-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1401\u20131411 (2023)","DOI":"10.1109\/ICCV51070.2023.00135"},{"key":"22_CR20","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation (2022)"},{"key":"22_CR21","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, X., Fu, X., Zhang, X., Wang, W., Chen, S., Yang, J.: Promptkd: unsupervised prompt distillation for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26617\u201326626 (2024)","DOI":"10.1109\/CVPR52733.2024.02513"},{"key":"22_CR22","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks (2019)"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S., Anwer, R.M., Yang, M.H.: Class-agnostic object detection with multi-modal transformer (2022)","DOI":"10.1007\/978-3-031-20080-9_30"},{"key":"22_CR24","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft (2013)"},{"key":"22_CR25","doi-asserted-by":"publisher","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics and Image Processing, pp. 722\u2013729 (2008).https:\/\/doi.org\/10.1109\/ICVGIP.2008.47","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C.V.: Cats and dogs. 2012 IEEE Conference on Computer Vision and Pattern Recognition pp. 3498\u20133505 (2012). https:\/\/api.semanticscholar.org\/CorpusID:383200","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Qian, Q., Xu, Y., Hu, J.: Intra-modal proxy learning for zero-shot visual categorization with clip. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 25461\u201325474. Curran Associates, Inc. (2023)","DOI":"10.52202\/075280-1107"},{"key":"22_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: Language-guided dense prediction with context-aware prompting (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"22_CR30","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do imagenet classifiers generalize to imagenet? (2019)"},{"key":"22_CR31","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: A dataset of 101 human actions classes from videos in the wild (2012)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: Learning cross-modality encoder representations from transformers (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"22_CR33","unstructured":"Wang, H., Ge, S., Xing, E.P., Lipton, Z.C.: Learning robust global representations by penalizing local predictive power (2019)"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Wang, Y., Jiang, X., Cheng, D., Li, D., Zhao, C.: Learning hierarchical prompt with structured linguistic knowledge for vision-language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 5749\u20135757 (2024)","DOI":"10.1609\/aaai.v38i6.28387"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: Large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492 (2010)","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: Visual-language prompt tuning with knowledge-guided context optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6757\u20136767 (June 2023)","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"22_CR37","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C.: Tcp:textual-based class-aware prompt tuning for visual-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23438\u201323448 (June 2024)","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-Vocabulary DETR with Conditional Matching, pp. 106\u2013122. Springer Nature Switzerland (2022)","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"22_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, J., Wu, S., Gao, L., Shen, H.T., Song, J.: Dept: decoupled prompt tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12924\u201312933 (June 2024)","DOI":"10.1109\/CVPR52733.2024.01228"},{"key":"22_CR41","unstructured":"Zhang, R., et al.: Tip-adapter: Training-free clip-adapter for better vision-language modeling (2021)"},{"key":"22_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"22_CR43","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"22_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, B., Niu, Y., Han, Y., Wu, Y., Zhang, H.: Prompt-aligned gradient for prompt tuning (2024)","DOI":"10.1109\/ICCV51070.2023.01435"}],"container-title":["Lecture Notes in Computer Science","Machine Learning and Knowledge Discovery in Databases. Research Track"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-06066-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T17:42:03Z","timestamp":1779817323000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-06066-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,4]]},"ISBN":["9783032060655","9783032060662"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-06066-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,4]]},"assertion":[{"value":"4 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECML PKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Joint European Conference on Machine Learning and Knowledge Discovery in Databases","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Porto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecml2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecmlpkdd.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}