{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T16:01:12Z","timestamp":1774540872555,"version":"3.50.1"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729669","type":"print"},{"value":"9783031729676","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72967-6_19","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:09:08Z","timestamp":1730574548000},"page":"340-356","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["SDPT: Synchronous Dual Prompt Tuning for\u00a0Fusion-Based Visual-Language Pre-trained Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2848-7642","authenticated-orcid":false,"given":"Yang","family":"Zhou","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5631-164X","authenticated-orcid":false,"given":"Yongjian","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7212-6595","authenticated-orcid":false,"given":"Jiya","family":"Saiyin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6979-0459","authenticated-orcid":false,"given":"Bingzheng","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Maode","family":"Lai","sequence":"additional","affiliation":[]},{"given":"Eric","family":"Chang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2636-7594","authenticated-orcid":false,"given":"Yan","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"19_CR1","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"issue":"1","key":"19_CR3","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"FL Chen","year":"2023","unstructured":"Chen, F.L., et al.: VLP: a survey on vision-language pre-training. Mach. Intell. Res. 20(1), 38\u201356 (2023)","journal-title":"Mach. Intell. Res."},{"key":"19_CR4","first-page":"16664","volume":"35","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: Adaptformer: adapting vision transformers for scalable visual recognition. Adv. Neural. Inf. Process. Syst. 35, 16664\u201316678 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Chowdhury, S., Nag, S., Manocha, D.: Apollo: unified adapter and prompt learning for vision language models. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 10173\u201310187 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.629"},{"key":"19_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"issue":"7","key":"19_CR7","doi-asserted-by":"publisher","first-page":"4032","DOI":"10.1109\/TGRS.2012.2228275","volume":"51","author":"I D\u00f3pido","year":"2013","unstructured":"D\u00f3pido, I., Li, J., Marpu, P.R., Plaza, A., Dias, J.M.B., Benediktsson, J.A.: Semisupervised self-learning for hyperspectral image classification. IEEE Trans. Geosci. Remote Sens. 51(7), 4032\u20134044 (2013)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"19_CR8","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arxiv 2020. arXiv preprint arXiv:2010.11929 (2010)"},{"key":"19_CR9","first-page":"32942","volume":"35","author":"ZY Dou","year":"2022","unstructured":"Dou, Z.Y., et al.: Coarse-to-fine vision-language pre-training with fusion in the backbone. Adv. Neural. Inf. Process. Syst. 35, 32942\u201332956 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Gao, P., et al.: Clip-adapter: better vision-language models with feature adapters. Int. J. Comput. Vision 1\u201315 (2023)","DOI":"10.1007\/s11263-023-01891-x"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"19_CR12","unstructured":"He, J., Zhou, C., Ma, X., Berg-Kirkpatrick, T., Neubig, G.: Towards a unified view of parameter-efficient transfer learning. arXiv preprint arXiv:2110.04366 (2021)"},{"key":"19_CR13","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: International Conference on Machine Learning, pp. 2790\u20132799. PMLR (2019)"},{"key":"19_CR14","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Jain, A., Tancik, M., Abbeel, P.: Putting nerf on a diet: semantically consistent few-shot view synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5885\u20135894 (2021)","DOI":"10.1109\/ICCV48922.2021.00583"},{"key":"19_CR16","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"19_CR17","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-031-19827-4_41","volume-title":"European Conference on Computer Vision","author":"M Jia","year":"2022","unstructured":"Jia, M., et al.: Visual prompt tuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13693, pp. 709\u2013727. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_41"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"19_CR20","first-page":"9287","volume":"35","author":"C Li","year":"2022","unstructured":"Li, C., et al.: Elevater: a benchmark and toolkit for evaluating language-augmented visual models. Adv. Neural. Inf. Process. Syst. 35, 9287\u20139301 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR21","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Quan, R., Zhu, L., Yang, Y.: Efficient multimodal fusion via interactive prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2604\u20132613 (2023)","DOI":"10.1109\/CVPR52729.2023.00256"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Gligen: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"19_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: Styleclip: text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2085\u20132094 (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"issue":"8","key":"19_CR29","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: Glue: a multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 568\u2013578 (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"19_CR32","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"693","DOI":"10.1007\/978-3-031-43987-2_67","volume-title":"MICCAI 2023","author":"Y Wu","year":"2023","unstructured":"Wu, Y., et al.: Zero-shot nuclei detection via visual-language pre-trained models. In: Greenspan, H., et al. (eds.) MICCAI 2023. LNCS, vol. 14225, pp. 693\u2013703. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-43987-2_67"},{"key":"19_CR33","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Xing, Y., et al.: Dual modality prompt tuning for vision-language pre-trained model. IEEE Trans. Multimedia (2023)","DOI":"10.1109\/TMM.2023.3291588"},{"key":"19_CR35","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token VIT: training vision transformers from scratch on imagenet. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 558\u2013567 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"19_CR37","unstructured":"Zaken, E.B., Ravfogel, S., Goldberg, Y.: Bitfit: simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)"},{"key":"19_CR38","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)"},{"key":"19_CR39","first-page":"36067","volume":"35","author":"H Zhang","year":"2022","unstructured":"Zhang, H., et al.: Glipv2: unifying localization and vision-language understanding. Adv. Neural. Inf. Process. Syst. 35, 36067\u201336080 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"issue":"9","key":"19_CR41","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"issue":"1","key":"19_CR42","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/JPROC.2020.3004555","volume":"109","author":"F Zhuang","year":"2020","unstructured":"Zhuang, F., et al.: A comprehensive survey on transfer learning. Proc. IEEE 109(1), 43\u201376 (2020)","journal-title":"Proc. IEEE"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72967-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:15:41Z","timestamp":1730574941000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72967-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031729669","9783031729676"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72967-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}