{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,22]],"date-time":"2025-07-22T10:41:25Z","timestamp":1753180885200,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819981441"},{"type":"electronic","value":"9789819981458"}],"license":[{"start":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T00:00:00Z","timestamp":1701043200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T00:00:00Z","timestamp":1701043200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8145-8_7","type":"book-chapter","created":{"date-parts":[[2023,11,26]],"date-time":"2023-11-26T23:02:21Z","timestamp":1701039741000},"page":"77-89","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Prompt Tuning for Vision and Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3858-173X","authenticated-orcid":false,"given":"Bing","family":"Li","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9683-9461","authenticated-orcid":false,"given":"Feng","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6350-5770","authenticated-orcid":false,"given":"Shaokun","family":"Gao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0551-8490","authenticated-orcid":false,"given":"Qile","family":"Fan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5563-4905","authenticated-orcid":false,"given":"Yuchen","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7548-2475","authenticated-orcid":false,"given":"Reyu","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7982-3374","authenticated-orcid":false,"given":"Zhiyuan","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,27]]},"reference":[{"unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. PMLR (2021)","key":"7_CR1"},{"unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning. PMLR (2021)","key":"7_CR2"},{"issue":"9","key":"7_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., et al.: Pre-train, prompt, and predict: a survey of prompting methods in NLP. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"doi-asserted-by":"crossref","unstructured":"Jia, M., et al.: Visual prompt tuning. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, 23\u201327 October 2022, Proceedings, Part XXXIII, pp. 709\u2013727. Springer, Cham (2022)","key":"7_CR4","DOI":"10.1007\/978-3-031-19827-4_41"},{"issue":"9","key":"7_CR5","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., et al.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"unstructured":"Zang, Y., et al.: Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)","key":"7_CR6"},{"doi-asserted-by":"crossref","unstructured":"Shin, T., et al.: Autoprompt: eliciting knowledge from language models. arXiv preprint arXiv:2010.15980 (2020)","key":"7_CR7","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"doi-asserted-by":"crossref","unstructured":"Qin, G., Eisner, J.: Learning how to ask: Querying LMs with soft prompts. arXiv preprint arXiv:2104.06599 (2021)","key":"7_CR8","DOI":"10.18653\/v1\/2021.naacl-main.410"},{"doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","key":"7_CR9","DOI":"10.18653\/v1\/D19-1514"},{"key":"7_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"unstructured":"Lu, J., et al.: Vilbert: pretraining visiolinguistic representations for vision-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)","key":"7_CR11"},{"doi-asserted-by":"crossref","unstructured":"Zhou, K., et al.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","key":"7_CR12","DOI":"10.1109\/CVPR52688.2022.01631"},{"unstructured":"Sun, X., et al.: Dualcoop: fast adaptation to multi-label recognition with limited annotations. arXiv preprint arXiv:2206.09541 (2022)","key":"7_CR13"},{"unstructured":"Xing, Y., et al.: Class-aware visual prompt tuning for vision-language pre-trained model. arXiv preprint arXiv:2208.08340 (2022)","key":"7_CR14"},{"doi-asserted-by":"crossref","unstructured":"He, K., et al.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778. IEEE (2016)","key":"7_CR15","DOI":"10.1109\/CVPR.2016.90"},{"unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)","key":"7_CR16"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)","key":"7_CR17"},{"issue":"5","key":"7_CR18","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/101.34898","volume":"5","author":"N Farhat","year":"1989","unstructured":"Farhat, N.: Optoelectronic neural networks and learning machines. IEEE Circuits Devices Mag. 5(5), 32\u201341 (1989)","journal-title":"IEEE Circuits Devices Mag."},{"unstructured":"Li, Y., et al.: Supervision exists everywhere: a data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208 (2021)","key":"7_CR19"},{"issue":"1","key":"7_CR20","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1145\/584091.584093","volume":"5","author":"CE Shannon","year":"2001","unstructured":"Shannon, C.E.: A mathematical theory of communication. ACM SIGMOBILE Mob. Comput. Commun. Rev. 5(1), 3\u201355 (2001)","journal-title":"ACM SIGMOBILE Mob. Comput. Commun. Rev."},{"doi-asserted-by":"crossref","unstructured":"Deng, J., et al.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","key":"7_CR21","DOI":"10.1109\/CVPR.2009.5206848"},{"unstructured":"Fei-Fei, L., et al.: Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, p. 178. IEEE (2004)","key":"7_CR22"},{"doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., et al.: Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505. IEEE (2012)","key":"7_CR23","DOI":"10.1109\/CVPR.2012.6248092"},{"doi-asserted-by":"crossref","unstructured":"Krause, J., et al.: 3D object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","key":"7_CR24","DOI":"10.1109\/ICCVW.2013.77"},{"doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729. IEEE (2008)","key":"7_CR25","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"7_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1007\/978-3-319-10599-4_29","volume-title":"Computer Vision \u2013 ECCV 2014","author":"L Bossard","year":"2014","unstructured":"Bossard, L., Guillaumin, M., Van Gool, L.: Food-101 \u2013 mining discriminative components with random forests. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 446\u2013461. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_29"},{"unstructured":"Maji, S., et al.: Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)","key":"7_CR27"},{"doi-asserted-by":"crossref","unstructured":"Xiao, J., et al.: Sun database: large-scale scene recognition from abbey to zoo. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492. IEEE (2010)","key":"7_CR28","DOI":"10.1109\/CVPR.2010.5539970"},{"unstructured":"Soomro, K., et al.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)","key":"7_CR29"},{"doi-asserted-by":"crossref","unstructured":"Cimpoi, M., et al.: Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3606\u20133613 (2014)","key":"7_CR30","DOI":"10.1109\/CVPR.2014.461"},{"doi-asserted-by":"crossref","unstructured":"Helber, P., et al.: Eurosat: a novel dataset and deep learning benchmark for land use and land cover classification. IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens. 12(7), 2217\u20132226 (2019)","key":"7_CR31","DOI":"10.1109\/JSTARS.2019.2918242"},{"issue":"6\u20137","key":"7_CR32","doi-asserted-by":"publisher","first-page":"884","DOI":"10.1007\/s11263-018-1131-1","volume":"127","author":"S Li","year":"2019","unstructured":"Li, S., Deng, W.: Blended emotion in-the-wild: Multi-label facial expression recognition using crowdsourced annotations and deep locality feature learning. Int. J. Computer Vision 127(6\u20137), 884\u2013906 (2019)","journal-title":"Int. J. Computer Vision"}],"container-title":["Communications in Computer and Information Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8145-8_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T18:53:48Z","timestamp":1710356028000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8145-8_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,27]]},"ISBN":["9789819981441","9789819981458"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8145-8_7","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2023,11,27]]},"assertion":[{"value":"27 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Changsha","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 November 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1274","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"650","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"51% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.14","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.46","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}