{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:53Z","timestamp":1781539013475,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Science and Technology Research Project of Department of Education of Hubei Province","award":["D20241403"],"award-info":[{"award-number":["D20241403"]}]},{"name":"National Natural Science Foundation of China","award":["62306106"],"award-info":[{"award-number":["62306106"]}]},{"name":"Startup Fund of Ningbo University","award":["ZX2024000433"],"award-info":[{"award-number":["ZX2024000433"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810876","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"893-901","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning A Bank of Transferable Prompts for Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3415-1822","authenticated-orcid":false,"given":"Jiong","family":"Wang","sequence":"first","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9983-8377","authenticated-orcid":false,"given":"Zhongwei","family":"Huang","sequence":"additional","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6016-6545","authenticated-orcid":false,"given":"Chong","family":"Wang","sequence":"additional","affiliation":[{"name":"Ningbo University, Ningbo, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3465-354X","authenticated-orcid":false,"given":"Endai","family":"Huang","sequence":"additional","affiliation":[{"name":"Ningbo University, Ningbo, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1699-8656","authenticated-orcid":false,"given":"Ran","family":"Zhou","sequence":"additional","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6103-1797","authenticated-orcid":false,"given":"Haitao","family":"Gan","sequence":"additional","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3475-6186","authenticated-orcid":false,"given":"Yingying","family":"Zhu","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0217-2469","authenticated-orcid":false,"given":"Xiaoyu","family":"Shen","sequence":"additional","affiliation":[{"name":"Eastern Institute of Technology, Ningbo, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"ECCV","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc\u00a0Van Gool. 2014. Food-101\u2013mining discriminative components with random forests. In ECCV."},{"key":"e_1_3_3_1_3_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Chen Guangyi","unstructured":"Guangyi Chen, Weiran Yao, Xiangchen Song, Xinyue Li, Yongming Rao, and Kun Zhang. [n. d.]. PLOT: Prompt Learning with Optimal Transport for Vision-Language Models. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_1_4_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Cheng Silin","unstructured":"Silin Cheng and Kai Han. [n. d.]. VaMP: Variational Multi-Modal Prompt Learning for Vision-Language Models. In The Thirty-ninth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01398"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.5555\/1032643.1033069"},{"key":"e_1_3_3_1_10_2","unstructured":"Peng Gao Shijie Geng Renrui Zhang Teli Ma Rongyao Fang Yongfeng Zhang Hongsheng Li and Yu Qiao. 2024. Clip-adapter: Better vision-language models with feature adapters. IJCV (2024)."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02329"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Patrick Helber Benjamin Bischke Andreas Dengel and Damian Borth. 2019. Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing (2019).","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01073"},{"key":"e_1_3_3_1_16_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Ilharco Gabriel","unstructured":"Gabriel Ilharco, Marco\u00a0Tulio Ribeiro, Mitchell Wortsman, Ludwig Schmidt, Hannaneh Hajishirzi, and Ali Farhadi. [n. d.]. Editing models with task arithmetic. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_1_17_2","volume-title":"ICML","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"e_1_3_3_1_26_2","unstructured":"Subhransu Maji Esa Rahtu Juho Kannala Matthew Blaschko and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1306.5151 (2013)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2355061"},{"key":"e_1_3_3_1_29_2","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, and Jack Clark. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_3_1_30_2","volume-title":"ICLR","author":"Roy Shuvendu","year":"2024","unstructured":"Shuvendu Roy and Ali Etemad. 2024. Consistency-guided Prompt Learning for Vision-Language Models. In ICLR."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Fei Song Yi Li Jiangmeng Li Rui Wang Changwen Zheng Fanjiang Xu and Hui Xiong. 2026. AmPLe: Supporting Vision-Language Models via Adaptive-Debiased Ensemble Multi-Prompt Learning. International Journal of Computer Vision 134 2 (2026) 67.","DOI":"10.1007\/s11263-025-02591-4"},{"key":"e_1_3_3_1_32_2","unstructured":"Khurram Soomro Amir\u00a0Roshan Zamir and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1212.0402 (2012)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02700"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00314"},{"key":"e_1_3_3_1_35_2","volume-title":"NeurIPS","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric\u00a0P Xing. 2019. Learning robust global representations by penalizing local predictive power. In NeurIPS."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01314"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3504568"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02249"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Frederic\u00a0Z Zhang Paul Albert Cristian Rodriguez-Opazo Anton van\u00a0den Hengel and Ehsan Abbasnejad. 2024. Knowledge composition using task vectors with learned anisotropic scaling. Advances in Neural Information Processing Systems 37 (2024) 67319\u201367354.","DOI":"10.52202\/079017-2149"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-8543-2_10"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Kaiyang Zhou Jingkang Yang Chen\u00a0Change Loy and Ziwei Liu. 2022. Learning to prompt for vision-language models. IJCV (2022).","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01435"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:34:06Z","timestamp":1781537646000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810876"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":46,"alternative-id":["10.1145\/3805622.3810876","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810876","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}