{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:15:26Z","timestamp":1755839726392,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658049","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"46-54","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Progressive Multi-modal Conditional Prompt Tuning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0966-2006","authenticated-orcid":false,"given":"Xiaoyu","family":"Qiu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8127-6639","authenticated-orcid":false,"given":"Hao","family":"Feng","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0965-584X","authenticated-orcid":false,"given":"Yuechen","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Giuseppe Amato Paolo Bolettieri Fabio Carrara Fabrizio Falchi Claudio Gennaro Nicola Messina Lucia Vadicamo and Claudio Vairo. 2023. VISIONE: a large-scale video retrieval system with advanced search functionalities. In ICMR. 649--653.","DOI":"10.1145\/3591106.3592226"},{"key":"e_1_3_2_1_2_1","volume-title":"Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274","author":"Bahng Hyojin","year":"2022","unstructured":"Hyojin Bahng, Ali Jahanian, Swami Sankaranarayanan, and Phillip Isola. 2022. Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Lukas Bossard Matthieu Guillaumin and Luc Van Gool. 2014. Food-101--mining discriminative components with random forests. In ECCV. 446--461.","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Mircea Cimpoi Subhransu Maji Iasonas Kokkinos Sammy Mohamed and Andrea Vedaldi. 2014. Describing textures in the wild. In CVPR. 3606--3613.","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Jia Deng Wei Dong Richard Socher Li-Jia Li Kai Li and Li Fei-Fei. 2009. ImageNet: A large-scale hierarchical image database. In CVPR. 248--255.","DOI":"10.1109\/CVPRW.2009.5206848"},{"key":"e_1_3_2_1_6_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Yu Du Fangyun Wei Zihe Zhang Miaojing Shi Yue Gao and Guoqi Li. 2022. Learning to prompt for open-vocabulary object detection with vision-language model. In CVPR. 14084--14093.","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"e_1_3_2_1_8_1","volume-title":"Transferring image-CLIP to video-text retrieval via temporal relations. TMM","author":"Fang Han","year":"2022","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Wenhan Luo. 2022. Transferring image-CLIP to video-text retrieval via temporal relations. TMM (2022)."},{"volume-title":"Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories","author":"Fei-Fei Li","key":"e_1_3_2_1_9_1","unstructured":"Li Fei-Fei. 2004. Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In CVPRW. IEEE, 178--178."},{"key":"e_1_3_2_1_10_1","volume-title":"Recurrent Generic Contour-based Instance Segmentation with Progressive Learning. TCSVT","author":"Feng Hao","year":"2024","unstructured":"Hao Feng, Keyi Zhou, Wengang Zhou, Yufei Yin, Jiajun Deng, Qi Sun, and Houqiang Li. 2024. Recurrent Generic Contour-based Instance Segmentation with Progressive Learning. TCSVT (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"DocScanner: Robust document image rectification with progressive learning. arXiv preprint arXiv:2110.14968","author":"Feng Hao","year":"2021","unstructured":"Hao Feng, Wengang Zhou, Jiajun Deng, Qi Tian, and Houqiang Li. 2021. DocScanner: Robust document image rectification with progressive learning. arXiv preprint arXiv:2110.14968 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"CLIP-Adapter: Better vision-language models with feature adapters. IJCV","author":"Gao Peng","year":"2023","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2023. CLIP-Adapter: Better vision-language models with feature adapters. IJCV (2023), 1--15."},{"key":"e_1_3_2_1_13_1","unstructured":"Xiuye Gu Tsung-Yi Lin Weicheng Kuo and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. In ICLR."},{"key":"e_1_3_2_1_14_1","volume-title":"PTR: Prompt tuning with rules for text classification. AI Open","author":"Han Xu","year":"2022","unstructured":"Xu Han, Weilin Zhao, Ning Ding, Zhiyuan Liu, and Maosong Sun. 2022. PTR: Prompt tuning with rules for text classification. AI Open (2022), 182--192."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","volume-title":"EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification. JSTARS","author":"Helber Patrick","year":"2019","unstructured":"Patrick Helber, Benjamin Bischke, Andreas Dengel, and Damian Borth. 2019. EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification. JSTARS (2019), 2217--2226."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Dan Hendrycks Steven Basart Norman Mu Saurav Kadavath Frank Wang Evan Dorundo Rahul Desai Tyler Zhu Samyak Parajuli Mike Guo et al. 2021a. The many faces of robustness: A critical analysis of out-of-distribution generalization. In ICCV. 8340--8349.","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Dan Hendrycks Kevin Zhao Steven Basart Jacob Steinhardt and Dawn Song. 2021b. Natural adversarial examples. In CVPR. 15262--15271.","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_19_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. 4904--4916."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Menglin Jia Luming Tang Bor-Chun Chen Claire Cardie Serge Belongie Bharath Hariharan and Ser-Nam Lim. 2022. Visual prompt tuning. In ECCV. 709--727.","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_21_1","unstructured":"Woojeong Jin Yu Cheng Yelong Shen Weizhu Chen and Xiang Ren. 2022. A good prompt is worth millions of parameters: low-resource prompt-based learning for vision-language models. In ACL. 2763--2775."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Chen Ju Tengda Han Kunhao Zheng Ya Zhang and Weidi Xie. 2022. Prompting visual-language models for efficient video understanding. In ECCV. 105--124.","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Muhammad Uzair Khattak Hanoona Rasheed Muhammad Maaz Salman Khan and Fahad Shahbaz Khan. 2023. MaPLe: Multi-modal prompt learning. In CVPR. 19113--19122.","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Jonathan Krause Michael Stark Jia Deng and Li Fei-Fei. 2013. 3D object representations for fine-grained categorization. In ICCVW. 554--561.","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Teven Le Scao and Alexander M Rush. 2021. How many data points is a prompt worth?. In NAACL. 2627--2636.","DOI":"10.18653\/v1\/2021.naacl-main.208"},{"key":"e_1_3_2_1_26_1","unstructured":"Yi-Lun Lee Yi-Hsuan Tsai Wei-Chen Chiu and Chen-Yu Lee. 2023. Multimodal prompting with missing modalities for visual recognition. In CVPR. 14943--14952."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Brian Lester Rami Al-Rfou and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. In EMNLP. 3045--3059.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_1_28_1","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: optimizing continuous prompts for generation. In ACL. 4582--4597."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Yikang Li Jenhao Hsiao and Chiuman Ho. 2022. VideoCLIP: A cross-attention model for fast video-text retrieval task with image clip. In ICMR. 29--33.","DOI":"10.1145\/3512527.3531429"},{"key":"e_1_3_2_1_30_1","volume-title":"Zhengxiao Du, Zhilin Yang, and Jie Tang.","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Kaixuan Ji, Yicheng Fu, Weng Lam Tam, Zhengxiao Du, Zhilin Yang, and Jie Tang. 2021. P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"GPT understands, too. AI Open","author":"Liu Xiao","year":"2023","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2023. GPT understands, too. AI Open (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"Yuning Lu Jianzhuang Liu Yonggang Zhang Yajing Liu and Xinmei Tian. 2022. Prompt distribution learning. In CVPR. 5206--5215."},{"key":"e_1_3_2_1_33_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Swaroop Mishra Daniel Khashabi Chitta Baral and Hannaneh Hajishirzi. 2022. Cross-task generalization via natural language crowdsourcing instructions. In AC. 3470--3487.","DOI":"10.18653\/v1\/2022.acl-long.244"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Maria-Elena Nilsback and Andrew Zisserman. 2008. Automated flower classification over a large number of classes. In ICVGIP. 722--729.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Omkar M Parkhi Andrea Vedaldi Andrew Zisserman and CV Jawahar. 2012. Cats and dogs. In CVPR. 3498--3505.","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Fabio Petroni Tim Rockt\"aschel Sebastian Riedel Patrick Lewis Anton Bakhtin Yuxiang Wu and Alexander Miller. 2019. Language models as knowledge bases?. In EMNLP-IJCNLP. 2463--2473.","DOI":"10.18653\/v1\/D19-1250"},{"key":"e_1_3_2_1_38_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_39_1","unstructured":"Yongming Rao Wenliang Zhao Guangyi Chen Yansong Tang Zheng Zhu Guan Huang Jie Zhou and Jiwen Lu. 2022. DenseCLIP: Language-guided dense prediction with context-aware prompting. In CVPR. 18082--18091."},{"key":"e_1_3_2_1_40_1","unstructured":"Benjamin Recht Rebecca Roelofs Ludwig Schmidt and Vaishaal Shankar. 2019. Do ImageNet classifiers generalize to ImageNet?. In ICML. 5389--5400."},{"key":"e_1_3_2_1_41_1","unstructured":"Hengcan Shi Munawar Hayat Yicheng Wu and Jianfei Cai. 2022. ProposalCLIP: Unsupervised open-category object proposal generation via exploiting CLIP cues. In CVPR. 9611--9620."},{"key":"e_1_3_2_1_42_1","volume-title":"Eric Wallace, and Sameer Singh.","author":"Shin Taylor","year":"2020","unstructured":"Taylor Shin, Yasaman Razeghi, Robert L Logan IV, Eric Wallace, and Sameer Singh. 2020. AutoPrompt: Eliciting knowledge from language models with automatically generated prompts. In EMNLP. 4222--4235."},{"key":"e_1_3_2_1_43_1","unstructured":"Manli Shu Weili Nie De-An Huang Zhiding Yu Tom Goldstein Anima Anandkumar and Chaowei Xiao. 2022. Test-time prompt tuning for zero-Shot generalization in vision-language models. In NeurIPS."},{"key":"e_1_3_2_1_44_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_45_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS. 6000--6010."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Eric Wallace Shi Feng Nikhil Kandpal Matt Gardner and Sameer Singh. 2019. Universal adversarial triggers for attacking and analyzing NLP. In EMNLP-IJCNLP. 2153--2162.","DOI":"10.18653\/v1\/D19-1221"},{"key":"e_1_3_2_1_47_1","unstructured":"Haohan Wang Songwei Ge Zachary Lipton and Eric P Xing. 2019. Learning robust global representations by penalizing local predictive power. NeurIPS."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Jianxiong Xiao James Hays Krista A Ehinger Aude Oliva and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In CVPR. 3485--3492.","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Hantao Yao Rui Zhang and Changsheng Xu. 2023. Visual-language prompt tuning with knowledge-guided context optimization. In CVPR. 6757--6767.","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"e_1_3_2_1_50_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, et al. 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_1_51_1","volume-title":"Tip-Adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930","author":"Zhang Renrui","year":"2021","unstructured":"Renrui Zhang, Rongyao Fang, Wei Zhang, Peng Gao, Kunchang Li, Jifeng Dai, Yu Qiao, and Hongsheng Li. 2021. Tip-Adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930 (2021)."},{"key":"e_1_3_2_1_52_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022a. Conditional prompt learning for vision-language models. In CVPR. 16816--16825."},{"key":"e_1_3_2_1_53_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022b. Learning to prompt for vision-language models. IJCV (2022), 2337--2348."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Beier Zhu Yulei Niu Yucheng Han Yue Wu and Hanwang Zhang. 2023. Prompt-aligned gradient for prompt tuning. In ICCV. 15659--15669.","DOI":"10.1109\/ICCV51070.2023.01435"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Yaoxin Zhuo Yikang Li Jenhao Hsiao Chiuman Ho and Baoxin Li. 2022. CLIP4Hashing: unsupervised deep hashing for cross-modal video-text retrieval. In ICMR. 158--166.","DOI":"10.1145\/3512527.3531381"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658049","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658049","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:51:59Z","timestamp":1755766319000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658049"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":55,"alternative-id":["10.1145\/3652583.3658049","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658049","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}