{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:16Z","timestamp":1765343356878,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"name":"National Major Scientific Instruments and Equipments Development Project of National Natural Science Foundation of China","award":["62427820"],"award-info":[{"award-number":["62427820"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["1082204112364"],"award-info":[{"award-number":["1082204112364"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100018542","name":"Natural Science Foundation of Sichuan Province","doi-asserted-by":"publisher","award":["2024NSFSC1462"],"award-info":[{"award-number":["2024NSFSC1462"]}],"id":[{"id":"10.13039\/501100018542","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Science Fund for Creative Research Groups of Sichuan Province Natural Science Foundation","award":["2024NSFTD0035"],"award-info":[{"award-number":["2024NSFTD0035"]}]},{"name":"Natural Science Foundation of Sichuan","award":["24NSFSC3404"],"award-info":[{"award-number":["24NSFSC3404"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754713","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"5922-5931","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual Prompt Learning for Adapting Vision-Language Models to Downstream Image-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5607-3265","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"first","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2480-878X","authenticated-orcid":false,"given":"Tao","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1749-986X","authenticated-orcid":false,"given":"Chenwei","family":"Tang","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8246-1561","authenticated-orcid":false,"given":"Caiyang","family":"Yu","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7904-9312","authenticated-orcid":false,"given":"Zhengqing","family":"Zang","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2694-7097","authenticated-orcid":false,"given":"Mengmi","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore and Deep NeuroCognition Lab, I2R and CFAR, Agency for Science, Technology and Research, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6848-5460","authenticated-orcid":false,"given":"Shudong","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6551-3884","authenticated-orcid":false,"given":"Jiancheng","family":"Lv","sequence":"additional","affiliation":[{"name":"College of Computer Science, Sichuan University, Chengdu, China and Engineering Research Center of Machine Learning and Industry Intelligence, Ministry of Education, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455679"},{"key":"e_1_3_2_1_2_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","volume-title":"MINE: Mutual Information Neural Estimation. arXiv preprint arXiv:1801.04062","author":"Belghazi Mohamed Ishmael","year":"2018","unstructured":"Mohamed Ishmael Belghazi, Aristide Baratin, Sai Rajeswar, Sherjil Ozair, Yoshua Bengio, Aaron Courville, and R Devon Hjelm. 2018. MINE: Mutual Information Neural Estimation. arXiv preprint arXiv:1801.04062 (2018)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20050-2_26"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_2_1_10_1","volume-title":"Domain adaptation via prompt learning","author":"Ge Chunjiang","year":"2023","unstructured":"Chunjiang Ge, Rui Huang, Mixue Xie, Zihang Lai, Shiji Song, Shuang Li, and Gao Huang. 2023. Domain adaptation via prompt learning. IEEE Transactions on Neural Networks and Learning Systems (2023), 1-11."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. 8109-8126","author":"Guo Yiduo","year":"2022","unstructured":"Yiduo Guo, Bing Liu, and Dongyan Zhao. 2022. Online Continual Learning through Mutual Information Maximization. In International Conference on Machine Learning. 8109-8126."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25152"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3135420"},{"key":"e_1_3_2_1_14_1","volume-title":"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In International Conference on Machine Learning. 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In International Conference on Machine Learning. 4904-4916."},{"key":"e_1_3_2_1_15_1","volume-title":"Visual Prompt Tuning. In European Conference on Computer Vision. 709-727","author":"Jia Menglin","year":"2022","unstructured":"Menglin Jia, Luming Tang, Bor-Chun Chen, Claire Cardie, Serge Belongie, Bharath Hariharan, and Ser-Nam Lim. 2022. Visual Prompt Tuning. In European Conference on Computer Vision. 709-727."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"e_1_3_2_1_17_1","first-page":"27896","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","volume":"37","author":"Jing Dong","year":"2024","unstructured":"Dong Jing, Xiaolong He, Yutian Luo, Nanyi Fei, Wei Wei, Huiwen Zhao, Zhiwu Lu, et al., 2024. FineCLIP: Self-distilled region-based clip for better fine-grained understanding. In Proceedings of the 38th International Conference on Neural Information Processing Systems, Vol. 37. 27896-27918."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19113-19122","author":"Muhammad Uzair","year":"2023","unstructured":"Muhammad Uzair khattak, Hanoona Rasheed, Muhammad Maaz, Salman Khan, and Fahad Shahbaz Khan. 2023. MaPLe: Multi-modal Prompt Learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19113-19122."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevE.69.066138"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_22_1","volume-title":"Predicting Structured Data","volume":"1","author":"LeCun Yann","year":"2006","unstructured":"Yann LeCun, Sumit Chopra, Raia Hadsell, M Ranzato, Fujie Huang, et al., 2006. A Tutorial on Energy-based earning. Predicting Structured Data, Vol. 1, 0 (2006)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00135"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148470"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-Tuning: Optimizing Continuous Prompts for Generation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing Chengqing Zong Fei Xia Wenjie Li and Roberto Navigli (Eds.). 4582-4597.","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02224"},{"key":"e_1_3_2_1_28_1","volume-title":"Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision. 740-755","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e4r, and C Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision. 740-755."},{"key":"e_1_3_2_1_29_1","first-page":"21464","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","volume":"33","author":"Liu Weitang","year":"2020","unstructured":"Weitang Liu, Xiaoyun Wang, John Owens, and Yixuan Li. 2020. Energy-based Out-of-distribution Detection. In Proceedings of the 33rd International Conference on Neural Information Processing Systems, Vol. 33. 21464-21475."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01524"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657678"},{"key":"e_1_3_2_1_32_1","volume-title":"Fine-grained visual classification of aircraft. Technical report","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. Technical report (2013)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00072"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531959"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681174"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02544"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_41_1","volume-title":"VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts. arXiv preprint arXiv:2112.02399","author":"Qiu Longtian","year":"2021","unstructured":"Longtian Qiu, Renrui Zhang, Ziyu Guo, Ziyao Zeng, Zilu Guo, Yafeng Li, and Guangnan Zhang. 2021. VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts. arXiv preprint arXiv:2112.02399 (2021)."},{"key":"e_1_3_2_1_42_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. 8748-8763."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. 10299-10315","author":"Rajput Shashank","year":"2023","unstructured":"Shashank Rajput, Nikhil Mehta, Anima Singh, Raghunandan Keshavan, Trung Vu, Lukasz Heidt, Lichan Hong, Yi Tay, Vinh Q Tran, Jonah Samost, et al., 2023. Recommender systems with generative retrieval. In Proceedings of the 37th International Conference on Neural Information Processing Systems. 10299-10315."},{"key":"e_1_3_2_1_44_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01059"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15963-15974","author":"Anasosalu Vasu Pavan Kumar","year":"2024","unstructured":"Pavan Kumar Anasosalu Vasu, Hadi Pouransari, Fartash Faghri, Raviteja Vemulapalli, and Oncel Tuzel. 2024. MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15963-15974."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01857"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25363"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2688133"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681556"},{"key":"e_1_3_2_1_51_1","volume-title":"Enhancing Diffusion Model Stability for Image Restoration via Gradient Management. arXiv preprint arXiv:2507.06656","author":"Wu Hongjie","year":"2025","unstructured":"Hongjie Wu, Mingqin Zhang, Linchao He, Ji-Zhe Zhou, and Jiancheng Lv. 2025. Enhancing Diffusion Model Stability for Image Restoration via Gradient Management. arXiv preprint arXiv:2507.06656 (2025)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01846"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3291588"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00273"},{"key":"e_1_3_2_1_56_1","volume-title":"FILIP: Fine-grained Interactive Language-Image Pre-Training. In The Tenth International Conference on Learning Representations.","author":"Yao Lewei","year":"2022","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2022. FILIP: Fine-grained Interactive Language-Image Pre-Training. In The Tenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2024.01.004"},{"key":"e_1_3_2_1_58_1","volume-title":"GPT-NAS: Neural architecture search meets generative pre-trained transformer model. Big Data Mining and Analytics","author":"Yu Caiyang","year":"2024","unstructured":"Caiyang Yu, Xianggen Liu, Yifan Wang, Yun Liu, Wentao Feng, Xiong Deng, Chenwei Tang, and Jiancheng Lv. 2024. GPT-NAS: Neural architecture search meets generative pre-trained transformer model. Big Data Mining and Analytics (2024)."},{"key":"e_1_3_2_1_59_1","volume-title":"Rethinking neural architecture representation for predictors: Topological encoding in pixel space. Information Fusion","author":"Yu Caiyang","year":"2025","unstructured":"Caiyang Yu, Jian Wang, Yifan Wang, Wei Ju, Chenwei Tang, and Jiancheng Lv. 2025. Rethinking neural architecture representation for predictors: Topological encoding in pixel space. Information Fusion (2025)."},{"key":"e_1_3_2_1_60_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Y\u00fcksekg\u00f6n\u00fcl Mert","year":"2023","unstructured":"Mert Y\u00fcksekg\u00f6n\u00fcl, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2023. When and Why Vision-Language Models Behave like Bags-Of-Words, and What to Do About It?. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_61_1","volume-title":"A Simple Yet Effective Multi-Modal Reward Model. arXiv preprint arXiv:2501.12368","author":"Zang Yuhang","year":"2025","unstructured":"Yuhang Zang, Xiaoyi Dong, Pan Zhang, Yuhang Cao, Ziyu Liu, Shengyuan Ding, Shenxi Wu, Yubo Ma, Haodong Duan, Wenwei Zhang, Kai Chen, Dahua Lin, and Jiaqi Wang. 2025. InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward Model. arXiv preprint arXiv:2501.12368 (2025)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28518"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.398"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754713","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:42Z","timestamp":1765343082000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754713"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":65,"alternative-id":["10.1145\/3746027.3754713","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754713","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}