{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T02:15:49Z","timestamp":1771467349150,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"name":"Pioneer and Leading Goose R&D Program of Zhejiang","award":["2024C01020"],"award-info":[{"award-number":["2024C01020"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755636","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"4856-4865","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["AttriPrompt: Dynamic Prompt Composition Learning for CLIP"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9061-5979","authenticated-orcid":false,"given":"Qiqi","family":"Zhan","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8765-9283","authenticated-orcid":false,"given":"Shiwei","family":"Li","sequence":"additional","affiliation":[{"name":"Hangzhou Innovation Institute, Beihang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5181-6451","authenticated-orcid":false,"given":"Qingjie","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8001-2703","authenticated-orcid":false,"given":"Yunhong","family":"Wang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Visual prompting: Modifying pixel space to adapt pre-trained models. arXiv preprint arXiv:2203.17274","author":"Bahng Hyojin","year":"2022","unstructured":"Hyojin Bahng, Ali Jahanian, Swami Sankaranarayanan, and Phillip Isola. 2022. Visual prompting: Modifying pixel space to adapt pre-trained models. arXiv preprint arXiv:2203.17274, Vol. 3, 11-12 (2022), 3."},{"key":"e_1_3_2_1_2_1","first-page":"33781","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume":"35","author":"Bangalath Hanoona","year":"2022","unstructured":"Hanoona Bangalath, Muhammad Maaz, Muhammad Uzair Khattak, Salman H Khan, and Fahad Shahbaz Khan. 2022. Bridging the gap between object and image-level representations for open-vocabulary detection. Advances in Neural Information Processing Systems, Vol. 35 (2022), 33781-33794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","first-page":"446","volume-title":"Switzerland","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van Gool. 2014. Food-101-mining discriminative components with random forests. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part VI 13. Springer, 446-461."},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PmLR, 1597-1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PmLR, 1597-1607."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01398"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"e_1_3_2_1_9_1","volume-title":"Openprompt: An open-source framework for prompt-learning. arXiv preprint arXiv:2111.01998","author":"Ding Ning","year":"2021","unstructured":"Ning Ding, Shengding Hu, Weilin Zhao, Yulin Chen, Zhiyuan Liu, Hai-Tao Zheng, and Maosong Sun. 2021. Openprompt: An open-source framework for prompt-learning. arXiv preprint arXiv:2111.01998 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_2_1_13_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"e_1_3_2_1_21_1","unstructured":"Konwoo Kim Michael Laskin Igor Mordatch and Deepak Pathak. 2021. How to adapt your large-scale vision-and-language model. (2021)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_23_1","volume-title":"Language-driven Semantic Segmentation. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q. Weinberger, Serge J. Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven Semantic Segmentation. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022. OpenReview.net. https:\/\/openreview.net\/forum?id=RriDjddCLN"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_30"},{"key":"e_1_3_2_1_26_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617833"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 5389-5400","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do imagenet classifiers generalize to imagenet?. In International conference on machine learning. PMLR, 5389-5400."},{"key":"e_1_3_2_1_34_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Learning robust global representations by penalizing local predictive power. Advances in neural information processing systems","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric P Xing. 2019. Learning robust global representations by penalizing local predictive power. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_37_1","first-page":"3485","article-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","year":"2010","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485-3492.","journal-title":"IEEE"},{"key":"e_1_3_2_1_38_1","volume-title":"Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783","author":"Yao Lewei","year":"2021","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2021. Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)."},{"key":"e_1_3_2_1_39_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, et al., 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01435"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755636","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:00:07Z","timestamp":1765342807000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755636"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755636","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755636","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}