{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:13:43Z","timestamp":1775326423166,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LDT23F01013F01"],"award-info":[{"award-number":["LDT23F01013F01"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271281, 61925107, U1936202, 62021002"],"award-info":[{"award-number":["62271281, 61925107, U1936202, 62021002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611988","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"5594-5604","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Hierarchical Prompt Learning Using CLIP for Multi-label Classification with Single Positive Labels"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0030-551X","authenticated-orcid":false,"given":"Ao","family":"Wang","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4180-5801","authenticated-orcid":false,"given":"Hui","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1390-7424","authenticated-orcid":false,"given":"Zijia","family":"Lin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2362-6802","authenticated-orcid":false,"given":"Zixuan","family":"Ding","sequence":"additional","affiliation":[{"name":"Xidian University &amp; Zhuoxi Institute of Brain and Intelligence, Beijing &amp; Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6031-5245","authenticated-orcid":false,"given":"Pengzhang","family":"Liu","sequence":"additional","affiliation":[{"name":"jd.com, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7816-0587","authenticated-orcid":false,"given":"Yongjun","family":"Bao","sequence":"additional","affiliation":[{"name":"jd.com, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5702-8384","authenticated-orcid":false,"given":"Weipeng","family":"Yan","sequence":"additional","affiliation":[{"name":"jd.com, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0137-9975","authenticated-orcid":false,"given":"Guiguang","family":"Ding","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19910"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00099"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01161"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00074"},{"key":"e_1_3_2_1_8_1","unstructured":"Mark Everingham and John Winn. 2012. The PASCAL visual object classes challenge 2012 (VOC2012) development kit. Pattern Anal. Stat. Model. Comput. Learn. Tech. Rep Vol. 2007 (2012) 1--45."},{"key":"e_1_3_2_1_9_1","volume-title":"Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723","author":"Gao Tianyu","year":"2020","unstructured":"Tianyu Gao, Adam Fisch, and Danqi Chen. 2020. Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723 (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00324"},{"key":"e_1_3_2_1_11_1","volume-title":"A patch-based architecture for multi-label classification from single label annotations. arXiv preprint arXiv:2209.06530","author":"Jouanneau Warren","year":"2022","unstructured":"Warren Jouanneau, Aur\u00e9lie Bugeau, Marc Palyart, Nicolas Papadakis, and Laurent V\u00e9zard. 2022. A patch-based architecture for multi-label classification from single label annotations. arXiv preprint arXiv:2209.06530 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Hyperspherical Learning in Multi-Label Classification. In European Conference on Computer Vision. Springer, 38--55","author":"Ke Bo","year":"2022","unstructured":"Bo Ke, Yunquan Zhu, Mengtian Li, Xiujun Shu, Ruizhi Qiao, and Bo Ren. 2022. Hyperspherical Learning in Multi-Label Classification. In European Conference on Computer Vision. Springer, 38--55."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01376"},{"key":"e_1_3_2_1_14_1","volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. International Journal of Computer Vision","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David A. Shamma, Michael S. Bernstein, and Li Fei-Fei. 2017. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. International Journal of Computer Vision (2017)."},{"key":"e_1_3_2_1_15_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01050"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","volume-title":"Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834","author":"Liu Shilong","year":"2021","unstructured":"Shilong Liu, Lei Zhang, Xiao Yang, Hang Su, and Jun Zhu. 2021. Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834 (2021)."},{"key":"e_1_3_2_1_19_1","volume-title":"Open-vocabulary Semantic Segmentation with Frozen Vision-Language Models. arXiv preprint arXiv:2210.15138","author":"Ma Chaofan","year":"2022","unstructured":"Chaofan Ma, Yuhuan Yang, Yanfeng Wang, Ya Zhang, and Weidi Xie. 2022. Open-vocabulary Semantic Segmentation with Frozen Vision-Language Models. arXiv preprint arXiv:2210.15138 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219748"},{"key":"e_1_3_2_1_21_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Semantic-aware representation blending for multi-label image recognition with partial labels. arXiv preprint arXiv:2203.02172","author":"Pu Tao","year":"2022","unstructured":"Tao Pu, Tianshui Chen, Hefeng Wu, and Liang Lin. 2022. Semantic-aware representation blending for multi-label image recognition with partial labels. arXiv preprint arXiv:2203.02172 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00012"},{"key":"e_1_3_2_1_26_1","volume-title":"Eric Wallace, and Sameer Singh.","author":"Shin Taylor","year":"2020","unstructured":"Taylor Shin, Yasaman Razeghi, Robert L Logan IV, Eric Wallace, and Sameer Singh. 2020. Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Dualcoop: Fast adaptation to multi-label recognition with limited annotations. arXiv preprint arXiv:2206.09541","author":"Sun Ximeng","year":"2022","unstructured":"Ximeng Sun, Ping Hu, and Kate Saenko. 2022. Dualcoop: Fast adaptation to multi-label recognition with limited annotations. arXiv preprint arXiv:2206.09541 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00387"},{"key":"e_1_3_2_1_29_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.251"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6909"},{"key":"e_1_3_2_1_32_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"J. International Conference on Learning Representations (ICLR","author":"Welling Max","year":"2016","unstructured":"Max Welling and Thomas N Kipf. 2016. Semi-supervised classification with graph convolutional networks. In J. International Conference on Learning Representations (ICLR 2017)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414046"},{"key":"e_1_3_2_1_35_1","first-page":"18430","article-title":"Label-Aware Global Consistency for Multi-Label Learning with Single Positive Labels","volume":"35","author":"Xie Ming-Kun","year":"2022","unstructured":"Ming-Kun Xie, Jiahao Xiao, and Sheng-Jun Huang. 2022. Label-Aware Global Consistency for Multi-Label Learning with Single Positive Labels. Advances in Neural Information Processing Systems, Vol. 35 (2022), 18430--18441.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"One Positive Label is Sufficient: Single-Positive Multi-Label Learning with Label Enhancement. arXiv preprint arXiv:2206.00517","author":"Xu Ning","year":"2022","unstructured":"Ning Xu, Congyu Qiao, Jiaqi Lv, Xin Geng, and Min-Ling Zhang. 2022b. One Positive Label is Sufficient: Single-Positive Multi-Label Learning with Label Enhancement. arXiv preprint arXiv:2206.00517 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"A Dual Modality Approach For (Zero-Shot) Multi-Label Classification. arXiv preprint arXiv:2208.09562","author":"Xu Shichao","year":"2022","unstructured":"Shichao Xu, Yikang Li, Jenhao Hsiao, Chiuman Ho, and Zhu Qi. 2022a. A Dual Modality Approach For (Zero-Shot) Multi-Label Classification. arXiv preprint arXiv:2208.09562 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449","author":"Yazici Vacit Oguz","unstructured":"Vacit Oguz Yazici, Abel Gonzalez-Garcia, Arnau Ramisa, Bartlomiej Twardowski, and Joost van de Weijer. 2020. Orderless recurrent models for multi-label classification. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13440--13449."},{"key":"e_1_3_2_1_39_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547834"},{"key":"e_1_3_2_1_41_1","volume-title":"Simple and Robust Loss Design for Multi-Label Learning with Missing Labels. arXiv preprint arXiv:2112.07368","author":"Zhang Youcai","year":"2021","unstructured":"Youcai Zhang, Yuhao Cheng, Xinyu Huang, Fei Wen, Rui Feng, Yaqian Li, and Yandong Guo. 2021. Simple and Robust Loss Design for Multi-Label Learning with Missing Labels. arXiv preprint arXiv:2112.07368 (2021)."},{"key":"e_1_3_2_1_42_1","volume-title":"Acknowledging the Unknown for Multi-label Learning with Single Positive Labels. arXiv preprint arXiv:2203.16219","author":"Zhou Donghao","year":"2022","unstructured":"Donghao Zhou, Pengfei Chen, Qiong Wang, Guangyong Chen, and Pheng-Ann Heng. 2022a. Acknowledging the Unknown for Multi-label Learning with Single Positive Labels. arXiv preprint arXiv:2203.16219 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2017.2785795"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611988","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611988","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:12:40Z","timestamp":1755821560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611988"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":45,"alternative-id":["10.1145\/3581783.3611988","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611988","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}