{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:10:54Z","timestamp":1765311054367,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["62272494, 62325605"],"award-info":[{"award-number":["62272494, 62325605"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755564","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"4708-4717","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DART: Dual Adaptive Refinement Transfer for Open-Vocabulary Multi-Label Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4772-9569","authenticated-orcid":false,"given":"Haijing","family":"Liu","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9564-8288","authenticated-orcid":false,"given":"Tao","family":"Pu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2132-6515","authenticated-orcid":false,"given":"Hefeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7817-8306","authenticated-orcid":false,"given":"Keze","family":"Wang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2248-3755","authenticated-orcid":false,"given":"Liang","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"32897","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","volume":"35","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei. 2022. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. NeurIPS, Vol. 35 (2022), 32897-32912.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_2_1","first-page":"640","article-title":"Semantic diversity learning for zero-shot multi-label classification","author":"Ben-Cohen Avi","year":"2021","unstructured":"Avi Ben-Cohen, Nadav Zamir, Emanuel Ben-Baruch, Itamar Friedman, and Lihi Zelnik-Manor. 2021. Semantic diversity learning for zero-shot multi-label classification. In ICCV. IEEE, 640-650.","journal-title":"ICCV. IEEE"},{"key":"e_1_3_2_1_3_1","unstructured":"Shaked Brody Uri Alon and Eran Yahav. 2022. How Attentive are Graph Attention Networks?. In ICLR. OpenReview. https:\/\/openreview.net\/forum?id=F72ximsx7C1"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2913079"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2913079"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3131222"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00061"},{"key":"e_1_3_2_1_8_1","first-page":"5177","article-title":"Multi-label image recognition with graph convolutional networks","author":"Chen Zhao-Min","year":"2019","unstructured":"Zhao-Min Chen, Xiu-Shen Wei, Peng Wang, and Yanwen Guo. 2019a. Multi-label image recognition with graph convolutional networks. In CVPR. 5177-5186.","journal-title":"CVPR."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"e_1_3_2_1_12_1","first-page":"14084","article-title":"Learning to prompt for open-vocabulary object detection with vision-language model","author":"Du Yu","year":"2022","unstructured":"Yu Du, Fangyun Wei, Zihe Zhang, Miaojing Shi, Yue Gao, and Guoqi Li. 2022. Learning to prompt for open-vocabulary object detection with vision-language model. In CVPR. 14084-14093.","journal-title":"CVPR."},{"key":"e_1_3_2_1_13_1","volume-title":"A review of multi-instance learning assumptions. The knowledge engineering review","author":"Foulds James","year":"2010","unstructured":"James Foulds and Eibe Frank. 2010. A review of multi-instance learning assumptions. The knowledge engineering review, Vol. 25, 1 (2010), 1-25."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3088605"},{"volume-title":"Scaling open-vocabulary image segmentation with image-level labels","author":"Ghiasi Golnaz","key":"e_1_3_2_1_15_1","unstructured":"Golnaz Ghiasi, Xiuye Gu, Yin Cui, and Tsung-Yi Lin. 2022. Scaling open-vocabulary image segmentation with image-level labels. In ECCV. Springer, 540-557."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"Sunan He Taian Guo Tao Dai Ruizhi Qiao Xiujun Shu Bo Ren and Shu-Tao Xia. 2023. Open-Vocabulary Multi-Label Classification via Multi-Modal Knowledge Transfer. In AAAI Brian Williams Yiling Chen and Jennifer Neville (Eds.). 808-816. doi:10.1609\/AAAI.V37I1.25159","DOI":"10.1609\/AAAI.V37I1.25159"},{"key":"e_1_3_2_1_17_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_2_1_18_1","volume-title":"ArXiv","volume":"2503","author":"Huang Zhongzhan","year":"2025","unstructured":"Zhongzhan Huang, Guoming Ling, Vincent S. Liang, Yupei Lin, Yandong Chen, Shanshan Zhong, Hefeng Wu, and Liang Lin. 2025a. RouterEval: A Comprehensive Benchmark for Routing LLMs to Explore Model-level Scaling Up in LLMs. ArXiv, Vol. 2503.10657 (2025)."},{"key":"e_1_3_2_1_19_1","first-page":"11442","article-title":"MiniLongBench","author":"Huang Zhongzhan","year":"2025","unstructured":"Zhongzhan Huang, Guoming Ling, Shanshan Zhong, Hefeng Wu, and Liang Lin. 2025b. MiniLongBench: The Low-cost Long Context Understanding Benchmark for Large Language Models. In ACL. 11442-11460.","journal-title":"In ACL."},{"key":"e_1_3_2_1_20_1","first-page":"8773","article-title":"A Shared Multi-Attention Framework for Multi-Label Zero-Shot Learning","author":"Huynh Dat","year":"2020","unstructured":"Dat Huynh and Ehsan Elhamifar. 2020. A Shared Multi-Attention Framework for Multi-Label Zero-Shot Learning. In CVPR. 8773-8783.","journal-title":"CVPR."},{"key":"e_1_3_2_1_21_1","first-page":"7020","article-title":"Open-vocabulary instance segmentation via robust cross-modal pseudo-labeling","author":"Huynh Dat","year":"2022","unstructured":"Dat Huynh, Jason Kuen, Zhe Lin, Jiuxiang Gu, and Ehsan Elhamifar. 2022. Open-vocabulary instance segmentation via robust cross-modal pseudo-labeling. In CVPR. 7020-7031.","journal-title":"CVPR."},{"key":"e_1_3_2_1_22_1","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. PMLR, 4904-4916.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.NEUCOM.2019.05.024"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_1_25_1","first-page":"1576","article-title":"Multi-Label Zero-Shot Learning With Structured Knowledge Graphs","author":"Lee Chung-Wei","year":"2018","unstructured":"Chung-Wei Lee, Wei Fang, Chih-Kuan Yeh, and Yu-Chiang Frank Wang. 2018. Multi-Label Zero-Shot Learning With Structured Knowledge Graphs. In CVPR. 1576-1585.","journal-title":"CVPR."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1186\/s10033-021-00598-9"},{"key":"e_1_3_2_1_27_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021b. Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS, Vol. 34 (2021), 9694-9705.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_28_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint, Vol. abs\/1908.03557 (2019). arXiv:1908.03557 http:\/\/arxiv.org\/abs\/1908.03557"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"Xirong Li Shuai Liao Weiyu Lan Xiaoyong Du and Gang Yang. 2015. Zero-shot Image Tagging by Hierarchical Semantic Embedding. In Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval Ricardo Baeza-Yates Mounia Lalmas Alistair Moffat and Berthier A. Ribeiro-Neto (Eds.). 879-882. doi:10.1145\/2766462.2767773","DOI":"10.1145\/2766462.2767773"},{"key":"e_1_3_2_1_30_1","first-page":"740","volume-title":"Switzerland","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13. Springer, 740-755."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28139"},{"key":"e_1_3_2_1_32_1","volume-title":"ArXiv","volume":"2310","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023b. Improved Baselines with Visual Instruction Tuning. ArXiv, Vol. 2310.03744 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Visual Instruction Tuning. In Annual Conference on Neural Information Processing Systems, NeurIPS","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. [n.d.]. Visual Instruction Tuning. In Annual Conference on Neural Information Processing Systems, NeurIPS, New Orleans, LA, USA, December 10-16, 2023."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02285"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3324648"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSP48568.2020.9182362"},{"key":"e_1_3_2_1_37_1","first-page":"8711","article-title":"Discriminative Region-based Multi-Label Zero-Shot Learning","author":"Narayan Sanath","year":"2021","unstructured":"Sanath Narayan, Akshita Gupta, Salman H. Khan, Fahad Shahbaz Khan, Ling Shao, and Mubarak Shah. 2021. Discriminative Region-based Multi-Label Zero-Shot Learning. In ICCV. 8711-8720.","journal-title":"ICCV."},{"key":"e_1_3_2_1_38_1","volume-title":"ArXiv","volume":"2303","author":"AI.","year":"2023","unstructured":"OpenAI. 2023. GPT-4 technical report. ArXiv, Vol. 2303.08774 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the Conference on Empirical Methods in Natural Language Processing, EMNLP 2014, Alessandro Moschitti, Bo Pang, and Walter Daelemans (Eds.). 1532-1543."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3345652"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3395901"},{"key":"e_1_3_2_1_43_1","first-page":"8748","volume-title":"ICML","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML, Vol. 139. 8748-8763."},{"key":"e_1_3_2_1_44_1","volume-title":"Asian Conference on Computer Vision. Springer, 530-546","author":"Rahman Shafin","year":"2018","unstructured":"Shafin Rahman and Salman Khan. 2018. Deep multiple instance learning for zero-shot image tagging. In Asian Conference on Computer Vision. Springer, 530-546."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1186\/S40537-019-0278-0"},{"key":"e_1_3_2_1_47_1","unstructured":"Qwen Team. 2024a. Qwen2 Technical Report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"Qwen Team. 2024b. Qwen2.5: A Party of Foundation Models. https:\/\/qwenlm.github.io\/blog\/qwen2.5\/"},{"key":"e_1_3_2_1_49_1","unstructured":"LLaMA 3 Teams. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_50_1","unstructured":"Petar Velickovic Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Li\u00f2 and Yoshua Bengio. 2018. Graph Attention Networks. In ICLR. OpenReview. https:\/\/openreview.net\/forum?id=rJXMpikCZ"},{"key":"e_1_3_2_1_51_1","first-page":"464","article-title":"Multi-label image recognition by recurrently discovering attentional regions","author":"Wang Zhouxia","year":"2017","unstructured":"Zhouxia Wang, Tianshui Chen, Guanbin Li, Ruijia Xu, and Liang Lin. 2017. Multi-label image recognition by recurrently discovering attentional regions. In ICCV. 464-472.","journal-title":"ICCV."},{"key":"e_1_3_2_1_52_1","first-page":"1901","volume-title":"IEEE TPAMI","volume":"38","author":"Wei Yunchao","year":"2015","unstructured":"Yunchao Wei, Wei Xia, Min Lin, Junshi Huang, Bingbing Ni, Jian Dong, Yao Zhao, and Shuicheng Yan. 2015. HCP: A flexible CNN framework for multi-label image classification. IEEE TPAMI, Vol. 38, 9 (2015), 1901-1907."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3329220"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3588255"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3453055"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578518"},{"volume-title":"Open-vocabulary detr with conditional matching","author":"Zang Yuhang","key":"e_1_3_2_1_57_1","unstructured":"Yuhang Zang, Wei Li, Kaiyang Zhou, Chen Huang, and Chen Change Loy. 2022. Open-vocabulary detr with conditional matching. In ECCV. Springer, 106-122."},{"key":"e_1_3_2_1_58_1","first-page":"14393","article-title":"Open-vocabulary object detection using captions","author":"Zareian Alireza","year":"2021","unstructured":"Alireza Zareian, Kevin Dela Rosa, Derek Hao Hu, and Shih-Fu Chang. 2021. Open-vocabulary object detection using captions. In CVPR. 14393-14402.","journal-title":"CVPR."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.644"},{"key":"e_1_3_2_1_61_1","first-page":"1473","article-title":"Scene-Aware Label Graph Learning for Multi-Label Image Classification","author":"Zhu Xuelin","year":"2023","unstructured":"Xuelin Zhu, Jian Liu, Weijia Liu, Jiawei Ge, Bo Liu, and Jiuxin Cao. 2023. Scene-Aware Label Graph Learning for Multi-Label Image Classification. In ICCV. 1473-1482.","journal-title":"ICCV."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:40Z","timestamp":1765310860000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755564"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":61,"alternative-id":["10.1145\/3746027.3755564","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755564","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}