{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T23:29:24Z","timestamp":1769729364873,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["62076005, U20A20398"],"award-info":[{"award-number":["62076005, U20A20398"]}]},{"name":"the Natural Science Foundation of Anhui Province","award":["2008085MF191, 2308085MF214"],"award-info":[{"award-number":["2008085MF191, 2308085MF214"]}]},{"name":"the University Synergy Innovation Program of Anhui Province, China","award":["GXXT-2021-002, GXXT-2022-029"],"award-info":[{"award-number":["GXXT-2021-002, GXXT-2022-029"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680815","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"6133-6142","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Text-Region Matching for Multi-Label Image Recognition with Missing Labels"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8681-0765","authenticated-orcid":false,"given":"Leilei","family":"Ma","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1577-4861","authenticated-orcid":false,"given":"Hongxing","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Anhui University, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3860-5139","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4191-4779","authenticated-orcid":false,"given":"Yanping","family":"Fu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0164-7944","authenticated-orcid":false,"given":"Dengdi","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Anhui University, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5300-0683","authenticated-orcid":false,"given":"Haifeng","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Anhui University, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00472"},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings, Part I. Springer, 213--229","author":"Carion Nicolas","year":"2020","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-End Object Detection with Transformers. In Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I. Springer, 213--229."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01849-z"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 37th International Conference on Machine Learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In Proceedings of the 37th International Conference on Machine Learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19910"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00061"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00099"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2557011"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00331"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00074"},{"key":"e_1_3_2_1_14_1","volume-title":"Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2015","unstructured":"Mark Everingham, SM Ali Eslami, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2015. The pascal visual object classes challenge: A retrospective. International journal of computer vision, Vol. 111 (2015), 98--136."},{"key":"e_1_3_2_1_15_1","unstructured":"Chun-Mei Feng Kai Yu Xinxing Xu Yong Liu Salman Khan Wangmeng Zuo and Rick Siow Mong Goh. 2024. Text to Image for Multi-Label Image Recognition with Joint Prompt-Adapter Learning. https:\/\/openreview.net\/forum?id=5abK7RDbuW"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01908"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings, Part VII 14","author":"Joulin Armand","year":"2016","unstructured":"Armand Joulin, Laurens Van Der Maaten, Allan Jabri, and Nicolas Vasilache. 2016. Learning visual features from large weakly supervised data. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part VII 14. Springer, 67--84."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"15969","author":"Kaul Prannay","year":"2023","unstructured":"Prannay Kaul, Weidi Xie, and Andrew Zisserman. 2023. Multi-Modal Classifiers for Open-Vocabulary Object Detection. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 15946--15969."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01376"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00654"},{"key":"e_1_3_2_1_26_1","volume-title":"European Conference on Computer Vision.","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and Larry Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV eccv ed.). European Conference on Computer Vision."},{"key":"e_1_3_2_1_27_1","first-page":"1","article-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","volume":"55","author":"Liu Pengfei","year":"2023","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2023. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. Comput. Surveys, Vol. 55, 9 (2023), 1--35.","journal-title":"Comput. Surveys"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"e_1_3_2_1_29_1","volume-title":"Semantic-Aware Dual Contrastive Learning for Multi-label Image Classification. In 26th European Conference on Artificial Intelligence. 1656--1663","author":"Ma Leilei","year":"2023","unstructured":"Leilei Ma, Dengdi Sun, Lei Wang, Haifeng Zhao, and Bin Luo. 2023. Semantic-Aware Dual Contrastive Learning for Multi-label Image Classification. In 26th European Conference on Artificial Intelligence. 1656--1663."},{"key":"e_1_3_2_1_30_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_31_1","unstructured":"Adam Paszke Sam Gross Soumith Chintala Gregory Chanan Edward Yang Zachary DeVito Zeming Lin Alban Desmaison Luca Antiga and Adam Lerer. 2017. Automatic differentiation in pytorch. (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20105"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"e_1_3_2_1_36_1","first-page":"30569","article-title":"Dualcoop: Fast adaptation to multi-label recognition with limited annotations","volume":"35","author":"Sun Ximeng","year":"2022","unstructured":"Ximeng Sun, Ping Hu, and Kate Saenko. 2022. Dualcoop: Fast adaptation to multi-label recognition with limited annotations. Advances in Neural Information Processing Systems, Vol. 35 (2022), 30569--30582.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_37_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611988"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.251"},{"key":"e_1_3_2_1_40_1","volume-title":"SpliceMix: A Cross-scale and Semantic Blending Augmentation Strategy for Multi-label Image Classification. arXiv preprint arXiv:2311.15200","author":"Wang Lei","year":"2023","unstructured":"Lei Wang, Yibing Zhan, Leilei Ma, Dapeng Tao, Liang Ding, and Chen Gong. 2023. SpliceMix: A Cross-scale and Semantic Blending Augmentation Strategy for Multi-label Image Classification. arXiv preprint arXiv:2311.15200 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 9929--9939","author":"Wang Tongzhou","year":"2020","unstructured":"Tongzhou Wang and Phillip Isola. 2020. Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In International conference on machine learning. PMLR, 9929--9939."},{"key":"e_1_3_2_1_42_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Wang Yifei","year":"2023","unstructured":"Yifei Wang, Qi Zhang, Tianqi Du, Jiansheng Yang, Zhouchen Lin, and Yisen Wang. 2023. A Message Passing Perspective on Learning Dynamics of Contrastive Learning. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.58"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings, Part IV 16","author":"Wu Tong","year":"2020","unstructured":"Tong Wu, Qingqiu Huang, Ziwei Liu, Yu Wang, and Dahua Lin. 2020. Distribution-balanced loss for multi-label classification in long-tailed datasets. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16. Springer, 162--178."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414046"},{"key":"e_1_3_2_1_46_1","first-page":"25731","article-title":"Class-Distribution-Aware Pseudo-Labeling for Semi-Supervised Multi-Label Learning","volume":"36","author":"Xie Ming-Kun","year":"2023","unstructured":"Ming-Kun Xie, Jiahao Xiao, Hao-Zhe Liu, Gang Niu, Masashi Sugiyama, and Sheng-Jun Huang. 2023. Class-Distribution-Aware Pseudo-Labeling for Semi-Supervised Multi-Label Learning. In Advances in Neural Information Processing Systems, Vol. 36. 25731--25747.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","unstructured":"Ming-Kun Xie Jia-Hao Xiao and Sheng-Jun Huang. 2022. Label-Aware Global Consistency for Multi-Label Learning with Single Positive Labels. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_48_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Xie Ming-Kun","year":"2024","unstructured":"Ming-Kun Xie, Jia-Hao Xiao, Pei Peng, Gang Niu, Masashi Sugiyama, and Sheng-Jun Huang. 2024. Counterfactual Reasoning for Multi-Label Image Classification via Patching-Based Training. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_49_1","volume-title":"Data-free Multi-label Image Recognition via LLM-powered Prompt Tuning. arXiv preprint arXiv:2403.01209","author":"Yang Shuo","year":"2024","unstructured":"Shuo Yang, Zirui Shang, Yongqi Wang, Derong Deng, Hongwei Chen, Qiyuan Cheng, and Xinxiao Wu. 2024. Data-free Multi-label Image Recognition via LLM-powered Prompt Tuning. arXiv preprint arXiv:2403.01209 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings, Part XXI 16","author":"Ye Jin","year":"2020","unstructured":"Jin Ye, Junjun He, Xiaojiang Peng, Wenhao Wu, and Yu Qiao. 2020. Attention-driven dynamic graph convolutional network for multi-label image recognition. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXI 16. Springer, 649--665."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6964"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"e_1_3_2_1_53_1","volume-title":"Simple and robust loss design for multi-label learning with missing labels. arXiv preprint arXiv:2112.07368","author":"Zhang Youcai","year":"2021","unstructured":"Youcai Zhang, Yuhao Cheng, Xinyu Huang, Fei Wen, Rui Feng, Yaqian Li, and Yandong Guo. 2021. Simple and robust loss design for multi-label learning with missing labels. arXiv preprint arXiv:2112.07368 (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_56_1","volume-title":"Image: Learning Transferable Adapter for Multi-Label Classification. arXiv preprint arXiv:2312.04160","author":"Zhu Xuelin","year":"2023","unstructured":"Xuelin Zhu, Jiuxin Cao, Dongqi Tang, Furong Xu, Weijia Liu, Jiawei Ge, Bo Liu, Qingpei Guo, Tianyi Zhang, et al. 2023. Text as Image: Learning Transferable Adapter for Multi-Label Classification. arXiv preprint arXiv:2312.04160 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680815","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680815","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680815"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":56,"alternative-id":["10.1145\/3664647.3680815","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680815","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}