{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:51:20Z","timestamp":1765309880868,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2023-00217689; RS-2024-00358935"],"award-info":[{"award-number":["RS-2023-00217689; RS-2024-00358935"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["IITP-2025-RS-2023-00254177"],"award-info":[{"award-number":["IITP-2025-RS-2023-00254177"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755253","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"3942-3951","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["BAC-GCN: Background-Aware CLIP-GCN Framework for Unsupervised Multi-Label Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1568-7462","authenticated-orcid":false,"given":"Yonghyeon","family":"Jo","sequence":"first","affiliation":[{"name":"Department of Information Convergence Engineering, Pusan National University, Busan, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3525-062X","authenticated-orcid":false,"given":"Janghyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Information Convergence Engineering, Pusan National University, Busan, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2296-819X","authenticated-orcid":false,"given":"Jinsun","family":"Park","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Pusan National University, Busan, Republic of Korea and Center for Artificial Intelligence Research, Pusan National University, Busan, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00130"},{"key":"e_1_3_2_1_2_1","unstructured":"Rabab Abdelfattah Xin Zhang Mostafa M Fouda Xiaofeng Wang and Song Wang. 2022. G2NetPL: Generic game-theoretic network for partial-label image classification. arXiv preprint arXiv:2210.11469."},{"key":"e_1_3_2_1_3_1","unstructured":"Steven Bird Ewan Klein and Edward Loper. 2009. Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit. Available online at http:\/\/www.nltk.org\/book\/."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1102351.1102363"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557487"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00394"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00099"},{"key":"e_1_3_2_1_10_1","first-page":"39","volume-title":"Eur. Conf. Comput. Vis., Springer-Verlag","author":"Das Anurag","year":"2024","unstructured":"Anurag Das, Xinting Hu, Li Jiang, and Bernt Schiele. 2024. MTA-CLIP: Language-guided semantic segmentation with mask-text alignment. In Eur. Conf. Comput. Vis., Springer-Verlag, Cham, Switzerland, 39-56."},{"key":"e_1_3_2_1_11_1","volume-title":"Int. Conf. Learn. Represent., International Conference on Learning Representations, Virtual, arXiv:2010","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Int. Conf. Learn. Represent., International Conference on Learning Representations, Virtual, arXiv:2010.11929."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_2_1_13_1","unstructured":"Han Fang Pengfei Xiong Luhui Xu and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3565970.3567696"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_2_1_16_1","first-page":"1263","volume-title":"Neural Message Passing for Quantum Chemistry. In Int. Conf. Mach. Learn., Machine Learning Research","author":"Gilmer Justin","year":"2017","unstructured":"Justin Gilmer, Samuel S Schoenholz, Patrick F Riley, Oriol Vinyals, and George E Dahl. 2017. Neural Message Passing for Quantum Chemistry. In Int. Conf. Mach. Learn., Machine Learning Research, Sydney, Australia, 1263-1272."},{"key":"e_1_3_2_1_17_1","volume-title":"Int. Conf. Learn. Represent., International Conference on Learning Representations, Virtual, arXiv:2104","author":"Gu Xiuye","year":"2022","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2022. Open-vocabulary object detection via vision and language knowledge distillation. In Int. Conf. Learn. Represent., International Conference on Learning Representations, Virtual, arXiv:2104.13921."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3385956.3422122"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3426372"},{"key":"e_1_3_2_1_21_1","first-page":"1626","article-title":"Eyemotion: Classifying facial expressions in VR using eye-tracking cameras. In IEEE winter conference on applications of computer vision (WACV). IEEE, Piscataway","author":"Hickson Steven","year":"2019","unstructured":"Steven Hickson, Nick Dufour, Avneesh Sud, Vivek Kwatra, and Irfan Essa. 2019. Eyemotion: Classifying facial expressions in VR using eye-tracking cameras. In IEEE winter conference on applications of computer vision (WACV). IEEE, Piscataway, NJ, 1626-1635.","journal-title":"NJ"},{"key":"e_1_3_2_1_22_1","volume-title":"Int. Conf. Learn. Represent., International Conference on Learning Representations","author":"Huang Xinyu","year":"2024","unstructured":"Xinyu Huang, Youcai Zhang, Jinyu Ma, Weiwei Tian, Rui Feng, Yuejie Zhang, Yaqian Li, Yandong Guo, and Lei Zhang. 2024. Tag2text: Guiding vision-language model via image tagging. In Int. Conf. Learn. Represent., International Conference on Learning Representations, Vienna, Austria, arXiv:2303.05657."},{"key":"e_1_3_2_1_23_1","first-page":"27896","article-title":"Fineclip: Self-distilled region-based clip for better fine-grained understanding","volume":"37","author":"Jing Dong","year":"2024","unstructured":"Dong Jing, Xiaolong He, Yutian Luo, Nanyi Fei, Wei Wei, Huiwen Zhao, Zhiwu Lu, et al., 2024a. Fineclip: Self-distilled region-based clip for better fine-grained understanding. Adv. Neural Inform. Process. Syst., Vol. 37 (2024), 27896-27918.","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643888"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3427549"},{"key":"e_1_3_2_1_26_1","volume-title":"Asian Conf. Comput. Vis., Springer-Verlag, Hanoi, Vietnam, 1420-1436","author":"Kim Janghyun","year":"2024","unstructured":"Janghyun Kim, Ukcheol Shin, Seokyong Heo, and Jinsun Park. 2024b. Exploiting cross-modal cost volume for multi-sensor depth estimation. In Asian Conf. Comput. Vis., Springer-Verlag, Hanoi, Vietnam, 1420-1436."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01376"},{"key":"e_1_3_2_1_28_1","first-page":"11313","volume-title":"Int. Conf. Learn. Represent., International Conference on Learning Representations","author":"Kipf Thomas N","year":"2017","unstructured":"Thomas N Kipf and Max Welling. 2017. Semi-supervised classification with graph convolutional networks. In Int. Conf. Learn. Represent., International Conference on Learning Representations, Toulon, France, 11313-11320."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01586"},{"key":"e_1_3_2_1_31_1","first-page":"561","article-title":"Exploiting weakly supervised visual patterns to learn from partial annotations","volume":"33","author":"Kundu Kaustav","year":"2020","unstructured":"Kaustav Kundu and Joseph Tighe. 2020. Exploiting weakly supervised visual patterns to learn from partial annotations. Adv. Neural Inform. Process. Syst., Vol. 33 (2020), 561-572.","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2024.02.012"},{"key":"e_1_3_2_1_33_1","volume-title":"Int. Conf. Mach. Learn., PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In Int. Conf. Mach. Learn., PMLR, Virtual, 19730-19742."},{"key":"e_1_3_2_1_34_1","volume-title":"Int. Conf. Mach. Learn., PMLR, Virtual, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In Int. Conf. Mach. Learn., PMLR, Virtual, 12888-12900."},{"key":"e_1_3_2_1_35_1","first-page":"284","volume-title":"ADD-GCN: Adaptive Dynamic Graph Convolutional Network for Multi-Label Image Recognition. In Eur. Conf. Comput. Vis., Springer-Verlag","author":"Li Jingyuan","year":"2020","unstructured":"Jingyuan Li, Bingchen Zhao, Xianglong Liu, Yao Hu, Jinqiao Wang, and Hanqing Lu. 2020. ADD-GCN: Adaptive Dynamic Graph Convolutional Network for Multi-Label Image Recognition. In Eur. Conf. Comput. Vis., Springer-Verlag, Cham, Switzerland, 284-300."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_38_1","volume-title":"Tagclip: A local-to-global framework to enhance open-vocabulary multi-label classification of clip without training. In AAAI","author":"Lin Yuqi","year":"2024","unstructured":"Yuqi Lin, Minghao Chen, Kaipeng Zhang, Hengjia Li, Mingming Li, Zheng Yang, Dongqin Lv, Binbin Lin, Haifeng Liu, and Deng Cai. 2024. Tagclip: A local-to-global framework to enhance open-vocabulary multi-label classification of clip without training. In AAAI, Vol. 38. AAAI Press, Palo Alto, California, USA, 3513-3521."},{"key":"e_1_3_2_1_39_1","unstructured":"Qinying Liu Kecheng Zheng Wu Wei Zhan Tong Yu Liu Wei Chen Zilei Wang and Yujun Shen. 2023. TagAlign: Improving Vision-Language Alignment with Multi-Tag Classification. arXiv preprint arXiv:2312.14149."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00969"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3150868"},{"key":"e_1_3_2_1_45_1","volume-title":"Int. Conf. Mach. Learn., PMLR, Virtual, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In Int. Conf. Mach. Learn., PMLR, Virtual, 8748-8763."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/1631272.1631397"},{"key":"e_1_3_2_1_47_1","volume-title":"Int. Conf. Mach. Learn., PMLR, Virtual, 8821-8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In Int. Conf. Mach. Learn., PMLR, Virtual, 8821-8831."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01634"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489849.3489958"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3161192"},{"key":"e_1_3_2_1_52_1","volume-title":"Graph Convolutional Matrix Completion. In KDD Workshop on Deep Learning Day. arXiv","author":"van den Berg Rianne","year":"2018","unstructured":"Rianne van den Berg, Thomas N Kipf, and Max Welling. 2018. Graph Convolutional Matrix Completion. In KDD Workshop on Deep Learning Day. arXiv, London, United Kingdom, arXiv:1706.02263."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611988"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_2_1_55_1","first-page":"18","volume-title":"Pseudo-RIS: Distinctive Pseudo-Supervision Generation for Referring Image Segmentation. In Eur. Conf. Comput. Vis., Springer-Verlag","author":"Yu Seonghoon","year":"2024","unstructured":"Seonghoon Yu, Paul Hongsuck Seo, and Jeany Son. 2024. Pseudo-RIS: Distinctive Pseudo-Supervision Generation for Referring Image Segmentation. In Eur. Conf. Comput. Vis., Springer-Verlag, Cham, Switzerland, 18-36."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.5555\/3507788.3507793"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548343"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755253","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:47:46Z","timestamp":1765309666000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755253"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":61,"alternative-id":["10.1145\/3746027.3755253","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755253","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}