{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:20:04Z","timestamp":1765340404238,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62406347 and No. 62202302"],"award-info":[{"award-number":["No. 62406347 and No. 62202302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754936","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"7548-7557","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Addressing Granularity-induced Semantic Drift in OvOD via Graph-guided semantically consistent representation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3846-5236","authenticated-orcid":false,"given":"Hongyan","family":"Xu","sequence":"first","affiliation":[{"name":"Central South University, Changsha, China and The University of New South Wales, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0307-5597","authenticated-orcid":false,"given":"Zhongze","family":"Wu","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9853-1953","authenticated-orcid":false,"given":"Ang","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Maritime University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5303-1191","authenticated-orcid":false,"given":"Xi","family":"Lin","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4283-7485","authenticated-orcid":false,"given":"Yi","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9863-5404","authenticated-orcid":false,"given":"Xiu","family":"Su","sequence":"additional","affiliation":[{"name":"Central South University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_35"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"e_1_3_2_1_5_1","volume-title":"GrainBrain: Multiview Identification and Stratification of Defective Grain Kernels","author":"Fan Lei","year":"2025","unstructured":"Lei Fan, Dongdong Fan, Yiwen Ding, Yong Wu, Donglin Di, Maurice Pagnucco, and Yang Song. 2025. GrainBrain: Multiview Identification and Stratification of Defective Grain Kernels. IEEE Transactions on Industrial Informatics (2025)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00407"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27939"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01396"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01067"},{"key":"e_1_3_2_1_11_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Rui Huang Henry Zheng Yan Wang Zhuofan Xia Marco Pavone and Gao Huang. 2024. Training an Open-Vocabulary Monocular 3D Detection Model Without 3D Data. In Advances in Neural Information Processing Systems (NeurIPS).","DOI":"10.52202\/079017-2303"},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_16_1","volume-title":"Llms meet vlms: Boost open vocabulary object detection with fine-grained descriptors. arXiv preprint arXiv:2402","author":"Jin Sheng","year":"2024","unstructured":"Sheng Jin, Xueying Jiang, Jiaxing Huang, Lewei Lu, and Shijian Lu. 2024. Llms meet vlms: Boost open vocabulary object detection with fine-grained descriptors. arXiv preprint arXiv:2402.04630 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01650"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_19_1","first-page":"50105","article-title":"Zero-shot visual relation detection via composite visual cues from large language models","volume":"36","author":"Li Lin","year":"2023","unstructured":"Lin Li, Jun Xiao, Guikun Chen, Jian Shao, Yueting Zhuang, and Long Chen. 2023b. Zero-shot visual relation detection via composite visual cues from large language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 50105-50116.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_21_1","volume-title":"Learning object-language alignments for open-vocabulary object detection. arXiv preprint arXiv:2211.14843","author":"Lin Chuang","year":"2022","unstructured":"Chuang Lin, Peize Sun, Yi Jiang, Ping Luo, Lizhen Qu, Gholamreza Haffari, Zehuan Yuan, and Jianfei Cai. 2022. Learning object-language alignments for open-vocabulary object detection. arXiv preprint arXiv:2211.14843 (2022)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28127"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01574"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01206"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32594"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_27_1","volume-title":"Fine-grained visual-text prompt-driven self-training for open-vocabulary object detection","author":"Long Yanxin","year":"2023","unstructured":"Yanxin Long, Jianhua Han, Runhui Huang, Hang Xu, Yi Zhu, Chunjing Xu, and Xiaodan Liang. 2023. Fine-grained visual-text prompt-driven self-training for open-vocabulary object detection. IEEE Transactions on Neural Networks and Learning Systems (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00121"},{"key":"e_1_3_2_1_29_1","volume-title":"Codet: Co-occurrence guided region-word alignment for open-vocabulary object detection. Advances in neural information processing systems","author":"Ma Chuofan","year":"2024","unstructured":"Chuofan Ma, Yi Jiang, Xin Wen, Zehuan Yuan, and Xiaojuan Qi. 2024. Codet: Co-occurrence guided region-word alignment for open-vocabulary object detection. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Machine Learning. PMLR, 26342-26362","author":"Novack Zachary","year":"2023","unstructured":"Zachary Novack, Julian McAuley, Zachary Chase Lipton, and Saurabh Garg. 2023. Chils: Zero-shot image classification with hierarchical label sets. In International Conference on Machine Learning. PMLR, 26342-26362."},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2022. ChatGPT: A Large-Scale GPT-3.5-Based Model. (2022). https:\/\/openai.com\/blog\/chatgpt Accessed: 2022-02-03."},{"key":"e_1_3_2_1_32_1","volume-title":"Parameter-efficient Fine-tuning in Hyperspherical Space for Open-vocabulary Semantic Segmentation. arXiv preprint arXiv:2405.18840","author":"Peng Zelin","year":"2024","unstructured":"Zelin Peng, Zhengqin Xu, Zhilin Zeng, Yaoming Wang, Lingxi Xie, Qi Tian, and Wei Shen. 2024. Parameter-efficient Fine-tuning in Hyperspherical Space for Open-vocabulary Semantic Segmentation. arXiv preprint arXiv:2405.18840 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00083"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01082"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00221"},{"key":"e_1_3_2_1_38_1","first-page":"8936","article-title":"Searching for network width with bilaterally coupled network","volume":"45","author":"Su Xiu","year":"2022","unstructured":"Xiu Su, Shan You, Jiyang Xie, Fei Wang, Chen Qian, Changshui Zhang, and Chang Xu. 2022. Searching for network width with bilaterally coupled network. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 7 (2022), 8936-8953.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning. PMLR, 9880-9890","author":"Su Xiu","year":"2021","unstructured":"Xiu Su, Shan You, Mingkai Zheng, Fei Wang, Chen Qian, Changshui Zhang, and Chang Xu. 2021c. K-shot nas: Learnable weight-sharing for nas with k-shot supernets. In International Conference on Machine Learning. PMLR, 9880-9890."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02818"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680925"},{"key":"e_1_3_2_1_42_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00914"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"e_1_3_2_1_46_1","first-page":"9125","article-title":"Detclip: Dictionary-enriched visual-concept paralleled pre-training for open-world detection","volume":"35","author":"Yao Lewei","year":"2022","unstructured":"Lewei Yao, Jianhua Han, Youpeng Wen, Xiaodan Liang, Dan Xu, Wei Zhang, Zhenguo Li, Chunjing Xu, and Hang Xu. 2022. Detclip: Dictionary-enriched visual-concept paralleled pre-training for open-world detection. Advances in Neural Information Processing Systems, Vol. 35 (2022), 9125-9138.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02586"},{"key":"e_1_3_2_1_48_1","volume-title":"Training-free boost for open-vocabulary object detection with confidence aggregation. arXiv preprint arXiv:2404.08603","author":"Zheng Yanhao","year":"2024","unstructured":"Yanhao Zheng and Kai Liu. 2024. Training-free boost for open-vocabulary object detection with confidence aggregation. arXiv preprint arXiv:2404.08603 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754936","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:16:21Z","timestamp":1765340181000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754936"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3754936","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754936","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}