{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:31Z","timestamp":1765339771569,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206077"],"award-info":[{"award-number":["62206077"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Inner Mongolia Natural Science Foundation for Distinguished Young Scholars","award":["2025JQ009"],"award-info":[{"award-number":["2025JQ009"]}]},{"name":"Inner Mongolia Talent Development Project for Outstanding Young Talents"},{"name":"Inner Mongolia University Graduate Research Innovation Foundation","award":["11200-5223737"],"award-info":[{"award-number":["11200-5223737"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754501","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"2596-2605","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SAM based Region-Word Clustering and Inference Score Adjusting for Open-Vocabulary Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3261-7256","authenticated-orcid":false,"given":"Qiuyu","family":"Liang","sequence":"first","affiliation":[{"name":"College of Computer Science, Inner Mongolia University, Hohhot, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0437-7337","authenticated-orcid":false,"given":"Yongqiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science, Inner Mongolia University, Hohhot, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72890-7_20"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.3724\/2096-7004.di.2024.0001"},{"key":"e_1_3_2_1_4_1","volume-title":"Open vocabulary object detection with proposal mining and prediction equalization. arXiv preprint arXiv:2206.11134","author":"Chen Peixian","year":"2022","unstructured":"Peixian Chen, Kekai Sheng, Mengdan Zhang, Mingbao Lin, Yunhang Shen, Shaohui Lin, Bo Ren, and Ke Li. 2022. Open vocabulary object detection with proposal mining and prediction equalization. arXiv preprint arXiv:2206.11134 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_6_1","first-page":"2292","article-title":"Lightspeed computation of optimal transportation distances","volume":"26","author":"Cuturi M","year":"2013","unstructured":"M Cuturi. 2013. Lightspeed computation of optimal transportation distances. Advances in Neural Information Processing Systems, Vol. 26, 2 (2013), 2292-2300.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision, Vol. 88 (2010), 303-338."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"e_1_3_2_1_10_1","volume-title":"Fast R-CNN. In Proceedings of the IEEE International Conference on Computer Vision. 1440-1448","author":"Girshick Ross","year":"2015","unstructured":"Ross Girshick. 2015. Fast R-CNN. In Proceedings of the IEEE International Conference on Computer Vision. 1440-1448."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations.","author":"Gu Xiuye","year":"2022","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2022. Open-vocabulary Object Detection via Vision and Language Knowledge Distillation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations.","author":"Hamilton Mark","year":"2022","unstructured":"Mark Hamilton, Zhoutong Zhang, Bharath Hariharan, Noah Snavely, and William T Freeman. 2022. Unsupervised Semantic Segmentation by Distilling Feature Correspondences. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_8"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01598"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28022"},{"key":"e_1_3_2_1_18_1","volume-title":"LLMs Meet VLMs: Boost Open Vocabulary Object Detection with Fine-grained Descriptors. In The Twelfth International Conference on Learning Representations.","author":"Jin Sheng","year":"2024","unstructured":"Sheng Jin, Xueying Jiang, Jiaxing Huang, Lewei Lu, and Shijian Lu. 2024. LLMs Meet VLMs: Boost Open Vocabulary Object Detection with Fine-grained Descriptors. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01072"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01650"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_22_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly, Vol. 2, 1-2 (1955), 83-97."},{"key":"e_1_3_2_1_23_1","volume-title":"Learning Object-Language Alignments for Open-Vocabulary Object Detection. In The Eleventh International Conference on Learning Representations.","author":"Lin Chuang","year":"2023","unstructured":"Chuang Lin, Peize Sun, Yi Jiang, Ping Luo, Lizhen Qu, Gholamreza Haffari, Zehuan Yuan, and Jianfei Cai. 2023. Learning Object-Language Alignments for Open-Vocabulary Object Detection. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_24_1","first-page":"740","volume-title":"Switzerland","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13. Springer, 740-755."},{"key":"e_1_3_2_1_25_1","volume-title":"ESOD: Efficient Small Object Detection on High-Resolution Images","author":"Liu Kai","year":"2024","unstructured":"Kai Liu, Zhihang Fu, Sheng Jin, Ze Chen, Fan Zhou, Rongxin Jiang, Yaowu Chen, and Jieping Ye. 2024. ESOD: Efficient Small Object Detection on High-Resolution Images. IEEE Transactions on Image Processing (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_27_1","first-page":"6894","article-title":"Notice of Removal: DeltaEdit: Exploring Text-free Training for Text-Driven Image Manipulation. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Lyu Yueming","year":"2023","unstructured":"Yueming Lyu, Tianwei Lin, Fu Li, Dongliang He, Jing Dong, and Tieniu Tan. 2023. Notice of Removal: DeltaEdit: Exploring Text-free Training for Text-Driven Image Manipulation. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE Computer Society, 6894-6903.","journal-title":"IEEE Computer Society"},{"key":"e_1_3_2_1_28_1","volume-title":"Codet: Co-occurrence guided region-word alignment for open-vocabulary object detection. Advances in neural information processing systems","author":"Ma Chuofan","year":"2023","unstructured":"Chuofan Ma, Yi Jiang, Xin Wen, Zehuan Yuan, and Xiaojuan Qi. 2023. Codet: Co-occurrence guided region-word alignment for open-vocabulary object detection. Advances in neural information processing systems, Vol. 36 (2023), 71078-71094."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-44824-z"},{"key":"e_1_3_2_1_30_1","first-page":"281","volume-title":"Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability","volume":"5","author":"MacQueen James","year":"1967","unstructured":"James MacQueen. 1967. Some methods for classification and analysis of multivariate observations. In Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Statistics, Vol. 5. University of California press, 281-298."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.3724\/2096-7004.di.2024.0049"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_33_1","volume-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 6 (2016), 1137-1149."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110648"},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Machine Learning. PMLR, 9934-9944","author":"Sun Peize","year":"2021","unstructured":"Peize Sun, Yi Jiang, Enze Xie, Wenqi Shao, Zehuan Yuan, Changhu Wang, and Ping Luo. 2021. What makes for end-to-end object detection?. In International Conference on Machine Learning. PMLR, 9934-9944."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00622"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00681"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01100"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680642"},{"key":"e_1_3_2_1_42_1","volume-title":"Self-supervised visual representation learning with semantic grouping. Advances in neural information processing systems","author":"Wen Xin","year":"2022","unstructured":"Xin Wen, Bingchen Zhao, Anlin Zheng, Xiangyu Zhang, and Xiaojuan Qi. 2022. Self-supervised visual representation learning with semantic grouping. Advances in neural information processing systems, Vol. 35 (2022), 16423-16438."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"e_1_3_2_1_44_1","volume-title":"CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7031-7040","author":"Wu Xiaoshi","year":"2023","unstructured":"Xiaoshi Wu, Feng Zhu, Rui Zhao, and Hongsheng Li. 2023b. CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7031-7040."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3485518"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01301-6"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-024-05403-3"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109424"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00261"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_2_1_54_1","unstructured":"Xingyi Zhou Vladlen Koltun and Philipp Kr\u00e4henb\u00fchl. 2021. Probabilistic two-stage detection. arXiv:2103.07461 [cs.CV] https:\/\/arxiv.org\/abs\/2103.07461"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680960"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3232487"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754501","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:26Z","timestamp":1765339526000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754501"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":56,"alternative-id":["10.1145\/3746027.3754501","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754501","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}