{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:26:34Z","timestamp":1765308394864,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62172300,No. 62372326."],"award-info":[{"award-number":["No. 62172300,No. 62372326."]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755465","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"1852-1861","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-modal Prototype Guided Few-shot Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6773-598X","authenticated-orcid":false,"given":"Chenbo","family":"Zhang","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6213-872X","authenticated-orcid":false,"given":"Bing","family":"Huangfu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8623-0976","authenticated-orcid":false,"given":"Hongxu","family":"Ma","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2313-7635","authenticated-orcid":false,"given":"Jihong","family":"Guan","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01083"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","volume-title":"Comprehensive Multi-Modal Prototypes are Simple and Effective Classifiers for Vast-Vocabulary Object Detection. arXiv preprint arXiv:2412.17800","author":"Chen Yitong","year":"2024","unstructured":"Yitong Chen, Wenhao Yao, Lingchen Meng, Sihong Wu, Zuxuan Wu, and Yu-Gang Jiang. 2024. Comprehensive Multi-Modal Prototypes are Simple and Effective Classifiers for Vast-Vocabulary Object Detection. arXiv preprint arXiv:2412.17800 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Ziqing Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al., 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys.org (accessed 14 April 2023), Vol. 2, 3 (2023), 6."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_6_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_7_1","volume-title":"Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2015","unstructured":"Mark Everingham, SM Ali Eslami, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2015. The pascal visual object classes challenge: A retrospective. International journal of computer vision, Vol. 111 (2015), 98-136."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00407"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681176"},{"key":"e_1_3_2_1_10_1","volume-title":"Multi-modal few-shot object detection with meta-learning-based cross-modal prompting. arXiv preprint arXiv:2204.07841","author":"Han Guangxing","year":"2022","unstructured":"Guangxing Han, Long Chen, Jiawei Ma, Shiyuan Huang, Rama Chellappa, and Shih-Fu Chang. 2022. Multi-modal few-shot object detection with meta-learning-based cross-modal prompting. arXiv preprint arXiv:2204.07841 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02703"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01005"},{"key":"e_1_3_2_1_13_1","volume-title":"From clip to dino: Visual encoders shout in multi-modal large language models. arXiv preprint arXiv:2310.08825","author":"Jiang Dongsheng","year":"2023","unstructured":"Dongsheng Jiang, Yuchen Liu, Songlin Liu, Jin'e Zhao, Hao Zhang, Zhen Gao, Xiaopeng Zhang, Jin Li, and Hongkai Xiong. 2023. From clip to dino: Visual encoders shout in multi-modal large language models. arXiv preprint arXiv:2310.08825 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00851"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 15946-15969","author":"Kaul Prannay","year":"2023","unstructured":"Prannay Kaul, Weidi Xie, and Andrew Zisserman. 2023. Multi-modal classifiers for open-vocabulary object detection. In International Conference on Machine Learning. PMLR, 15946-15969."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1137\/060659624"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_20_1","first-page":"740","volume-title":"Switzerland","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13. Springer, 740-755."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475428"},{"key":"e_1_3_2_1_22_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al., 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2025.3550372"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25274"},{"key":"e_1_3_2_1_25_1","first-page":"1","article-title":"Few-shot object detection in aerial imagery guided by text-modal knowledge","volume":"61","author":"Lu Xiaonan","year":"2023","unstructured":"Xiaonan Lu, Xian Sun, Wenhui Diao, Yongqiang Mao, Junxi Li, Yidan Zhang, Peijin Wang, and Kun Fu. 2023b. Few-shot object detection in aerial imagery guided by text-modal knowledge. IEEE Transactions on Geoscience and Remote Sensing, Vol. 61 (2023), 1-19.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_26_1","volume-title":"MS-DETR: Towards Effective Video Moment Retrieval and Highlight Detection by Joint Motion-Semantic Learning. arXiv preprint arXiv:2507.12062","author":"Ma Hongxu","year":"2025","unstructured":"Hongxu Ma, Guanshuo Wang, Fufu Yu, Qiong Jia, and Shouhong Ding. 2025a. MS-DETR: Towards Effective Video Moment Retrieval and Highlight Detection by Joint Motion-Semantic Learning. arXiv preprint arXiv:2507.12062 (2025)."},{"key":"e_1_3_2_1_27_1","volume-title":"Fine-Grained Zero-Shot Object Detection. arXiv preprint arXiv:2507.10358","author":"Ma Hongxu","year":"2025","unstructured":"Hongxu Ma, Chenbo Zhang, Lu Zhang, Jiaogen Zhou, Jihong Guan, and Shuigeng Zhou. 2025b. Fine-Grained Zero-Shot Object Detection. arXiv preprint arXiv:2507.10358 (2025)."},{"key":"e_1_3_2_1_28_1","volume-title":"SMILe: Leveraging Submodular Mutual Information For Robust Few-Shot Object Detection. In European Conference on Computer Vision. Springer, 350-366","author":"Majee Anay","year":"2024","unstructured":"Anay Majee, Ryan Sharp, and Rishabh Iyer. 2024. SMILe: Leveraging Submodular Mutual Information For Robust Few-Shot Object Detection. In European Conference on Computer Vision. Springer, 350-366."},{"key":"e_1_3_2_1_29_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00856"},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_32_1","volume-title":"Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767","author":"Redmon Joseph","year":"2018","unstructured":"Joseph Redmon and Ali Farhadi. 2018. Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)."},{"key":"e_1_3_2_1_33_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_35_1","volume-title":"Prototypical networks for few-shot learning. Advances in neural information processing systems","author":"Snell Jake","year":"2017","unstructured":"Jake Snell, Kevin Swersky, and Richard Zemel. 2017. Prototypical networks for few-shot learning. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00727"},{"key":"e_1_3_2_1_37_1","volume-title":"Frustratingly simple few-shot object detection. arXiv","author":"Wang X","year":"2020","unstructured":"X Wang, TE Huang, T Darrell, JE Gonzalez, and F Yu. [n.d.]. Frustratingly simple few-shot object detection. arXiv 2020. arXiv preprint arXiv:2003.06957 ([n.d.])."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01192"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28399"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00943"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3552138"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413832"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01888"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711867"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20205"},{"key":"e_1_3_2_1_46_1","volume-title":"GCSTG: Generating Class-confusion-aware Samples with a Tree-structure Graph for Few-shot Object Detection","author":"Yang Longrong","year":"2025","unstructured":"Longrong Yang, Hanbin Zhao, Hongliang Li, Liang Qiao, Ziwei Yang, and Xi Li. 2025. GCSTG: Generating Class-confusion-aware Samples with a Tree-structure Graph for Few-shot Object Detection. IEEE Transactions on Image Processing (2025)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28528"},{"key":"e_1_3_2_1_48_1","volume-title":"Meta-detr: Image-level few-shot detection with inter-class correlation exploitation","author":"Zhang Gongjie","year":"2022","unstructured":"Gongjie Zhang, Zhipeng Luo, Kaiwen Cui, Shijian Lu, and Eric P Xing. 2022b. Meta-detr: Image-level few-shot detection with inter-class correlation exploitation. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 11 (2022), 12832-12843."},{"key":"e_1_3_2_1_49_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605","author":"Zhang Hao","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M Ni, and Heung-Yeung Shum. 2022a. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548412"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00630"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01419"},{"key":"e_1_3_2_1_53_1","volume-title":"Detect everything with few examples. arXiv preprint arXiv:2309.12969","author":"Zhang Xinyu","year":"2023","unstructured":"Xinyu Zhang, Yuhan Liu, Yuting Wang, and Abdeslam Boularias. 2023a. Detect everything with few examples. arXiv preprint arXiv:2309.12969 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446683"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548062"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755465","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:22:34Z","timestamp":1765308154000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755465"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":55,"alternative-id":["10.1145\/3746027.3755465","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755465","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}