{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:38:39Z","timestamp":1774021119306,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"CAS Project for Young Scientists in Basic Research","award":["YSBR-041"],"award-info":[{"award-number":["YSBR-041"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100004739","name":"Youth Innovation Promotion Association of the Chinese Academy of Sciences","doi-asserted-by":"publisher","award":["Y202051 and 2022196"],"award-info":[{"award-number":["Y202051 and 2022196"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100004739","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Liaoning Provincial Selecting the Best Candidates by Opening Competition Mechanism Science and Technology Program","award":["2023JH1\/10400045"],"award-info":[{"award-number":["2023JH1\/10400045"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23A20343, 61873259, 61821005 and 62206075"],"award-info":[{"award-number":["U23A20343, 61873259, 61821005 and 62206075"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681212","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"1991-2000","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Uni-YOLO: Vision-Language Model-Guided YOLO for Robust and Fast Universal Detection in the Open World"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3278-0028","authenticated-orcid":false,"given":"Xudong","family":"Wang","sequence":"first","affiliation":[{"name":"State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences &amp; University of the Chinese Academy of Sciences, Shenyang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3839-0078","authenticated-orcid":false,"given":"Weihong","family":"Ren","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering and Automation, Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4756-3962","authenticated-orcid":false,"given":"Xi'ai","family":"Chen","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences, Shenyang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8548-861X","authenticated-orcid":false,"given":"Huijie","family":"Fan","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences, Shenyang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3805-7654","authenticated-orcid":false,"given":"Yandong","family":"Tang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences, Shenyang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8039-6679","authenticated-orcid":false,"given":"Zhi","family":"Han","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Robotics, Shenyang Institute of Automation, Chinese Academy of Sciences, Shenyang, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Dongdong Chen Mingming He Qingnan Fan Jing Liao Liheng Zhang Dongdong Hou Lu Yuan and Gang Hua. 2019. Gated context aggregation network for image dehazing and deraining. In 2019 IEEE winter conference on applications of computer vision (WACV). 1375--1383.","DOI":"10.1109\/WACV.2019.00151"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00298"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3128560"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3334213"},{"key":"e_1_3_2_1_8_1","volume-title":"Federated Class-Incremental Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Dong Jiahua","year":"2022","unstructured":"Jiahua Dong, Lixu Wang, Zhen Fang, Gan Sun, Shichao Xu, Xiao Wang, and Qi Zhu. 2022. Federated Class-Incremental Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_9_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision 88 (2010), 303--338."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612146"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02131"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612189"},{"key":"e_1_3_2_1_14_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Alina Kuznetsova Hassan Rom Neil Alldrin Jasper Uijlings Ivan Krasin Jordi Pont-Tuset Shahab Kamali Stefan Popov Matteo Malloci Alexander Kolesnikov et al. 2020. The open images dataset v4: Unified image classification object detection and visual relationship detection at scale. International journal of computer vision 128 7 (2020) 1956--1981.","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.511"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2867951"},{"key":"e_1_3_2_1_19_1","volume-title":"Detection-friendly dehazing: Object detection in realworld hazy scenes","author":"Li Chengyang","year":"2023","unstructured":"Chengyang Li, Heng Zhou, Yang Liu, Caidong Yang, Yongqiang Xie, Zhongbo Li, and Liping Zhu. 2023. Detection-friendly dehazing: Object detection in realworld hazy scenes. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_21_1","volume-title":"Grounded Language-Image Pre-training. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10955--10965","author":"Li Liunian Harold","year":"2022","unstructured":"Liunian Harold Li, Pengchuan Zhang, Haotian Zhang, Jianwei Yang, Chunyuan Li, Yiwu Zhong, Lijuan Wang, Lu Yuan, Lei Zhang, Jenq-Neng Hwang, Kai-Wei Chang, and Jianfeng Gao. 2022. Grounded Language-Image Pre-training. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10955--10965."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613116"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00315"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00602"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612523"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20072"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2018.10.010"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14074--14083","author":"Ma Zongyang","year":"2022","unstructured":"Zongyang Ma, Guan Luo, Jin Gao, Liang Li, Yuxin Chen, Shaoru Wang, Congxuan Zhang, and Weiming Hu. 2022. Open-vocabulary one-stage detection with hierarchical visual-language knowledge distillation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14074--14083."},{"key":"e_1_3_2_1_30_1","volume-title":"Rethinking Open-World Object Detection in Autonomous Driving Scenarios. In MM '22: The 30th ACM International Conference on Multimedia","author":"Ma Zeyu","year":"2022","unstructured":"Zeyu Ma, Yang Yang, Guoqing Wang, Xing Xu, Heng Tao Shen, and Mingxing Zhang. 2022. Rethinking Open-World Object Detection in Autonomous Driving Scenarios. In MM '22: The 30th ACM International Conference on Multimedia, Lisboa, Portugal, October 10 - 14, 2022. ACM, 1279--1288."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611854"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2000.855874"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6865"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_35_1","unstructured":"Alec Radford JongWook Kim Tao Xu Greg Brockman Christine McLeavey and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. 28492--28518 pages."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6868"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_1_38_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"e_1_3_2_1_40_1","volume-title":"EdaDet: Open-Vocabulary Object Detection Using Early Dense Alignment. 2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Shi Cheng","year":"2023","unstructured":"Cheng Shi and Sibei Yang. 2023. EdaDet: Open-Vocabulary Object Detection Using Early Dense Alignment. 2023 IEEE\/CVF International Conference on Computer Vision (ICCV) (2023), 15678--15688."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612407"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3256763"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611850"},{"key":"e_1_3_2_1_44_1","volume-title":"ERNIE: Enhanced Representation through Knowledge Integration. arXiv:1904.09223 [cs.CL]","author":"Sun Yu","year":"2019","unstructured":"Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, and Hua Wu. 2019. ERNIE: Enhanced Representation through Knowledge Integration. arXiv:1904.09223 [cs.CL]"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611907"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611909"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00721"},{"key":"e_1_3_2_1_48_1","volume-title":"Intelligent Robotics and Applications","author":"Wang Xudong","unstructured":"Xudong Wang, Xi'ai Chen, Feifan Wang, Chonglong Xu, and Yandong Tang. 2023. Image Recovery and Object Detection Integrated Algorithms for Robots in Harsh Battlefield Environments. In Intelligent Robotics and Applications. Springer Nature Singapore, Singapore, 575--585."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01100"},{"key":"e_1_3_2_1_50_1","volume-title":"Deep retinex decomposition for low-light enhancement. arXiv preprint arXiv:1808.04560","author":"Wei Chen","year":"2018","unstructured":"Chen Wei, Wenjing Wang, Wenhan Yang, and Jiaying Liu. 2018. Deep retinex decomposition for low-light enhancement. arXiv preprint arXiv:1808.04560 (2018)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"e_1_3_2_1_53_1","volume-title":"Zero-shot Object Detection Through Vision-Language Embedding Alignment. 2022 IEEE International Conference on Data Mining Workshops (ICDMW)","author":"Xie Johnathan","year":"2021","unstructured":"Johnathan Xie and Shuai Zheng. 2021. Zero-shot Object Detection Through Vision-Language Embedding Alignment. 2022 IEEE International Conference on Data Mining Workshops (ICDMW) (2021), 1--15."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611843"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612594"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_58_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605.","author":"Zhang Hao","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M Ni, and Heung-Yeung Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2020.3035443"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01592"},{"key":"e_1_3_2_1_61_1","volume-title":"Enhancing geometric factors in model learning and inference for object detection and instance segmentation","author":"Zheng Zhaohui","year":"2021","unstructured":"Zhaohui Zheng, Ping Wang, Dongwei Ren, Wei Liu, Rongguang Ye, Qinghua Hu, and Wangmeng Zuo. 2021. Enhancing geometric factors in model learning and inference for object detection and instance segmentation. IEEE transactions on cybernetics 52, 8 (2021), 8574--8586."},{"key":"e_1_3_2_1_62_1","volume-title":"Mixed Supervision for Instance Learning in Object Detection with Few-shot Annotation. In MM '22: The 30th ACM International Conference on Multimedia","author":"Zhong Yi","year":"2022","unstructured":"Yi Zhong, Chengyao Wang, Shiyong Li, Zhu Zhou, Yaowei Wang, and Wei-Shi Zheng. 2022. Mixed Supervision for Instance Learning in Object Detection with Few-shot Annotation. In MM '22: The 30th ACM International Conference on Multimedia, Lisboa, Portugal, October 10 - 14, 2022. ACM, 648--658."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2899569"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01171"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681212","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681212","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681212"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":66,"alternative-id":["10.1145\/3664647.3681212","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681212","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}