{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:03:48Z","timestamp":1778083428677,"version":"3.51.4"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.02586","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"5610-5619","source":"Crossref","is-referenced-by-count":33,"title":["DetCLIPv3: Towards Versatile Generative Open-Vocabulary Object Detection"],"prefix":"10.1109","author":[{"given":"Lewei","family":"Yao","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"given":"Renjie","family":"Pi","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"given":"Jianhua","family":"Han","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab"}]},{"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[{"name":"Shenzhen Campus of Sun Yat-Sen University"}]},{"given":"Hang","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab"}]},{"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab"}]},{"given":"Zhenguo","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x0027;s Ark Lab"}]},{"given":"Dan","family":"Xu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Natural language processing with Python: analyzing text with the natural language toolkit.","author":"Bird","year":"2009"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_17"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref5","author":"Chen","year":"2023","journal-title":"Pixart-alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis"},{"key":"ref6","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023","journal-title":"NeurIPS"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00729"},{"key":"ref8","author":"Dave","year":"2021","journal-title":"Evaluating large-vocabulary object detectors: The devil is in the details."},{"key":"ref9","article-title":"Unified language model pre-training for natural language understanding and generation","author":"Dong","year":"2019","journal-title":"NeurIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"ref11","author":"Fontanel","year":"2022","journal-title":"Detecting the unknown in object detection."},{"key":"ref12","author":"Gao","year":"2021","journal-title":"Towards open vocabulary object detection without human-provided bounding boxes."},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"ref14","volume":"2","author":"Gu","year":"2021","journal-title":"Open-vocabulary object detection via vision and language knowledge distillation."},{"key":"ref15","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"Gu","year":"2022","journal-title":"ICLR"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"ref19","author":"Honnibal","year":"2020","journal-title":"spacy: Industrial-strength natural language processing in python."},{"key":"ref20","author":"Inkawhich","year":"2022","journal-title":"Self-trained proposal networks for the open world."},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref24","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref25","author":"Li","year":"2023","journal-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models."},{"key":"ref26","author":"Li","year":"2019","journal-title":"Visualbert: A simple and perfor-mant baseline for vision and language."},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018650"},{"key":"ref29","author":"Lin","year":"2022","journal-title":"Learning object-language alignments for open-vocabulary object de-tection."},{"key":"ref30","article-title":"Learning object-language alignments for open-vocabulary object de-tection","author":"Lin","year":"2023","journal-title":"ICLR"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.33540\/2168"},{"key":"ref34","author":"Liu","year":"2023","journal-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01462"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00583"},{"key":"ref38","author":"Miller","year":"1998","journal-title":"Wordnet, an electronic lexical database."},{"key":"ref39","author":"Minderer","year":"2023","journal-title":"Scaling open-vocabulary object detection."},{"key":"ref40","volume-title":"Improving image generation with better captions.","year":"2023"},{"key":"ref41","year":"2023","journal-title":"Gpt-4 technical report."},{"key":"ref42","article-title":"Learning transferable visual models from natural language super-vision","author":"Radford","year":"2021","journal-title":"I CML"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref44","author":"Schuhmann","year":"2021","journal-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs."},{"key":"ref45","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","author":"Schuhmann","year":"2022","journal-title":"NeurIPS"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3152990"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"ref50","article-title":"Learning robust global representations by penalizing local predictive power","author":"Wang","year":"2019","journal-title":"NeurIPS"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01817"},{"key":"ref52","author":"Wu","year":"2022","journal-title":"Grit: A generative region-to-text transformer for object understanding."},{"key":"ref53","author":"Xie","year":"2021","journal-title":"Zsd-yolo: Zero-shot yolo detection using vision-language knowledge distillation."},{"key":"ref54","article-title":"Detclip: Dictionary-enriched visual-concept paralleled pre-training for open-world detection","author":"Yao","year":"2022","journal-title":"NeurIPS"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00640"},{"key":"ref57","author":"Yu","year":"2023","journal-title":"Capsfusion: Rethinking image-text data at scale."},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref60","article-title":"Glipv2: Unifying localization and vision-language understanding","author":"Zhang","year":"2022","journal-title":"NeurIPS"},{"key":"ref61","article-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2023","journal-title":"ICLR"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00100"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_10"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref66","article-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2021","journal-title":"ICLR"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10658129.pdf?arnumber=10658129","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T06:22:22Z","timestamp":1726813342000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10658129\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.02586","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}