{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T06:01:40Z","timestamp":1779948100072,"version":"3.53.1"},"reference-count":80,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T00:00:00Z","timestamp":1773964800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T00:00:00Z","timestamp":1773964800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,20]]},"DOI":"10.1109\/3dv69130.2026.00171","type":"proceedings-article","created":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T19:40:49Z","timestamp":1779910849000},"page":"1801-1811","source":"Crossref","is-referenced-by-count":0,"title":["Open Vocabulary Monocular 3D Object Detection"],"prefix":"10.1109","author":[{"given":"Jin","family":"Yao","sequence":"first","affiliation":[{"name":"University of Virginia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Gu","sequence":"additional","affiliation":[{"name":"University of Virginia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuweiyi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Virginia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiayun","family":"Wang","sequence":"additional","affiliation":[{"name":"California Institute of Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zezhou","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Virginia"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00773"},{"key":"ref2","article-title":"Arkitscenes: A diverse real-world dataset for 3d indoor scene understanding using mobile rgb-d data","author":"Baruch","year":"2021","journal-title":"NeurIPS"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01264"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3145"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3593580"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.236"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01599"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.73"},{"key":"ref12","first-page":"226","article-title":"A density-based algorithm for discovering clusters in large spatial databases with noise","author":"Ester","year":"1996","journal-title":"kdd"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73661-2_8"},{"key":"ref14","article-title":"Cityscapes 3d: Dataset and benchmark for 9 dof vehicle detection","author":"G\u00e4hlert","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_30"},{"key":"ref18","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"Gu","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3444912"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00398"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2303"},{"key":"ref25","author":"Huang","year":"2019","journal-title":"Cooperative holistic scene understanding: Unifying 3d object, layout, and camera pose estimation"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00927"},{"key":"ref27","first-page":"49044916","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021","journal-title":"ICML"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73226-3_8"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02070"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3574363"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00506"},{"key":"ref38","article-title":"Open-vocabulary 3d detection via image-level class and debiased cross-modal contrastive learning","author":"Lu","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00121"},{"key":"ref40","article-title":"Openvocabulary one-stage detection with hierarchical visuallanguage knowledge distillation","author":"Ma","year":"2022","journal-title":"CVPR"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00013"},{"key":"ref43","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref44","article-title":"Im2text: Describing images using 1 million captioned photographs","author":"Ordonez","year":"2011","journal-title":"NeurIPS"},{"key":"ref45","article-title":"Global-local collaborative inference with 11 m for lidar-based open-vocabulary detection","author":"Peng","year":"2025","journal-title":"ECCV"},{"key":"ref46","article-title":"Glrd: Global-local collaborative reason and debate with psl for 3d open-vocabulary detection","author":"Peng","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00963"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3628473"},{"key":"ref49","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"ref51","article-title":"Accelerating 3d deep learning with pytorch3d","author":"Ravi","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01073"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/wacv51458.2022.00133"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00208"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"ref61","first-page":"14751485","article-title":"Probabilistic and geometric depth: Detecting objects in perspective","volume-title":"Conference on Robot Learning","author":"Wang","year":"2022"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_5"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3361862"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28428"},{"key":"ref65","volume-title":"Detectron2","author":"Wu","year":"2019"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.32388\/vje40l"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29612"},{"key":"ref69","article-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref70","article-title":"Glipv2: Unifying localization and vision-language understanding","author":"Zhang","year":"2022","journal-title":"NeurIPS"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00480"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72907-2_1"},{"key":"ref73","article-title":"A tale of two features: Stable diffusion complements dino for zero-shot semantic correspondence","author":"Zhang","year":"2024","journal-title":"NeurIPS"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00840"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69525-5_25"},{"key":"ref78","author":"Zhou","year":"2019","journal-title":"Objects as points"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref80","article-title":"Object2scene: Putting objects in context for openvocabulary 3d detection","author":"Zhu","year":"2023","journal-title":"arXiv preprint arXiv"}],"event":{"name":"2026 International Conference on 3D Vision (3DV)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2026,3,20]]},"end":{"date-parts":[[2026,3,23]]}},"container-title":["2026 International Conference on 3D Vision (3DV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11533157\/11533158\/11533208.pdf?arnumber=11533208","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T05:01:37Z","timestamp":1779944497000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11533208\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,20]]},"references-count":80,"URL":"https:\/\/doi.org\/10.1109\/3dv69130.2026.00171","relation":{},"subject":[],"published":{"date-parts":[[2026,3,20]]}}}