{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T22:14:16Z","timestamp":1743113656886,"version":"3.40.3"},"publisher-location":"Cham","reference-count":54,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729485"},{"type":"electronic","value":"9783031729492"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72949-2_6","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:22:17Z","timestamp":1730301737000},"page":"92-108","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["VEON: Vocabulary-Enhanced Occupancy Prediction"],"prefix":"10.1007","author":[{"given":"Jilai","family":"Zheng","sequence":"first","affiliation":[]},{"given":"Pin","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Zhongdao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Guoqing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiangxuan","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Bailan","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"6_CR1","unstructured":"The bevdet codebase. https:\/\/github.com\/HuangJunJie2017\/BEVDet. Accessed 28 Oct 2023"},{"key":"6_CR2","unstructured":"CVPR 2023 3D occupancy prediction challenge. https:\/\/github.com\/CVPR2023-3D-Occupancy-Prediction\/CVPR2023-3D-Occupancy-Prediction. Accessed 28 Oct 2023"},{"key":"6_CR3","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: BERT pre-training of image transformers. In: ICLR (2021)"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Behley, J., et al.: Semantickitti: a dataset for semantic scene understanding of lidar sequences. In: ICCV, pp. 9297\u20139307 (2019)","DOI":"10.1109\/ICCV.2019.00939"},{"key":"6_CR5","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Adabins: depth estimation using adaptive bins. In: CVPR, pp. 4009\u20134018 (2021)"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Localbins: improving depth estimation by learning local distributions. In: ECCV, pp. 480\u2013496 (2022)","DOI":"10.1007\/978-3-031-19769-7_28"},{"key":"6_CR7","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: Zoedepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"},{"key":"6_CR8","unstructured":"Birkl, R., Wofk, D., M\u00fcller, M.: Midas v3. 1\u2013a model zoo for robust monocular relative depth estimation. arXiv preprint arXiv:2307.14460 (2023)"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"Cao, A.Q., de\u00a0Charette, R.: Monoscene: monocular 3D semantic scene completion. In: CVPR, pp. 3991\u20134001 (2022)","DOI":"10.1109\/CVPR52688.2022.00396"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: ICCV, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: PLA: language-driven open-vocabulary 3D scene understanding. In: CVPR, pp. 7010\u20137019 (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"6_CR13","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"6_CR14","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. In: NeurIPS, pp. 2366\u20132374 (2014)"},{"issue":"2","key":"6_CR15","first-page":"3795","volume":"7","author":"WK Fong","year":"2022","unstructured":"Fong, W.K., et al.: Panoptic nuscenes: a large-scale benchmark for lidar panoptic segmentation and tracking. RA-L 7(2), 3795\u20133802 (2022)","journal-title":"RA-L"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR17","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. In: ICLR (2021)"},{"key":"6_CR18","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR, pp. 17853\u201317862 (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"6_CR19","unstructured":"Huang, J., Huang, G.: Bevdet4d: exploit temporal cues in multi-camera 3D object detection. arXiv preprint arXiv:2203.17054 (2022)"},{"key":"6_CR20","unstructured":"Huang, J., Huang, G.: Bevpoolv2: a cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111 (2022)"},{"key":"6_CR21","unstructured":"Huang, J., Huang, G., Zhu, Z., Yun, Y., Du, D.: Bevdet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)"},{"issue":"10","key":"6_CR22","doi-asserted-by":"publisher","first-page":"2702","DOI":"10.1109\/TPAMI.2019.2926463","volume":"42","author":"X Huang","year":"2019","unstructured":"Huang, X., Wang, P., Cheng, X., Zhou, D., Geng, Q., Yang, R.: The apolloscape open dataset for autonomous driving and its application. TPAMI 42(10), 2702\u20132719 (2019)","journal-title":"TPAMI"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, B., Zhou, J., Lu, J.: Selfocc: self-supervised vision-based 3D occupancy prediction. arXiv preprint arXiv:2311.12754 (2023)","DOI":"10.1109\/CVPR52733.2024.01885"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, Y., Zhou, J., Lu, J.: Tri-perspective view for vision-based 3D semantic occupancy prediction. In: CVPR, pp. 9223\u20139232 (2023)","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wu, X., Chen, X., Zhao, H., Zhu, L., Lasenby, J.: Openins3d: snap and lookup for 3D open-vocabulary instance segmentation. arXiv preprint arXiv:2309.00616 (2023)","DOI":"10.1007\/978-3-031-73033-7_10"},{"key":"6_CR26","doi-asserted-by":"crossref","unstructured":"Jiang, B., et al.: VAD: vectorized scene representation for efficient autonomous driving. In: ICCV, pp. 8340\u20138350 (2023)","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Voxformer: sparse voxel transformer for camera-based 3D semantic scene completion. In: CVPR, pp. 9087\u20139098 (2023)","DOI":"10.1109\/CVPR52729.2023.00877"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Li, Z., Snavely, N.: Megadepth: learning single-view depth prediction from internet photos. In: CVPR, pp. 2041\u20132050 (2018)","DOI":"10.1109\/CVPR.2018.00218"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Bevformer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: ECCV, pp. 1\u201318 (2022)","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"6_CR30","unstructured":"Liu, K., et al.: Weakly supervised 3D open-vocabulary segmentation. arXiv preprint arXiv:2305.14093 (2023)"},{"key":"6_CR31","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2018)"},{"key":"6_CR32","unstructured":"Lu, S., Chang, H., Jing, E.P., Boularias, A., Bekris, K.: OVIR-3D: open-vocabulary 3D instance retrieval without training on 3D data. CoRL (2023)"},{"key":"6_CR33","unstructured":"Miao, R., et al.: Occdepth: a depth-aware method for 3D semantic scene completion. arXiv preprint arXiv:2302.13540 (2023)"},{"issue":"1","key":"6_CR34","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Peng, S., et al.: Openscene: 3D scene understanding with open vocabularies. In: CVPR, pp. 815\u2013824 (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: ECCV, pp. 194\u2013210 (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"6_CR37","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"issue":"3","key":"6_CR38","doi-asserted-by":"publisher","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","volume":"44","author":"R Ranftl","year":"2020","unstructured":"Ranftl, R., Lasinger, K., Hafner, D., Schindler, K., Koltun, V.: Towards robust monocular depth estimation: mixing datasets for zero-shot cross-dataset transfer. TPAMI 44(3), 1623\u20131637 (2020)","journal-title":"TPAMI"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: Indoor segmentation and support inference from RGBD images. In: ECCV, pp. 746\u2013760 (2012)","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"6_CR40","unstructured":"Sun, P., et al.: Scalability in perception for autonomous driving: waymo open dataset. In: CVPR, pp. 2446\u20132454 (2020)"},{"key":"6_CR41","unstructured":"Tan, Z., Dong, Z., Zhang, C., Zhang, W., Ji, H., Li, H.: OVO: open-vocabulary occupancy. arXiv preprint arXiv:2305.16133 (2023)"},{"key":"6_CR42","doi-asserted-by":"crossref","unstructured":"Tang, P., et al.: Sparseocc: rethinking sparse latent representation for vision-based semantic occupancy prediction. In: CVPR, pp. 15035\u201315044 (2024)","DOI":"10.1109\/CVPR52733.2024.01424"},{"key":"6_CR43","unstructured":"Tian, X., Jiang, T., Yun, L., Wang, Y., Wang, Y., Zhao, H.: OCC3D: a large-scale 3D occupancy prediction benchmark for autonomous driving. arXiv preprint arXiv:2304.14365 (2023)"},{"key":"6_CR44","doi-asserted-by":"crossref","unstructured":"Tong, W., et al.: Scene as occupancy. In: ICCV, pp. 8406\u20138415 (2023)","DOI":"10.1109\/ICCV51070.2023.00772"},{"key":"6_CR45","unstructured":"Vobecky, A., et al.: POP-3D: open-vocabulary 3D occupancy prediction from images. In: NeurIPS, pp. 50545\u201350557 (2023)"},{"key":"6_CR46","doi-asserted-by":"crossref","unstructured":"Wang, G., et al.: Occgen: generative multi-modal 3D occupancy prediction for autonomous driving. arXiv preprint arXiv:2404.15014 (2024)","DOI":"10.1007\/978-3-031-72661-3_6"},{"key":"6_CR47","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Openoccupancy: a large scale benchmark for surrounding semantic occupancy perception. In: ICCV, pp. 17850\u201317859 (2023)","DOI":"10.1109\/ICCV51070.2023.01636"},{"key":"6_CR48","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhao, L., Zheng, W., Zhu, Z., Zhou, J., Lu, J.: Surroundocc: multi-camera 3D occupancy prediction for autonomous driving. In: ICCV, pp. 21729\u201321740 (2023)","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"6_CR49","doi-asserted-by":"crossref","unstructured":"Xian, K., Zhang, J., Wang, O., Mai, L., Lin, Z., Cao, Z.: Structure-guided ranking loss for single image depth prediction. In: CVPR, pp. 611\u2013620 (2020)","DOI":"10.1109\/CVPR42600.2020.00069"},{"key":"6_CR50","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: CVPR, pp. 2945\u20132954 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"6_CR51","doi-asserted-by":"crossref","unstructured":"Yao, Y., et al.: Blendedmvs: a large-scale dataset for generalized multi-view stereo networks. In: CVPR, pp. 1790\u20131799 (2020)","DOI":"10.1109\/CVPR42600.2020.00186"},{"key":"6_CR52","unstructured":"Zhang, C., et al.: Occnerf: self-supervised multi-camera occupancy prediction with neural radiance fields. arXiv preprint arXiv:2312.09243 (2023)"},{"key":"6_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhu, Z., Du, D.: Occformer: dual-path transformer for vision-based 3D semantic occupancy prediction. arXiv preprint arXiv:2304.05316 (2023)","DOI":"10.1109\/ICCV51070.2023.00865"},{"key":"6_CR54","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: ECCV, pp. 696\u2013712 (2022)","DOI":"10.1007\/978-3-031-19815-1_40"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72949-2_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T14:19:01Z","timestamp":1732976341000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72949-2_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031729485","9783031729492"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72949-2_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}