{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:40:32Z","timestamp":1775230832482,"version":"3.50.1"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726606","type":"print"},{"value":"9783031726613","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72661-3_6","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T07:45:22Z","timestamp":1732607122000},"page":"95-112","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["OccGen: Generative Multi-modal 3D Occupancy Prediction for\u00a0Autonomous Driving"],"prefix":"10.1007","author":[{"given":"Guoqing","family":"Wang","sequence":"first","affiliation":[]},{"given":"Zhongdao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Pin","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Jilai","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Xiangxuan","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Bailan","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"key":"6_CR1","unstructured":"Amit, T., Nachmani, E., Shaharbany, T., Wolf, L.: Segdiff: image segmentation with diffusion probabilistic models. arXiv preprint arXiv:2112.00390 (2021)"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Behley, J., et al.: Semantickitti: a dataset for semantic scene understanding of lidar sequences. In: ICCV, pp. 9297\u20139307 (2019)","DOI":"10.1109\/ICCV.2019.00939"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Berman, M., Triki, A.R., Blaschko, M.B.: The lov\u00e1sz-softmax loss: a tractable surrogate for the optimization of the intersection-over-union measure in neural networks. In: CVPR, pp. 4413\u20134421 (2018)","DOI":"10.1109\/CVPR.2018.00464"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Cao, A.Q., de\u00a0Charette, R.: Monoscene: monocular 3D semantic scene completion. In: CVPR, pp. 3991\u20134001 (2022)","DOI":"10.1109\/CVPR52688.2022.00396"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., Luo, P.: Diffusiondet: diffusion model for object detection. In: ICCV, pp. 19830\u201319843 (2023)","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Chen, T., Li, L., Saxena, S., Hinton, G., Fleet, D.J.: A generalist framework for panoptic segmentation of images and videos. arXiv preprint arXiv:2210.06366 (2022)","DOI":"10.1109\/ICCV51070.2023.00090"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., Lin, K.Y., Qian, C., Zeng, G., Li, H.: 3D sketch-aware semantic scene completion via semi-supervised structure prior. In: CVPR, pp. 4193\u20134202 (2020)","DOI":"10.1109\/CVPR42600.2020.00425"},{"key":"6_CR9","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"Everingham, M., Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. IJCV (2009)","DOI":"10.1007\/s11263-009-0275-4"},{"key":"6_CR11","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: NeurIPS, vol.\u00a027 (2014)"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Harakeh, A., Smart, M., Waslander, S.L.: Bayesod: a Bayesian approach for uncertainty estimation in deep object detectors. In: ICRA, pp. 87\u201393. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196544"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR14","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR, pp. 17853\u201317862 (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"6_CR16","unstructured":"Huang, J., Huang, G., Zhu, Z., Du, D.: Bevdet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, Y., Zhou, J., Lu, J.: Tri-perspective view for vision-based 3D semantic occupancy prediction. In: CVPR, pp. 9223\u20139232 (2023)","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"6_CR18","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Ji, Y., et al.: DDP: diffusion model for dense visual prediction. In: ICCV, pp. 21741\u201321752 (2023)","DOI":"10.1109\/ICCV51070.2023.01987"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Jia, X., Gao, Y., Chen, L., Yan, J., Liu, P.L., Li, H.: Driveadapter: breaking the coupling barrier of perception and planning in end-to-end autonomous driving. In: CVPR, pp. 7953\u20137963 (2023)","DOI":"10.1109\/ICCV51070.2023.00731"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Jiang, H., Cheng, T., Gao, N., Zhang, H., Liu, W., Wang, X.: Symphonize 3D semantic scene completion with contextual instance queries. In: CVPR, pp. 20258\u201320267 (2024)","DOI":"10.1109\/CVPR52733.2024.01915"},{"key":"6_CR22","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Lang, A.H., Vora, S., Caesar, H., Zhou, L., Yang, J., Beijbom, O.: Pointpillars: fast encoders for object detection from point clouds. In: CVPR, pp. 12697\u201312705 (2019)","DOI":"10.1109\/CVPR.2019.01298"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Li, J., Han, K., Wang, P., Liu, Y., Yuan, X.: Anisotropic convolutional networks for 3D semantic scene completion. In: CVPR, pp. 3351\u20133359 (2020)","DOI":"10.1109\/CVPR42600.2020.00341"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Voxformer: sparse voxel transformer for camera-based 3D semantic scene completion. In: CVPR, pp. 9087\u20139098 (2023)","DOI":"10.1109\/CVPR52729.2023.00877"},{"key":"6_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Bevdepth: acquisition of reliable depth for multi-view 3D object detection. arXiv preprint arXiv:2206.10092 (2022)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"6_CR27","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20077-9_1","volume-title":"ECCV 2022","author":"Z Li","year":"2022","unstructured":"Li, Z., et al.: Bevformer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 1\u201318. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_1"},{"key":"6_CR28","unstructured":"Liang, T., et al.: Bevfusion: a simple and robust lidar-camera fusion framework. In: NeurIPS (2022)"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: CVPR, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Bevfusion: multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. In: ICRA (2023)","DOI":"10.1109\/ICRA48891.2023.10160968"},{"issue":"2","key":"6_CR32","doi-asserted-by":"publisher","first-page":"3153","DOI":"10.1109\/LRA.2020.2974682","volume":"5","author":"A Loquercio","year":"2020","unstructured":"Loquercio, A., Segu, M., Scaramuzza, D.: A general framework for uncertainty estimation in deep learning. IEEE Robot. Autom. Lett. 5(2), 3153\u20133160 (2020)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"6_CR33","unstructured":"Lu, H., et al.: Scaling multi-camera 3D object detection through weak-to-strong eliciting. arXiv preprint arXiv:2404.06700 (2024)"},{"key":"6_CR34","unstructured":"Lu, H., Zhang, Y., Lian, Q., Du, D., Chen, Y.: Towards generalizable multi-camera 3D object detection via perspective debiasing. arXiv preprint arXiv:2310.11346 (2023)"},{"key":"6_CR35","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: ICML, pp. 8162\u20138171. PMLR (2021)"},{"key":"6_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1007\/978-3-030-58568-6_12","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Philion","year":"2020","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 194\u2013210. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_12"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Roldao, L., de\u00a0Charette, R., Verroust-Blondet, A.: Lmscnet: lightweight multiscale 3D semantic completion. In: 3DV (2020)","DOI":"10.1109\/3DV50981.2020.00021"},{"key":"6_CR38","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"6_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"6_CR40","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Palette: image-to-image diffusion models. In: SIGGRAPH, pp. 1\u201310 (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"6_CR41","unstructured":"Saxena, S., Kar, A., Norouzi, M., Fleet, D.J.: Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816 (2023)"},{"key":"6_CR42","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML, pp. 2256\u20132265. PMLR (2015)"},{"key":"6_CR43","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"6_CR44","doi-asserted-by":"crossref","unstructured":"Song, S., Yu, F., Zeng, A., Chang, A.X., Savva, M., Funkhouser, T.: Semantic scene completion from a single depth image. In: CVPR, pp. 1746\u20131754 (2017)","DOI":"10.1109\/CVPR.2017.28"},{"key":"6_CR45","doi-asserted-by":"crossref","unstructured":"Tang, P., et al.: Sparseocc: rethinking sparse latent representation for vision-based semantic occupancy prediction. arXiv preprint arXiv:2404.09502 (2024)","DOI":"10.1109\/CVPR52733.2024.01424"},{"key":"6_CR46","unstructured":"Tian, X., Jiang, T., Yun, L., Wang, Y., Wang, Y., Zhao, H.: OCC3D: a large-scale 3D occupancy prediction benchmark for autonomous driving. arXiv preprint arXiv:2304.14365 (2023)"},{"key":"6_CR47","doi-asserted-by":"crossref","unstructured":"Tong, W., et al.: Scene as occupancy. In: ICCV, pp. 8406\u20138415 (2023)","DOI":"10.1109\/ICCV51070.2023.00772"},{"key":"6_CR48","doi-asserted-by":"crossref","unstructured":"Vora, S., Lang, A.H., Helou, B., Beijbom, O.: Pointpainting: sequential fusion for 3D object detection. In: CVPR, pp. 4604\u20134612 (2020)","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"6_CR49","doi-asserted-by":"crossref","unstructured":"Wang, C., Ma, C., Zhu, M., Yang, X.: Pointaugmenting: cross-modal augmentation for 3D object detection. In: CVPR, pp. 11794\u201311803 (2021)","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"6_CR50","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Openoccupancy: a large scale benchmark for surrounding semantic occupancy perception. In: ICCV, pp. 17850\u201317859 (2023)","DOI":"10.1109\/ICCV51070.2023.01636"},{"key":"6_CR51","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhao, L., Zheng, W., Zhu, Z., Zhou, J., Lu, J.: Surroundocc: multi-camera 3D occupancy prediction for autonomous driving. In: ICCV, pp. 21729\u201321740 (2023)","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"6_CR52","unstructured":"Wolleb, J., Sandk\u00fchler, R., Bieder, F., Valmaggia, P., Cattin, P.C.: Diffusion models for implicit image segmentation ensembles. In: MIDL, pp. 1336\u20131348 (2022)"},{"key":"6_CR53","unstructured":"Wu, J., Fang, H., Zhang, Y., Yang, Y., Xu, Y.: Medsegdiff: medical image segmentation with diffusion probabilistic model. arXiv preprint arXiv:2211.00611 (2022)"},{"key":"6_CR54","doi-asserted-by":"crossref","unstructured":"Yan, X., et al.: Sparse single sweep lidar point cloud segmentation via learning contextual shape priors from scene completion. In: AAAI, vol.\u00a035, pp. 3101\u20133109 (2021)","DOI":"10.1609\/aaai.v35i4.16419"},{"issue":"10","key":"6_CR55","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., Li, B.: SECOND: sparsely embedded convolutional detection. Sensors 18(10), 3337 (2018)","journal-title":"Sensors"},{"key":"6_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Polarnet: an improved grid representation for online lidar point clouds semantic segmentation. In: CVPR, pp. 9601\u20139610 (2020)","DOI":"10.1109\/CVPR42600.2020.00962"},{"key":"6_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhu, Z., Du, D.: Occformer: dual-path transformer for vision-based 3D semantic occupancy prediction. In: ICCV, pp. 9433\u20139443 (2023)","DOI":"10.1109\/ICCV51070.2023.00865"},{"key":"6_CR58","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: Voxelnet: end-to-end learning for point cloud based 3D object detection. In: CVPR, pp. 4490\u20134499 (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"6_CR59","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Cylindrical and asymmetrical 3D convolution networks for lidar segmentation. In: CVPR, pp. 9939\u20139948 (2021)","DOI":"10.1109\/CVPR46437.2021.00981"},{"key":"6_CR60","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: ICLR (2021). https:\/\/openreview.net\/forum?id=gZ9hCDWe6ke"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72661-3_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T08:16:36Z","timestamp":1732608996000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72661-3_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031726606","9783031726613"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72661-3_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}