{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T22:47:42Z","timestamp":1777589262536,"version":"3.51.4"},"reference-count":41,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005153","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["62225308"],"award-info":[{"award-number":["62225308"]}],"id":[{"id":"10.13039\/501100005153","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.cviu.2026.104670","type":"journal-article","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T17:21:36Z","timestamp":1770657696000},"page":"104670","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["DepthOcc: Real-time and accurate 3D occupancy prediction via multi-depth fusion and temporal enhancement"],"prefix":"10.1016","volume":"265","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9814-445X","authenticated-orcid":false,"given":"Chenghai","family":"Mao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Qin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiqiang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinghong","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.cviu.2026.104670_b1","doi-asserted-by":"crossref","unstructured":"Berman,\u00a0Maxim, Triki,\u00a0Amal\u00a0Rannen, Blaschko,\u00a0Matthew\u00a0B., 2018. The lov\u00e1sz-softmax loss: A tractable surrogate for the optimization of the intersection-over-union measure in neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4413\u20134421.","DOI":"10.1109\/CVPR.2018.00464"},{"key":"10.1016\/j.cviu.2026.104670_b2","doi-asserted-by":"crossref","unstructured":"Caesar,\u00a0Holger, Bankiti,\u00a0Varun, Lang,\u00a0Alex\u00a0H, Vora,\u00a0Sourabh, Liong,\u00a0Venice\u00a0Erin, Xu,\u00a0Qiang, Krishnan,\u00a0Anush, Pan,\u00a0Yu, Baldan,\u00a0Giancarlo, Beijbom,\u00a0Oscar, 2020. nuscenes: A multimodal dataset for autonomous driving.","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"10.1016\/j.cviu.2026.104670_b3","doi-asserted-by":"crossref","unstructured":"Cao,\u00a0Anh-Quan, De\u00a0Charette,\u00a0Raoul, 2022. Monoscene: Monocular 3d semantic scene completion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3991\u20134001.","DOI":"10.1109\/CVPR52688.2022.00396"},{"key":"10.1016\/j.cviu.2026.104670_b4","doi-asserted-by":"crossref","DOI":"10.1109\/TIV.2024.3403134","article-title":"A Comprehensive Framework for 3D Occupancy Estimation in Autonomous Driving","author":"Gan","year":"2024","journal-title":"IEEE Trans. Intell. Veh."},{"key":"10.1016\/j.cviu.2026.104670_b5","series-title":"Real-time 3D occupancy prediction via geometric-semantic disentanglement","author":"He","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b6","doi-asserted-by":"crossref","unstructured":"He,\u00a0Kaiming, Zhang,\u00a0Xiangyu, Ren,\u00a0Shaoqing, Sun,\u00a0Jian, 2016. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.cviu.2026.104670_b7","series-title":"FastOcc: Accelerating 3D occupancy prediction by fusing the 2d bird\u2019s-eye view and perspective view","author":"Hou","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b8","series-title":"Bevdet4d: Exploit temporal cues in multi-camera 3d object detection","author":"Huang","year":"2022"},{"key":"10.1016\/j.cviu.2026.104670_b9","doi-asserted-by":"crossref","unstructured":"Huang,\u00a0Yuanhui, Zheng,\u00a0Wenzhao, Zhang,\u00a0Yunpeng, Zhou,\u00a0Jie, Lu,\u00a0Jiwen, 2023. Tri-perspective view for vision-based 3d semantic occupancy prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9223\u20139232.","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"10.1016\/j.cviu.2026.104670_b10","first-page":"1486","article-title":"Bevstereo: Enhancing depth estimation in multi-view 3d object detection with temporal stereo","volume":"vol. 37","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2026.104670_b11","first-page":"1477","article-title":"Bevdepth: Acquisition of reliable depth for multi-view 3d object detection","volume":"vol. 37","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2026.104670_b12","series-title":"European Conference on Computer Vision","first-page":"1","article-title":"Bevformer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers","author":"Li","year":"2022"},{"key":"10.1016\/j.cviu.2026.104670_b13","series-title":"Fb-occ: 3d occupancy prediction based on forward-backward view transformation","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2026.104670_b14","series-title":"European Conference on Computer Vision","first-page":"54","article-title":"Fully sparse 3d occupancy prediction","author":"Liu","year":"2025"},{"key":"10.1016\/j.cviu.2026.104670_b15","doi-asserted-by":"crossref","unstructured":"Liu,\u00a0Ze, Lin,\u00a0Yutong, Cao,\u00a0Yue, Hu,\u00a0Han, Wei,\u00a0Yixuan, Zhang,\u00a0Zheng, Lin,\u00a0Stephen, Guo,\u00a0Baining, 2021. Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Vomputer Vision. pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.cviu.2026.104670_b16","series-title":"European Conference on Computer Vision","first-page":"531","article-title":"Petr: Position embedding transformation for multi-view 3d object detection","author":"Liu","year":"2022"},{"key":"10.1016\/j.cviu.2026.104670_b17","doi-asserted-by":"crossref","unstructured":"Liu,\u00a0Yingfei, Yan,\u00a0Junjie, Jia,\u00a0Fan, Li,\u00a0Shuailin, Gao,\u00a0Aqi, Wang,\u00a0Tiancai, Zhang,\u00a0Xiangyu, 2023. Petrv2: A unified framework for 3d perception from multi-camera images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 3262\u20133272.","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"10.1016\/j.cviu.2026.104670_b18","series-title":"Fixing Weight Decay Regularization in Adam","author":"Loshchilov","year":"2018"},{"key":"10.1016\/j.cviu.2026.104670_b19","series-title":"OctreeOcc: Efficient and multi-granularity occupancy prediction using octree queries","author":"Lu","year":"2023"},{"key":"10.1016\/j.cviu.2026.104670_b20","doi-asserted-by":"crossref","unstructured":"Ma,\u00a0Qihang, Tan,\u00a0Xin, Qu,\u00a0Yanyun, Ma,\u00a0Lizhuang, Zhang,\u00a0Zhizhong, Xie,\u00a0Yuan, 2024a. Cotr: Compact occupancy transformer for vision-based 3d occupancy prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19936\u201319945.","DOI":"10.1109\/CVPR52733.2024.01884"},{"issue":"12","key":"10.1016\/j.cviu.2026.104670_b21","doi-asserted-by":"crossref","first-page":"10978","DOI":"10.1109\/TPAMI.2024.3449912","article-title":"Vision-Centric BEV Perception: A Survey","volume":"46","author":"Ma","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2026.104670_b22","series-title":"MMDetection3D: OpenMMLab next-generation platform for general 3D object detection","author":"MMDetection3D Contributors","year":"2020"},{"key":"10.1016\/j.cviu.2026.104670_b23","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2025.104320","article-title":"View-to-label: Multi-view consistency for self-supervised monocular 3D object detection","volume":"254","author":"Mouawad","year":"2025","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2026.104670_b24","series-title":"2024 IEEE International Conference on Robotics and Automation","first-page":"12404","article-title":"Renderocc: Vision-centric 3d occupancy prediction with 2d rendering supervision","author":"Pan","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b25","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16","first-page":"194","article-title":"Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d","author":"Philion","year":"2020"},{"key":"10.1016\/j.cviu.2026.104670_b26","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2023.103808","article-title":"3D detection transformer: Set prediction of objects using point clouds","volume":"236","author":"Tan","year":"2023","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2026.104670_b27","article-title":"Occ3d: A large-scale 3d occupancy prediction benchmark for autonomous driving","volume":"36","author":"Tian","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2026.104670_b28","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0Yuqi, Chen,\u00a0Yuntao, Liao,\u00a0Xingyu, Fan,\u00a0Lue, Zhang,\u00a0Zhaoxiang, 2024. Panoocc: Unified occupancy representation for camera-based 3d panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 17158\u201317168.","DOI":"10.1109\/CVPR52733.2024.01624"},{"key":"10.1016\/j.cviu.2026.104670_b29","series-title":"Conference on Robot Learning","first-page":"180","article-title":"Detr3d: 3d object detection from multi-view images via 3d-to-2d queries","author":"Wang","year":"2022"},{"key":"10.1016\/j.cviu.2026.104670_b30","series-title":"Mv-fcos3d++: Multi-view camera-only 4d object detection with pretrained monocular backbones","author":"Wang","year":"2022"},{"key":"10.1016\/j.cviu.2026.104670_b31","doi-asserted-by":"crossref","unstructured":"Wei,\u00a0Yi, Zhao,\u00a0Linqing, Zheng,\u00a0Wenzhao, Zhu,\u00a0Zheng, Zhou,\u00a0Jie, Lu,\u00a0Jiwen, 2023. Surroundocc: Multi-camera 3d occupancy prediction for autonomous driving. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 21729\u201321740.","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"10.1016\/j.cviu.2026.104670_b32","series-title":"Deep height decoupling for precise vision-based 3D occupancy prediction","author":"Wu","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b33","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104164","article-title":"VIDF-Net: A voxel-image dynamic fusion method for 3D object detection","volume":"249","author":"Xiang","year":"2024","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2026.104670_b34","doi-asserted-by":"crossref","unstructured":"Xue,\u00a0Youze, Chen,\u00a0Jiansheng, Wan,\u00a0Weitao, Huang,\u00a0Yiqing, Yu,\u00a0Cheng, Li,\u00a0Tianpeng, Bao,\u00a0Jiayu, 2019. Mvscrf: Learning multi-view stereo with conditional random fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 4312\u20134321.","DOI":"10.1109\/ICCV.2019.00441"},{"key":"10.1016\/j.cviu.2026.104670_b35","doi-asserted-by":"crossref","unstructured":"Yao,\u00a0Yao, Luo,\u00a0Zixin, Li,\u00a0Shiwei, Fang,\u00a0Tian, Quan,\u00a0Long, 2018. Mvsnet: Depth inference for unstructured multi-view stereo. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 767\u2013783.","DOI":"10.1007\/978-3-030-01237-3_47"},{"key":"10.1016\/j.cviu.2026.104670_b36","doi-asserted-by":"crossref","unstructured":"Yao,\u00a0Yao, Luo,\u00a0Zixin, Li,\u00a0Shiwei, Shen,\u00a0Tianwei, Fang,\u00a0Tian, Quan,\u00a0Long, 2019. Recurrent mvsnet for high-resolution multi-view stereo depth inference. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5525\u20135534.","DOI":"10.1109\/CVPR.2019.00567"},{"key":"10.1016\/j.cviu.2026.104670_b37","series-title":"European Conference on Computer Vision","first-page":"381","article-title":"CVT-Occ: Cost Volume Temporal Fusion for 3D Occupancy Prediction","author":"Ye","year":"2025"},{"key":"10.1016\/j.cviu.2026.104670_b38","series-title":"Flashocc: Fast and memory-efficient occupancy prediction via channel-to-height plugin","author":"Yu","year":"2023"},{"key":"10.1016\/j.cviu.2026.104670_b39","series-title":"Panoptic-flashocc: An efficient baseline to marry semantic occupancy with panoptic via instance center","author":"Yu","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b40","series-title":"Vision-based 3D occupancy prediction in autonomous driving: a review and outlook","author":"Zhang","year":"2024"},{"key":"10.1016\/j.cviu.2026.104670_b41","doi-asserted-by":"crossref","unstructured":"Zhang,\u00a0Yunpeng, Zhu,\u00a0Zheng, Du,\u00a0Dalong, 2023. Occformer: Dual-path transformer for vision-based 3d semantic occupancy prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 9433\u20139443.","DOI":"10.1109\/ICCV51070.2023.00865"}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226000378?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226000378?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T02:18:19Z","timestamp":1772849899000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314226000378"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":41,"alternative-id":["S1077314226000378"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104670","relation":{},"ISSN":["1077-3142"],"issn-type":[{"value":"1077-3142","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"DepthOcc: Real-time and accurate 3D occupancy prediction via multi-depth fusion and temporal enhancement","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104670","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104670"}}