{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T19:33:35Z","timestamp":1773776015605,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681293","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"787-796","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["FusionOcc: Multi-Modal Fusion for 3D Occupancy Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3942-8754","authenticated-orcid":false,"given":"Shuo","family":"Zhang","sequence":"first","affiliation":[{"name":"Zhejiang Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2375-3854","authenticated-orcid":false,"given":"Yupeng","family":"Zhai","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3326-1632","authenticated-orcid":false,"given":"Jilin","family":"Mei","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8818-4075","authenticated-orcid":false,"given":"Yu","family":"Hu","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00939"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01656"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00396"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02076"},{"key":"e_1_3_2_1_7_1","volume-title":"Conference on Robot Learning. PMLR, 2148--2161","author":"Cheng Ran","year":"2021","unstructured":"Ran Cheng, Christopher Agia, Yuan Ren, Xinhai Li, and Liu Bingbing. 2021. S3cnet: A sparse semantic scene completion network for lidar point clouds. In Conference on Robot Learning. PMLR, 2148--2161."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3148457"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_12_1","volume-title":"Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054","author":"Huang Junjie","year":"2022","unstructured":"Junjie Huang and Guan Huang. 2022. Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054 (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790","author":"Huang Junjie","year":"2021","unstructured":"Junjie Huang, Guan Huang, Zheng Zhu, Yun Ye, and Dalong Du. 2021. Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812383"},{"key":"e_1_3_2_1_17_1","unstructured":"Yiming Li Sihang Li Xinhao Liu Moonjun Gong Kenan Li Nuo Chen Zijun Wang Zhiheng Li Tao Jiang Fisher Yu et al. 2023. SSCBench: A Large-Scale 3D Semantic Scene Completion Benchmark for Autonomous Driving. arXiv preprint arXiv:2306.09001 (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-18916-6_42"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"e_1_3_2_1_20_1","volume-title":"Mingsheng Fang, Shiyi Lan, Jan Kautz, and Jose M Alvarez.","author":"Li Zhiqi","year":"2023","unstructured":"Zhiqi Li, Zhiding Yu, David Austin, Mingsheng Fang, Shiyi Lan, Jan Kautz, and Jose M Alvarez. 2023. Fb-occ: 3d occupancy prediction based on forward-backward view transformation. arXiv preprint arXiv:2307.01492 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_39"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3179507"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"e_1_3_2_1_25_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"e_1_3_2_1_27_1","volume-title":"Occdepth: A depth-aware method for 3d semantic scene completion. arXiv preprint arXiv:2302.13540","author":"Miao Ruihang","year":"2023","unstructured":"Ruihang Miao, Weizhou Liu, Mingrui Chen, Zheng Gong, Weixin Xu, Chen Hu, and Shuchang Zhou. 2023. Occdepth: A depth-aware method for 3d semantic scene completion. arXiv preprint arXiv:2302.13540 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Mao Shan, and Stewart Worrall.","author":"Ming Zhenxing","year":"2024","unstructured":"Zhenxing Ming, Julie Stephany Berrio, Mao Shan, and Stewart Worrall. 2024. InverseMatrixVT3D: An Efficient Projection Matrix-Based Approach for 3D Occupancy Prediction. arXiv preprint arXiv:2401.12422 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Mao Shan, and Stewart Worrall.","author":"Ming Zhenxing","year":"2024","unstructured":"Zhenxing Ming, Julie Stephany Berrio, Mao Shan, and Stewart Worrall. 2024. OccFusion: A Straightforward and Effective Multi-Sensor Fusion Framework for 3D Occupancy Prediction. arXiv preprint arXiv:2403.01644 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"MiLO: Multi-task Learning with Localization Ambiguity Suppression for Occupancy Prediction CVPR 2023 Occupancy Challenge Report. arXiv preprint arXiv:2306","author":"Jung-Hee Kim Thang Vu","year":"2023","unstructured":"Thang Vu Jung-Hee Kim Myeongjin and Kim Seokwoo Jung Seong-Gyun Jeong. 2023. MiLO: Multi-task Learning with Localization Ambiguity Suppression for Occupancy Prediction CVPR 2023 Occupancy Challenge Report. arXiv preprint arXiv:2306.11414 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Renderocc: Vision-centric 3d occupancy prediction with 2d rendering supervision. arXiv preprint arXiv:2309.09502","author":"Pan Mingjie","year":"2023","unstructured":"Mingjie Pan, Jiaming Liu, Renrui Zhang, Peixiang Huang, Xiaoqi Li, Li Liu, and Shanghang Zhang. 2023. Renderocc: Vision-centric 3d occupancy prediction with 2d rendering supervision. arXiv preprint arXiv:2309.09502 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"UniOcc: Unifying Vision-Centric 3D Occupancy Prediction with Geometric and Semantic Rendering. arXiv preprint arXiv:2306.09117","author":"Pan Mingjie","year":"2023","unstructured":"Mingjie Pan, Li Liu, Jiaming Liu, Peixiang Huang, Longlong Wang, Shanghang Zhang, Shaoqing Xu, Zhiyi Lai, and Kuiyuan Yang. 2023. UniOcc: Unifying Vision-Centric 3D Occupancy Prediction with Geometric and Semantic Rendering. arXiv preprint arXiv:2306.09117 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings, Part III 16","author":"Peng Songyou","year":"2020","unstructured":"Songyou Peng, Michael Niemeyer, Lars Mescheder, Marc Pollefeys, and Andreas Geiger. 2020. Convolutional occupancy networks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part III 16. Springer, 523--540."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings, Part XIV 16","author":"Philion Jonah","year":"2020","unstructured":"Jonah Philion and Sanja Fidler. 2020. Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XIV 16. Springer, 194--210."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3095302"},{"key":"e_1_3_2_1_36_1","volume-title":"Roshan Ragel, and Gihan Jayatilaka.","author":"Silva Sathira","year":"2024","unstructured":"Sathira Silva, Savindu Bhashitha Wannigama, Roshan Ragel, and Gihan Jayatilaka. 2024. S2TPVFormer: Spatio-Temporal Tri-Perspective View for temporally coherent 3D Semantic Occupancy Prediction. arXiv preprint arXiv:2401.13785 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.28"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"e_1_3_2_1_39_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Tian Xiaoyu","year":"2024","unstructured":"Xiaoyu Tian, Tao Jiang, Longfei Yun, Yucheng Mao, Huitong Yang, Yue Wang, Yilun Wang, and Hang Zhao. 2024. Occ3d: A large-scale 3d occupancy prediction benchmark for autonomous driving. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00772"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01624"},{"key":"e_1_3_2_1_42_1","volume-title":"Conference on Robot Learning. PMLR, 180--191","author":"Wang Yue","year":"2022","unstructured":"Yue Wang, Vitor Campagnolo Guizilini, Tianyuan Zhang, Yilun Wang, Hang Zhao, and Justin Solomon. 2022. Detr3d: 3d object detection from multi-view images via 3d-to-2d queries. In Conference on Robot Learning. PMLR, 180--191."},{"key":"e_1_3_2_1_43_1","volume-title":"NeRF--: Neural radiance fields without known camera parameters. arXiv preprint arXiv:2102.07064","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Shangzhe Wu, Weidi Xie, Min Chen, and Victor Adrian Prisacariu. 2021. NeRF--: Neural radiance fields without known camera parameters. arXiv preprint arXiv:2102.07064 (2021)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16419"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_39"},{"key":"e_1_3_2_1_47_1","volume-title":"Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong.","author":"Zhang Chaoning","year":"2023","unstructured":"Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong. 2023. Faster segment anything: Towards lightweight sam for mobile applications. arXiv preprint arXiv:2306.14289 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"OccFormer: Dual-path Transformer for Vision-based 3D Semantic Occupancy Prediction. arXiv e-prints","author":"Zhang Yunpeng","year":"2023","unstructured":"Yunpeng Zhang, Zheng Zhu, and Dalong Du. 2023. OccFormer: Dual-path Transformer for Vision-based 3D Semantic Occupancy Prediction. arXiv e-prints (2023), arXiv--2304."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00981"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681293","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681293","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681293"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681293","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681293","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}