{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T22:04:42Z","timestamp":1767650682042,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613798","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"4074-4083","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["OccluBEV: Occlusion Aware Spatiotemporal Modeling for Multi-view 3D Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8331-184X","authenticated-orcid":false,"given":"Ziteng","family":"Wen","sequence":"first","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4581-8394","authenticated-orcid":false,"given":"Hai","family":"Xu","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6682-2877","authenticated-orcid":false,"given":"Chenyu","family":"Liu","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0733-8310","authenticated-orcid":false,"given":"Tao","family":"Guo","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3017-973X","authenticated-orcid":false,"given":"Jinshui","family":"Hu","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2150-1237","authenticated-orcid":false,"given":"Xuming","family":"He","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4539-3361","authenticated-orcid":false,"given":"Fengren","family":"Wang","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1979-0562","authenticated-orcid":false,"given":"Shun","family":"Lou","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3181-8546","authenticated-orcid":false,"given":"Haibo","family":"Fan","sequence":"additional","affiliation":[{"name":"iFLYTEK Research, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00938"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_3_1","volume-title":"Danda Pani Paudel, and Luc Van Gool","author":"Can Yigit Baran","year":"2021","unstructured":"Yigit Baran Can, Alexander Liniger, Danda Pani Paudel, and Luc Van Gool. 2021. Structured bird's-eye-view traffic scene understanding from onboard images. In ICCV. 15661--15670."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.691"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01211"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.89"},{"key":"e_1_3_2_1_8_1","volume-title":"MLOD: A multi-view 3D object detection based on robust feature fusion method. In 2019 IEEE intelligent transportation systems conference (ITSC)","author":"Deng Jian","year":"2019","unstructured":"Jian Deng and Krzysztof Czarnecki. 2019. MLOD: A multi-view 3D object detection based on robust feature fusion method. In 2019 IEEE intelligent transportation systems conference (ITSC). IEEE, 279--284."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8469--8478","author":"Hu Jordan SK","year":"2022","unstructured":"Jordan SK Hu, Tianshu Kuai, and Steven L Waslander. 2022. Point density-aware voxels for lidar 3d object detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8469--8478."},{"key":"e_1_3_2_1_11_1","volume-title":"Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790","author":"Huang Junjie","year":"2021","unstructured":"Junjie Huang, Guan Huang, Zheng Zhu, Yun Ye, and Dalong Du. 2021. Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00103"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00111"},{"key":"e_1_3_2_1_15_1","volume-title":"Perception: A Review, Evaluation and Recipe. arXiv preprint arXiv:2209.05324","author":"Li Hongyang","year":"2022","unstructured":"Hongyang Li, Chonghao Sima, Jifeng Dai, Wenhai Wang, Lewei Lu, Huijie Wang, Enze Xie, Zhiqi Li, Hanming Deng, Hao Tian, et al. 2022d. Delving into the Devils of Bird's-eye-view Perception: A Review, Evaluation and Recipe. arXiv preprint arXiv:2209.05324 (2022)."},{"key":"e_1_3_2_1_16_1","volume-title":"Bevstereo: Enhancing depth estimation in multi-view 3d object detection with dynamic temporal stereo. arXiv preprint arXiv:2209.10248","author":"Li Yinhao","year":"2022","unstructured":"Yinhao Li, Han Bao, Zheng Ge, Jinrong Yang, Jianjian Sun, and Zeming Li. 2022a. Bevstereo: Enhancing depth estimation in multi-view 3d object detection with dynamic temporal stereo. arXiv preprint arXiv:2209.10248 (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"Unifying voxel-based representation with transformer for 3d object detection. arXiv preprint arXiv:2206.00630","author":"Li Yanwei","year":"2022","unstructured":"Yanwei Li, Yilun Chen, Xiaojuan Qi, Zeming Li, Jian Sun, and Jiaya Jia. 2022b. Unifying voxel-based representation with transformer for 3d object detection. arXiv preprint arXiv:2206.00630 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092","author":"Li Yinhao","year":"2022","unstructured":"Yinhao Li, Zheng Ge, Guanyi Yu, Jinrong Yang, Zengran Wang, Yukang Shi, Jianjian Sun, and Zeming Li. 2022c. Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092 (2022)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00746"},{"key":"e_1_3_2_1_20_1","volume-title":"Tel Aviv","author":"Li Zhiqi","year":"2022","unstructured":"Zhiqi Li, Wenhai Wang, Hongyang Li, Enze Xie, Chonghao Sima, Tong Lu, Yu Qiao, and Jifeng Dai. 2022 e. Bevformer: Learning bird's-eye-view representation from multi-camera images via spatiotemporal transformers. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part IX. Springer, 1--18."},{"key":"e_1_3_2_1_21_1","volume-title":"Petrv2: A unified framework for 3d perception from multi-camera images. arXiv preprint arXiv:2206.01256","author":"Liu Yingfei","year":"2022","unstructured":"Yingfei Liu, Junjie Yan, Fan Jia, Shuailin Li, Qi Gao, Tiancai Wang, Xiangyu Zhang, and Jian Sun. 2022. Petrv2: A unified framework for 3d perception from multi-camera images. arXiv preprint arXiv:2206.01256 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"3D object detection from images for autonomous driving: a survey. arXiv preprint arXiv:2202.02980","author":"Ma Xinzhu","year":"2022","unstructured":"Xinzhu Ma, Wanli Ouyang, Andrea Simonelli, and Elisa Ricci. 2022. 3D object detection from images for autonomous driving: a survey. arXiv preprint arXiv:2202.02980 (2022)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00695"},{"key":"e_1_3_2_1_24_1","volume-title":"Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection. arXiv preprint arXiv:2210.02443","author":"Park Jinhyung","year":"2022","unstructured":"Jinhyung Park, Chenfeng Xu, Shijia Yang, Kurt Keutzer, Kris Kitani, Masayoshi Tomizuka, and Wei Zhan. 2022. Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection. arXiv preprint arXiv:2210.02443 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00102"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"e_1_3_2_1_28_1","volume-title":"Orthographic feature transform for monocular 3d object detection. arXiv preprint arXiv:1811.08188","author":"Roddick Thomas","year":"2018","unstructured":"Thomas Roddick, Alex Kendall, and Roberto Cipolla. 2018. Orthographic feature transform for monocular 3d object detection. arXiv preprint arXiv:1811.08188 (2018)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00086"},{"key":"e_1_3_2_1_31_1","volume-title":"From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network","author":"Shi Shaoshuai","year":"2020","unstructured":"Shaoshuai Shi, Zhe Wang, Jianping Shi, Xiaogang Wang, and Hongsheng Li. 2020. From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 8 (2020), 2647--2664."},{"key":"e_1_3_2_1_32_1","volume-title":"Conference on Robot Learning. PMLR, 1475--1485","author":"Wang Tai","year":"2022","unstructured":"Tai Wang, ZHU Xinge, Jiangmiao Pang, and Dahua Lin. 2022c. Probabilistic and geometric depth: Detecting objects in perspective. In Conference on Robot Learning. PMLR, 1475--1485."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00864"},{"key":"e_1_3_2_1_35_1","volume-title":"Conference on Robot Learning. PMLR, 180--191","author":"Wang Yue","year":"2022","unstructured":"Yue Wang, Vitor Campagnolo Guizilini, Tianyuan Zhang, Yilun Wang, Hang Zhao, and Justin Solomon. 2022a. Detr3d: 3d object detection from multi-view images via 3d-to-2d queries. In Conference on Robot Learning. PMLR, 180--191."},{"key":"e_1_3_2_1_36_1","volume-title":"Sts: Surround-view temporal stereo for multi-view 3d detection. arXiv preprint arXiv:2208.10145","author":"Wang Zengran","year":"2022","unstructured":"Zengran Wang, Chen Min, Zheng Ge, Yinhao Li, Zeming Li, Hongyu Yang, and Di Huang. 2022b. Sts: Surround-view temporal stereo for multi-view 3d detection. arXiv preprint arXiv:2208.10145 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00114"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Turner Whitted. 2005. An improved illumination model for shaded display. In ACM Siggraph 2005 Courses. 4-es.","DOI":"10.1145\/1198555.1198743"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01140"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Chenyu Yang Yuntao Chen Hao Tian Chenxin Tao Xizhou Zhu Zhaoxiang Zhang Gao Huang Hongyang Li Yu Qiao Lewei Lu et al. 2022. BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition via Perspective Supervision. arXiv preprint arXiv:2211.10439 (2022).","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00760"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"volume-title":"International Conference on Learning Representations.","author":"You Yurong","key":"e_1_3_2_1_43_1","unstructured":"Yurong You, Yan Wang, Wei-Lun Chao, Divyansh Garg, Geoff Pleiss, Bharath Hariharan, Mark Campbell, and Kilian Q Weinberger. [n.,d.]. Pseudo-LiDAR: Accurate Depth for 3D Object Detection in Autonomous Driving. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00330"},{"key":"e_1_3_2_1_45_1","volume-title":"Beverse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving. arXiv preprint arXiv:2205.09743","author":"Zhang Yunpeng","year":"2022","unstructured":"Yunpeng Zhang, Zheng Zhu, Wenzhao Zheng, Junjie Huang, Guan Huang, Jie Zhou, and Jiwen Lu. 2022. Beverse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving. arXiv preprint arXiv:2205.09743 (2022)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"e_1_3_2_1_47_1","volume-title":"Class-balanced grouping and sampling for point cloud 3d object detection. arXiv preprint arXiv:1908.09492","author":"Zhu Benjin","year":"2019","unstructured":"Benjin Zhu, Zhengkai Jiang, Xiangxin Zhou, Zeming Li, and Gang Yu. 2019. Class-balanced grouping and sampling for point cloud 3d object detection. arXiv preprint arXiv:1908.09492 (2019)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613798","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613798","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:18Z","timestamp":1755821178000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613798"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":47,"alternative-id":["10.1145\/3581783.3613798","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613798","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}