{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:24:06Z","timestamp":1781587446973,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["No. 62332016"],"award-info":[{"award-number":["No. 62332016"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681103","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"4620-4629","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["RayFormer: Improving Query-Based Multi-Camera 3D Object Detection via Ray-Centric Strategies"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7164-7245","authenticated-orcid":false,"given":"Xiaomeng","family":"Chu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9624-7451","authenticated-orcid":false,"given":"Jiajun","family":"Deng","sequence":"additional","affiliation":[{"name":"The University of Adelaide, Adelaide, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0964-7279","authenticated-orcid":false,"given":"Guoliang","family":"You","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0754-3953","authenticated-orcid":false,"given":"Yifan","family":"Duan","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6063-3331","authenticated-orcid":false,"given":"Yao","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9046-798X","authenticated-orcid":false,"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2892405"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_3_1","volume-title":"OA-BEV: Bringing Object Awareness to Bird's-Eye-View Representation for Multi-Camera 3D Object Detection. CoRR","author":"Chu Xiaomeng","year":"2023","unstructured":"Xiaomeng Chu, Jiajun Deng, Yuan Zhao, Jianmin Ji, Yu Zhang, Houqiang Li, and Yanyong Zhang. 2023. OA-BEV: Bringing Object Awareness to Bird's-Eye-View Representation for Multi-Camera 3D Object Detection. CoRR, Vol. abs\/2301.05711 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2403.09212"},{"key":"e_1_3_2_1_5_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. [n.,d.]. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3--7, 2021."},{"key":"e_1_3_2_1_6_1","volume-title":"Deep Ordinal Regression Network for Monocular Depth Estimation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Fu Huan","year":"2018","unstructured":"Huan Fu, Mingming Gong, Chaohui Wang, Kayhan Batmanghelich, and Dacheng Tao. 2018. Deep Ordinal Regression Network for Monocular Depth Estimation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_7_1","volume-title":"AdaMixer: A Fast-Converging Query-Based Object Detector. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022","author":"Gao Ziteng","year":"2022","unstructured":"Ziteng Gao, Limin Wang, Bing Han, and Sheng Guo. 2022. AdaMixer: A Fast-Converging Query-Based Object Detector. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18--24, 2022. IEEE, 5354--5363."},{"key":"e_1_3_2_1_8_1","volume-title":"Deep Residual Learning for Image Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_9_1","volume-title":"BEVDet4D: Exploit Temporal Cues in Multi-camera 3D Object Detection. CoRR","author":"Huang Junjie","year":"2022","unstructured":"Junjie Huang and Guan Huang. 2022. BEVDet4D: Exploit Temporal Cues in Multi-camera 3D Object Detection. CoRR (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View. CoRR","author":"Huang Junjie","year":"2021","unstructured":"Junjie Huang, Guan Huang, Zheng Zhu, and Dalong Du. 2021. BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View. CoRR (2021)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28033"},{"key":"e_1_3_2_1_12_1","volume-title":"PolarFormer: Multi-camera 3D Object Detection with Polar Transformers. CoRR","author":"Jiang Yanqin","year":"2022","unstructured":"Yanqin Jiang, Li Zhang, Zhenwei Miao, Xiatian Zhu, Jin Gao, Weiming Hu, and Yu-Gang Jiang. 2022. PolarFormer: Multi-camera 3D Object Detection with Polar Transformers. CoRR (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"An Energy and GPU-Computation Efficient Backbone Network for Real-Time Object Detection. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops).","author":"Lee Youngwan","year":"2019","unstructured":"Youngwan Lee, Joong-Won Hwang, Sangrok Lee, Yuseok Bae, and Jongyoul Park. 2019. An Energy and GPU-Computation Efficient Backbone Network for Real-Time Object Detection. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops)."},{"key":"e_1_3_2_1_14_1","volume-title":"BEVStereo: Enhancing Depth Estimation in Multi-view 3D Object Detection with Dynamic Temporal Stereo. CoRR","author":"Li Yinhao","year":"2022","unstructured":"Yinhao Li, Han Bao, Zheng Ge, Jinrong Yang, Jianjian Sun, and Zeming Li. 2022. BEVStereo: Enhancing Depth Estimation in Multi-view 3D Object Detection with Dynamic Temporal Stereo. CoRR (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"Unifying Voxel-based Representation with Transformer for 3D Object Detection. CoRR","author":"Li Yanwei","year":"2022","unstructured":"Yanwei Li, Yilun Chen, Xiaojuan Qi, Zeming Li, Jian Sun, and Jiaya Jia. 2022. Unifying Voxel-based Representation with Transformer for 3D Object Detection. CoRR, Vol. abs\/2206.00630 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"e_1_3_2_1_18_1","volume-title":"IEEE\/CVF International Conference on Computer Vision, ICCV 2023","author":"Li Zhiqi","year":"2023","unstructured":"Zhiqi Li, Zhiding Yu, Wenhai Wang, Anima Anandkumar, Tong Lu, and Jos\u00e9 M. \u00c1lvarez. 2023. FB-BEV: BEV Representation from Forward-Backward View Transformations. In IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1--6, 2023. IEEE, 6896--6905."},{"key":"e_1_3_2_1_19_1","volume-title":"Focal Loss for Dense Object Detection. In IEEE International Conference on Computer Vision, ICCV 2017","author":"Lin Tsung-Yi","year":"2017","unstructured":"Tsung-Yi Lin, Priya Goyal, Ross B. Girshick, Kaiming He, and Piotr Doll\u00e1r. 2017. Focal Loss for Dense Object Detection. In IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22--29, 2017. IEEE Computer Society, 2999--3007."},{"key":"e_1_3_2_1_20_1","volume-title":"Sparse4D: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion. CoRR","author":"Lin Xuewu","year":"2022","unstructured":"Xuewu Lin, Tianwei Lin, Zixiang Pei, Lichao Huang, and Zhizhong Su. 2022. Sparse4D: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion. CoRR, Vol. abs\/2211.10581 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"Sparse4D v2: Recurrent Temporal Fusion with Sparse Model. CoRR","author":"Lin Xuewu","year":"2023","unstructured":"Xuewu Lin, Tianwei Lin, Zixiang Pei, Lichao Huang, and Zhizhong Su. 2023. Sparse4D v2: Recurrent Temporal Fusion with Sparse Model. CoRR, Vol. abs\/2305.14018 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"e_1_3_2_1_25_1","volume-title":"Vision-based Uneven BEV Representation Learning with Polar Rasterization and Surface Estimation. In Conference on Robot Learning, CoRL 2022","volume":"446","author":"Liu Zhi","year":"2022","unstructured":"Zhi Liu, Shaoyu Chen, Xiaojie Guo, Xinggang Wang, Tianheng Cheng, Hongmei Zhu, Qian Zhang, Wenyu Liu, and Yi Zhang. 2022. Vision-based Uneven BEV Representation Learning with Polar Rasterization and Surface Estimation. In Conference on Robot Learning, CoRL 2022, 14--18 December 2022, Auckland, New Zealand (Proceedings of Machine Learning Research, Vol. 205), Karen Liu, Dana Kulic, and Jeffrey Ichnowski (Eds.). PMLR, 437--446."},{"key":"e_1_3_2_1_26_1","volume-title":"SGDR: Stochastic Gradient Descent with Warm Restarts. In International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. SGDR: Stochastic Gradient Descent with Warm Restarts. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_27_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_28_1","volume-title":"3D Object Detection from Images for Autonomous Driving: A Survey. CoRR","author":"Ma Xinzhu","year":"2022","unstructured":"Xinzhu Ma, Wanli Ouyang, Andrea Simonelli, and Elisa Ricci. 2022. 3D Object Detection from Images for Autonomous Driving: A Survey. CoRR (2022)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"e_1_3_2_1_30_1","volume-title":"The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Park Jinhyung","year":"2023","unstructured":"Jinhyung Park, Chenfeng Xu, Shijia Yang, Kurt Keutzer, Kris M. Kitani, Masayoshi Tomizuka, and Wei Zhan. 2023. Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection. In The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1--5, 2023."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00102"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Rui Qian Xin Lai and Xirong Li. 2022. 3D Object Detection for Autonomous Driving: A Survey. Pattern Recognit. (2022).","DOI":"10.1016\/j.patcog.2022.108796"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00331"},{"key":"e_1_3_2_1_35_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00493"},{"key":"e_1_3_2_1_38_1","volume-title":"Conference on Robot Learning (CoRL).","author":"Wang Yue","year":"2021","unstructured":"Yue Wang, Vitor Guizilini, Tianyuan Zhang, Yilun Wang, Hang Zhao, and Justin Solomon. 2021. DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries. In Conference on Robot Learning (CoRL)."},{"key":"e_1_3_2_1_39_1","volume-title":"International Journal of Computer Vision (IJCV)","author":"Wang Yingjie","year":"2023","unstructured":"Yingjie Wang, Qiuyu Mao, Hanqi Zhu, Jiajun Deng, Yu Zhang, Jianmin Ji, Houqiang Li, and Yanyong Zhang. 2023 d. Multi-modal 3d object detection in autonomous driving: a survey. International Journal of Computer Vision (IJCV) (2023), 1--31."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00351"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2021.07.003"},{"key":"e_1_3_2_1_43_1","volume-title":"PolarNet: An Improved Grid Representation for Online LiDAR Point Clouds Semantic Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020","author":"Zhang Yang","year":"2020","unstructured":"Yang Zhang, Zixiang Zhou, Philip David, Xiangyu Yue, Zerong Xi, Boqing Gong, and Hassan Foroosh. 2020. PolarNet: An Improved Grid Representation for Online LiDAR Point Clouds Semantic Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13--19, 2020. Computer Vision Foundation \/ IEEE, 9598--9607."},{"key":"e_1_3_2_1_44_1","volume-title":"BEVerse: Unified Perception and Prediction in Birds-Eye-View for Vision-Centric Autonomous Driving. CoRR","author":"Zhang Yunpeng","year":"2022","unstructured":"Yunpeng Zhang, Zheng Zhu, Wenzhao Zheng, Junjie Huang, Guan Huang, Jie Zhou, and Jiwen Lu. 2022. BEVerse: Unified Perception and Prediction in Birds-Eye-View for Vision-Centric Autonomous Driving. CoRR (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Tianhao Zhao Yongcan Chen Yu Wu Tianyang Liu Bo Du Peilun Xiao Shi Qiu Hongda Yang Guozhen Li Yi Yang and Yutian Lin. 2024. Improving Bird's Eye View Semantic Segmentation by Task Decomposition. arxiv: 2404.01925 [cs.CV]","DOI":"10.1109\/CVPR52733.2024.01469"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00785"},{"key":"e_1_3_2_1_47_1","volume-title":"Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection. CoRR","author":"Zhu Benjin","year":"2019","unstructured":"Benjin Zhu, Zhengkai Jiang, Xiangxin Zhou, Zeming Li, and Gang Yu. 2019. Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection. CoRR (2019)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00953"},{"key":"e_1_3_2_1_49_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations (ICLR).","author":"Zhu Xizhou","year":"2021","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00350"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681103","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681103","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:52Z","timestamp":1750294672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681103"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3681103","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681103","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}