{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:40:48Z","timestamp":1777491648755,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680807","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"2927-2935","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["QE-BEV: Query Evolution for Bird's Eye View Object Detection in Varied Contexts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0994-2072","authenticated-orcid":false,"given":"Jiawei","family":"Yao","sequence":"first","affiliation":[{"name":"School of Engineering and Technology, University of Washington, Tacoma, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3769-7350","authenticated-orcid":false,"given":"Yingxin","family":"Lai","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, Xiamen University, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5391-1354","authenticated-orcid":false,"given":"Hongrui","family":"Kou","sequence":"additional","affiliation":[{"name":"Department of Vehicle Engineering, Jilin University, Changchun, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2991-4513","authenticated-orcid":false,"given":"Tong","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Engineering and Technology, University of Washington, Tacoma, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4510-5383","authenticated-orcid":false,"given":"Ruixi","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Yonsei University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.704"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.597"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5364--5373","author":"Gao Ziteng","year":"2022","unstructured":"Ziteng Gao, Limin Wang, Bing Han, and Sheng Guo. 2022. Ada Mixer: A Fast-Converging Query-Based Object Detector. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5364--5373."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_7_1","volume-title":"Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054","author":"Huang Junjie","year":"2022","unstructured":"Junjie Huang and Guan Huang. 2022. Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"Bevpoolv2: A cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111","author":"Huang Junjie","year":"2022","unstructured":"Junjie Huang and Guan Huang. 2022. Bevpoolv2: A cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790","author":"Huang Junjie","year":"2021","unstructured":"Junjie Huang, Guan Huang, Zheng Zhu, and Dalong Du. 2021. Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly 2, 1--2","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly 2, 1--2 (1955), 83--97."},{"key":"e_1_3_2_1_11_1","volume-title":"BEVStereo: Enhancing Depth Estimation in Multi-view 3D Object Detection with Dynamic Temporal Stereo. arXiv preprint arXiv:2209.10248","author":"Li Yinhao","year":"2022","unstructured":"Yinhao Li, Han Bao, Zheng Ge, Jinrong Yang, Jianjian Sun, and Zeming Li. 2022. BEVStereo: Enhancing Depth Estimation in Multi-view 3D Object Detection with Dynamic Temporal Stereo. arXiv preprint arXiv:2209.10248 (2022)."},{"key":"e_1_3_2_1_12_1","first-page":"18442","article-title":"Unifying Voxel-based Representation with Transformer for 3D Object Detection","volume":"35","author":"Li Y.","year":"2022","unstructured":"Y. Li, Y. Chen, X. Qi, Z. Li, J. Sun, and J. Jia. 2022. Unifying Voxel-based Representation with Transformer for 3D Object Detection. Advances in Neural Information Processing Systems 35 (2022), 18442--18455.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092","author":"Li Yinhao","year":"2022","unstructured":"Yinhao Li, Zheng Ge, Guanyi Yu, Jinrong Yang, Zengran Wang, Yukang Shi, Jianjian Sun, and Zeming Li. 2022. Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092 (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Improving BEVFormer for 3D Camera-only Object Detection: 1st Place Solution for Waymo Open Dataset Challenge","author":"Li Zhiqi","year":"2022","unstructured":"Zhiqi Li, Hanming Deng, Tianyu Li, Yangyi Huang, Chonghao Sima, Xiangwei Geng, Yulu Gao, Wenhai Wang, Yang Li, and Lewei Lu. 2023. BEVFormer ++: Improving BEVFormer for 3D Camera-only Object Detection: 1st Place Solution for Waymo Open Dataset Challenge 2022."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_17_1","volume-title":"Sparse4d: Multi-view 3d object detection with sparse spatial-temporal fusion. arXiv preprint arXiv:2211.10581","author":"Lin Xuewu","year":"2022","unstructured":"Xuewu Lin, Tianwei Lin, Zixiang Pei, Lichao Huang, and Zhizhong Su. 2022. Sparse4d: Multi-view 3d object detection with sparse spatial-temporal fusion. arXiv preprint arXiv:2211.10581 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Sparse4D v2: Recurrent Temporal Fusion with Sparse Model. arXiv preprint arXiv:2305.14018","author":"Lin Xuewu","year":"2023","unstructured":"Xuewu Lin, Tianwei Lin, Zixiang Pei, Lichao Huang, and Zhizhong Su. 2023. Sparse4D v2: Recurrent Temporal Fusion with Sparse Model. arXiv preprint arXiv:2305.14018 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Sparse-BEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos. arXiv preprint arXiv:2308.09244","author":"Liu Haisong","year":"2023","unstructured":"Haisong Liu, Yao Teng, Tao Lu, Haiguang Wang, and Limin Wang. 2023. Sparse-BEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos. arXiv preprint arXiv:2308.09244 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"e_1_3_2_1_21_1","volume-title":"PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images. arXiv preprint arXiv:2206.01256","author":"Liu Yingfei","year":"2022","unstructured":"Yingfei Liu, Junjie Yan, Fan Jia, Shuailin Li, Qi Gao, Tiancai Wang, Xiangyu Zhang, and Jian Sun. 2022. PETRv2: A Unified Framework for 3D Perception from Multi-Camera Images. arXiv preprint arXiv:2206.01256 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection. arXiv preprint arXiv:2210.02443","author":"Park Jinhyung","year":"2022","unstructured":"Jinhyung Park, Chenfeng Xu, Shijia Yang, Kurt Keutzer, Kris Kitani, Masayoshi Tomizuka, and Wei Zhan. 2022. Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection. arXiv preprint arXiv:2210.02443 (2022)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"e_1_3_2_1_27_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection. arXiv preprint arXiv:2303.11926","author":"Wang Shihao","year":"2023","unstructured":"Shihao Wang, Yingfei Liu, Tiancai Wang, Ying Li, and Xiangyu Zhang. 2023. Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection. arXiv preprint arXiv:2303.11926 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Multi-view camera-only 4d object detection with pretrained monocular backbones. arXiv preprint arXiv:2207.12716","author":"Wang Tai","year":"2022","unstructured":"Tai Wang, Qing Lian, Chenming Zhu, Xinge Zhu, and Wenwei Zhang. 2022. Mv-fcos3d++: Multi-view camera-only 4d object detection with pretrained monocular backbones. arXiv preprint arXiv:2207.12716 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00864"},{"key":"e_1_3_2_1_32_1","volume-title":"Conference on Robot Learning. PMLR, 180--191","author":"Wang Yue","year":"2022","unstructured":"Yue Wang, Vitor Campagnolo Guizilini, Tianyuan Zhang, Yilun Wang, Hang Zhao, and Justin Solomon. 2022. Detr3d: 3d object detection from multi-view images via 3d-to-2d queries. In Conference on Robot Learning. PMLR, 180--191."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 3791--3800","author":"Wang Z.","unstructured":"Z. Wang, Z. Huang, J. Fu, N. Wang, and S. Liu. 2023. Object as Query: Lifting Any 2D Object Detector to 3D Detection. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 3791--3800."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 18268--18278","author":"Yan J.","unstructured":"J. Yan, Y. Liu, J. Sun, F. Jia, S. Li, T. Wang, and X. Zhang. 2023. Cross Modal Transformer: Towards Fast and Robust 3D Object Detection. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 18268--18278."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00867"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447410"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"37","author":"Zhang Y.","unstructured":"Y. Zhang, W. Zheng, Z. Zhu, G. Huang, J. Lu, and J. Zhou. 2023. A Simple Baseline for Multi-camera 3D Object Detection. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37. 3507--3515."},{"key":"e_1_3_2_1_38_1","unstructured":"Xizhou Zhu Weijie Su Lewei Lu Bin Li Xiaogang Wang and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680807","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680807","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680807"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":38,"alternative-id":["10.1145\/3664647.3680807","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680807","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}