{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,25]],"date-time":"2026-07-25T15:59:50Z","timestamp":1784995190743,"version":"3.55.0"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732195","type":"print"},{"value":"9783031732201","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73220-1_8","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:04:38Z","timestamp":1730577878000},"page":"131-147","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["RecurrentBEV: A Long-Term Temporal Fusion Framework for\u00a0Multi-view 3D Detection"],"prefix":"10.1007","author":[{"given":"Ming","family":"Chang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xishan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhipeng","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guanhua","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shaoli","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"8_CR1","unstructured":"Ballas, N., Yao, L., Pal, C., Courville, A.: Delving deeper into convolutional networks for learning video representations. In: ICLR (2016)"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Chen, D., Li, J., Guizilini, V., Ambrus, R.A., Gaidon, A.: Viewpoint equivariance for multi-view 3D object detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00889"},{"issue":"4","key":"8_CR5","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE TPAMI 40(4), 834\u2013848 (2017)","journal-title":"IEEE TPAMI"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Feng, C., Jie, Z., Zhong, Y., Chu, X., Ma, L.: Aedet: azimuth-invariant multi-view 3D object detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02067"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Han, C., et al.: Exploring recurrent long-term temporal fusion for multi-view 3D perception. arXiv preprint arXiv:2303.05970 (2023)","DOI":"10.1109\/LRA.2024.3401172"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"8_CR10","unstructured":"Huang, B., et al.: Fast-bev: towards real-time on-vehicle bird\u2019s-eye view perception. arXiv preprint arXiv:2301.07870 (2023)"},{"key":"8_CR11","unstructured":"Huang, J., Huang, G.: Bevdet4d: exploit temporal cues in multi-camera 3D object detection. arXiv preprint arXiv:2203.17054 (2022)"},{"key":"8_CR12","unstructured":"Huang, J., Huang, G.: Bevpoolv2: a cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111 (2022)"},{"key":"8_CR13","unstructured":"Huang, J., Huang, G., Zhu, Z., Ye, Y., Du, D.: Bevdet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)"},{"key":"8_CR14","unstructured":"Huang, L., et al.: Leveraging vision-centric multi-modal expertise for 3D object detection. In: NeurIPS (2023)"},{"key":"8_CR15","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., et\u00a0al.: Spatial transformer networks. In: NeurIPS (2015)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Jiang, Y., et al.: Polarformer: multi-camera 3D object detection with polar transformer. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Lee, Y., Hwang, J.W., Lee, S., Bae, Y., Park, J.: An energy and GPU-computation efficient backbone network for real-time object detection. In: CVPRW (2019)","DOI":"10.1109\/CVPRW.2019.00103"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Li, H., et al.: DFA3D: 3D deformable attention for 2D-to-3D feature lifting. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00615"},{"key":"8_CR19","unstructured":"Li, Y., Chen, Y., Qi, X., Li, Z., Sun, J., Jia, J.: Unifying voxel-based representation with transformer for 3D object detection. In: NeurIPS (2022)"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Bao, H., Ge, Z., Yang, J., Sun, J., Li, Z.: Bevstereo: enhancing depth estimation in multi-view 3D object detection with temporal stereo. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25234"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Bevdepth: acquisition of reliable depth for multi-view 3D object detection. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Bevformer: lbird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Li, Z., Yu, Z., Wang, W., Anandkumar, A., Lu, T., Alvarez, J.M.: FB-BEV: BEV representation from forward-backward view transformations. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"8_CR25","unstructured":"Lin, X., Lin, T., Pei, Z., Huang, L., Su, Z.: Sparse4d: multi-view 3D object detection with sparse spatial-temporal fusion. arXiv preprint arXiv:2211.10581 (2022)"},{"key":"8_CR26","unstructured":"Lin, X., Lin, T., Pei, Z., Huang, L., Su, Z.: Sparse4d v2: recurrent temporal fusion with sparse model. arXiv preprint arXiv:2305.14018 (2023)"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Liu, H., Teng, Y., Lu, T., Wang, H., Wang, L.: Sparsebev: high-performance sparse 3D object detection from multi-camera videos. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, T., Zhang, X., Sun, J.: Petr: position embedding transformation for multi-view 3D object detection. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: PETRV2: a unified framework for 3D perception from multi-camera images. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"8_CR31","unstructured":"Luo, Z., Zhou, C., Zhang, G., Lu, S.: DETR4D: direct multi-view 3D object detection with sparse attention. arXiv preprint arXiv:2212.07849 (2022)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Park, D., Ambrus, R., Guizilini, V., Li, J., Gaidon, A.: Is pseudo-lidar needed for monocular 3D object detection? In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"8_CR33","unstructured":"Park, J., et al.: Time will tell: new outlooks and a baseline for temporal multi-view 3D object detection. In: ICLR (2023)"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"8_CR35","doi-asserted-by":"crossref","unstructured":"Wang, S., Liu, Y., Wang, T., Li, Y., Zhang, X.: Exploring object-centric temporal modeling for efficient multi-view 3D object detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"8_CR36","unstructured":"Wang, Y., Guizilini, V.C., Zhang, T., Wang, Y., Zhao, H., Solomon, J.: DETR3D: 3D object detection from multi-view images via 3D-to-2D queries. In: Conference on Robot Learning (2022)"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Wang, Z., Huang, Z., Fu, J., Wang, N., Liu, S.: Object as query: lifting any 2D object detector to 3D detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00351"},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Xiong, K., et al.: Cape: camera view position embedding for multi-view 3D object detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02066"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Yang, C., et al.: Bevformer V2: adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3D object detection and tracking. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, J., Zhang, Y., Liu, Q., Wang, Y.: SA-BEV: generating semantic-aware bird\u2019s-eye-view feature for multi-view 3D object detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00310"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, H., Ge, Z., Li, Z., Zhang, X.: Matrixvt: efficient multi-camera to BEV transformation for 3D perception. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00785"},{"key":"8_CR43","unstructured":"Zhu, B., Jiang, Z., Zhou, X., Li, Z., Yu, G.: Class-balanced grouping and sampling for point cloud 3D object detection. arXiv preprint arXiv:1908.09492 (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73220-1_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:05:12Z","timestamp":1730577912000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73220-1_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031732195","9783031732201"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73220-1_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}