{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:29:59Z","timestamp":1777656599810,"version":"3.51.4"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729720","type":"print"},{"value":"9783031729737","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72973-7_22","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"376-392","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["HENet: Hybrid Encoding for\u00a0End-to-End Multi-task 3D Perception from\u00a0Multi-view Cameras"],"prefix":"10.1007","author":[{"given":"Zhongyu","family":"Xia","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiwei","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinhao","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongtao","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yun","family":"Xing","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shengxiang","family":"Qi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nan","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming-Hsuan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Badrinarayanan, V., Kendall, A., Cipolla, R.: Segnet: a deep convolutional encoder-decoder architecture for image segmentation. In: IEEE TPAMI (2017)","DOI":"10.1109\/TPAMI.2016.2644615"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Brazil, G., Liu, X.: M3d-rpn: monocular 3d region proposal network for object detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00938"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuscenes: a multimodal dataset for autonomous driving. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Ding, M., et al.: Learning depth-guided convolutions for monocular 3D object detection. In: CVPR (2020)","DOI":"10.1109\/CVPRW50498.2020.00508"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Feng, C., Jie, Z., Zhong, Y., Chu, X., Ma, L.: Aedet: Azimuth-invariant multi-view 3D object detection. In: CVPR, pp. 21580\u201321588 (2023)","DOI":"10.1109\/CVPR52729.2023.02067"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: Fiery: future instance prediction in bird\u2019s-eye view from surround monocular cameras. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"22_CR9","unstructured":"Huang, B., et al.: Fast-bev: towards real-time on-vehicle bird\u2019s-eye view perception. arXiv preprint arXiv:2301.07870 (2023)"},{"key":"22_CR10","unstructured":"Huang, J., Huang, G.: Bevpoolv2: a cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111 (2022)"},{"key":"22_CR11","unstructured":"Huang, J., Huang, G., Robotics, P.: Bevdet4d: exploit temporal cues in multi-camera 3D object detection. arXiv preprint arXiv:2203.17054 (2022)"},{"key":"22_CR12","unstructured":"Huang, J., Huang, G., Zhu, Z., Ye, Y., Du, D.: Bevdet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, X., et al.: Far3d: expanding the horizon for surround-view 3d object detection. In: AAAI (2024)","DOI":"10.1609\/aaai.v38i3.28033"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, Y., et al.: Polarformer: multi-camera 3d object detection with polar transformer. In: AAAI, pp. 1042\u20131050 (2023)","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Lee, Y., Park, J.: Centermask: real-time anchor-free instance segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01392"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Li, Q., Wang, Y., Wang, Y., Zhao, H.: Hdmapnet: an online HD map construction and evaluation framework. In: ICRA (2022)","DOI":"10.1109\/ICRA46639.2022.9812383"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Bao, H., Ge, Z., Yang, J., Sun, J., Li, Z.: Bevstereo: enhancing depth estimation in multi-view 3d object detection with dynamic temporal stereo. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25234"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Bevdepth: acquisition of reliable depth for multi-view 3d object detection. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Bevformer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Li, Z., Yu, Z., Wang, W., Anandkumar, A., Lu, T., Alvarez, J.M.: Fb-bev: bev representation from forward-backward view transformations. In: ICCV, pp. 6919\u20136928 (2023)","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"22_CR21","unstructured":"Liang, T., et al.: Bevfusion: a simple and robust lidar-camera fusion framework. In: NeurIPS (2022)"},{"key":"22_CR22","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"22_CR24","unstructured":"Lin, X., Lin, T., Pei, Z., Huang, L., Su, Z.: Sparse4d: multi-view 3D object detection with sparse spatial-temporal fusion. arXiv preprint arXiv:2211.10581 (2022)"},{"key":"22_CR25","unstructured":"Lin, X., Lin, T., Pei, Z., Huang, L., Su, Z.: Sparse4d v2: recurrent temporal fusion with sparse model. arXiv preprint arXiv:2305.14018 (2023)"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., Teng, Y., Lu, T., Wang, H., Wang, L.: Sparsebev: high-performance sparse 3D object detection from multi-camera videos. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"22_CR27","doi-asserted-by":"publisher","unstructured":"Liu, Y., Wang, T., Zhang, X., Sun, J.: PETR: position embedding transformation for\u00a0multi-view 3D object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXVII, pp. 531\u2013548. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_31","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Petrv2: a unified framework for 3d perception from multi-camera images. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"22_CR29","unstructured":"Nicolas, C., Francisco, M., Gabriel, S., Nicolas, U., Alexander, K., Sergey, Z.: End-to-end object detection with transformers. In: ECCV (2020)"},{"issue":"3","key":"22_CR30","doi-asserted-by":"publisher","first-page":"4867","DOI":"10.1109\/LRA.2020.3004325","volume":"5","author":"B Pan","year":"2020","unstructured":"Pan, B., Sun, J., Leung, H.Y.T., Andonian, A., Zhou, B.: Cross-view semantic segmentation for sensing surroundings. IEEE Robot. Automat. Lett. 5(3), 4867\u20134873 (2020)","journal-title":"IEEE Robot. Automat. Lett."},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Park, D., Ambrus, R., Guizilini, V., Li, J., Gaidon, A.: Is pseudo-lidar needed for monocular 3D object detection? In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"22_CR32","unstructured":"Park, J., et al.: Time will tell: new outlooks and a baseline for temporal multi-view 3D object detection. In: ICLR (2023)"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Reading, C., Harakeh, A., Chae, J., Waslander, S.L.: Categorical depth distribution network for monocular 3d object detection. In: CVPR, pp. 8555\u20138564 (2021)","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Roddick, T., Cipolla, R.: Predicting semantic map representations from images using pyramid occupancy networks. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01115"},{"key":"22_CR36","unstructured":"Roddick, T., Kendall, A., Cipolla, R.: Orthographic feature transform for monocular 3D object detection. arXiv preprint arXiv:1811.08188 (2018)"},{"key":"22_CR37","doi-asserted-by":"crossref","unstructured":"Wang, S., Liu, Y., Wang, T., Li, Y., Zhang, X.: Exploring object-centric temporal modeling for efficient multi-view 3D object detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"22_CR38","doi-asserted-by":"publisher","unstructured":"Wang, T., Pang, J., Lin, D.: Monocular 3D object detection with\u00a0depth from\u00a0motion. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part IX, pp. 386\u2013403. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_23","DOI":"10.1007\/978-3-031-20077-9_23"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, X., Pang, J., Lin, D.: Fcos3d: fully convolutional one-stage monocular 3D object detection. In: ICCV (2021)","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"22_CR40","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chao, W.L., Garg, D., Hariharan, B., Campbell, M., Weinberger, K.Q.: Pseudo-lidar from visual depth estimation: bridging the gap in 3d object detection for autonomous driving. In: CVPR, pp. 8445\u20138453 (2019)","DOI":"10.1109\/CVPR.2019.00864"},{"key":"22_CR41","unstructured":"Wang, Y., Guizilini, V., Zhang, T., Wang, Y., Zhao, H., Solomon, J.: Detr3D: 3D object detection from multi-view images via 3D-to-2D queries. In: CoRL (2021)"},{"key":"22_CR42","unstructured":"Wang, Z., et al.: STS: surround-view temporal stereo for multi-view 3D detection. arXiv preprint arXiv:2208.10145 (2022)"},{"key":"22_CR43","unstructured":"Xie, E., et al.: M2bev: multi-camera joint 3d detection and segmentation with unified birds-eye view representation. arXiv preprint arXiv:2204.05088 (2022)"},{"key":"22_CR44","doi-asserted-by":"crossref","unstructured":"Yang, C., et\u00a0al.: Bevformer v2: adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In: CVPR, pp. 17830\u201317839 (2023)","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"22_CR45","doi-asserted-by":"crossref","unstructured":"Yang, W., et al.: Projecting your view attentively: monocular road scene layout estimation via cross-view transformation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01528"},{"key":"22_CR46","doi-asserted-by":"crossref","unstructured":"Yang, Z., Zhou, Y., Chen, Z., Ngiam, J.: 3D-man: 3D multi-frame attention network for object detection. In: CVPR, pp. 1863\u20131872 (2021)","DOI":"10.1109\/CVPR46437.2021.00190"},{"key":"22_CR47","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3d object detection and tracking. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"22_CR48","unstructured":"Yuan, Y., Huang, L., Guo, J., Zhang, C., Chen, X., Wang, J.: Ocnet: object context network for scene parsing. arXiv preprint arXiv:1809.00916 (2018)"},{"key":"22_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zheng, W., Zhu, Z., Huang, G., Lu, J., Zhou, J.: A simple baseline for multi-camera 3d object detection. In: AAAI, pp. 3507\u20133515 (2023)","DOI":"10.1609\/aaai.v37i3.25460"},{"key":"22_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, B., Kr\u00e4henb\u00fchl, P.: Cross-view transformers for real-time map-view semantic segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"22_CR51","unstructured":"Zhu, B., Jiang, Z., Zhou, X., Li, Z., Yu, G.: Class-balanced grouping and sampling for point cloud 3d object detection. arXiv preprint arXiv:1908.09492 (2019)"},{"key":"22_CR52","doi-asserted-by":"crossref","unstructured":"Zhuang, Z., Li, R., Jia, K., Wang, Q., Li, Y., Tan, M.: Perception-aware multi-sensor fusion for 3D lidar semantic segmentation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01597"},{"key":"22_CR53","doi-asserted-by":"crossref","unstructured":"Zong, Z., et al.: Temporal enhanced training of multi-view 3D object detector via historical object prediction. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00350"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72973-7_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,15]],"date-time":"2025-02-15T15:00:01Z","timestamp":1739631601000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72973-7_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031729720","9783031729737"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72973-7_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}