{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T15:53:48Z","timestamp":1781020428629,"version":"3.54.1"},"publisher-location":"Cham","reference-count":59,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_17","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"296-313","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["UniM$$^2$$AE: Multi-modal Masked Autoencoders with\u00a0Unified 3D Representation for\u00a03D Perception in\u00a0Autonomous Driving"],"prefix":"10.1007","author":[{"given":"Jian","family":"Zou","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1071-6371","authenticated-orcid":false,"given":"Tianyu","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5324-3642","authenticated-orcid":false,"given":"Guanglei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8201-0864","authenticated-orcid":false,"given":"Zhenhua","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3415-3676","authenticated-orcid":false,"given":"Tao","family":"Luo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3044-9779","authenticated-orcid":false,"given":"Chun-Mei","family":"Feng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3330-783X","authenticated-orcid":false,"given":"Wangmeng","family":"Zuo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"17_CR1","unstructured":"Athar, A., Li, E., Casas, S., Urtasun, R.: 4D-former: multimodal 4d panoptic segmentation. In: Conference on Robot Learning, pp. 2151\u20132164. PMLR (2023)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Bai, X., et al.: Transfusion: robust lidar-camera fusion for 3D object detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1090\u20131099 (2022)","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Borse, S., et al.: X-align: cross-modal cross-view alignment for bird\u2019s-eye-view segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3287\u20133297 (2023)","DOI":"10.1007\/s00138-023-01400-7"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Boulch, A., Sautier, C., Michele, B., Puy, G., Marlet, R.: Also: automotive lidar self-supervision by occupancy estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13455\u201313465 (2023)","DOI":"10.1109\/CVPR52729.2023.01293"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuscenes: a multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Chen, A., et al.: PIMAE: point cloud and image interactive masked autoencoders for 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5291\u20135301 (2023)","DOI":"10.1109\/CVPR52729.2023.00512"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, H., Wan, J., Li, B., Xia, T.: Multi-view 3D object detection network for autonomous driving. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 1907\u20131915 (2017)","DOI":"10.1109\/CVPR.2017.691"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., Zhang, T., Wang, Y., Wang, Y., Zhao, H.: Futr3D: a unified sensor fusion framework for 3D detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 172\u2013181 (2023)","DOI":"10.1109\/CVPRW59228.2023.00022"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Focalformer3d: focusing on hard instance for 3D object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8394\u20138405 (2023)","DOI":"10.1109\/ICCV51070.2023.00771"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Y., Li, Y., Zhang, X., Sun, J., Jia, J.: Focal sparse convolutional networks for 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5428\u20135437 (2022)","DOI":"10.1109\/CVPR52688.2022.00535"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, J., Zhang, X., Qi, X., Jia, J.: Largekernel3d: Scaling up kernels in 3D sparse CNNs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13488\u201313498 (2023)","DOI":"10.1109\/CVPR52729.2023.01296"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Chi, X., et al.: Bev-san: accurate BEV 3D object detection via slice attention networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17461\u201317470 (2023)","DOI":"10.1109\/CVPR52729.2023.01675"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Fan, H., Su, H., Guibas, L.J.: A point set generation network for 3d object reconstruction from a single image. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 605\u2013613 (2017)","DOI":"10.1109\/CVPR.2017.264"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Fan, L., et al.: Embracing single stride 3D object detector with sparse transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8458\u20138468 (2022)","DOI":"10.1109\/CVPR52688.2022.00827"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Hess, G., Jaxing, J., Svensson, E., Hagerman, D., Petersson, C., Svensson, L.: Masked autoencoder for self-supervised pre-training on lidar point clouds. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 350\u2013359 (2023)","DOI":"10.1109\/WACVW58289.2023.00039"},{"key":"17_CR17","unstructured":"Hu, H., et al.: EA-LSS: edge-aware lift-splat-shot framework for 3D BEV object detection. arXiv preprint arXiv:2303.17895 (2023)"},{"key":"17_CR18","first-page":"19997","volume":"35","author":"L Huang","year":"2022","unstructured":"Huang, L., You, S., Zheng, M., Wang, F., Qian, C., Yamasaki, T.: Green hierarchical vision transformer for masked image modeling. Adv. Neural. Inf. Process. Syst. 35, 19997\u201320010 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: Clip2point: transfer clip to point cloud classification with image-depth pre-training. arXiv preprint arXiv:2210.01055 (2022)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Jiao, Y., Jie, Z., Chen, S., Chen, J., Ma, L., Jiang, Y.G.: MSMDfusion: fusing lidar and camera at multiple scales with multi-depth seeds for 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21643\u201321652 (2023)","DOI":"10.1109\/CVPR52729.2023.02073"},{"key":"17_CR21","unstructured":"Li, H., et\u00a0al.: Delving into the devils of bird\u2019s-eye-view perception: a review, evaluation and recipe. IEEE Trans. Pattern Anal. Mach. Intell. (2023)"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., et\u00a0al.: Deepfusion: lidar-camera deep fusion for multi-modal 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17182\u201317191 (2022)","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"17_CR23","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20077-9_1","volume-title":"ECCV 2022","author":"Z Li","year":"2022","unstructured":"Li, Z., et al.: BEVformer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 1\u201318. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_1"},{"key":"17_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"663","DOI":"10.1007\/978-3-030-01270-0_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Liang","year":"2018","unstructured":"Liang, M., Yang, B., Wang, S., Urtasun, R.: Deep continuous fusion for multi-sensor 3D object detection. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11220, pp. 663\u2013678. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01270-0_39"},{"key":"17_CR25","first-page":"10421","volume":"35","author":"T Liang","year":"2022","unstructured":"Liang, T., et al.: Bevfusion: a simple and robust lidar-camera fusion framework. Adv. Neural. Inf. Process. Syst. 35, 10421\u201310434 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR26","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"657","DOI":"10.1007\/978-3-031-20086-1_38","volume-title":"ECCV 2022","author":"H Liu","year":"2022","unstructured":"Liu, H., Cai, M., Lee, Y.J.: Masked discrimination for self-supervised learning on point clouds. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13662, pp. 657\u2013675. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20086-1_38"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: SWIN transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Bevfusion: multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. In: 2023 IEEE International Conference on Robotics and Automation (ICRA), pp. 2774\u20132781. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"17_CR29","unstructured":"Min, C., Zhao, D., Xiao, L., Nie, Y., Dai, B.: Voxel-MAE: masked autoencoders for pre-training large-scale point clouds. arXiv preprint arXiv:2206.09900 (2022)"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Nabati, R., Qi, H.: Centerfusion: center-based radar and camera fusion for 3D object detection. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1527\u20131536 (2021)","DOI":"10.1109\/WACV48630.2021.00157"},{"key":"17_CR31","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"604","DOI":"10.1007\/978-3-031-20086-1_35","volume-title":"ECCV 2022","author":"Y Pang","year":"2022","unstructured":"Pang, Y., Wang, W., Tay, F.E., Liu, W., Tian, Y., Yuan, L.: Masked autoencoders for point cloud self-supervised learning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13662, pp. 604\u2013621. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20086-1_35"},{"key":"17_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1007\/978-3-030-58568-6_12","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Philion","year":"2020","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part XIV. LNCS, vol. 12359, pp. 194\u2013210. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_12"},{"key":"17_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR34","unstructured":"Roddick, T., Kendall, A., Cipolla, R.: Orthographic feature transform for monocular 3d object detection. arXiv preprint arXiv:1811.08188 (2018)"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Sindagi, V.A., Zhou, Y., Tuzel, O.: MVX-Net: multimodal voxelnet for 3D object detection. In: 2019 International Conference on Robotics and Automation (ICRA), pp. 7276\u20137282. IEEE (2019)","DOI":"10.1109\/ICRA.2019.8794195"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Scalability in perception for autonomous driving: WAYMO open dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2446\u20132454 (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"17_CR37","unstructured":"Tian, X., Jiang, T., Yun, L., Wang, Y., Wang, Y., Zhao, H.: Occ3D: a large-scale 3d occupancy prediction benchmark for autonomous driving. arXiv preprint arXiv:2304.14365 (2023)"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Tian, X., Ran, H., Wang, Y., Zhao, H.: Geomae: masked geometric target prediction for self-supervised point cloud pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13570\u201313580 (2023)","DOI":"10.1109\/CVPR52729.2023.01304"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Vora, S., Lang, A.H., Helou, B., Beijbom, O.: Pointpainting: sequential fusion for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4604\u20134612 (2020)","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Wang, C., Ma, C., Zhu, M., Yang, X.: Pointaugmenting: cross-modal augmentation for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11794\u201311803 (2021)","DOI":"10.1109\/CVPR46437.2021.01162"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Unitr: a unified and efficient multi-modal transformer for bird\u2019s-eye-view representation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6792\u20136802 (2023)","DOI":"10.1109\/ICCV51070.2023.00625"},{"key":"17_CR42","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhao, L., Zheng, W., Zhu, Z., Zhou, J., Lu, J.: SurroundOcc: multi-camera 3d occupancy prediction for autonomous driving. arXiv preprint arXiv:2303.09551 (2023)","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L.E., Huang, G.: Vision transformer with deformable attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4794\u20134803 (2022)","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"17_CR44","unstructured":"Xie, E., et al.: M$$^2$$bev: multi-camera joint 3d detection and segmentation with unified birds-eye view representation. arXiv preprint arXiv:2204.05088 (2022)"},{"key":"17_CR45","unstructured":"Xie, S., et al.: Robobev: towards robust bird\u2019s eye view perception under corruptions. arXiv preprint arXiv:2304.06719 (2023)"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Xu, R., et al.: MV-JAR: masked voxel jigsaw and reconstruction for lidar-based self-supervised pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13445\u201313454 (2023)","DOI":"10.1109\/CVPR52729.2023.01292"},{"issue":"10","key":"17_CR47","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., Li, B.: Second: sparsely embedded convolutional detection. Sensors 18(10), 3337 (2018)","journal-title":"Sensors"},{"key":"17_CR48","doi-asserted-by":"crossref","unstructured":"Yang, C., et\u00a0al.: Bevformer v2: adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17830\u201317839 (2023)","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"17_CR49","doi-asserted-by":"crossref","unstructured":"Yang, H., et al.: GD-MAE: generative decoder for MAE pre-training on lidar point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9403\u20139414 (2023)","DOI":"10.1109\/CVPR52729.2023.00907"},{"key":"17_CR50","first-page":"1992","volume":"35","author":"Z Yang","year":"2022","unstructured":"Yang, Z., Chen, J., Miao, Z., Li, W., Zhu, X., Zhang, L.: Deepinteraction: 3D object detection via modality interaction. Adv. Neural. Inf. Process. Syst. 35, 1992\u20132005 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR51","unstructured":"Yao, L., et al.: Filip: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"17_CR52","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P.: Center-based 3D object detection and tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11784\u201311793 (2021)","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"17_CR53","first-page":"16494","volume":"34","author":"T Yin","year":"2021","unstructured":"Yin, T., Zhou, X., Kr\u00e4henb\u00fchl, P.: Multimodal virtual point 3d detection. Adv. Neural. Inf. Process. Syst. 34, 16494\u201316507 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR54","doi-asserted-by":"crossref","unstructured":"Yu, X., Tang, L., Rao, Y., Huang, T., Zhou, J., Lu, J.: Point-Bert: pre-training 3D point cloud transformers with masked point modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19313\u201319322 (2022)","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"17_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"17_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, R., Wang, L., Qiao, Y., Gao, P., Li, H.: Learning 3D representations from 2D pre-trained models via image-to-point masked autoencoders. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21769\u201321780 (2023)","DOI":"10.1109\/CVPR52729.2023.02085"},{"key":"17_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhu, Z., Du, D.: Occformer: dual-path transformer for vision-based 3D semantic occupancy prediction. arXiv preprint arXiv:2304.05316 (2023)","DOI":"10.1109\/ICCV51070.2023.00865"},{"key":"17_CR58","doi-asserted-by":"crossref","unstructured":"Zhou, B., Kr\u00e4henb\u00fchl, P.: Cross-view transformers for real-time map-view semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13760\u201313769 (2022)","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"17_CR59","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:21:46Z","timestamp":1727594506000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":59,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}