{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:07:52Z","timestamp":1775837272744,"version":"3.50.1"},"reference-count":106,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T00:00:00Z","timestamp":1728172800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T00:00:00Z","timestamp":1728172800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276108"],"award-info":[{"award-number":["62276108"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s11263-024-02235-z","type":"journal-article","created":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T13:01:37Z","timestamp":1728219697000},"page":"1352-1374","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":100,"title":["MapTRv2: An End-to-End Framework for Online Vectorized HD Map Construction"],"prefix":"10.1007","volume":"133","author":[{"given":"Bencheng","family":"Liao","sequence":"first","affiliation":[]},{"given":"Shaoyu","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yunchi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Qian","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Wenyu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Chang","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6732-7823","authenticated-orcid":false,"given":"Xinggang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,6]]},"reference":[{"key":"2235_CR1","doi-asserted-by":"crossref","unstructured":"Acuna, D., Ling, H., Kar, A., & Fidler, S. (2018). Efficient interactive annotation of segmentation datasets with polygon-rnn++. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 859\u2013868.","DOI":"10.1109\/CVPR.2018.00096"},{"key":"2235_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Bankiti, V., Lang, A.H., Vora, S., Liong, V.E., Xu, Q., Krishnan, A., Pan, Y., Baldan, G., & Beijbom, O. (2020). nuscenes: A multimodal dataset for autonomous driving. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"2235_CR3","doi-asserted-by":"crossref","unstructured":"Can, Y. B., Liniger, A., Paudel, D. P., & Van\u00a0Gool, L. (2021). Structured bird\u2019s-eye-view traffic scene understanding from onboard images. In: ICCV.","DOI":"10.1109\/ICCV48922.2021.01537"},{"key":"2235_CR4","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In: ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2235_CR5","unstructured":"Chen, S., Cheng, T., Wang, X., Meng, W., Zhang, Q., & Liu, W. (2022). Efficient and robust 2d-to-bev representation learning via geometry-guided kernel transformer. arXiv preprint arXiv:2206.04584."},{"key":"2235_CR6","unstructured":"Chen, J., Deng, R., & Furukawa, Y. (2023). Polydiffuse: Polygonal shape reconstruction via guided set diffusion models. arXiv preprint arXiv:2306.01461"},{"key":"2235_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L., Sima, C., Li, Y., Zheng, Z., Xu, J., Geng, X., Li, H., He, C., Shi, J., Qiao, Y., & Yan, J. (2022). Persformer: 3d lane detection via perspective transformer and the openlane benchmark. In: ECCV.","DOI":"10.1007\/978-3-031-19839-7_32"},{"key":"2235_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Wu, Y., Tan, J., Ma, H., & Furukawa, Y. (2024). Maptracker: Tracking with strided memory fusion for consistent vector hd mapping. arXiv preprint arXiv:2403.15951","DOI":"10.1007\/978-3-031-72658-3_6"},{"key":"2235_CR9","unstructured":"Chen, S., Zhang, Y., Liao, B., Xie, J., Cheng, T., Sui, W., Zhang, Q., Huang, C., Liu, W., & Wang, X. (2023). Vma: Divide-and-conquer vectorized map annotation system for large-scale driving scene. arXiv preprint arXiv:2304.09807"},{"key":"2235_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A. G., Kirillov, A., & Girdhar, R. (2022). Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2235_CR11","unstructured":"Dauner, D., Hallgarten, M., Geiger, A., & Chitta, K. (2023). Parting with misconceptions about learning-based vehicle motion planning. arXiv:2306.07962"},{"key":"2235_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. Ieee","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2235_CR13","doi-asserted-by":"crossref","unstructured":"Ding, W., Qiao, L., Qiu, X., & Zhang, C. (2023). Pivotnet: Vectorized pivot learning for end-to-end hd map construction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3672\u20133682.","DOI":"10.1109\/ICCV51070.2023.00340"},{"key":"2235_CR14","unstructured":"Efrat, N., Bluvstein, M., Oron, S., Levi, D., Garnett, N., & Shlomo, B. E. (2020). 3d-lanenet+: Anchor free lane detection using a semi-local representation. arXiv preprint arXiv:2011.01535"},{"key":"2235_CR15","doi-asserted-by":"crossref","unstructured":"Fang, Y., Yang, S., Wang, X., Li, Y., Fang, C., Shan, Y., Feng, B., & Liu, W. (2021). Instances as queries. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6910\u20136919.","DOI":"10.1109\/ICCV48922.2021.00683"},{"key":"2235_CR16","first-page":"26183","volume":"34","author":"Y Fang","year":"2021","unstructured":"Fang, Y., Liao, B., Wang, X., Fang, J., Qi, J., Wu, R., Niu, J., & Liu, W. (2021). You only look at one sequence: Rethinking transformer in vision through object detection. Advances in Neural Information Processing Systems, 34, 26183\u201326197.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2235_CR17","doi-asserted-by":"crossref","unstructured":"Feng, Z., Guo, S., Tan, X., Xu, K., Wang, M., & Ma, L. (2022). Rethinking efficient lane detection via curve modeling. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.01655"},{"key":"2235_CR18","unstructured":"Feng, H., Zhou, W., Yin, Y., Deng, J., Sun, Q., & Li, H. (2023). Recurrent contour-based instance segmentation with progressive learning. arXiv preprint arXiv:2301.08898."},{"key":"2235_CR19","doi-asserted-by":"crossref","unstructured":"Garnett, N., Cohen, R., Pe\u2019er, T., Lahav, R., & Levi, D. (2019). 3d-lanenet: end-to-end 3d multiple lane detection. In: ICCV.","DOI":"10.1109\/ICCV.2019.00301"},{"key":"2235_CR20","doi-asserted-by":"crossref","unstructured":"Gu, X., Song, G., Gilitschenski, I., Pavone, M., & Ivanovic, B. (2024). Producing and leveraging online map uncertainty in trajectory prediction. In: CVPR.","DOI":"10.1109\/CVPR52733.2024.01376"},{"key":"2235_CR21","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, G., Zhao, P., Zhang, W., Miao, J., Wang, J., & Choe, T. E. (2020). Gen-lanenet: A generalized and scalable approach for 3d lane detection. In: European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58589-1_40"},{"issue":"4","key":"2235_CR22","doi-asserted-by":"publisher","first-page":"12","DOI":"10.1109\/MPRV.2008.80","volume":"7","author":"M Haklay","year":"2008","unstructured":"Haklay, M., & Weber, P. (2008). Openstreetmap: User-generated street maps. IEEE Pervasive computing, 7(4), 12\u201318.","journal-title":"IEEE Pervasive computing"},{"key":"2235_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2235_CR24","doi-asserted-by":"crossref","unstructured":"Hu, A., Murez, Z., Mohan, N., Dudas, S., Hawke, J., Badrinarayanan, V., Cipolla, R., & Kendall, A. (2021). FIERY: Future instance segmentation in bird\u2019s-eye view from surround monocular cameras. In: ICCV.","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"2235_CR25","unstructured":"Huang, J., & Huang, G. (2022). Bevpoolv2: A cutting-edge implementation of bevdet toward deployment. arXiv preprint arXiv:2211.17111"},{"key":"2235_CR26","unstructured":"Huang, J., Huang, G., Zhu, Z., Yun, Y., & Du, D. (2021). Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790"},{"key":"2235_CR27","doi-asserted-by":"crossref","unstructured":"Jia, D., Yuan, Y., He, H., Wu, X., Yu, H., Lin, W., Sun, L., Zhang, C., & Hu, H. (2023). Detrs with hybrid matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19702\u201319712.","DOI":"10.1109\/CVPR52729.2023.01887"},{"key":"2235_CR28","doi-asserted-by":"crossref","unstructured":"Jia, D., Yuan, Y., He, H., Wu, X., Yu, H., Lin, W., Sun, L., Zhang, C., & Hu, H. (2023). Detrs with hybrid matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19702\u201319712.","DOI":"10.1109\/CVPR52729.2023.01887"},{"key":"2235_CR29","unstructured":"Jiang, B., Chen, S., Wang, X., Liao, B., Cheng, T., Chen, J., Zhou, H., Zhang, Q., Liu, W., & Huang, C. (2022). Perceive, interact, predict: Learning dynamic and static clues for end-to-end motion prediction. arXiv preprint arXiv:2212.02181"},{"key":"2235_CR30","doi-asserted-by":"crossref","unstructured":"Jiang, B., Chen, S., Xu, Q., Liao, B., Chen, J., Zhou, H., Zhang, Q., Liu, W., Huang, C., & Wang, X. (2023). Vad: Vectorized scene representation for efficient autonomous driving. ICCV.","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"2235_CR31","doi-asserted-by":"crossref","unstructured":"Jiang, Z., Zhu, Z., Li, P., Gao, H.-a., Yuan, T., Shi, Y., Zhao, H., & Zhao, H. (2024). P-mapnet: Far-seeing map generator enhanced by both sdmap and hdmap priors. arXiv preprint arXiv:2403.10521","DOI":"10.1109\/LRA.2024.3447450"},{"key":"2235_CR32","unstructured":"Kalfaoglu, M., Ozturk, H.I., Kilinc, O., & Temizel, A. (2023). Topomask: Instance-mask-based formulation for the road topology problem via transformer-based architecture. arXiv preprint arXiv:2306.05419"},{"key":"2235_CR33","doi-asserted-by":"crossref","unstructured":"Lang, A. H., Vora, S., Caesar, H., Zhou, L., Yang, J., & Beijbom, O. (2019). Pointpillars: Fast encoders for object detection from point clouds. In:CVPR.","DOI":"10.1109\/CVPR.2019.01298"},{"key":"2235_CR34","doi-asserted-by":"crossref","unstructured":"Lazarow, J., Xu, W., & Tu, Z. (2022). Instance segmentation with mask-supervised polygonal boundary transformers. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.00434"},{"key":"2235_CR35","doi-asserted-by":"crossref","unstructured":"Lee, Y., Hwang, J.-w., Lee, S., Bae, Y., & Park, J. (2019). An energy and gpu-computation efficient backbone network for real-time object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 0\u20130.","DOI":"10.1109\/CVPRW.2019.00103"},{"key":"2235_CR36","unstructured":"Li, T., Chen, L., Geng, X., Wang, H., Li, Y., Liu, Z., Jiang, S., Wang, Y., Xu, H., Xu, C., et al. (2023). Topology reasoning for driving scenes. arXiv preprint arXiv:2304.05277"},{"key":"2235_CR37","unstructured":"Li, T., Chen, L., Geng, X., Wang, H., Li, Y., Liu, Z., Jiang, S., Wang, Y., Xu, H., Xu, C., Wen, F., Luo, P., Yan, J., Zhang, W., Wang, X., Qiao, Y., & Li, H. (2023). Topology reasoning for driving scenes. arXiv preprint arXiv:2304.05277"},{"key":"2235_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Ge, Z., Yu, G., Yang, J., Wang, Z., Shi, Y., Sun, J., & Li, Z. (2022). Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092.","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"2235_CR39","unstructured":"Li, T., Jia, P., Wang, B., Chen, L., JIANG, K., Yan, J., & Li, H. (2024). Lanesegnet: Map learning with lane segment perception for autonomous driving. In: The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=LsURkIPYR5"},{"key":"2235_CR40","unstructured":"Li, H., Sima, C., Dai, J., Wang, W., Lu, L., Wang, H., Xie, E., Li, Z., Deng, H., Tian, H., et al. (2022). Delving into the devils of bird\u2019s-eye-view perception: A review, evaluation and recipe. arXiv preprint arXiv:2209.05324"},{"key":"2235_CR41","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, W., Li, H., Xie, E., Sima, C., Lu, T., Qiao, Y., & Dai, J. (2022). Bevformer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In:ECCV.","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"2235_CR42","doi-asserted-by":"crossref","unstructured":"Li, Q., Wang, Y., Wang, Y., & Zhao, H. (2022). Hdmapnet: An online hd map construction and evaluation framework. In: ICRA.","DOI":"10.1109\/ICRA46639.2022.9812383"},{"key":"2235_CR43","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, S., Zhang, X., Xu, Y., Xu, W., & Tu, Z. (2021). Pose recognition with cascade transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1944\u20131953.","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"2235_CR44","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Liu, S.-g., Guo, J., Ni, L.M.-s., & Zhang, L. (2022). Dn-detr: Accelerate detr training by introducing query denoising. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 13609\u201313617.","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"2235_CR45","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhang, S., Wang, Z., Yang, S., Yang, W., Xia, S., & Zhou, E. (2021). Tokenpose: Learning keypoint tokens for human pose estimation. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), 11293\u201311302.","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"2235_CR46","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Xu, H., Liu, S., Zhang, L., Ni, L.M., & Shum, H.-Y. (2023). Mask dino: Towards a unified transformer-based framework for object detection and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3041\u20133050.","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"2235_CR47","doi-asserted-by":"crossref","unstructured":"Liao, B., Chen, S., Jiang, B., Cheng, T., Zhang, Q., Liu, W., Huang, C., & Wang, X. (2023). Lane graph as path: Continuity-preserving path-wise modeling for online lane graph construction. arXiv preprint arXiv:2303.08815","DOI":"10.1007\/978-3-031-72784-9_19"},{"key":"2235_CR48","unstructured":"Liao, B., Chen, S., Wang, X., Cheng, T., Zhang, Q., Liu, W., & Huang, C. (2023). MapTR: Structured modeling and learning for online vectorized HD map construction. In: The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=k7p_YAO7yE"},{"key":"2235_CR49","doi-asserted-by":"crossref","unstructured":"Lin, T., Goyal, P., Girshick, R. B., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: ICCV.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2235_CR50","unstructured":"Lin, X., Lin, T., Pei, Z., Huang, L., & Su, Z. (2022). Sparse4d: Multi-view 3d object detection with sparse spatial-temporal fusion. arXiv preprint arXiv:2211.10581"},{"key":"2235_CR51","doi-asserted-by":"crossref","unstructured":"Ling, H., Gao, J., Kar, A., Chen, W., & Fidler, S. (2019). Fast interactive object annotation with curve-gcn. In: CVPR.","DOI":"10.1109\/CVPR.2019.00540"},{"key":"2235_CR52","unstructured":"Liu, Z., Chen, S., Guo, X., Wang, X., Cheng, T., Zhu, H., Zhang, Q., Liu, W., & Zhang, Y. (2022). Vision-based uneven bev representation learning with polar rasterization and surface estimation. arXiv preprint arXiv:2207.01878"},{"key":"2235_CR53","doi-asserted-by":"crossref","unstructured":"Liu, R., Chen, D., Liu, T., Xiong, Z., & Yuan, Z. (2022). Learning to predict 3d lane shape and camera pose from a single image via geometry constraints. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 1765\u20131772.","DOI":"10.1609\/aaai.v36i2.20069"},{"key":"2235_CR54","doi-asserted-by":"crossref","unstructured":"Liu, L., Chen, X., Zhu, S., & Tan, P. (2021). Condlanenet: a top-to-down lane detection framework based on conditional convolution. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3773\u20133782.","DOI":"10.1109\/ICCV48922.2021.00375"},{"key":"2235_CR55","unstructured":"Liu, S., Li, F., Zhang, H., Yang, X., Qi, X., Su, H., Zhu, J., & Zhang, L. (2022). DAB-DETR: Dynamic anchor boxes are better queries for DETR. In: International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=oMI9PjOb9Jl"},{"key":"2235_CR56","doi-asserted-by":"crossref","unstructured":"Liu, Z., Liew, J. H., Chen, X., & Feng, J. (2021). Dance: A deep attentive contour model for efficient instance segmentation. In: WACVW.","DOI":"10.1109\/WACV48630.2021.00039"},{"key":"2235_CR57","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2235_CR58","doi-asserted-by":"crossref","unstructured":"Liu, Z., Tang, H., Amini, A., Yang, X., Mao, H., Rus, D., & Han, S. (2022). Bevfusion: Multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. arXiv preprint arXiv:2205.13542","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"2235_CR59","unstructured":"Liu, Y., Wang, Y., Wang, Y., & Zhao, H. (2022). Vectormapnet: End-to-end vectorized hd map learning. arXiv preprint arXiv:2206.08920."},{"key":"2235_CR60","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, T., Zhang, X., & Sun, J. (2022). Petr: Position embedding transformation for multi-view 3d object detection. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVII, pp. 531\u2013548. Springer.","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"2235_CR61","doi-asserted-by":"crossref","unstructured":"Liu, R., Yuan, Z., Liu, T., & Xiong, Z. (2021). End-to-end lane shape prediction with transformers. In: WACV.","DOI":"10.1109\/WACV48630.2021.00374"},{"key":"2235_CR62","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, X., Liu, G., Zhao, J., & Xu, N. (2024). Leveraging enhanced queries of point sets for vectorized map construction. arXiv preprint arXiv:2402.17430","DOI":"10.1007\/978-3-031-72998-0_26"},{"key":"2235_CR63","unstructured":"Ma, Y., Wang, T., Bai, X., Yang, H., Hou, Y., Wang, Y., Qiao, Y., Yang, R., Manocha, D., & Zhu, X. (2022). Vision-centric bev perception: A survey. arXiv preprint arXiv:2208.02797"},{"key":"2235_CR64","doi-asserted-by":"crossref","unstructured":"Mallot, H. A., B\u00fclthoff, H. H., Little, J., & Bohrer, S. (1991). Inverse perspective mapping simplifies optical flow computation and obstacle detection. Biological Cybernetics","DOI":"10.1007\/BF00201978"},{"issue":"8","key":"2235_CR65","doi-asserted-by":"publisher","first-page":"1909","DOI":"10.1007\/s11263-023-01790-1","volume":"131","author":"J Mao","year":"2023","unstructured":"Mao, J., Shi, S., Wang, X., & Li, H. (2023). 3d object detection for autonomous driving: A comprehensive survey. International Journal of Computer Vision, 131(8), 1909\u20131963.","journal-title":"International Journal of Computer Vision"},{"key":"2235_CR66","doi-asserted-by":"crossref","unstructured":"Meng, D., Chen, X., Fan, Z., Zeng, G., Li, H., Yuan, Y., Sun, L., & Wang, J. (2021). Conditional detr for fast training convergence. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), 3631\u20133640.","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"2235_CR67","doi-asserted-by":"crossref","unstructured":"Pan, C., He, Y., Peng, J., Zhang, Q., Sui, W., & Zhang, Z. (2023). Baeformer: Bi-directional and early interaction transformers for bird\u2019s eye view semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9590\u20139599.","DOI":"10.1109\/CVPR52729.2023.00925"},{"key":"2235_CR68","doi-asserted-by":"crossref","unstructured":"Pannen, D., Liebner, M., Hempel, W., & Burgard, W. (2020). How to keep hd maps for automated driving up to date. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 2288\u20132294. IEEE.","DOI":"10.1109\/ICRA40945.2020.9197419"},{"key":"2235_CR69","doi-asserted-by":"crossref","unstructured":"Park, D., Ambrus, R., Guizilini, V., Li, J., & Gaidon, A. (2021). Is pseudo-lidar needed for monocular 3d object detection? In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3142\u20133152.","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"2235_CR70","doi-asserted-by":"crossref","unstructured":"Peng, S., Jiang, W., Pi, H., Li, X., Bao, H., & Zhou, X. (2020). Deep snake for real-time instance segmentation. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.00856"},{"key":"2235_CR71","doi-asserted-by":"crossref","unstructured":"Philion, J., & Fidler, S. (2020). Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d. In: ECCV.","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"2235_CR72","doi-asserted-by":"crossref","unstructured":"Qiao, L., Ding, W., Qiu, X., & Zhang, C. (2023). End-to-end vectorized hd-map construction with piecewise bezier curve. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13218\u201313228","DOI":"10.1109\/CVPR52729.2023.01270"},{"key":"2235_CR73","unstructured":"Qiao, L., Zheng, Y., Zhang, P., Ding, W., Qiu, X., Wei, X., & Zhang, C. (2023). Machmap: End-to-end vectorized solution for compact hd-map construction. arXiv preprint arXiv:2306.10301"},{"key":"2235_CR74","doi-asserted-by":"crossref","unstructured":"Qin, Z., Chen, J., Chen, C., Chen, X., & Li, X. (2022). Uniformer: Unified multi-view fusion transformer for spatial-temporal representation in bird\u2019s-eye-view. arXiv preprint arXiv:2207.08536","DOI":"10.1109\/ICCV51070.2023.00798"},{"key":"2235_CR75","doi-asserted-by":"crossref","unstructured":"Shan, T., & Englot, B. (2018). Lego-loam: Lightweight and ground-optimized lidar odometry and mapping on variable terrain. In: IROS.","DOI":"10.1109\/IROS.2018.8594299"},{"key":"2235_CR76","doi-asserted-by":"crossref","unstructured":"Shan, T., Englot, B. J., Meyers, D., Wang, W., Ratti, C., & Rus, D. (2020). LIO-SAM: tightly-coupled lidar inertial odometry via smoothing and mapping. In: IROS.","DOI":"10.1109\/ICRA48506.2021.9561996"},{"key":"2235_CR77","doi-asserted-by":"crossref","unstructured":"Shi, D., Wei, X., Li, L., Ren, Y., & Tan, W. (2022). End-to-end multi-person pose estimation with transformers. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 11059\u201311068.","DOI":"10.1109\/CVPR52688.2022.01079"},{"issue":"2","key":"2235_CR78","doi-asserted-by":"publisher","first-page":"531","DOI":"10.1007\/s11263-022-01710-9","volume":"131","author":"S Shi","year":"2023","unstructured":"Shi, S., Jiang, L., Deng, J., Wang, Z., Guo, C., Shi, J., Wang, X., & Li, H. (2023). Pv-rcnn++: Point-voxel feature set abstraction with local vector representation for 3d object detection. International Journal of Computer Vision, 131(2), 531\u2013551.","journal-title":"International Journal of Computer Vision"},{"key":"2235_CR79","unstructured":"Shin, J., Rameau, F., Jeong, H., & Kum, D. (2023). Instagram: Instance-level graph modeling for vectorized hd map learning. arXiv preprint arXiv:2301.04470"},{"key":"2235_CR80","doi-asserted-by":"crossref","unstructured":"Tabelini, L., Berriel, R., Paixao, T. M., Badue, C., De\u00a0Souza, A. F., & Oliveira-Santos, T. (2021). Keep your eyes on the lane: Real-time attention-guided lane detection. In: CVPR.","DOI":"10.1109\/CVPR46437.2021.00036"},{"key":"2235_CR81","unstructured":"Tan, M., & Le, Q. V. (2019). Efficientnet: Rethinking model scaling for convolutional neural networks. In: ICML."},{"key":"2235_CR82","unstructured":"Wang, Y., Guizilini, V.C., Zhang, T., Wang, Y., Zhao, H., & Solomon, J. (2022). Detr3d: 3d object detection from multi-view images via 3d-to-2d queries. In: Conference on Robot Learning, pp. 180\u2013191. PMLR"},{"key":"2235_CR83","doi-asserted-by":"crossref","unstructured":"Wang, S., Jia, F., Liu, Y., Zhao, Y., Chen, Z., Wang, T., Zhang, C., Zhang, X., & Zhao, F. (2024). Stream query denoising for vectorized hd map construction. arXiv preprint arXiv:2401.09112","DOI":"10.1007\/978-3-031-72655-2_12"},{"key":"2235_CR84","doi-asserted-by":"crossref","unstructured":"Wang, J., Ma, Y., Huang, S., Hui, T., Wang, F., Qian, C., & Zhang, T. (2022). A keypoint-based global association network for lane detection. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.00145"},{"key":"2235_CR85","doi-asserted-by":"crossref","unstructured":"Wang, R., Qin, J., Li, K., Li, Y., Cao, D., & Xu, J. (2023). Bev-lanedet: An efficient 3d lane detection based on virtual camera via key-points. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1002\u20131011.","DOI":"10.1109\/CVPR52729.2023.00103"},{"key":"2235_CR86","doi-asserted-by":"crossref","unstructured":"Wang, Y., Zhang, X., Yang, T., & Sun, J. (2022). Anchor detr: Query design for transformer-based detector. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 2567\u20132575.","DOI":"10.1609\/aaai.v36i3.20158"},{"issue":"8","key":"2235_CR87","doi-asserted-by":"publisher","first-page":"2122","DOI":"10.1007\/s11263-023-01784-z","volume":"131","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Mao, Q., Zhu, H., Deng, J., Zhang, Y., Ji, J., Li, H., & Zhang, Y. (2023). Multi-modal 3d object detection in autonomous driving: a survey. International Journal of Computer Vision, 131(8), 2122\u20132152.","journal-title":"International Journal of Computer Vision"},{"key":"2235_CR88","doi-asserted-by":"crossref","unstructured":"Wei, F., Sun, X., Li, H., Wang, J., & Lin, S. (2020). Point-set anchors for object detection, instance segmentation and pose estimation. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part X 16, pp. 527\u2013544. Springer","DOI":"10.1007\/978-3-030-58607-2_31"},{"key":"2235_CR89","unstructured":"Wilson, B., Qi, W., Agarwal, T., Lambert, J., Singh, J., Khandelwal, S., Pan, B., Kumar, R., Hartnett, A., Pontes, J. K., et al. (2023). Argoverse 2: Next generation datasets for self-driving perception and forecasting. arXiv preprint arXiv:2301.00493."},{"key":"2235_CR90","doi-asserted-by":"crossref","unstructured":"Xie, E., Sun, P., Song, X., Wang, W., Liu, X., Liang, D., Shen, C., & Luo, P. (2020). Polarmask: Single shot instance segmentation with polar representation. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.01221"},{"key":"2235_CR91","first-page":"5385","volume":"44","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Ding, M., Zhang, R., & Luo, P. (2021). Polarmask++: Enhanced polar representation for single-shot instance segmentation and beyond. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44, 5385\u20135400.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2235_CR92","doi-asserted-by":"crossref","unstructured":"Xiong, X., Liu, Y., Yuan, T., Wang, Y., Wang, Y., & Zhao, H. (2023). Neural map prior for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 17535\u201317544","DOI":"10.1109\/CVPR52729.2023.01682"},{"key":"2235_CR93","unstructured":"Xiong, H., Shen, J., Zhu, T., & Pan, Y. (2024). Ean-mapnet: Efficient vectorized hd map construction with anchor neighborhoods. arXiv preprint arXiv:2402.18278"},{"key":"2235_CR94","doi-asserted-by":"crossref","unstructured":"Xu, W., Wang, H., Qi, F., & Lu, C. (2019). Explicit shape encoding for real-time instance segmentation. In: ICCV.","DOI":"10.1109\/ICCV.2019.00527"},{"key":"2235_CR95","unstructured":"Xu, Z., Wong, K. K., & Zhao, H. (2023). Insightmapper: A closer look at inner-instance information for vectorized high-definition mapping. arXiv preprint arXiv:2308.08543."},{"issue":"10","key":"2235_CR96","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., & Li, B. (2018). Second: Sparsely embedded convolutional detection. Sensors, 18(10), 3337.","journal-title":"Sensors"},{"key":"2235_CR97","doi-asserted-by":"crossref","unstructured":"Yuan, T., Liu, Y., Wang, Y., Wang, Y., & Zhao, H. (2024). Streammapnet: Streaming mapping network for vectorized online hd map construction. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 7356\u20137365.","DOI":"10.1109\/WACV57701.2024.00719"},{"key":"2235_CR98","doi-asserted-by":"crossref","unstructured":"Yuan, T., Mao, Y., Yang, J., Liu, Y., Wang, Y., & Zhao, H. (2024). Presight: Enhancing autonomous vehicle perception with city-scale nerf priors. arXiv preprint arXiv:2403.09079","DOI":"10.1007\/978-3-031-72980-5_19"},{"key":"2235_CR99","doi-asserted-by":"crossref","unstructured":"Zhang, J., & Singh, S. (2014). LOAM: lidar odometry and mapping in real-time. In: Robotics: Science and Systems X, University of California.","DOI":"10.15607\/RSS.2014.X.007"},{"key":"2235_CR100","unstructured":"Zhang, H., Li, F., Liu, S., Zhang, L., Su, H., Zhu, J., Ni, L., & Shum, H.-Y. (2023). DINO: DETR with improved denoising anchor boxes for end-to-end object detection. In: The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3mRwyG5one"},{"key":"2235_CR101","unstructured":"Zhang, G., Lin, J., Wu, S., Song, Y., Luo, Z., Xue, Y., Lu, S., & Wang, Z. (2023). Online map vectorization for autonomous driving: A rasterization perspective. arXiv preprint arXiv:2306.10502"},{"key":"2235_CR102","doi-asserted-by":"crossref","unstructured":"Zhang, T., Wei, S., & Ji, S. (2022). E2ec: An end-to-end contour-based method for high-quality high-speed instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4443\u20134452.","DOI":"10.1109\/CVPR52688.2022.00440"},{"key":"2235_CR103","unstructured":"Zhang, Y., Zhu, Z., Zheng, W., Huang, J., Huang, G., Zhou, J., & Lu, J. (2022). Beverse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving. arXiv preprint arXiv:2205.09743"},{"key":"2235_CR104","doi-asserted-by":"crossref","unstructured":"Zhou, B., & Kr\u00e4henb\u00fchl, P. (2022). Cross-view transformers for real-time map-view semantic segmentation. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"2235_CR105","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2021). Deformable DETR: deformable transformers for end-to-end object detection. In: ICLR."},{"key":"2235_CR106","doi-asserted-by":"crossref","unstructured":"Zhu, C., Zhang, X., Li, Y., Qiu, L., Han, K., & Han, X. (2022). Sharpcontour: A contour-based boundary refinement approach for efficient and accurate instance segmentation. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.00435"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02235-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02235-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02235-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T10:06:32Z","timestamp":1740391592000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02235-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,6]]},"references-count":106,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["2235"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02235-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,6]]},"assertion":[{"value":"26 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 September 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 October 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}