{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T23:20:24Z","timestamp":1772839224825,"version":"3.50.1"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,9]],"date-time":"2025-02-09T00:00:00Z","timestamp":1739059200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,9]],"date-time":"2025-02-09T00:00:00Z","timestamp":1739059200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02351-4","type":"journal-article","created":{"date-parts":[[2025,2,9]],"date-time":"2025-02-09T06:38:56Z","timestamp":1739083136000},"page":"3877-3890","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["LiDAR-guided Geometric Pretraining for Vision-Centric 3D Object Detection"],"prefix":"10.1007","volume":"133","author":[{"given":"Linyan","family":"Huang","sequence":"first","affiliation":[]},{"given":"Huijie","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jia","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Shengchuan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Liujuan","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Junchi","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Hongyang","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,9]]},"reference":[{"key":"2351_CR1","doi-asserted-by":"crossref","unstructured":"Bai, X., Hu, Z., Zhu, X., et\u00a0al. (2022). TransFusion: Robust lidar-camera fusion for 3d object detection with transformers. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"2351_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Bankiti, V., Lang, A. H., et\u00a0al. (2020). nuScenes: A multimodal dataset for autonomous driving. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"2351_CR3","unstructured":"Chen, G., Choi, W., Yu, X., et\u00a0al. (2017). Learning efficient object detection models with knowledge distillation. Advances in Neural Information Processing Systems 30"},{"key":"2351_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L., Sima, C., Li, Y., et\u00a0al. (2022a). PersFormer: 3d lane detection via perspective transformer and the openlane benchmark. arXiv preprint arXiv:2203.11089","DOI":"10.1007\/978-3-031-19839-7_32"},{"key":"2351_CR5","unstructured":"Chen, Z., Li, Z., Zhang, S., et\u00a0al. (2022b). Bevdistill: Cross-modal bev distillation for multi-view 3d object detection. arXiv preprint arXiv:2211.09386"},{"key":"2351_CR6","doi-asserted-by":"crossref","unstructured":"Cho, H., Choi, J., Baek, G., et\u00a0al. (2022). itKD: Interchange transfer-based knowledge distillation for 3d object detection. arXiv preprint arXiv:2205.15531","DOI":"10.1109\/CVPR52729.2023.01301"},{"key":"2351_CR7","unstructured":"Chong, Z., Ma, X., Zhang, H., et\u00a0al. (2022). MonoDistill: Learning spatial features for monocular 3d object detection. arXiv preprint arXiv:2201.10830"},{"key":"2351_CR8","doi-asserted-by":"crossref","unstructured":"Dai, X., Jiang, Z., Wu, Z., et\u00a0al. (2021). General instance distillation for object detection. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR46437.2021.00775"},{"key":"2351_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., et\u00a0al. (2009). ImageNet: A large-scale hierarchical image database. In IEEE Conference on Computer Vision and Pattern Recognition, IEEE","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2351_CR10","doi-asserted-by":"crossref","unstructured":"Guo, X., Shi, S., Wang, X., et\u00a0al. (2021). LIGA-Stereo: Learning lidar geometry aware representations for stereo-based 3d detector. In IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.00314"},{"key":"2351_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, S., Hoffman, J., Malik, J. (2016). Cross modal distillation for supervision transfer. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2016.309"},{"key":"2351_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al. (2016). Deep residual learning for image recognition. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2016.90"},{"key":"2351_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., et\u00a0al. (2017). Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision, (pp. 2961\u20132969)","DOI":"10.1109\/ICCV.2017.322"},{"key":"2351_CR14","unstructured":"Hinton, G., Vinyals, O., Dean, J. (2015). Distilling the knowledge in a neural network (2015). arXiv preprint arXiv:1503.02531 2"},{"key":"2351_CR15","doi-asserted-by":"crossref","unstructured":"Hou, J., Xie, S., Graham, B., et\u00a0al. (2021). Pri3d: Can 3d priors help 2d representation learning? In IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.00564"},{"key":"2351_CR16","doi-asserted-by":"crossref","unstructured":"Hu, S., Chen, L., Wu, P., et\u00a0al. (2022a). St-p3: End-to-end vision-based autonomous driving via spatial-temporal feature learning. In European Conference on Computer Vision, Springer, (pp 533\u2013549)","DOI":"10.1007\/978-3-031-19839-7_31"},{"key":"2351_CR17","unstructured":"Hu, Y., Yang, J., Chen, L., et\u00a0al. (2022b). Goal-oriented autonomous driving. arXiv preprint arXiv:2212.10156"},{"key":"2351_CR18","unstructured":"Huang, J., & Huang, G. (2022). BEVDet4D: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054"},{"key":"2351_CR19","unstructured":"Huang, J., Huang, G., Zhu, Z., et\u00a0al. (2021). BEVDet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790"},{"key":"2351_CR20","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Zhang, L., Miao, Z., et\u00a0al. (2022). Polarformer: Multi-camera 3d object detection with polar transformers. arXiv preprint arXiv:2206.15398","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"2351_CR21","first-page":"16468","volume":"34","author":"Z Kang","year":"2021","unstructured":"Kang, Z., Zhang, P., Zhang, X., et al. (2021). Instance-conditional knowledge distillation for object detection. Advances in Neural Information Processing Systems, 34, 16468\u201316480.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2351_CR22","doi-asserted-by":"crossref","unstructured":"Lang, A. H., Vora, S., Caesar, H., et\u00a0al. (2019). PointPillars: Fast encoders for object detection from point clouds. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2019.01298"},{"key":"2351_CR23","doi-asserted-by":"crossref","unstructured":"Lee, Y., Hwang, Jw, Lee, S., et\u00a0al. (2019). An energy and gpu-computation efficient backbone network for real-time object detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","DOI":"10.1109\/CVPRW.2019.00103"},{"key":"2351_CR24","unstructured":"Li, H., Sima, C., Dai, J., et\u00a0al. (2022a). Delving into the devils of bird\u2019s-eye-view perception: A review, evaluation and recipe. arXiv preprint arXiv:2209.05324"},{"key":"2351_CR25","unstructured":"Li, J., Lu, M., Liu, J., et\u00a0al. (2022b). BEV-LGKD: A unified lidar-guided knowledge distillation framework for BEV 3d object detection. arXiv preprint arXiv:2212.00623"},{"key":"2351_CR26","unstructured":"Li, Y., Chen, Y., Qi, X., et\u00a0al. (2022c). Unifying voxel-based representation with transformer for 3d object detection. arXiv preprint arXiv:2206.00630"},{"key":"2351_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Ge, Z., Yu, G., et\u00a0al. (2022d). BEVDepth: Acquisition of reliable depth for multi-view 3d object detection. arXiv preprint arXiv:2206.10092","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"2351_CR28","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, W., Li, H., et\u00a0al. (2022e). BEVFormer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. arXiv preprint arXiv:2203.17270","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"2351_CR29","doi-asserted-by":"crossref","unstructured":"Li, Z., Yu, Z., Wang, W., et\u00a0al. (2023). Fb-bev: Bev representation from forward-backward view transformations. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 6919\u20136928)","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"2351_CR30","unstructured":"Liang, T., Xie, H., Yu, K., et\u00a0al. (2022). BEVFusion: A simple and robust lidar-camera fusion framework. arXiv preprint arXiv:2205.13790"},{"key":"2351_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, Y., Wang, S., et\u00a0al. (2020). CBNet: A novel composite backbone network architecture for object detection. In AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v34i07.6834"},{"key":"2351_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, T., Zhang, X., et\u00a0al. (2022a). PETR: Position embedding transformation for multi-view 3d object detection. arXiv preprint arXiv:2203.05625","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"2351_CR33","doi-asserted-by":"crossref","unstructured":"Liu, Y., Yan, J., Jia, F., et\u00a0al. (2022b). PETRv2: A unified framework for 3d perception from multi-camera images. arXiv preprint arXiv:2206.01256","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"2351_CR34","unstructured":"Liu, Y. C., Huang, Y. K., Chiang, H. Y., et\u00a0al. (2021). Learning from 2d: Contrastive pixel-to-point knowledge transfer for 3d pretraining. arXiv preprint arXiv:2104.04687"},{"key":"2351_CR35","doi-asserted-by":"crossref","unstructured":"Park, D., Ambrus, R., Guizilini, V., et\u00a0al. (2021). Is pseudo-lidar needed for monocular 3d object detection? In IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.00313"},{"key":"2351_CR36","unstructured":"Park, J., Xu, C., Yang, S., et\u00a0al. (2022). Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection. arXiv preprint arXiv:2210.02443"},{"key":"2351_CR37","doi-asserted-by":"crossref","unstructured":"Peng, L., Liu, F., Yu, Z., et\u00a0al. (2022). Lidar point cloud guided monocular 3d object detection. In European Conference on Computer Vision, Springer, (pp. 123\u2013139)","DOI":"10.1007\/978-3-031-19769-7_8"},{"key":"2351_CR38","doi-asserted-by":"crossref","unstructured":"Philion, J., & Fidler, S. (2020). Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d. In European Conference on Computer Vision, Springer","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"2351_CR39","unstructured":"Romero, A., Ballas, N., Kahou, S. E., et\u00a0al. (2014). Fitnets: Hints for thin deep nets. arXiv preprint arXiv:1412.6550"},{"key":"2351_CR40","doi-asserted-by":"crossref","unstructured":"Sautier, C., Puy, G., Gidaris, S., et\u00a0al. (2022). Image-to-lidar self-supervised distillation for autonomous driving data. In: IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52688.2022.00966"},{"key":"2351_CR41","doi-asserted-by":"crossref","unstructured":"Shu, C., Liu, Y., Gao, J., et\u00a0al. (2021). Channel-wise knowledge distillation for dense prediction. In Proceedings of the IEEE International Conference on Computer Vision, (pp. 5311\u20135320)","DOI":"10.1109\/ICCV48922.2021.00526"},{"key":"2351_CR42","doi-asserted-by":"crossref","unstructured":"Sirko-Galouchenko, S., Boulch, A., Gidaris, S., et\u00a0al. (2024). Occfeat: Self-supervised occupancy feature prediction for pretraining bev segmentation networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 4493\u20134503)","DOI":"10.1109\/CVPRW63382.2024.00452"},{"key":"2351_CR43","doi-asserted-by":"crossref","unstructured":"Sun, P., Kretzschmar, H., Dotiwalla, X., et\u00a0al. (2020). Scalability in perception for autonomous driving: Waymo open dataset. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"2351_CR44","doi-asserted-by":"crossref","unstructured":"Tellex, S., Kollar, T., Dickerson, S., et\u00a0al. (2011). Understanding natural language commands for robotic navigation and mobile manipulation. In AAAI Conference on Artificial Intelligence, (pp 1507\u20131514)","DOI":"10.1609\/aaai.v25i1.7979"},{"key":"2351_CR45","doi-asserted-by":"crossref","unstructured":"Wang, T., Yuan, L., Zhang, X., et\u00a0al. (2019). Distilling object detectors with fine-grained feature imitation. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2019.00507"},{"key":"2351_CR46","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, X., Pang, J., et\u00a0al. (2021). FCOS3D: Fully convolutional one-stage monocular 3d object detection. In IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"2351_CR47","unstructured":"Wang, Y., Guizilini, V. C., Zhang, T., et\u00a0al. (2022a). DETR3D: 3d object detection from multi-view images via 3d-to-2d queries. In Conference on Robot Learning, PMLR"},{"key":"2351_CR48","unstructured":"Wang, Z., Min, C., Ge, Z., et\u00a0al. (2022b). STS: Surround-view temporal stereo for multi-view 3d detection. arXiv preprint arXiv:2208.10145"},{"key":"2351_CR49","first-page":"6119","volume":"35","author":"P Wu","year":"2022","unstructured":"Wu, P., Jia, X., Chen, L., et al. (2022). Trajectory-guided control prediction for end-to-end autonomous driving: A simple yet strong baseline. Advances in Neural Information Processing Systems, 35, 6119\u20136132.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2351_CR50","unstructured":"Wu, P., Chen, L., Li, H., et\u00a0al. (2023). Policy pre-training for autonomous driving via self-supervised geometric modeling. In The Eleventh International Conference on Learning Representations"},{"key":"2351_CR51","doi-asserted-by":"crossref","unstructured":"Xie, S., Gu, J., Guo, D., et\u00a0al. (2020). Pointcontrast: Unsupervised pre-training for 3d point cloud understanding. In European Conference on Computer Vision, Springer","DOI":"10.1007\/978-3-030-58580-8_34"},{"issue":"10","key":"2351_CR52","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., & Li, B. (2018). SECOND: Sparsely embedded convolutional detection. Sensors, 18(10), 3337.","journal-title":"Sensors"},{"key":"2351_CR53","unstructured":"Yang, J., Shi, S., Ding, R., et\u00a0al. (2022a). Towards efficient 3d object detection with knowledge distillation. arXiv preprint arXiv:2205.15156"},{"key":"2351_CR54","doi-asserted-by":"crossref","unstructured":"Yang, Z., Li, Z., Jiang, X., et\u00a0al. (2022b). Focal and global knowledge distillation for detectors. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52688.2022.00460"},{"key":"2351_CR55","doi-asserted-by":"crossref","unstructured":"Yang, Z., Chen, L., Sun, Y., et\u00a0al. (2024). Visual point cloud forecasting enables scalable autonomous driving. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 14673\u201314684)","DOI":"10.1109\/CVPR52733.2024.01390"},{"key":"2351_CR56","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., Krahenbuhl, P. (2021). Center-based 3d object detection and tracking. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"2351_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, L., Dong, R., Tai, H. S., et\u00a0al. (2022a). PointDistiller: Structured knowledge distillation towards efficient and compact 3d detection. arXiv preprint arXiv:2205.11098","DOI":"10.1109\/CVPR52729.2023.02087"},{"key":"2351_CR58","unstructured":"Zhang, Y., Zhu, Z., Zheng, W., et\u00a0al. (2022b). BEVerse: Unified perception and prediction in birds-eye-view for vision-centric autonomous driving. arXiv preprint arXiv:2205.09743"},{"key":"2351_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Girdhar, R., Joulin, A., et\u00a0al. (2021). Self-supervised pretraining of 3d features on any point-cloud. In IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.01009"},{"key":"2351_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, Y., & Tuzel, O. (2018). VoxelNet: End-to-end learning for point cloud based 3d object detection. In IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2018.00472"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02351-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02351-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02351-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:05:11Z","timestamp":1749276311000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02351-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,9]]},"references-count":60,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2351"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02351-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,9]]},"assertion":[{"value":"13 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}