{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T23:20:24Z","timestamp":1772839224750,"version":"3.50.1"},"reference-count":80,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,3,20]],"date-time":"2025-03-20T00:00:00Z","timestamp":1742428800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,20]],"date-time":"2025-03-20T00:00:00Z","timestamp":1742428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02404-8","type":"journal-article","created":{"date-parts":[[2025,3,21]],"date-time":"2025-03-21T20:45:52Z","timestamp":1742589952000},"page":"4817-4836","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["CT3D++: Improving 3D Object Detection with Keypoint-Induced Channel-wise Transformer"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2405-9325","authenticated-orcid":false,"given":"Hualian","family":"Sheng","sequence":"first","affiliation":[]},{"given":"Sijia","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Na","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Bing","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Qiao","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Min-Jian","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jieping","family":"Ye","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,20]]},"reference":[{"key":"2404_CR1","first-page":"21665","volume":"33","author":"V Apoorv","year":"2020","unstructured":"Apoorv, V., Katharopoulos, A., & Fleuret, F. (2020). Fast transformers with clustered attention. Advances in Neural Information Processing Systems, 33, 21665\u201321674.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"11","key":"2404_CR2","doi-asserted-by":"publisher","first-page":"3174","DOI":"10.1109\/TCSVT.2017.2740321","volume":"28","author":"Y Cao","year":"2017","unstructured":"Cao, Y., Zifeng, W., & Shen, C. (2017). Estimating depth from monocular images as classification using deep fully convolutional residual networks. IEEE Transactions on Circuits and Systems for Video Technology (TCSVT), 28(11), 3174\u20133182.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)"},{"key":"2404_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N, Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In proceedings of the European conference on computer vision (pp. 213\u2013229). Springer.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2404_CR4","doi-asserted-by":"crossref","unstructured":"Chai, Y., Sun, P., Ngiam, J., Wang, W., Caine, B., Vasudevan, V., Zhang, X., & Anguelov, D.. (2021). To the point: Efficient 3d object detection in the range image with graph convolution kernels. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition pp. 16000\u201316009.","DOI":"10.1109\/CVPR46437.2021.01574"},{"key":"2404_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, J., Zhang, X., Qi, X., & Jia, J. (2023). Largekernel3d: Scaling up kernels in 3d sparse cnns. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 13488\u201313498).","DOI":"10.1109\/CVPR52729.2023.01296"},{"key":"2404_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, H., Wan, J., Li, B., & Xia, T. (2017) Multi-view 3d object detection network for autonomous driving. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 1907\u20131915).","DOI":"10.1109\/CVPR.2017.691"},{"key":"2404_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Y., Tai, L., Sun, K., & Li, M. (2020). Monopair: Monocular 3d object detection using pairwise spatial relationships. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 12093\u201312102).","DOI":"10.1109\/CVPR42600.2020.01211"},{"issue":"3","key":"2404_CR8","doi-asserted-by":"publisher","first-page":"1328","DOI":"10.1109\/TCSVT.2021.3068834","volume":"32","author":"S Chen","year":"2021","unstructured":"Chen, S., Zhengdong, P., Fan, X., & Zou, B. (2021). Fixing defect of photometric loss for self-supervised monocular depth estimation. IEEE Transactions on Circuits and Systems for Video Technology (TCSVT), 32(3), 1328\u20131338.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)"},{"key":"2404_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Shi, S., Li, P., Zhou, W., Zhang, Y., & Li, H.. (2020). Voxel r-cnn: Towards high performance voxel-based 3d object detection. arXiv preprint arXiv:2012.15712.","DOI":"10.1609\/aaai.v35i2.16207"},{"key":"2404_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Shi, S., Li, P., Zhou, W., Zhang, Y., & Li, H.. (2021). Voxel r-cnn: Towards high performance voxel-based 3d object detection. Proceedings of the AAAI conference on artificial intelligence, 35(2), 1201\u20131209.","DOI":"10.1609\/aaai.v35i2.16207"},{"key":"2404_CR11","doi-asserted-by":"crossref","unstructured":"Ding, M., Huo, Y., Yi, H., Wang, Z., Shi, J., Lu, Z., & Luo, P. (2020). Learning depth-guided convolutions for monocular 3d object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops pp. 1000\u20131001.","DOI":"10.1109\/CVPRW50498.2020.00508"},{"key":"2404_CR12","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al, (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"2404_CR13","doi-asserted-by":"crossref","unstructured":"Fan, L., Xiong, X., Wang, F., Wang, N., & Zhang, Z. (2021). Rangedet: In defense of range view for lidar-based 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2918\u20132927).","DOI":"10.1109\/ICCV48922.2021.00291"},{"issue":"11","key":"2404_CR14","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., & Urtasun, R. (2013). Vision meets robotics: The kitti dataset. The International Journal of Robotics Research, 32(11), 1231\u20131237.","journal-title":"The International Journal of Robotics Research"},{"issue":"2","key":"2404_CR15","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1007\/s41095-021-0229-5","volume":"7","author":"MH Guo","year":"2021","unstructured":"Guo, M. H., Cai, J. X., Liu, Z. N., Mu, T. J., Martin, R. R., & Hu, S. M. (2021). Pct: Point cloud transformer. Computational Visual Media, 7(2), 187\u2013199.","journal-title":"Computational Visual Media"},{"key":"2404_CR16","doi-asserted-by":"crossref","unstructured":"Hai, W., Wen, C., Li, W., Li, X., Yang, R., & Wang, C. (2023). Transformation-equivariant 3d object detection for autonomous driving. In Proceedings of the AAAI Conference on Artificial Intelligence,37, 2795\u20132802.","DOI":"10.1609\/aaai.v37i3.25380"},{"key":"2404_CR17","first-page":"5099","volume":"30","author":"C Hao","year":"2017","unstructured":"Hao, C., & Guibas, S. L. J. (2017). Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Advances in Neural Information Processing Systems (NIPS), 30, 5099\u20135108.","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"2404_CR18","doi-asserted-by":"crossref","unstructured":"He, C., Li, R., Li, S., & Zhang, L. (2022). Voxel set transformer: A set-to-set approach to 3d object detection from point clouds. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 8417\u20138427).","DOI":"10.1109\/CVPR52688.2022.00823"},{"key":"2404_CR19","doi-asserted-by":"crossref","unstructured":"He, C., Zeng, H., Huang, J., Hua, X.-S., & Zhang, L. (2020). Structure aware single-stage 3d object detection from point cloud. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 11873\u201311882).","DOI":"10.1109\/CVPR42600.2020.01189"},{"key":"2404_CR20","doi-asserted-by":"publisher","first-page":"7126","DOI":"10.1109\/JSEN.2024.3350770","volume":"24","author":"HA Hoang","year":"2024","unstructured":"Hoang, H. A., Bui, D. C., & Yoo, M. (2024). Tsstdet: Transformation-based 3-d object detection via a spatial shape transformer. IEEE Sensors Journal, 24, 7126\u20137139.","journal-title":"IEEE Sensors Journal"},{"key":"2404_CR21","unstructured":"Huang, J., & Huang, G. (2022). Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054."},{"key":"2404_CR22","unstructured":"Huang, J., Huang, G., Zhu, Z., Ye, Y., & Du, D. (2021). Bevdet: High-performance multi-camera 3d object detection in bird-eye-view. arXiv preprint arXiv:2112.11790."},{"key":"2404_CR23","doi-asserted-by":"crossref","unstructured":"Ku J., Mozifian, M., Lee, J., Harakeh, A., & Waslander, S. L. (2018). Joint 3d proposal generation and object detection from view aggregation. In 2018 IEEE\/RSJ International conference on intelligent robots and systems (IROS) (pp. 1\u20138).","DOI":"10.1109\/IROS.2018.8594049"},{"key":"2404_CR24","doi-asserted-by":"crossref","unstructured":"Lang, A.H, V., Sourabh, C., Holger, Z., Lubing, Y., Jiong, & Beijbom, O. (2019). Pointpillars: Fast encoders for object detection from point clouds. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 12697\u201312705).","DOI":"10.1109\/CVPR.2019.01298"},{"key":"2404_CR25","doi-asserted-by":"crossref","unstructured":"Law, Hei, & Deng, Jia. (2018). Cornernet: Detecting objects as paired keypoints. In Proceedings of the European Conference on Computer Vision (ECCV), pages 734\u2013750.","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"2404_CR26","unstructured":"Lee, J., Lee, Y., Kim, J., Kosiorek, A., Choi, S., & Teh, Y.\u00a0W. (2019). Set transformer: A framework for attention-based permutation-invariant neural networks. In International conference on machine learning (pp. 3744\u20133753). PMLR."},{"key":"2404_CR27","doi-asserted-by":"crossref","unstructured":"Li, B. (2017). 3d fully convolutional network for vehicle detection in point cloud. In 2017 IEEE\/RSJ international conference on intelligent robots and systems (pp. 1513\u20131518). IEEE.","DOI":"10.1109\/IROS.2017.8205955"},{"key":"2404_CR28","doi-asserted-by":"crossref","unstructured":"Li, Y., Ge, Z., Guanyi, Y., Yang, J., Wang, Z., Shi, Y., Sun, J., & Li, Z. (2023). Bevdepth: Acquisition of reliable depth for multi-view 3d object detection. Proceedings of the AAAI Conference on Artificial Intelligence,37(2), 1477\u20131485.","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"2404_CR29","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, F., & Wang, N. (2021). Lidar r-cnn: An efficient and universal 3d object detector. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 7546\u20137555).","DOI":"10.1109\/CVPR46437.2021.00746"},{"key":"2404_CR30","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, W., Li, H., Xie, E., Sima, C., Lu, T., Qiao, Y., & Dai, J. (2022). Bevformer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In European conference on computer vision (pp. 1\u201318). Springer.","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"2404_CR31","doi-asserted-by":"crossref","unstructured":"Liang, Z., Zhang, Z., Zhang, M., Zhao, X., & Pu, S. (2021). Rangeioudet: Range image based real-time 3d object detector optimized by intersection over union. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 7140\u20137149).","DOI":"10.1109\/CVPR46437.2021.00706"},{"key":"2404_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin,Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021c). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2404_CR33","doi-asserted-by":"crossref","unstructured":"Liu, H., Teng, Y., Lu, T., Wang, H., & Wang, L. (2023a). Sparsebev: High-performance sparse 3d object detection from multi-camera videos. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 18580\u201318590).","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"2404_CR34","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, T., Zhang, X., & Sun, J. (2022). Petr: Position embedding transformation for multi-view 3d object detection. In European conference on computer vision (pp. 531\u2013548). Springer.","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"2404_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Z., Wu, Z., & T\u00f3th, R. (2020). Smoke: Single-stage monocular 3d object detection via keypoint estimation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops (pp. 996\u2013997).","DOI":"10.1109\/CVPRW50498.2020.00506"},{"key":"2404_CR36","doi-asserted-by":"crossref","unstructured":"Liu, X., Xue, N., & Wu, T. (2021a). Learning auxiliary monocular contexts helps monocular 3d object detection. arXiv preprint arXiv:2112.04628.","DOI":"10.1609\/aaai.v36i2.20074"},{"key":"2404_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Y., Yan, J., Jia, F., Li, S., Gao, A., Wang, T., & Zhang, X. (2023b). Petrv2: A unified framework for 3d perception from multi-camera images. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3262\u20133272).","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"2404_CR38","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, Z., Cao, Y., Hu, H., & Tong, X. (2021d). Group-free 3d object detection via transformers. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2949\u20132958).","DOI":"10.1109\/ICCV48922.2021.00294"},{"issue":"2","key":"2404_CR39","doi-asserted-by":"publisher","first-page":"919","DOI":"10.1109\/LRA.2021.3052442","volume":"6","author":"Y Liu","year":"2021","unstructured":"Liu, Y., Yixuan, Y., & Liu, M. (2021). Ground-aware monocular 3d object detection for autonomous driving. IEEE Robotics and Automation Letters, 6(2), 919\u2013926.","journal-title":"IEEE Robotics and Automation Letters"},{"key":"2404_CR40","doi-asserted-by":"crossref","unstructured":"Lu, Y., Ma, X., Yang, L., Zhang, T., Liu, Y., Chu,Q., Yan, J., & Ouyang, W. (2021). Geometry uncertainty projection network for monocular 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) (pp. 3111\u20133121).","DOI":"10.1109\/ICCV48922.2021.00310"},{"key":"2404_CR41","doi-asserted-by":"crossref","unstructured":"Ma, X., Liu, S., Xia, Z., Zhang, H., Zeng, X., & Ouyang, W. (2020). Rethinking pseudo-lidar representation. In European conference on computer vision (ECCV) (pp. 311\u2013327).","DOI":"10.1007\/978-3-030-58601-0_19"},{"key":"2404_CR42","doi-asserted-by":"crossref","unstructured":"Ma, X., Wang, Z., Li, H., Zhang, P., Ouyang, W., & Fan, X. (2019). Accurate monocular 3d object detection via color-embedded 3d reconstruction for autonomous driving. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) (pp. 6851\u20136860).","DOI":"10.1109\/ICCV.2019.00695"},{"key":"2404_CR43","doi-asserted-by":"crossref","unstructured":"Mao, J., Xue, Y., Niu, M., Bai, H., Feng, J., Liang, X., Xu, H., & Xu, C. (2021). Voxel transformer for 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 3164\u20133173).","DOI":"10.1109\/ICCV48922.2021.00315"},{"key":"2404_CR44","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., & Joulin, A. (2021). An end-to-end transformer model for 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 2906\u20132917).","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"2404_CR45","unstructured":"OpenPCDet\u00a0Development Team. (2020). Openpcdet: An open-source toolbox for 3d object detection from point clouds. https:\/\/github.com\/open-mmlab\/OpenPCDet."},{"key":"2404_CR46","doi-asserted-by":"crossref","unstructured":"Pan, X., Xia, Z., Song, S., Li, L.\u00a0E., & Huang, G. (2020). 3d object detection with pointformer. arXiv preprint arXiv:2012.11409.","DOI":"10.1109\/CVPR46437.2021.00738"},{"key":"2404_CR47","doi-asserted-by":"crossref","unstructured":"Philion, Jonah, & Fidler, Sanja. (2020). Lift, splat, shoot: Encoding images from arbitrary camera rigs by implicitly unprojecting to 3d. In Computer vision\u2013ECCV 2020: 16th European conference. Glasgow August 23\u201328, 2020, proceedings, Part XIV 16 (pp. 194\u2013210). Springer.","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"2404_CR48","unstructured":"Qi, C. R., Su, H., Mo, K., & Guibas, Leonidas J. (2017a). Pointnet: Deep learning on point sets for 3d classification and segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 652\u2013660)."},{"key":"2404_CR49","doi-asserted-by":"crossref","unstructured":"Qiangeng, X., Zhong, Y., & Neumann, U. (2022). Behind the curtain: Learning occluded shapes for 3d object detection. Proceedings of the AAAI Conference on Artificial Intelligence,36(3), 2893\u20132901.","DOI":"10.1609\/aaai.v36i3.20194"},{"key":"2404_CR50","doi-asserted-by":"crossref","unstructured":"Reading, C., Harakeh, A., Chae, J., & Waslander, S. L. (2021). Categorical depth distribution network for monocular 3d object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 8555\u20138564).","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"2404_CR51","doi-asserted-by":"crossref","unstructured":"Sheng, H., Cai, S., Liu, Y., Deng, B., Huang, J., Hua, X.-S., & Zhao, M.-J. (2021). Improving 3d object detection with channel-wise transformer. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) (pp. 2743\u20132752).","DOI":"10.1109\/ICCV48922.2021.00274"},{"key":"2404_CR52","doi-asserted-by":"crossref","unstructured":"Sheng, H., Cai, S., Zhao, N., Deng, B., Huang, J., Hua, X.-S., Zhao, M.-J., & Lee, G. H. (2022). Rethinking iou-based optimization for single-stage 3d object detection. In Proceedings of the European conference on computer vision (pp. 544\u2013561). Springer.","DOI":"10.1007\/978-3-031-20077-9_32"},{"key":"2404_CR53","doi-asserted-by":"publisher","first-page":"7591","DOI":"10.1109\/TCSVT.2023.3276518","volume":"33","author":"H Sheng","year":"2023","unstructured":"Sheng, H., Cai, S., Zhao, N., Deng, B., Zhao, M.-J., & Lee, G. H. (2023). Pdr: Progressive depth regularization for monocular 3d object detection. IEEE Transactions on Circuits and Systems for Video Technology, 33, 7591\u20137603.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2404_CR54","doi-asserted-by":"crossref","unstructured":"Shi, S., Guo, C., Jiang, L., Wang, Z., Shi, J., Wang, X., & Li, H. (2020a). Pv-rcnn: Point-voxel feature set abstraction for 3d object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10529\u201310538).","DOI":"10.1109\/CVPR42600.2020.01054"},{"key":"2404_CR55","doi-asserted-by":"crossref","unstructured":"Shi, S., Wang, X., & Li, H. (2019). Pointrcnn: 3d object proposal generation and detection from point cloud. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 770\u2013779).","DOI":"10.1109\/CVPR.2019.00086"},{"key":"2404_CR56","doi-asserted-by":"crossref","unstructured":"Shi, X.,Ye, Q., Chen, X., Chen, C., Chen, Z., & Kim, T.-K. (2021). Geometry-based distance decomposition for monocular 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) (pp. 15172\u201315181).","DOI":"10.1109\/ICCV48922.2021.01489"},{"issue":"2","key":"2404_CR57","doi-asserted-by":"publisher","first-page":"531","DOI":"10.1007\/s11263-022-01710-9","volume":"131","author":"S Shi","year":"2023","unstructured":"Shi, S., Jiang, L., Deng, J., Wang, Z., Guo, C., Shi, J., Wang, X., & Li, H. (2023). Pv-rcnn++: Point-voxel feature set abstraction with local vector representation for 3d object detection. International Journal of Computer Vision, 131(2), 531\u2013551.","journal-title":"International Journal of Computer Vision"},{"issue":"8","key":"2404_CR58","first-page":"2647","volume":"43","author":"S Shi","year":"2020","unstructured":"Shi, S., Wang, Z., Shi, J., Wang, X., & Li, H. (2020). From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(8), 2647\u20132664.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2404_CR59","doi-asserted-by":"crossref","unstructured":"Song, S., & Xiao, J. (2016). Deep sliding shapes for amodal 3d object detection in rgb-d images. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 808\u2013816).","DOI":"10.1109\/CVPR.2016.94"},{"issue":"11","key":"2404_CR60","doi-asserted-by":"publisher","first-page":"4381","DOI":"10.1109\/TCSVT.2021.3049869","volume":"31","author":"M Song","year":"2021","unstructured":"Song, M., Lim, S., & Kim, W. (2021). Monocular depth estimation using laplacian pyramid-based depth residuals. IEEE Transactions on Circuits and Systems for Video Technology (TCSVT), 31(11), 4381\u20134393.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)"},{"key":"2404_CR61","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021). Training data-efficient image transformers & distillation through attention. In International conference on machine learning (pp. 10347\u201310357). PMLR."},{"key":"2404_CR62","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. In Advances in neural information processing systems (NIPS) (pp. 5998\u20136008)."},{"key":"2404_CR63","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chao, W.-L., Garg, D., Hariharan, B., Campbell, M., & Weinberger, K. Q. (2019). Pseudo-lidar from visual depth estimation: Bridging the gap in 3d object detection for autonomous driving. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 8445\u20138453).","DOI":"10.1109\/CVPR.2019.00864"},{"key":"2404_CR64","unstructured":"Wang, Y., Guizilini, V.\u00a0C., Zhang, T., Wang, Y., Zhao, H., & Solomon, J. (2022). Detr3d: 3d object detection from multi-view images via 3d-to-2d queries. In Conference on robot learning (pp. 180\u2013191). PMLR."},{"key":"2404_CR65","unstructured":"Wang, S., Li, B. Z. , Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768."},{"key":"2404_CR66","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, X., Pang, J., & Lin, D. (2021b). Fcos3d: Fully convolutional one-stage monocular 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) (pp. 913\u2013922).","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"2404_CR67","first-page":"13364","volume":"34","author":"L Wang","year":"2021","unstructured":"Wang, L., Zhang, L., Zhu, Y., Zhang, Z., He, T., Li, M., & Xue, X. (2021). Progressive coordinate transforms for monocular 3d object detection. Advances in Neural Information Processing Systems (NeurIPS), 34, 13364\u201313377.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"2404_CR68","doi-asserted-by":"crossref","unstructured":"Yang, C., Chen, Y., Tian, H., Tao, C., Zhu, X., Zhang, Z., Huang, G., Li, H., Qiao, Y., Lu, L., et\u00a0al. (2023). Bevformer v2: Adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 17830\u201317839).","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"2404_CR69","doi-asserted-by":"crossref","unstructured":"Yang, B., Luo, W., & Urtasun, R. (2018). Pixor: Real-time 3d object detection from point clouds. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 7652\u20137660).","DOI":"10.1109\/CVPR.2018.00798"},{"key":"2404_CR70","doi-asserted-by":"crossref","unstructured":"Yang, Z., Sun, Y., Liu, S., & Jia, J. (2020). 3dssd: Point-based 3d single stage object detector. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11040\u201311048).","DOI":"10.1109\/CVPR42600.2020.01105"},{"key":"2404_CR71","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., & Li, B. (2018). Second: Sparsely embedded convolutional detection. Sensors, 18, 3337.","journal-title":"Sensors"},{"key":"2404_CR72","doi-asserted-by":"crossref","unstructured":"Yihan, H., Ding, Z., Ge, R., Shao, W., Huang, L., Li, K., & Liu, Q. (2022). Afdetv2: Rethinking the necessity of the second stage for object detection from point clouds. Proceedings of the AAAI conference on artificial intelligence,36, 969\u2013979.","DOI":"10.1609\/aaai.v36i1.19980"},{"key":"2404_CR73","doi-asserted-by":"crossref","unstructured":"Yin, T., Zhou, X., & Krahenbuhl, P. (2021). Center-based 3d object detection and tracking. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 11784\u201311793).","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"2404_CR74","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Hu, Q., Xu, G., Ma, Y., Wan, J., & Guo, Y. (2022). Not all points are equal: Learning highly efficient point-based detectors for 3d lidar point clouds. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 18953\u201318962).","DOI":"10.1109\/CVPR52688.2022.01838"},{"key":"2404_CR75","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Lu, J., & Zhou, J. (2021). Objects are different: Flexible monocular 3d object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 3289\u20133298).","DOI":"10.1109\/CVPR46437.2021.00330"},{"key":"2404_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, R., Qiu, H., Wang, T., Guo, Z., Cui, Z., Qiao, Y., Li, H., & Gao, P. (2023). Monodetr: Depth-guided transformer for monocular 3d object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9155\u20139166).","DOI":"10.1109\/ICCV51070.2023.00840"},{"key":"2404_CR77","doi-asserted-by":"crossref","unstructured":"Zhao, H., Jiang, L., Jia, J., Torr, P. H. S., & Koltun, V. (2021). Point transformer. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 16259\u201316268).","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"2404_CR78","doi-asserted-by":"crossref","unstructured":"Zheng, W., Tang, W., Chen, S., Jiang, L., & Fu, C-W. (2021). Cia-ssd: Confident iou-aware single-stage object detector from point cloud. Proceedings of the AAAI Conference on Artificial Intelligence,35(4), 3555\u20133562.","DOI":"10.1609\/aaai.v35i4.16470"},{"key":"2404_CR79","doi-asserted-by":"crossref","unstructured":"Zhou, Yin, & Tuzel, Oncel. (2018). Voxelnet: End-to-end learning for point cloud based 3d object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4490\u20134499).","DOI":"10.1109\/CVPR.2018.00472"},{"key":"2404_CR80","doi-asserted-by":"crossref","unstructured":"Zhou, X., Zhuo, J., & Krahenbuhl, P. (2019). Bottom-up object detection by grouping extreme and center points. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (pp. 850\u2013859).","DOI":"10.1109\/CVPR.2019.00094"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02404-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02404-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02404-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:00:17Z","timestamp":1749276017000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02404-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,20]]},"references-count":80,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2404"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02404-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,20]]},"assertion":[{"value":"16 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}