{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,17]],"date-time":"2025-12-17T23:25:35Z","timestamp":1766013935011,"version":"3.48.0"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"17","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202142"],"award-info":[{"award-number":["62202142"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11760-025-05024-4","type":"journal-article","created":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T20:33:03Z","timestamp":1764966783000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MSGI3D: Multimodal semantic-geometric integration for 3D object detection in cluttered scenes"],"prefix":"10.1007","volume":"19","author":[{"given":"Shuang","family":"Wang","sequence":"first","affiliation":[]},{"given":"Kunpeng","family":"Bi","sequence":"additional","affiliation":[]},{"given":"Xiangyang","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Miaohui","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,5]]},"reference":[{"key":"5024_CR1","doi-asserted-by":"crossref","unstructured":"Chen, L., Wu, P., Chitta, K., et al.: End-to-end autonomous driving: challenges and frontiers. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3435937"},{"key":"5024_CR2","doi-asserted-by":"crossref","unstructured":"Xia, Z., Li, J., Lin, Z., et al.: Openad: open-world autonomous driving benchmark for 3d object detection. arXiv preprint arXiv:2411.17761 (2024)","DOI":"10.32388\/J2781I"},{"key":"5024_CR3","doi-asserted-by":"crossref","unstructured":"Charatan, D., Li, S.L., Tagliasacchi, A., et al.: Pixelsplat: 3d gaussian splats from image pairs for scalable generalizable 3d reconstruction. In: Proceedings o f the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19457\u201319467 (2024)","DOI":"10.1109\/CVPR52733.2024.01840"},{"issue":"1","key":"5024_CR4","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., et al.: 3d convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35(1), 221\u2013231 (2013). https:\/\/doi.org\/10.1109\/TPAMI.2012.59","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"5024_CR5","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., et al.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5024_CR6","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Chen, X., Litany, O., et al.: Imvotenet: boosting 3d object detection in point clouds with image votes. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4404\u20134413 (2020)","DOI":"10.1109\/CVPR42600.2020.00446"},{"key":"5024_CR7","doi-asserted-by":"crossref","unstructured":"Rukhovich, D., Vorontsova, A., Konushin, A.: Tr3d: towards real-time indoor 3d object detection. In: 2023 IEEE International Conference on Image Processing (ICIP), IEEE, pp. 281\u2013285. (2023)","DOI":"10.1109\/ICIP49359.2023.10222644"},{"key":"5024_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102591","volume":"112","author":"X Jiang","year":"2024","unstructured":"Jiang, X., Wang, D., Bi, K., et al.: Mshp3d: multi-stage cross-modal fusion based on hybrid perception for indoor 3d object detection. Inf. Fusion 112, 102591 (2024)","journal-title":"Inf. Fusion"},{"key":"5024_CR9","unstructured":"Yang, H., Shi, C., Chen, Y., et al.: Boosting 3d object detection via object-focused image fusion. arXiv (2022)"},{"key":"5024_CR10","doi-asserted-by":"crossref","unstructured":"Xie, Q., Lai, Y.K., Wu, J., et al.: Mlcvnet: multi-level context votenet for 3d object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10447\u201310456 (2020)","DOI":"10.1109\/CVPR42600.2020.01046"},{"key":"5024_CR11","doi-asserted-by":"crossref","unstructured":"Gwak, J., Choy, C., Savarese, S.: Generative sparse detection networks for 3d single-shot object detection. In: Computer Vision\u2013ECCV 2020: 16th European Conference, August 23\u201328, 2020, Proceedings, Part IV 16, pp. 297\u2013313. Springer, Glasgow, UK (2020)","DOI":"10.1007\/978-3-030-58548-8_18"},{"key":"5024_CR12","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"5024_CR13","doi-asserted-by":"crossref","unstructured":"Choy, C., Gwak, J., Savarese, S.: 4d spatio-temporal convnets: minkowski convolutional neural networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 3075\u20133084 (2019)","DOI":"10.1109\/CVPR.2019.00319"},{"key":"5024_CR14","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1016\/j.ress.2018.11.011","volume":"182","author":"X Li","year":"2019","unstructured":"Li, X., Zhang, W., Ding, Q.: Deep learning-based remaining useful life estimation of bearings using multi-scale feature extraction. Reliab. Eng. Syst. Safety 182, 208\u2013218 (2019)","journal-title":"Reliab. Eng. Syst. Safety"},{"key":"5024_CR15","doi-asserted-by":"crossref","unstructured":"Guo, C., Fan, B., Zhang, Q., et al.: Augfpn: improving multi-scale feature learning for object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 12595\u201312604 (2020)","DOI":"10.1109\/CVPR42600.2020.01261"},{"issue":"25","key":"5024_CR16","doi-asserted-by":"publisher","first-page":"18813","DOI":"10.1007\/s00521-023-08757-w","volume":"35","author":"I Pacal","year":"2023","unstructured":"Pacal, I., K\u0131l\u0131carslan, S.: Deep learning-based approaches for robust classification of cervical cancer. Neural Comput. Appl. 35(25), 18813\u201318828 (2023)","journal-title":"Neural Comput. Appl."},{"key":"5024_CR17","unstructured":"Qi, C.R., Su, H., Mo, K., et al. (2017) Pointnet: deep learning on point sets for 3d classification and segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 652\u2013660"},{"key":"5024_CR18","unstructured":"Qi, C.R., Yi, L., Su, H., et al.: Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Adv. Neural Inf. Proc. Syst. 30, (2017)"},{"key":"5024_CR19","doi-asserted-by":"crossref","unstructured":"Ding, Z., Han, X., Niethammer, M.: Votenet: a deep learning label fusion method for multi-atlas segmentation. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2019: 22nd International Conference, October 13\u201317, 2019, Proceedings, Part III 22, pp. 202\u2013210. Springer, Shenzhen, China (2019)","DOI":"10.1007\/978-3-030-32248-9_23"},{"key":"5024_CR20","doi-asserted-by":"crossref","unstructured":"Shi, S., Wang, X., Li, H.: Pointrcnn: 3d object proposal generation and detection from point cloud. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 770\u2013779 (2019)","DOI":"10.1109\/CVPR.2019.00086"},{"key":"5024_CR21","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: Voxelnet: end-to-end learning for point cloud based 3d object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4490\u20134499 (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"issue":"10","key":"5024_CR22","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., Li, B.: Second: sparsely embedded convolutional detection. Sens. 18(10), 3337 (2018)","journal-title":"Sens."},{"key":"5024_CR23","doi-asserted-by":"crossref","unstructured":"Rukhovich, D., Vorontsova, A., Konushin, A.: Fcaf3d: fully convolutional anchor-free 3d object detection. In: European Conference on Computer Vision, pp. 477\u2013493. Springer (2022)","DOI":"10.1007\/978-3-031-20080-9_28"},{"key":"5024_CR24","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Hui, L., Shen, Y., et al.: Spgroup3d: superpoint grouping network for indoor 3d object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 7811\u20137819 (2024)","DOI":"10.1609\/aaai.v38i7.28616"},{"key":"5024_CR25","first-page":"1","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A.: Attention is all you need. Adv. Neural Inf. Proc. Syst. 30, 1 (2017)","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"5024_CR26","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3d object detection. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 2906\u20132917 (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"5024_CR27","unstructured":"Shen, Y., Geng, Z., Yuan, Y., et al.: V-detr: detr with vertex relative position encoding for 3d object detection. arXiv preprint arXiv:2308.04409 (2023)"},{"issue":"9","key":"5024_CR28","doi-asserted-by":"publisher","first-page":"15824","DOI":"10.1109\/TITS.2022.3145588","volume":"23","author":"K Peng","year":"2022","unstructured":"Peng, K., Fei, J., Yang, K., et al.: Mass: multi-attentional semantic segmentation of lidar data for dense top-view understanding. IEEE Trans. Intell. Transp. Syst. 23(9), 15824\u201315840 (2022)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"5024_CR29","doi-asserted-by":"crossref","unstructured":"Fei, J., Peng, K., Heidenreich, P., et al.: Pillarsegnet: pillar-based semantic grid map estimation using sparse lidar data. In: 2021 IEEE intelligent vehicles symposium (IV), IEEE, pp. 838\u2013844 (2021)","DOI":"10.1109\/IV48863.2021.9575694"},{"key":"5024_CR30","doi-asserted-by":"crossref","unstructured":"Lin, T.: Focal loss for dense object detection. arXiv preprint arXiv:1708.02002 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"5024_CR31","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., et al.: End-to-end object detection with transformers. In: European conference on computer vision, pp. 213\u2013229. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"5024_CR32","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1016\/j.procs.2022.01.135","volume":"199","author":"P Jiang","year":"2022","unstructured":"Jiang, P., Ergu, D., Liu, F., et al.: A review of yolo algorithm developments. Procedia Comput. Sci. 199, 1066\u20131073 (2022)","journal-title":"Procedia Comput. Sci."},{"key":"5024_CR33","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, H., Wan, J., et al.: Multi-view 3d object detection network for autonomous driving. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 1907\u20131915 (2017)","DOI":"10.1109\/CVPR.2017.691"},{"key":"5024_CR34","doi-asserted-by":"crossref","unstructured":"Ku, J., Mozifian, M., Lee, J., et al.: Joint 3d proposal generation and object detection from view aggregation. In: 2018 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), IEEE, pp. 1\u20138 (2018)","DOI":"10.1109\/IROS.2018.8594049"},{"key":"5024_CR35","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Liu, W., Wu, C., et al.: Frustum pointnets for 3d object detection from rgb-d data. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 918\u2013927 (2018)","DOI":"10.1109\/CVPR.2018.00102"},{"key":"5024_CR36","doi-asserted-by":"crossref","unstructured":"Chen, A., Zhang, K., Zhang, R., et al.: Pimae: point cloud and image interactive masked autoencoders for 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5291\u20135301 (2023)","DOI":"10.1109\/CVPR52729.2023.00512"},{"key":"5024_CR37","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: Sun rgb-d: a rgb-d scene understanding benchmark suite. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 567\u2013576 (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"5024_CR38","doi-asserted-by":"crossref","unstructured":"Huang, T., Liu, Z., Chen, X., et al.: Epnet: enhancing point features with image semantics for 3d object detection. In: Computer Vision\u2013ECCV 2020: 16th European Conference, August 23\u201328, 2020, Proceedings, Part XV 16, pp. 35\u201352. Springer, Glasgow, UK (2020)","DOI":"10.1007\/978-3-030-58555-6_3"},{"key":"5024_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., et al.: Feature pyramid networks for object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"5024_CR40","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., et al.: Scannet: richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5828\u20135839 (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"5024_CR41","doi-asserted-by":"crossref","unstructured":"Armeni, I., Sener, O., Zamir, A.R., et al.: 3d semantic parsing of large-scale indoor spaces. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 1534\u20131543 (2016)","DOI":"10.1109\/CVPR.2016.170"},{"key":"5024_CR42","unstructured":"Contributors, M.: Mmdetection3d: openmmlab next-generation platform for general 3d object detection (2020)"},{"key":"5024_CR43","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, Z., Cao, Y., et al.: Group-free 3d object detection via transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2949\u20132958 (2021)","DOI":"10.1109\/ICCV48922.2021.00294"},{"issue":"7","key":"5024_CR44","first-page":"8324","volume":"45","author":"Z Liu","year":"2022","unstructured":"Liu, Z., Huang, T., Li, B., et al.: Epnet++: cascade bi-directional fusion for multi-modal 3d object detection. IEEE Trans. Pattern Anal. Mach. Intell. 45(7), 8324\u20138341 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5024_CR45","first-page":"29975","volume":"35","author":"H Wang","year":"2022","unstructured":"Wang, H., Ding, L., Dong, S., et al.: Cagroup3d: class-aware grouping for 3d object detection on point clouds. Adv. Neural. Inf. Process. Syst. 35, 29975\u201329988 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5024_CR46","first-page":"39876","volume":"36","author":"Z Wang","year":"2024","unstructured":"Wang, Z., Li, Y.L., Chen, X., et al.: Uni3detr: unified 3d detection transformer. Adv. Neural Inf. Proc. Syst. 36, 39876\u201339896 (2024)","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"5024_CR47","doi-asserted-by":"crossref","unstructured":"Li, Z., Yu, H., Yang, Z., et al.: Ashapeformer: semantics-guided object-level active shape encoding for 3d object detection via transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1012\u20131021 (2023)","DOI":"10.1109\/CVPR52729.2023.00104"},{"issue":"5","key":"5024_CR48","doi-asserted-by":"publisher","first-page":"2981","DOI":"10.1109\/TPAMI.2023.3336874","volume":"46","author":"Y Zheng","year":"2023","unstructured":"Zheng, Y., Duan, Y., Li, Z., et al.: Learning dynamic scene-conditioned 3d object detectors. IEEE Trans. Pattern Anal. Mach. Intell. 46(5), 2981\u20132996 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5024_CR49","doi-asserted-by":"crossref","unstructured":"Fan, G., Qi, Z., Shi, W., et al.: Point-gcc: universal self-supervised 3d scene pre-training via geometry-color contrast. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp 4709\u20134718 (2024)","DOI":"10.1145\/3664647.3681343"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-05024-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-05024-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-05024-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,17]],"date-time":"2025-12-17T23:20:46Z","timestamp":1766013646000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-05024-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":49,"journal-issue":{"issue":"17","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["5024"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-05024-4","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"type":"print","value":"1863-1703"},{"type":"electronic","value":"1863-1711"}],"subject":[],"published":{"date-parts":[[2025,12]]},"assertion":[{"value":"7 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 November 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 December 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"1423"}}