{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T00:01:51Z","timestamp":1780531311010,"version":"3.54.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T00:00:00Z","timestamp":1779321600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T00:00:00Z","timestamp":1779321600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62202142"],"award-info":[{"award-number":["62202142"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s11760-026-05440-0","type":"journal-article","created":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T18:19:26Z","timestamp":1779387566000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MIHP3D: Multi-level interaction and hierarchical perception for 3D object detection"],"prefix":"10.1007","volume":"20","author":[{"given":"Chengyi","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shengjie","family":"Ren","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linxian","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Miaohui","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,5,21]]},"reference":[{"key":"5440_CR1","doi-asserted-by":"crossref","unstructured":"Armeni, I., Sener, O., Zamir, A.R.: 3D semantic parsing of large-scale indoor spaces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1534\u20131543. (2016)","DOI":"10.1109\/CVPR.2016.170"},{"key":"5440_CR2","doi-asserted-by":"crossref","unstructured":"Choy, C., Gwak, J., Savarese, S.: 4D spatio-temporal convnets: Minkowski convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3075\u20133084. (2019)","DOI":"10.1109\/CVPR.2019.00319"},{"key":"5440_CR3","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M.: ScanNet: Richly-annotated 3D reconstructions of indoor scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5828\u20135839. (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"5440_CR4","doi-asserted-by":"crossref","unstructured":"Ding, Z., Han, X., Niethammer, M.: VoteNet: A deep learning label fusion method for multi-atlas segmentation. In: Proceedings of the International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI), pp. 202\u2013210. Springer (2019)","DOI":"10.1007\/978-3-030-32248-9_23"},{"key":"5440_CR5","doi-asserted-by":"crossref","unstructured":"Fan, G., Qi, Z., Shi, W.: Point-GCC: Universal self-supervised 3D scene pre-training via geometry-color contrast. In: Proceedings of the ACM International Conference on Multimedia (ACM MM), pp. 4709\u20134718. (2024)","DOI":"10.1145\/3664647.3681343"},{"key":"5440_CR6","doi-asserted-by":"crossref","unstructured":"Fei, J., Peng, K., Heidenreich, P.: PillarSegNet: Pillar-based semantic grid map estimation using sparse LiDAR data, (2021). (arXiv preprint)","DOI":"10.1109\/IV48863.2021.9575694"},{"key":"5440_CR7","doi-asserted-by":"crossref","unstructured":"Gwak, J., Choy, C., Savarese, S.: Generative sparse detection networks for 3D single-shot object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), Springer, pp 297\u2013313 (2020)","DOI":"10.1007\/978-3-030-58548-8_18"},{"key":"5440_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778. (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"5440_CR9","doi-asserted-by":"crossref","unstructured":"Huang, T., Liu, Z., Chen, X., et al.: EPNet: Enhancing point features with image semantics for 3D object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), Springer, pp 35\u201352 (2020)","DOI":"10.1007\/978-3-030-58555-6_3"},{"key":"5440_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102591","volume":"112","author":"X Jiang","year":"2024","unstructured":"Jiang, X., Wang, D., Bi, K., et al.: MSHP3D: Multi-stage cross-modal fusion based on hybrid perception for indoor 3D object detection. Information Fusion 112, 102591 (2024)","journal-title":"Information Fusion"},{"key":"5440_CR11","unstructured":"Kong, W., Zeng, Z., Wen, D.: Exploring single domain generalization of LiDAR-based semantic segmentation under imperfect labels, (2025). (arXiv preprint)"},{"key":"5440_CR12","doi-asserted-by":"crossref","unstructured":"Li, J., Peng, K., Sun, Y.: Dense semantic bird-eye-view map generation from sparse LiDAR point clouds via distribution-aware feature fusion. In: Proceedings of the IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 4123\u20134129. IEEE (2025)","DOI":"10.1109\/IROS60139.2025.11246382"},{"key":"5440_CR13","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R.: Feature pyramid networks for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2117\u20132125. (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"5440_CR14","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, Z., Cao, Y.: Group-free 3D object detection via transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2949\u20132958. (2021)","DOI":"10.1109\/ICCV48922.2021.00294"},{"issue":"7","key":"5440_CR15","first-page":"8324","volume":"45","author":"Z Liu","year":"2022","unstructured":"Liu, Z., Huang, T., Li, B., et al.: EPNet++: Cascade bi-directional fusion for multi-modal 3D object detection. IEEE Trans. Pattern Anal. Mach. Intell. 45(7), 8324\u20138341 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5440_CR16","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3D object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2906\u20132917. (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"5440_CR17","doi-asserted-by":"crossref","unstructured":"Park, J., Weng, X., Man, Y.: Multi-modality task cascade for 3D object detection, (2021). (arXiv preprint)","DOI":"10.5244\/C.35.315"},{"issue":"9","key":"5440_CR18","doi-asserted-by":"publisher","first-page":"15824","DOI":"10.1109\/TITS.2022.3145588","volume":"23","author":"K Peng","year":"2022","unstructured":"Peng, K., Fei, J., Yang, K., et al.: MASS: Multi-attentional semantic segmentation of LiDAR data for dense top-view understanding. IEEE Trans. Intell. Transp. Syst. 23(9), 15824\u201315840 (2022)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"5440_CR19","unstructured":"Qi, C.R., Su, H., Mo, K., et al.: PointNet: Deep learning on point sets for 3D classification and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 652\u2013660 (2017a)"},{"key":"5440_CR20","unstructured":"Qi, C.R., Yi, L., Su, H., et al.: PointNet++: Deep hierarchical feature learning on point sets in a metric space. Advances in Neural Information Processing Systems 30 (2017b)"},{"key":"5440_CR21","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Chen, X., Litany, O.: ImVoteNet: Boosting 3D object detection in point clouds with image votes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4404\u20134413. (2020)","DOI":"10.1109\/CVPR42600.2020.00446"},{"key":"5440_CR22","doi-asserted-by":"crossref","unstructured":"Rukhovich, D., Vorontsova, A., Konushin, A.: FCAF3D: Fully convolutional anchor-free 3D object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), Springer, pp 477\u2013493 (2022)","DOI":"10.1007\/978-3-031-20080-9_28"},{"key":"5440_CR23","doi-asserted-by":"crossref","unstructured":"Rukhovich, D., Vorontsova, A., Konushin, A.: TR3D: Towards real-time indoor 3D object detection. In: Proceedings of the IEEE International Conference on Image Processing (ICIP), IEEE, pp 281\u2013285 (2023)","DOI":"10.1109\/ICIP49359.2023.10222644"},{"key":"5440_CR24","unstructured":"Sapkota, R., Roumeliotis, K.I., Cheppally, R.H.: A review of 3D object detection with vision-language models. (arXiv preprint) (2025)"},{"key":"5440_CR25","unstructured":"Shen, Y., Geng, Z., Yuan, Y., et al.: V-DETR: DETR with vertex relative position encoding for 3D object detection. arXiv preprint arXiv:2308.04409 (2023)"},{"key":"5440_CR26","doi-asserted-by":"crossref","unstructured":"Shi, S., Wang, X., Li, H.: PointRCNN: 3D object proposal generation and detection from point cloud. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013779. (2019)","DOI":"10.1109\/CVPR.2019.00086"},{"key":"5440_CR27","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: SUN RGB-D: A RGB-D scene understanding benchmark suite. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 567\u2013576. (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"5440_CR28","doi-asserted-by":"crossref","unstructured":"Sun, P., Zheng, Y., Xu, W.: Completing missing entities: Exploring consistency reasoning for remote sensing object detection. IEEE Trans. Image Process. , (2026)","DOI":"10.1109\/TIP.2025.3648164"},{"issue":"18","key":"5440_CR29","doi-asserted-by":"publisher","first-page":"8615","DOI":"10.1021\/acs.jctc.5c00973","volume":"21","author":"T Sun","year":"2025","unstructured":"Sun, T., Xia, W., Shu, J., et al.: Advances and challenges in machine learning for RNA-small molecule interaction modeling. J. Chem. Theory Comput. 21(18), 8615\u20138633 (2025)","journal-title":"J. Chem. Theory Comput."},{"key":"5440_CR30","unstructured":"Wang, C., Yang, W., Liu, X.: State space model meets transformer: A new paradigm for 3D object detection, (arXiv preprint) (2025)"},{"key":"5440_CR31","first-page":"29975","volume":"35","author":"H Wang","year":"2022","unstructured":"Wang, H., Ding, L., Dong, S., et al.: CAGroup3D: Class-aware grouping for 3D object detection on point clouds. Adv. Neural. Inf. Process. Syst. 35, 29975\u201329988 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5440_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chen, X., Cao, L., et al.: Multimodal token fusion for vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 12186\u201312195 (2022b)","DOI":"10.1109\/CVPR52688.2022.01187"},{"key":"5440_CR33","first-page":"39876","volume":"36","author":"Z Wang","year":"2023","unstructured":"Wang, Z., Li, Y.L., Chen, X., et al.: Uni3DETR: Unified 3D detection transformer. Adv. Neural. Inf. Process. Syst. 36, 39876\u201339896 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5440_CR34","doi-asserted-by":"crossref","unstructured":"Xia, Z., Li, J., Lin, Z., et al.: OpenAD: Open-world autonomous driving benchmark for 3D object detection. arXiv preprint arXiv:2411.17761 (2024)","DOI":"10.32388\/J2781I"},{"key":"5440_CR35","doi-asserted-by":"crossref","unstructured":"Xie, Q., Lai, Y.K., Wu, J.: MLCVNet: Multi-level context VoteNet for 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10447\u201310456. (2020)","DOI":"10.1109\/CVPR42600.2020.01046"},{"key":"5440_CR36","doi-asserted-by":"publisher","first-page":"1857","DOI":"10.1007\/s11263-021-01456-w","volume":"129","author":"Q Xie","year":"2021","unstructured":"Xie, Q., Lai, Y.K., Wu, J., et al.: Vote-based 3D object detection with context modeling and SOB-3DNMS. Int. J. Comput. Vision 129, 1857\u20131874 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"5440_CR37","doi-asserted-by":"crossref","unstructured":"Xu, S., Zhou, D., Fang, J.: FusionPainting: Multimodal fusion with adaptive attention for 3D object detection. In: Proceedings of the IEEE International Intelligent Transportation Systems Conference (ITSC), pp. 3047\u20133054. IEEE (2021)","DOI":"10.1109\/ITSC48978.2021.9564951"},{"key":"5440_CR38","unstructured":"Yang, H., Shi, C., Chen, Y.: Boosting 3D object detection via object-focused image fusion, (2022). (arXiv preprint)"},{"key":"5440_CR39","doi-asserted-by":"crossref","unstructured":"Yin, J., Shen, J., Chen, R.: IS-Fusion: Instance-scene collaborative fusion for multimodal 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14905\u201314915. (2024)","DOI":"10.1109\/CVPR52733.2024.01412"},{"key":"5440_CR40","doi-asserted-by":"crossref","unstructured":"Yu, H., Zhang, X., Zhou, X., et al.: CIDRA-Net: Cross-modal interaction fusion network with distribution-relation awareness for robust 3D object detection. Neural Networks p 107818 (2025)","DOI":"10.1016\/j.neunet.2025.107818"},{"key":"5440_CR41","unstructured":"Zeid, K.A., Yilmaz, K., de Geus, D., et al.: DINO in the room: Leveraging 2D foundation models for 3D segmentation. arXiv preprint arXiv:2503.18944 (2025)"},{"issue":"1","key":"5440_CR42","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1109\/TIP.2018.2867733","volume":"28","author":"K Zhang","year":"2018","unstructured":"Zhang, K., Luo, W., Zhong, Y., et al.: Adversarial spatio-temporal learning for video deblurring. IEEE Trans. Image Process. 28(1), 291\u2013301 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"5440_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, M., Wang, S., Bi, K., et al.: DiR-Net: A diagnostic and iterative rectification network for cross-modal 3D object detection. Knowledge-Based Systems p 115023 (2025a)","DOI":"10.1016\/j.knosys.2025.115023"},{"key":"5440_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, X., Fan, G., Ju, Y., et al.: (2025b) SGDFuse: SAM-guided diffusion for high-fidelity infrared and visible image fusion. arXiv preprint arXiv:2508.05264","DOI":"10.1016\/j.inffus.2026.104290"},{"key":"5440_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2025.130376","volume":"641","author":"X Zhang","year":"2025","unstructured":"Zhang, X., Xu, C., Fan, G., et al.: FSCMF: A dual-branch frequency-spatial joint perception cross-modality network for visible and infrared image fusion. Neurocomputing 641, 130376 (2025)","journal-title":"Neurocomputing"},{"issue":"5","key":"5440_CR46","doi-asserted-by":"publisher","first-page":"2981","DOI":"10.1109\/TPAMI.2023.3336874","volume":"46","author":"Y Zheng","year":"2023","unstructured":"Zheng, Y., Duan, Y., Li, Z., et al.: Learning dynamic scene-conditioned 3D object detectors. IEEE Trans. Pattern Anal. Mach. Intell. 46(5), 2981\u20132996 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5440_CR47","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: VoxelNet: End-to-end learning for point cloud based 3D object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4490\u20134499. (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"5440_CR48","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Mottaghi, R., Kolve, E.: Target-driven visual navigation in indoor scenes using deep reinforcement learning. In: Proceedings of the IEEE International Conference on Robotics and Automation (ICRA), pp. 3357\u20133364. IEEE (2017)","DOI":"10.1109\/ICRA.2017.7989381"},{"key":"5440_CR49","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Hui, L., Shen, Y.: SPGroup3D: Superpoint grouping network for indoor 3D object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 7811\u20137819. (2024)","DOI":"10.1609\/aaai.v38i7.28616"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05440-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-026-05440-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05440-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T23:11:06Z","timestamp":1780528266000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-026-05440-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,21]]},"references-count":49,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["5440"],"URL":"https:\/\/doi.org\/10.1007\/s11760-026-05440-0","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,21]]},"assertion":[{"value":"20 December 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 April 2026","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 May 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 May 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}}],"article-number":"359"}}