{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T05:26:54Z","timestamp":1775107614282,"version":"3.50.1"},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202142"],"award-info":[{"award-number":["62202142"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.imavis.2026.105919","type":"journal-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T00:28:28Z","timestamp":1769560108000},"page":"105919","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SRformer: A hybrid semantic-regional transformer for indoor 3D object detection"],"prefix":"10.1016","volume":"168","author":[{"given":"Kunpeng","family":"Bi","sequence":"first","affiliation":[]},{"given":"Shuang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiangyang","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3351-5169","authenticated-orcid":false,"given":"Miaohui","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.105919_b1","doi-asserted-by":"crossref","unstructured":"Charlesm R. Qi, Hao Su, Kaichun Mo, Leonidas J. Guibas, Pointnet: Deep learning on point sets for 3d classification and segmentation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 652\u2013660.","DOI":"10.1109\/CVPR.2017.16"},{"key":"10.1016\/j.imavis.2026.105919_b2","unstructured":"R. Charles, Li Qi, Hao Yi, Su Leonidas, J. Guibas, Pointnet++: Deep hierarchical feature learning on point sets in a metric space supplementary material."},{"key":"10.1016\/j.imavis.2026.105919_b3","unstructured":"Charles R. Qi, Or Litany, Kaiming He, Leonidas J. Guibas, Deep hough voting for 3d object detection in point clouds, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 9277\u20139286."},{"key":"10.1016\/j.imavis.2026.105919_b4","doi-asserted-by":"crossref","unstructured":"Yin Zhou, Oncel Tuzel, Voxelnet: End-to-end learning for point cloud based 3d object detection, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 4490\u20134499.","DOI":"10.1109\/CVPR.2018.00472"},{"key":"10.1016\/j.imavis.2026.105919_b5","doi-asserted-by":"crossref","unstructured":"Alex H. Lang, Sourabh Vora, Holger Caesar, Lubing Zhou, Jiong Yang, Oscar Beijbom, Pointpillars: Fast encoders for object detection from point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12697\u201312705.","DOI":"10.1109\/CVPR.2019.01298"},{"issue":"10","key":"10.1016\/j.imavis.2026.105919_b6","doi-asserted-by":"crossref","first-page":"3337","DOI":"10.3390\/s18103337","article-title":"Second: Sparsely embedded convolutional detection","volume":"18","author":"Yan","year":"2018","journal-title":"Sensors"},{"key":"10.1016\/j.imavis.2026.105919_b7","doi-asserted-by":"crossref","unstructured":"Danila Rukhovich, Anna Vorontsova, Anton Konushin, Imvoxelnet: Image to voxels projection for monocular and multi-view general-purpose 3d object detection, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2022, pp. 2397\u20132406.","DOI":"10.1109\/WACV51458.2022.00133"},{"key":"10.1016\/j.imavis.2026.105919_b8","series-title":"2023 IEEE International Conference on Image Processing","first-page":"281","article-title":"Tr3d: Towards real-time indoor 3d object detection","author":"Rukhovich","year":"2023"},{"key":"10.1016\/j.imavis.2026.105919_b9","series-title":"Knowledge distillation via query selection for detection transformer","author":"Liu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105919_b10","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.imavis.2026.105919_b11","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105919_b12","doi-asserted-by":"crossref","unstructured":"Junbo Yin, Jianbing Shen, Runnan Chen, Wei Li, Ruigang Yang, Pascal Frossard, Wenguan Wang, Is-fusion: Instance-scene collaborative fusion for multimodal 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14905\u201314915.","DOI":"10.1109\/CVPR52733.2024.01412"},{"key":"10.1016\/j.imavis.2026.105919_b13","doi-asserted-by":"crossref","unstructured":"Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen, Detrs beat yolos on real-time object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 16965\u201316974.","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"10.1016\/j.imavis.2026.105919_b14","doi-asserted-by":"crossref","unstructured":"Xiaozhi Chen, Huimin Ma, Ji Wan, Bo Li, Tian Xia, Multi-view 3d object detection network for autonomous driving, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 1907\u20131915.","DOI":"10.1109\/CVPR.2017.691"},{"key":"10.1016\/j.imavis.2026.105919_b15","first-page":"7652","article-title":"Pixor: Real-time 3d object detection from point clouds","author":"Yang","year":"2018","journal-title":"Proc. the IEEE Conf. Comput. Vis. Pattern Recognit."},{"key":"10.1016\/j.imavis.2026.105919_b16","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August (2020) 23\u201328, Proceedings, Part XV 16","first-page":"35","article-title":"Epnet: Enhancing point features with image semantics for 3d object detection","author":"Huang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105919_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2024.102591","article-title":"Mshp3d: Multi-stage cross-modal fusion based on hybrid perception for indoor 3d object detection","volume":"112","author":"Jiang","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.imavis.2026.105919_b18","unstructured":"Maosheng Ye, Shuangjie Xu, Tongyi Cao, Hvnet: Hybrid voxel network for lidar based 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 1631\u20131640."},{"key":"10.1016\/j.imavis.2026.105919_b19","doi-asserted-by":"crossref","unstructured":"Chenhang He, Hui Zeng, Jianqiang Huang, Xian-Sheng Hua, Lei Zhang, Structure aware single-stage 3d object detection from point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 11873\u201311882.","DOI":"10.1109\/CVPR42600.2020.01189"},{"key":"10.1016\/j.imavis.2026.105919_b20","doi-asserted-by":"crossref","unstructured":"Ishan Misra, Rohit Girdhar, Armand Joulin, An end-to-end transformer model for 3d object detection, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2906\u20132917.","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"10.1016\/j.imavis.2026.105919_b21","unstructured":"Weijing Shi, Raj Rajkumar, Point-gnn: Graph neural network for 3d object detection in a point cloud, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 1711\u20131719."},{"key":"10.1016\/j.imavis.2026.105919_b22","doi-asserted-by":"crossref","unstructured":"Jintai Chen, Biwen Lei, Qingyu Song, Haochao Ying, Danny Z. Chen, Jian Wu, A hierarchical graph network for 3d object detection on point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 392\u2013401.","DOI":"10.1109\/CVPR42600.2020.00047"},{"key":"10.1016\/j.imavis.2026.105919_b23","series-title":"Efficient detr: improving end-to-end object detector with dense prior","author":"Yao","year":"2021"},{"key":"10.1016\/j.imavis.2026.105919_b24","doi-asserted-by":"crossref","unstructured":"Peize Sun, Rufeng Zhang, Yi Jiang, Tao Kong, Chenfeng Xu, Wei Zhan, Masayoshi Tomizuka, Lei Li, Zehuan Yuan, Changhu Wang, et al., Sparse r-cnn: End-to-end object detection with learnable proposals, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 14454\u201314463.","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"10.1016\/j.imavis.2026.105919_b25","unstructured":"Xiyang Dai, Yinpeng Chen, Bin Xiao, Dongdong Chen, Mengchen Liu, Lu Yuan, Lei Zhang, Dynamic head: Unifying object detection heads with attentions, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 7373\u20137382."},{"key":"10.1016\/j.imavis.2026.105919_b26","series-title":"Dq-detr: Detr with dynamic query for tiny object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105919_b27","doi-asserted-by":"crossref","unstructured":"Wenyu Lv, Shangliang Xu, Yian Zhao, Guanzhong Wang, Jinman Wei, Cheng Cui, Yuning Du, Qingqing Dang, Yi Liu, Detrs beat yolos on real-time object detection, in: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2023, pp. 16965\u201316974.","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"10.1016\/j.imavis.2026.105919_b28","doi-asserted-by":"crossref","unstructured":"Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang, Conditional detr for fast training convergence, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 3651\u20133660.","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"10.1016\/j.imavis.2026.105919_b29","series-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"},{"key":"10.1016\/j.imavis.2026.105919_b30","first-page":"2567","article-title":"Anchor detr: Query design for transformer-based detector","volume":"vol. 36","author":"Wang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105919_b31","series-title":"Dab-detr: Dynamic anchor boxes are better queries for detr","author":"Liu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105919_b32","series-title":"Box-detr: Understanding and boxing conditional spatial queries","author":"Liu","year":"2023"},{"key":"10.1016\/j.imavis.2026.105919_b33","series-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2022"},{"key":"10.1016\/j.imavis.2026.105919_b34","series-title":"V-detr: Detr with vertex relative position encoding for 3d object detection","author":"Shen","year":"2023"},{"issue":"4","key":"10.1016\/j.imavis.2026.105919_b35","doi-asserted-by":"crossref","first-page":"2239","DOI":"10.1109\/TPAMI.2023.3335410","article-title":"Dn-detr: Accelerate detr training by introducing query denoising","volume":"46","author":"Li","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105919_b36","series-title":"Group detr: Fast training convergence with decoupled one-to-many label assignment","author":"Chen","year":"2022"},{"key":"10.1016\/j.imavis.2026.105919_b37","unstructured":"Christopher B. Choy, JunYoung Gwak, Silvio Savarese, Minkowski engine: A sparse tensor deep learning library for 3d computer vision, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 3121\u20133130."},{"key":"10.1016\/j.imavis.2026.105919_b38","unstructured":"Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Doll\u00e1r, Focal loss for dense object detection, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 2980\u20132988."},{"key":"10.1016\/j.imavis.2026.105919_b39","unstructured":"Angela Dai, Angel X. Chang, Manolis Savva, Maciej Halber, Thomas Funkhouser, Matthias Nie\u00dfner, Scannet: Richly-annotated 3d reconstructions of indoor scenes, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 5828\u20135839."},{"key":"10.1016\/j.imavis.2026.105919_b40","doi-asserted-by":"crossref","unstructured":"Shuran Song, Samuel P. Lichtenberg, Jianxiong Xiao, Sun rgb-d: A rgb-d scene understanding benchmark suite, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 567\u2013576.","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"10.1016\/j.imavis.2026.105919_b41","doi-asserted-by":"crossref","unstructured":"Qian Xie, Yu-Kun Lai, Jing Wu, Zhoutao Wang, Yiming Zhang, Kai Xu, Jun Wang, Mlcvnet: Multi-level context votenet for 3d object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10447\u201310456.","DOI":"10.1109\/CVPR42600.2020.01046"},{"key":"10.1016\/j.imavis.2026.105919_b42","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August (2020) 23\u201328, Proceedings, Part XII 16","first-page":"311","article-title":"H3dnet: 3d object detection using hybrid geometric primitives","author":"Zhang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105919_b43","doi-asserted-by":"crossref","unstructured":"Bowen Cheng, Lu Sheng, Shaoshuai Shi, Ming Yang, Dong Xu, Back-tracing representative points for voting-based 3d object detection in point clouds, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 8963\u20138972.","DOI":"10.1109\/CVPR46437.2021.00885"},{"key":"10.1016\/j.imavis.2026.105919_b44","doi-asserted-by":"crossref","unstructured":"Ze Liu, Zheng Zhang, Yue Cao, Han Hu, Xin Tong, Group-free 3d object detection via transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2949\u20132958.","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"10.1016\/j.imavis.2026.105919_b45","series-title":"European Conference on Computer Vision","first-page":"477","article-title":"Fcaf3d: Fully convolutional anchor-free 3d object detection","author":"Rukhovich","year":"2022"},{"key":"10.1016\/j.imavis.2026.105919_b46","first-page":"29975","article-title":"Cagroup3d: Class-aware grouping for 3d object detection on point clouds","volume":"35","author":"Wang","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105919_b47","first-page":"39876","article-title":"Uni3detr: Unified 3d detection transformer","volume":"36","author":"Wang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105919_b48","doi-asserted-by":"crossref","unstructured":"Maxim Kolodiazhnyi, Anna Vorontsova, Anton Konushin, Danila Rukhovich, Oneformer3d: One transformer for unified point cloud segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 20943\u201320953.","DOI":"10.1109\/CVPR52733.2024.01979"},{"issue":"5","key":"10.1016\/j.imavis.2026.105919_b49","doi-asserted-by":"crossref","first-page":"2981","DOI":"10.1109\/TPAMI.2023.3336874","article-title":"Learning dynamic scene-conditioned 3d object detectors","volume":"46","author":"Zheng","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105919_b50","first-page":"7811","article-title":"Spgroup3d: Superpoint grouping network for indoor 3d object detection","volume":"vol. 38","author":"Zhu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105919_b51","series-title":"Unidet3d: Multi-dataset indoor 3d object detection","author":"Kolodiazhnyi","year":"2024"},{"key":"10.1016\/j.imavis.2026.105919_b52","series-title":"State space model meets transformer: A new paradigm for 3d object detection","author":"Wang","year":"2025"},{"key":"10.1016\/j.imavis.2026.105919_b53","doi-asserted-by":"crossref","unstructured":"Yikai Wang, Xinghao Chen, Lele Cao, Wenbing Huang, Fuchun Sun, Yunhe Wang, Multimodal token fusion for vision transformers, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 12186\u201312195.","DOI":"10.1109\/CVPR52688.2022.01187"},{"issue":"7","key":"10.1016\/j.imavis.2026.105919_b54","first-page":"8324","article-title":"Epnet++: Cascade bi-directional fusion for multi-modal 3d object detection","volume":"45","author":"Liu","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105919_b55","doi-asserted-by":"crossref","unstructured":"Zechuan Li, Hongshan Yu, Zhengeng Yang, Tongjia Chen, Naveed Akhtar, Ashapeformer: Semantics-guided object-level active shape encoding for 3d object detection via transformers, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 1012\u20131021.","DOI":"10.1109\/CVPR52729.2023.00104"},{"key":"10.1016\/j.imavis.2026.105919_b56","doi-asserted-by":"crossref","unstructured":"Guofan Fan, Zekun Qi, Wenkai Shi, Kaisheng Ma, Point-gcc: Universal self-supervised 3d scene pre-training via geometry-color contrast, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 4709\u20134718.","DOI":"10.1145\/3664647.3681343"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000259?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000259?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T03:57:58Z","timestamp":1775102278000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626000259"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":56,"alternative-id":["S0262885626000259"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105919","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SRformer: A hybrid semantic-regional transformer for indoor 3D object detection","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105919","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"105919"}}