{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:11:39Z","timestamp":1780495899053,"version":"3.54.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22B2055"],"award-info":[{"award-number":["U22B2055"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2441251"],"award-info":[{"award-number":["U2441251"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113541","type":"journal-article","created":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T09:21:03Z","timestamp":1773825663000},"page":"113541","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["IIFNet3D: Instance-to-instance fusion with dual attention for indoor RGB-D 3D object detection"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5678-7551","authenticated-orcid":false,"given":"Zhihao","family":"Sun","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2087-2151","authenticated-orcid":false,"given":"Zixin","family":"Fan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1155-467X","authenticated-orcid":false,"given":"Bin","family":"Fan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9834-4087","authenticated-orcid":false,"given":"Hongmin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"10","key":"10.1016\/j.patcog.2026.113541_bib0001","doi-asserted-by":"crossref","first-page":"9925","DOI":"10.1109\/TCSVT.2024.3405992","article-title":"Selective transfer learning of cross-modality distillation for monocular 3D object detection","volume":"34","author":"Ding","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113541_bib0002","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2022.102515","article-title":"A state-of-the-art survey on augmented reality-assisted digital twin for futuristic human-centric industry transformation","volume":"81","author":"Yin","year":"2023","journal-title":"Rob. Comput. Integr. Manuf."},{"key":"10.1016\/j.patcog.2026.113541_bib0003","first-page":"5285","article-title":"SG-NAV: online 3D scene graph prompting for LLM-based zero-shot object navigation","volume":"37","author":"Yin","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113541_bib0004","series-title":"ICCV","first-page":"9277","article-title":"Deep hough voting for 3D object detection in point clouds","author":"Qi","year":"2019"},{"key":"10.1016\/j.patcog.2026.113541_bib0005","series-title":"ECCV","first-page":"311","article-title":"H3DNet: 3D object detection using hybrid geometric primitives","author":"Zhang","year":"2020"},{"key":"10.1016\/j.patcog.2026.113541_bib0006","series-title":"ICCV","first-page":"2906","article-title":"An end-to-end transformer model for 3D object detection","author":"Misra","year":"2021"},{"key":"10.1016\/j.patcog.2026.113541_bib0007","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"2949","article-title":"Group-free 3D object detection via transformers","author":"Liu","year":"2021"},{"key":"10.1016\/j.patcog.2026.113541_bib0008","series-title":"ECCV","first-page":"477","article-title":"FCAF3D: fully convolutional anchor-free 3D object detection","author":"Rukhovich","year":"2022"},{"key":"10.1016\/j.patcog.2026.113541_bib0009","series-title":"European Conference on Computer Vision","first-page":"355","article-title":"DSPDet3D: 3D small object detection with dynamic spatial pruning","author":"Xu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113541_bib0010","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"7811","article-title":"Spgroup3D: superpoint grouping network for indoor 3D object detection","volume":"38","author":"Zhu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113541_bib0011","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.patcog.2026.113541_bib0012","series-title":"ECCV","first-page":"297","article-title":"Generative sparse detection networks for 3D single-shot object detection","author":"Gwak","year":"2020"},{"key":"10.1016\/j.patcog.2026.113541_bib0013","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"4622","article-title":"2D-driven 3D object detection in RGB-D images","author":"Lahoud","year":"2017"},{"key":"10.1016\/j.patcog.2026.113541_bib0014","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"244","article-title":"Pointfusion: deep sensor fusion for 3D bounding box estimation","author":"Xu","year":"2018"},{"key":"10.1016\/j.patcog.2026.113541_bib0015","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4404","article-title":"ImvoteNet: boosting 3D object detection in point clouds with image votes","author":"Qi","year":"2020"},{"key":"10.1016\/j.patcog.2026.113541_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1090","article-title":"Transfusion: robust LiDAR-camera fusion for 3D object detection with transformers","author":"Bai","year":"2022"},{"issue":"7","key":"10.1016\/j.patcog.2026.113541_bib0017","doi-asserted-by":"crossref","first-page":"6671","DOI":"10.1109\/TCSVT.2025.3538784","article-title":"Rethinking how to capture long-range dependency in 3D object detection","volume":"35","author":"Wang","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113541_bib0018","series-title":"CVPR","first-page":"652","article-title":"PointNet: deep learning on point sets for 3D classification and segmentation","author":"Qi","year":"2017"},{"key":"10.1016\/j.patcog.2026.113541_bib0019","series-title":"NeurIPS","first-page":"5099","article-title":"PointNet++: deep hierarchical feature learning on point sets in a metric space","author":"Qi","year":"2017"},{"key":"10.1016\/j.patcog.2026.113541_bib0020","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1110","article-title":"RBGNet: ray-based grouping for 3D object detection","author":"Wang","year":"2022"},{"key":"10.1016\/j.patcog.2026.113541_bib0021","series-title":"2023 IEEE International Conference on Image Processing (ICIP)","first-page":"281","article-title":"TR3D: towards real-time indoor 3D object detection","author":"Rukhovich","year":"2023"},{"key":"10.1016\/j.patcog.2026.113541_bib0022","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.127807","article-title":"ARIoU: anchor-free rotation-decoupling IoU-based optimization for 3D object detection","volume":"594","author":"Wen","year":"2024","journal-title":"Neurocomputing"},{"issue":"10","key":"10.1016\/j.patcog.2026.113541_bib0023","doi-asserted-by":"crossref","first-page":"10134","DOI":"10.1109\/TCSVT.2025.3563083","article-title":"Dynamic learnable label assignment for indoor 3D object detection","volume":"35","author":"Liu","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113541_bib0024","article-title":"OV-GT3D: a generalizable open-vocabulary two-stage 3D detector with dual path distillation","author":"Sun","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113541_bib0025","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"2397","article-title":"ImVoxelNet: image to voxels projection for monocular and multi-view general-purpose 3D object detection","author":"Rukhovich","year":"2022"},{"key":"10.1016\/j.patcog.2026.113541_bib0026","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6996","article-title":"ImgeoNet: image-induced geometry-aware voxel representation for multi-view 3D object detection","author":"Tu","year":"2023"},{"key":"10.1016\/j.patcog.2026.113541_bib0027","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"23320","article-title":"NeRF-DET: learning geometry-aware volumetric representation for multi-view 3D object detection","author":"Xu","year":"2023"},{"key":"10.1016\/j.patcog.2026.113541_bib0028","unstructured":"H. Yan, Y. Zheng, Y. Duan, Gaussian-DET: Learning closed-surface Gaussians for 3D object detection, (2024) arXiv: 2410.01404."},{"issue":"1","key":"10.1016\/j.patcog.2026.113541_bib0029","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1145\/3503250","article-title":"NeRF: representing scenes as neural radiance fields for view synthesis","volume":"65","author":"Mildenhall","year":"2021","journal-title":"Commun. ACM"},{"key":"10.1016\/j.patcog.2026.113541_bib0030","doi-asserted-by":"crossref","first-page":"2575","DOI":"10.1109\/TIP.2025.3560240","article-title":"NeRF-DET++: incorporating semantic cues and perspective-aware depth supervision for indoor multi-view 3D detection","volume":"34","author":"Huang","year":"2025","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.113541_bib0031","unstructured":"Y. Cao, Y. Jv, D. Xu, 3DGS-DET: empower 3D Gaussian splatting with boundary guidance and box-focused sampling for 3D object detection, (2024) arXiv: 2410.01647."},{"key":"10.1016\/j.patcog.2026.113541_bib0032","article-title":"MDFusion: a multistage dynamic fusion framework for multimodal 3D object detection with leveraging cross-modal feature complementarity","author":"Wang","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.patcog.2026.113541_bib0033","article-title":"PV-MM3D: point-voxel parallel dual-stream framework with dual-attention region adaptive fusion for multimodal 3D object detection","author":"Wang","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.113541_bib0034","series-title":"CVPR","first-page":"5828","article-title":"ScanNet: richly-annotated 3D reconstructions of indoor scenes","author":"Dai","year":"2017"},{"key":"10.1016\/j.patcog.2026.113541_bib0035","series-title":"CVPR","first-page":"567","article-title":"Sun RGB-D: a RGB-D scene understanding benchmark suite","author":"Song","year":"2015"},{"key":"10.1016\/j.patcog.2026.113541_bib0036","unstructured":"M. Contributors, MMDetection3D: OpenMMLab next-generation platform for general 3D object detection, https:\/\/github.com\/open-mmlab\/mmdetection3d, 2020."},{"key":"10.1016\/j.patcog.2026.113541_bib0037","first-page":"91","article-title":"Faster R-CNN: towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113541_bib0038","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3075","article-title":"4D spatio-temporal convnets: minkowski convolutional neural networks","author":"Choy","year":"2019"},{"key":"10.1016\/j.patcog.2026.113541_bib0039","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10447","article-title":"MLCVNet: multi-level context votenet for 3D object detection","author":"Xie","year":"2020"},{"key":"10.1016\/j.patcog.2026.113541_bib0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7463","article-title":"3D object detection with pointformer","author":"Pan","year":"2021"},{"key":"10.1016\/j.patcog.2026.113541_bib0041","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8963","article-title":"Back-tracing representative points for voting-based 3D object detection in point clouds","author":"Cheng","year":"2021"},{"key":"10.1016\/j.patcog.2026.113541_bib0042","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110061","article-title":"Objformer: boosting 3D object detection via instance-wise interaction","volume":"146","author":"Tao","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113541_bib0043","first-page":"29975","article-title":"Cagroup3D: class-aware grouping for 3D object detection on point clouds","volume":"35","author":"Wang","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"1","key":"10.1016\/j.patcog.2026.113541_bib0044","first-page":"1","article-title":"Multi-feature fusion VoteNet for 3D object detection","volume":"18","author":"Wang","year":"2022","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (TOMM)"},{"key":"10.1016\/j.patcog.2026.113541_bib0045","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5291","article-title":"Pimae: point cloud and image interactive masked autoencoders for 3D object detection","author":"Chen","year":"2023"},{"key":"10.1016\/j.patcog.2026.113541_bib0046","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12186","article-title":"Multimodal token fusion for vision transformers","author":"Wang","year":"2022"},{"issue":"11","key":"10.1016\/j.patcog.2026.113541_bib0047","doi-asserted-by":"crossref","first-page":"6474","DOI":"10.1109\/TCSVT.2023.3271318","article-title":"Semantic-context graph network for point-based 3D object detection","volume":"33","author":"Dong","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113541_bib0048","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"392","article-title":"A hierarchical graph network for 3D object detection on point clouds","author":"Chen","year":"2020"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005078?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005078?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T13:12:02Z","timestamp":1780492322000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326005078"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":48,"alternative-id":["S0031320326005078"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113541","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"IIFNet3D: Instance-to-instance fusion with dual attention for indoor RGB-D 3D object detection","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113541","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113541"}}