{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T03:04:06Z","timestamp":1780542246607,"version":"3.54.1"},"reference-count":66,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003819","name":"Hubei Province Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003819","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Signal Processing: Image Communication"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.image.2026.117599","type":"journal-article","created":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T06:49:34Z","timestamp":1779518974000},"page":"117599","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["A hybrid architecture with CNN and ViT dual branches for indoor scene recognition"],"prefix":"10.1016","volume":"147","author":[{"given":"Chen","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8862-5834","authenticated-orcid":false,"given":"Zhongcheng","family":"Dai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiong","family":"Pan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.image.2026.117599_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105036","article-title":"A survey of visual navigation: From geometry to embodied ai","volume":"114","author":"Zhang","year":"2022","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.image.2026.117599_b2","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"451","article-title":"Hierarchy of alternating specialists for scene recognition","author":"Kim","year":"2018"},{"issue":"4","key":"10.1016\/j.image.2026.117599_b3","doi-asserted-by":"crossref","first-page":"2028","DOI":"10.1109\/TIP.2017.2666739","article-title":"Weakly supervised patchnets: Describing and aggregating local patches for scene recognition","volume":"26","author":"Wang","year":"2017","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b4","doi-asserted-by":"crossref","first-page":"474","DOI":"10.1016\/j.patcog.2017.09.025","article-title":"Scene recognition with objectness","volume":"74","author":"Cheng","year":"2018","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.image.2026.117599_b5","doi-asserted-by":"crossref","first-page":"145","DOI":"10.1023\/A:1011139631724","article-title":"Modeling the shape of the scene: A holistic representation of the spatial envelope","volume":"42","author":"Oliva","year":"2001","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.image.2026.117599_b6","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","article-title":"Distinctive image features from scale-invariant keypoints","volume":"60","author":"Lowe","year":"2004","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.image.2026.117599_b7","series-title":"Proceedings of the 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition","first-page":"886","article-title":"Histograms of oriented gradients for human detection","volume":"Vol. 1","author":"Dalal","year":"2005"},{"key":"10.1016\/j.image.2026.117599_b8","series-title":"Proceedings of the Ninth IEEE International Conference on Computer Vision","first-page":"1470","article-title":"Video google: A text retrieval approach to object matching in videos","author":"Sivic","year":"2003"},{"key":"10.1016\/j.image.2026.117599_b9","series-title":"Proceedings of the Tenth IEEE International Conference on Computer Vision","first-page":"1458","article-title":"The pyramid match kernel: Discriminative classification with sets of image features","volume":"Vol. 2","author":"Grauman","year":"2005"},{"issue":"2","key":"10.1016\/j.image.2026.117599_b10","doi-asserted-by":"crossref","first-page":"808","DOI":"10.1109\/TIP.2016.2629443","article-title":"Locally supervised deep hybrid model for scene recognition","volume":"26","author":"Guo","year":"2016","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b11","doi-asserted-by":"crossref","first-page":"938","DOI":"10.1016\/j.ins.2022.07.188","article-title":"Joint global metric learning and local manifold preservation for scene recognition","volume":"610","author":"Wang","year":"2022","journal-title":"Inform. Sci."},{"issue":"20","key":"10.1016\/j.image.2026.117599_b12","doi-asserted-by":"crossref","first-page":"4143","DOI":"10.3390\/rs13204143","article-title":"TRS: Transformers for remote sensing scene classification","volume":"13","author":"Zhang","year":"2021","journal-title":"Remote. Sens."},{"key":"10.1016\/j.image.2026.117599_b13","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.image.2026.117599_b14","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.image.2026.117599_b15","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"558","article-title":"Tokens-to-token vit: Training vision transformers from scratch on imagenet","author":"Yuan","year":"2021"},{"issue":"12","key":"10.1016\/j.image.2026.117599_b16","doi-asserted-by":"crossref","first-page":"14679","DOI":"10.1109\/TITS.2023.3300537","article-title":"CMX: Cross-modal fusion for rgb-x semantic segmentation with transformers","volume":"24","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.image.2026.117599_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2023.105534","article-title":"HiFuse: Hierarchical multi-scale feature fusion network for medical image classification","volume":"87","author":"Huo","year":"2024","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.image.2026.117599_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2025.110164","article-title":"Effivit: Hybrid cnn-transformer for retinal imaging","volume":"191","author":"Rajatha","year":"2025","journal-title":"Comput. Biol. Med."},{"key":"10.1016\/j.image.2026.117599_b19","doi-asserted-by":"crossref","first-page":"188","DOI":"10.1016\/j.neucom.2016.11.023","article-title":"G-MS2F: Googlenet based multi-stage feature fusion of deep CNN for scene recognition","volume":"225","author":"Tang","year":"2017","journal-title":"Neurocomputing"},{"key":"10.1016\/j.image.2026.117599_b20","series-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.image.2026.117599_b21","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.image.2026.117599_b22","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4700","article-title":"Densely connected convolutional networks","author":"Huang","year":"2017"},{"key":"10.1016\/j.image.2026.117599_b23","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"571","article-title":"Scene recognition with CNNs: objects, scales and dataset bias","author":"Herranz","year":"2016"},{"issue":"7","key":"10.1016\/j.image.2026.117599_b24","doi-asserted-by":"crossref","first-page":"3372","DOI":"10.1109\/TIP.2016.2567076","article-title":"A discriminative representation of convolutional features for indoor scene recognition","volume":"25","author":"Khan","year":"2016","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b25","series-title":"Proceedings of the 2017 2nd IEEE International Conference on Computational Intelligence and Applications","first-page":"345","article-title":"Sparse decomposition of convolutional features for scene recognition","author":"Xie","year":"2017"},{"issue":"5","key":"10.1016\/j.image.2026.117599_b26","doi-asserted-by":"crossref","first-page":"1182","DOI":"10.1109\/TMM.2019.2942478","article-title":"Hierarchical coding of convolutional features for scene recognition","volume":"22","author":"Xie","year":"2019","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.image.2026.117599_b27","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"270","article-title":"Interactive: Inter-layer activeness propagation","author":"Xie","year":"2016"},{"key":"10.1016\/j.image.2026.117599_b28","series-title":"Proceedings of the AAAI conference on Artificial Intelligence","first-page":"7178","article-title":"Dictionary learning inspired deep network for scene recognition","volume":"Vol. 32","author":"Liu","year":"2018"},{"issue":"10","key":"10.1016\/j.image.2026.117599_b29","doi-asserted-by":"crossref","first-page":"4829","DOI":"10.1109\/TIP.2016.2599292","article-title":"A spatial layout and scale invariant feature representation for indoor scene classification","volume":"25","author":"Hayat","year":"2016","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b30","doi-asserted-by":"crossref","first-page":"612","DOI":"10.1016\/j.neucom.2021.12.053","article-title":"Scale attentive network for scene recognition","volume":"492","author":"Yuan","year":"2022","journal-title":"Neurocomputing"},{"issue":"15","key":"10.1016\/j.image.2026.117599_b31","doi-asserted-by":"crossref","first-page":"18431","DOI":"10.1007\/s10489-023-04460-4","article-title":"Building discriminative features of scene recognition using multi-stages of inception-resnet-v2","volume":"53","author":"Khan","year":"2023","journal-title":"Appl. Intell."},{"key":"10.1016\/j.image.2026.117599_b32","unstructured":"H. Touvron, M. Cord, M. Douze, F. Massa, A. Sablayrolles, H. J\u00e9gou, Training data-efficient image transformers & distillation through attention, in: Proceedings of the International Conference on Machine Learning, Vol. 139, 2021, pp. 10347\u201310357."},{"key":"10.1016\/j.image.2026.117599_b33","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"568","article-title":"Pyramid vision transformer: A versatile backbone for dense prediction without convolutions","author":"Wang","year":"2021"},{"key":"10.1016\/j.image.2026.117599_b34","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"10012","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.image.2026.117599_b35","series-title":"2025 IEEE International Conference on Image Processing","first-page":"1720","article-title":"Facelivt: Face recognition using linear vision transformer with structural reparameterization for mobile device","author":"Setyawan","year":"2025"},{"key":"10.1016\/j.image.2026.117599_b36","series-title":"Advancing vision transformers with group-mix attention","author":"Ge","year":"2023"},{"key":"10.1016\/j.image.2026.117599_b37","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10936","article-title":"On the faithfulness of vision transformer explanations","author":"Wu","year":"2024"},{"key":"10.1016\/j.image.2026.117599_b38","first-page":"1","article-title":"SCViT: A spatial-channel feature preserving vision transformer for remote sensing image scene classification","volume":"60","author":"Lv","year":"2022","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"issue":"21","key":"10.1016\/j.image.2026.117599_b39","doi-asserted-by":"crossref","first-page":"24947","DOI":"10.1007\/s10489-023-04725-y","article-title":"HELViT: highly efficient lightweight vision transformer for remote sensing image scene classification","volume":"53","author":"Guo","year":"2023","journal-title":"Appl. Intell."},{"key":"10.1016\/j.image.2026.117599_b40","first-page":"1","article-title":"Hierarchical feature fusion of transformer with patch dilating for remote sensing scene classification","volume":"61","author":"Chen","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.image.2026.117599_b41","series-title":"Multi-scale context aggregation by dilated convolutions","author":"Yu","year":"2015"},{"key":"10.1016\/j.image.2026.117599_b42","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7132","article-title":"Squeeze-and-excitation networks","author":"Hu","year":"2018"},{"key":"10.1016\/j.image.2026.117599_b43","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis."},{"issue":"6","key":"10.1016\/j.image.2026.117599_b44","doi-asserted-by":"crossref","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","article-title":"Places: A 10 million image database for scene recognition","volume":"40","author":"Zhou","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.image.2026.117599_b45","series-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017"},{"key":"10.1016\/j.image.2026.117599_b46","series-title":"Proceedings of the 2022 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"CAT: Cross attention in vision transformer","author":"Lin","year":"2022"},{"key":"10.1016\/j.image.2026.117599_b47","series-title":"Proceedings of the 2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"413","article-title":"Recognizing indoor scenes","author":"Quattoni","year":"2009"},{"key":"10.1016\/j.image.2026.117599_b48","series-title":"Proceedings of the 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition","first-page":"3485","article-title":"Sun database: Large-scale scene recognition from abbey to zoo","author":"Xiao","year":"2010"},{"key":"10.1016\/j.image.2026.117599_b49","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"367","article-title":"Conformer: Local features coupling global representations for visual recognition","author":"Peng","year":"2021"},{"key":"10.1016\/j.image.2026.117599_b50","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"22","article-title":"Cvt: Introducing convolutions to vision transformers","author":"Wu","year":"2021"},{"key":"10.1016\/j.image.2026.117599_b51","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"12175","article-title":"CMT: Convolutional neural networks meet vision transformers","author":"Guo","year":"2022"},{"key":"10.1016\/j.image.2026.117599_b52","first-page":"487","article-title":"Learning deep features for scene recognition using places database","volume":"27","author":"Zhou","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"12","key":"10.1016\/j.image.2026.117599_b53","doi-asserted-by":"crossref","first-page":"2335","DOI":"10.1109\/TPAMI.2017.2651061","article-title":"Compositional model based fisher vector coding for image classification","volume":"39","author":"Liu","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"10.1016\/j.image.2026.117599_b54","doi-asserted-by":"crossref","first-page":"1263","DOI":"10.1109\/TCSVT.2015.2511543","article-title":"Hybrid CNN and dictionary-based models for scene recognition and domain adaptation","volume":"27","author":"Xie","year":"2015","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"8","key":"10.1016\/j.image.2026.117599_b55","doi-asserted-by":"crossref","first-page":"2309","DOI":"10.1007\/s11263-021-01475-7","article-title":"Cross-modal pyramid translation for rgb-d scene recognition","volume":"129","author":"Du","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.image.2026.117599_b56","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"8322","author":"Qiu","year":"2021"},{"key":"10.1016\/j.image.2026.117599_b57","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7287","article-title":"MPViT: Multi-path vision transformer for dense prediction","author":"Lee","year":"2022"},{"key":"10.1016\/j.image.2026.117599_b58","unstructured":"H. Huang, X. Zhou, J. Cao, R. He, T. Tan, Vision transformer with super token sampling, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 22690\u201322699."},{"issue":"6","key":"10.1016\/j.image.2026.117599_b59","doi-asserted-by":"crossref","first-page":"2721","DOI":"10.1109\/TIP.2017.2686017","article-title":"Multi-scale multi-feature context modeling for scene recognition in the semantic manifold","volume":"26","author":"Song","year":"2017","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b60","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107256","article-title":"Semantic-aware scene recognition","volume":"102","author":"L\u00f3pez-Cifuentes","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.image.2026.117599_b61","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2021.107470","article-title":"Content and context features for scene image representation","volume":"232","author":"Sitaula","year":"2021","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.image.2026.117599_b62","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.117505","article-title":"Embedding metric learning into an extreme learning machine for scene recognition","volume":"203","author":"C. Wang","year":"2022","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.image.2026.117599_b63","doi-asserted-by":"crossref","first-page":"5877","DOI":"10.1109\/TIP.2020.2986599","article-title":"Scene recognition with prototype-agnostic scene layout","volume":"29","author":"Chen","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.image.2026.117599_b64","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2022.108530","article-title":"Scene recognition using multiple representation network","volume":"118","author":"Lin","year":"2022","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.image.2026.117599_b65","doi-asserted-by":"crossref","first-page":"141","DOI":"10.1109\/TMM.2020.3046877","article-title":"Amorphous region context modeling for scene recognition","volume":"24","author":"Zeng","year":"2020","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.image.2026.117599_b66","series-title":"Proceedings of the 2023 International Joint Conference on Neural Networks","first-page":"01","article-title":"SRRM: Semantic region relation model for indoor scene recognition","author":"Song","year":"2023"}],"container-title":["Signal Processing: Image Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526001220?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526001220?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T02:36:28Z","timestamp":1780540588000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0923596526001220"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":66,"alternative-id":["S0923596526001220"],"URL":"https:\/\/doi.org\/10.1016\/j.image.2026.117599","relation":{},"ISSN":["0923-5965"],"issn-type":[{"value":"0923-5965","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A hybrid architecture with CNN and ViT dual branches for indoor scene recognition","name":"articletitle","label":"Article Title"},{"value":"Signal Processing: Image Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.image.2026.117599","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"117599"}}