{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T12:17:37Z","timestamp":1780575457340,"version":"3.54.1"},"reference-count":261,"publisher":"Springer Science and Business Media LLC","issue":"S3","license":[{"start":{"date-parts":[[2023,10,4]],"date-time":"2023-10-04T00:00:00Z","timestamp":1696377600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,10,4]],"date-time":"2023-10-04T00:00:00Z","timestamp":1696377600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10462-023-10595-0","type":"journal-article","created":{"date-parts":[[2023,10,4]],"date-time":"2023-10-04T03:24:46Z","timestamp":1696389886000},"page":"2917-2970","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":385,"title":["A survey of the vision transformers and their CNN-transformer based variants"],"prefix":"10.1007","volume":"56","author":[{"given":"Asifullah","family":"Khan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zunaira","family":"Rauf","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Anabia","family":"Sohail","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abdul Rehman","family":"Khan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hifsa","family":"Asif","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aqsa","family":"Asif","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Umair","family":"Farooq","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2023,10,4]]},"reference":[{"key":"10595_CR1","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1007\/S10462-020-09855-0\/TABLES\/4","volume":"54","author":"O Agbo-Ajala","year":"2021","unstructured":"Agbo-Ajala O, Viriri S (2021) Deep learning approach for facial age classification: a survey of the state-of-the-art. Artif Intell Rev 54:179\u2013213. https:\/\/doi.org\/10.1007\/S10462-020-09855-0\/TABLES\/4","journal-title":"Artif Intell Rev"},{"key":"10595_CR2","doi-asserted-by":"publisher","DOI":"10.3390\/rs15071860","author":"AA Aleissaee","year":"2022","unstructured":"Aleissaee AA, Kumar A, Anwer RM et al (2022) Transformers in remote sensing: a survey. Remote Sensing. https:\/\/doi.org\/10.3390\/rs15071860","journal-title":"Remote Sensing"},{"key":"10595_CR3","doi-asserted-by":"publisher","DOI":"10.3390\/s23052385","author":"AM Ali","year":"2023","unstructured":"Ali AM, Benjdira B, Koubaa A et al (2023a) Vision transformers in image restoration: a survey. Sensors. https:\/\/doi.org\/10.3390\/s23052385","journal-title":"Sensors"},{"key":"10595_CR4","doi-asserted-by":"crossref","unstructured":"Ali ML, Rauf Z, Khan A et al (2023b) CB-HVTNet: a channel-boosted hybrid vision transformer network for lymphocyte assessment in histopathological images","DOI":"10.1109\/ACCESS.2023.3324383"},{"key":"10595_CR5","doi-asserted-by":"publisher","first-page":"7024","DOI":"10.3390\/S22187024","volume":"22","author":"L An","year":"2022","unstructured":"An L, Wang L, Li Y (2022) HEA-Net: attention and MLP hybrid encoder architecture for medical image segmentation. Sensors 22:7024. https:\/\/doi.org\/10.3390\/S22187024","journal-title":"Sensors"},{"key":"10595_CR6","unstructured":"Arjovsky M, Chintala S, Bottou L (2017) Wasserstein GAN"},{"key":"10595_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2022.106439","volume":"152","author":"H Bao","year":"2023","unstructured":"Bao H, Zhu Y, Li Q (2023a) Hybrid-scale contextual fusion network for medical image segmentation. Comput Biol Med 152:106439. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2022.106439","journal-title":"Comput Biol Med"},{"key":"10595_CR8","doi-asserted-by":"publisher","unstructured":"Bao Q, Liu Y, Gang B, et al (2023b) SCTANet: a spatial attention-guided CNN-transformer aggregation network for deep face image super-resolution. IEEE Trans Multimed 1\u201312. https:\/\/doi.org\/10.1109\/TMM.2023.3238522","DOI":"10.1109\/TMM.2023.3238522"},{"key":"10595_CR9","unstructured":"Beal J, Kim E, Tzeng E et al (2020) Toward transformer-based object detection"},{"key":"10595_CR10","doi-asserted-by":"publisher","first-page":"2470","DOI":"10.3390\/ELECTRONICS10202470","volume":"10","author":"D Bhatt","year":"2021","unstructured":"Bhatt D, Patel C, Talsania H et al (2021) CNN variants for computer vision: history, architecture, application, challenges and future scope. Electron 10:2470. https:\/\/doi.org\/10.3390\/ELECTRONICS10202470","journal-title":"Electron"},{"key":"10595_CR11","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1109\/CEI52496.2021.9574462","volume":"2021","author":"J Bi","year":"2021","unstructured":"Bi J, Zhu Z, Meng Q (2021) Transformer in computer vision. IEEE Int Conf Comput Sci Electron Inf Eng Intell Control Technol CEI 2021:178\u2013188. https:\/\/doi.org\/10.1109\/CEI52496.2021.9574462","journal-title":"IEEE Int Conf Comput Sci Electron Inf Eng Intell Control Technol CEI"},{"key":"10595_CR12","doi-asserted-by":"publisher","unstructured":"Cao X, Li X, Ma L, et al (2022) AggPose: deep aggregation vision transformer for infant pose estimation. IJCAI Int Jt Conf Artif Intell 5045\u20135051. https:\/\/doi.org\/10.24963\/ijcai.2022\/700","DOI":"10.24963\/ijcai.2022\/700"},{"key":"10595_CR13","doi-asserted-by":"publisher","unstructured":"Cao H, Wang Y, Chen J, et al (2023) Swin-Unet: Unet-like pure transformer for\u00a0medical image segmentation. 205\u2013218. https:\/\/doi.org\/10.1007\/978-3-031-25066-8_9","DOI":"10.1007\/978-3-031-25066-8_9"},{"key":"10595_CR14","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume":"12346","author":"N Carion","year":"2020","unstructured":"Carion N, Massa F, Synnaeve G et al (2020) End-to-end object detection with transformers. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 12346:213\u2013229. https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2022.108827","volume":"130","author":"H Chen","year":"2022","unstructured":"Chen H, Li C, Wang G et al (2022a) GasHis-transformer: a multi-scale visual transformer approach for gastric histopathological image detection. Pattern Recognit 130:108827. https:\/\/doi.org\/10.1016\/J.PATCOG.2022.108827","journal-title":"Pattern Recognit"},{"key":"10595_CR16","doi-asserted-by":"publisher","first-page":"445","DOI":"10.1016\/J.INFFUS.2022.10.030","volume":"91","author":"J Chen","year":"2023","unstructured":"Chen J, Chen X, Chen S et al (2023a) Shape-former: bridging CNN and transformer via ShapeConv for multimodal image matching. Inf Fusion 91:445\u2013457. https:\/\/doi.org\/10.1016\/J.INFFUS.2022.10.030","journal-title":"Inf Fusion"},{"key":"10595_CR17","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1016\/J.NEUCOM.2023.01.033","volume":"527","author":"J Chen","year":"2023","unstructured":"Chen J, Ding J, Yu Y, Gong W (2023b) THFuse: an infrared and visible image fusion network using transformer and hybrid feature extractor. Neurocomputing 527:71\u201382. https:\/\/doi.org\/10.1016\/J.NEUCOM.2023.01.033","journal-title":"Neurocomputing"},{"key":"10595_CR18","doi-asserted-by":"publisher","first-page":"371","DOI":"10.3390\/RS15020371","volume":"15","author":"J Chen","year":"2023","unstructured":"Chen J, Hong H, Song B et al (2023c) MDCT: multi-Kernel dilated convolution and transformer for one-stage object detection of remote sensing images. Remote Sens 15:371. https:\/\/doi.org\/10.3390\/RS15020371","journal-title":"Remote Sens"},{"key":"10595_CR19","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1016\/J.NEUNET.2023.04.045","volume":"164","author":"J Chen","year":"2023","unstructured":"Chen J, Zhang Y, Pan Y et al (2023d) A transformer-based deep neural network model for SSVEP classification. Neural Netw 164:521\u2013534. https:\/\/doi.org\/10.1016\/J.NEUNET.2023.04.045","journal-title":"Neural Netw"},{"key":"10595_CR20","doi-asserted-by":"crossref","unstructured":"Chen J, Ho CM (2022) MM-ViT: multi-modal video transformer for compressed video action recognition. pp. 1910\u20131921","DOI":"10.1109\/WACV51458.2022.00086"},{"key":"10595_CR21","doi-asserted-by":"publisher","unstructured":"Chen CF, Fan Q, Panda R (2021a) CrossViT: cross-attention multi-scale vision transformer for image classification. Proc IEEE Int Conf Comput Vis 347\u2013356. https:\/\/doi.org\/10.48550\/arxiv.2103.14899","DOI":"10.48550\/arxiv.2103.14899"},{"key":"10595_CR22","doi-asserted-by":"crossref","unstructured":"Chen J, Lu Y, Yu Q et al (2021b) TransUNet: transformers make strong encoders for medical image segmentation","DOI":"10.1109\/IGARSS46834.2022.9883628"},{"key":"10595_CR23","unstructured":"Chen S, Yu T, Li P (2021c) MVT: Multi-view vision transformer for 3D object recognition"},{"key":"10595_CR24","doi-asserted-by":"publisher","unstructured":"Chen Z, Xie L, Niu J et al (2021d) Visformer: the vision-friendly transformer. Proc IEEE Int Conf Comput Vis 569\u2013578. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00063","DOI":"10.1109\/ICCV48922.2021.00063"},{"key":"10595_CR25","doi-asserted-by":"publisher","unstructured":"Chen Z, Zhu Y, Zhao C et al (2021e) DPT: deformable patch-based transformer for visual recognition. MM 2021e Proc 29th ACM Int Conf Multimed 2899\u20132907. https:\/\/doi.org\/10.1145\/3474085.3475467","DOI":"10.1145\/3474085.3475467"},{"key":"10595_CR26","unstructured":"Chen S, Ge C, Tong Z, et al (2022b) Token merging: your ViT but faster"},{"key":"10595_CR27","doi-asserted-by":"crossref","unstructured":"Chen S, Ge C, Tong Z, et al (2022c) AdaptFormer: adapting vision transformers for scalable visual recognition","DOI":"10.1109\/ICCV48922.2021.01205"},{"key":"10595_CR28","unstructured":"Chen S, Ye T, Liu Y, Chen E (2022d) Dual-former: hybrid self-attention transformer for efficient image restoration"},{"key":"10595_CR29","doi-asserted-by":"publisher","unstructured":"Chen Y, Dai X, Chen D, et al (2022e) Mobile-former: bridging MobileNet and transformer. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022e-June:5260\u20135269. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00520","DOI":"10.1109\/CVPR52688.2022.00520"},{"key":"10595_CR30","doi-asserted-by":"crossref","unstructured":"Cheng M, Ma H, Ma Q, et al (2023) Hybrid transformer and CNN attention network for stereo image super-resolution","DOI":"10.1109\/CVPRW59228.2023.00171"},{"key":"10595_CR31","first-page":"9355","volume":"12","author":"X Chu","year":"2021","unstructured":"Chu X, Tian Z, Wang Y et al (2021a) Twins: revisiting the design of spatial attention in vision transformers. Adv Neural Inf Process Syst 12:9355\u20139366","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR32","unstructured":"Chu X, Tian Z, Zhang B et al (2021b) Conditional positional encodings for vision transformers"},{"key":"10595_CR33","doi-asserted-by":"publisher","first-page":"3965","DOI":"10.48550\/arxiv.2106.04803","volume":"5","author":"Z Dai","year":"2021","unstructured":"Dai Z, Liu H, Le QV, Tan M (2021) CoAtNet: marrying convolution and attention for all data sizes. Adv Neural Inf Process Syst 5:3965\u20133977. https:\/\/doi.org\/10.48550\/arxiv.2106.04803","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR34","unstructured":"Dehghani M, Mustafa B, Djolonga J et al (2023) Patch n\u2019 Pack: NaViT, a Vision transformer for any aspect ratio and resolution"},{"key":"10595_CR35","doi-asserted-by":"publisher","DOI":"10.1016\/J.MARPOLBUL.2023.114834","volume":"190","author":"S Dehghani-Dehcheshmeh","year":"2023","unstructured":"Dehghani-Dehcheshmeh S, Akhoondzadeh M, Homayouni S (2023) Oil spills detection from SAR Earth observations based on a hybrid CNN transformer networks. Mar Pollut Bull 190:114834. https:\/\/doi.org\/10.1016\/J.MARPOLBUL.2023.114834","journal-title":"Mar Pollut Bull"},{"key":"10595_CR36","doi-asserted-by":"publisher","DOI":"10.3390\/rs15051219","author":"Y Deng","year":"2023","unstructured":"Deng Y, Meng Y, Chen J et al (2023) TChange: a hybrid transformer-CNN change detection network. Remote Sens. https:\/\/doi.org\/10.3390\/rs15051219","journal-title":"Remote Sens"},{"key":"10595_CR37","unstructured":"Devlin J, Chang MW, Lee K, Toutanova K (2018) BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. NAACL HLT 2019 - 2019 Conf North Am Chapter Assoc Comput Linguist Hum Lang Technol Proc Conf 1:4171\u20134186"},{"key":"10595_CR38","doi-asserted-by":"publisher","first-page":"1132","DOI":"10.1007\/S10489-022-03642-W\/FIGURES\/9","volume":"53","author":"T Dhamija","year":"2023","unstructured":"Dhamija T, Gupta A, Gupta S et al (2023) Semantic segmentation in medical images through transfused convolution and transformer networks. Appl Intell 53:1132\u20131148. https:\/\/doi.org\/10.1007\/S10489-022-03642-W\/FIGURES\/9","journal-title":"Appl Intell"},{"key":"10595_CR39","doi-asserted-by":"publisher","first-page":"1116","DOI":"10.1109\/TMI.2018.2878669","volume":"38","author":"J Dolz","year":"2019","unstructured":"Dolz J, Gopinath K, Yuan J et al (2019) HyperDense-net: a hyper-densely connected CNN for multi-modal image segmentation. IEEE Trans Med Imaging 38:1116\u20131126. https:\/\/doi.org\/10.1109\/TMI.2018.2878669","journal-title":"IEEE Trans Med Imaging"},{"key":"10595_CR40","doi-asserted-by":"publisher","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A et al (2020) An image is worth 16x16 words: transformers for image recognition at scale. https:\/\/doi.org\/10.48550\/arxiv.2010.11929","DOI":"10.48550\/arxiv.2010.11929"},{"key":"10595_CR41","doi-asserted-by":"publisher","unstructured":"Du Y, Liu Z, Li J, Zhao WX (2022) A survey of vision-language pre-trained models. IJCAI Int Jt Conf Artif Intell. 5436\u20135443. https:\/\/doi.org\/10.24963\/ijcai.2022\/762","DOI":"10.24963\/ijcai.2022\/762"},{"key":"10595_CR42","doi-asserted-by":"publisher","DOI":"10.1007\/S10462-023-10455-X\/TABLES\/3","author":"MJ Er","year":"2023","unstructured":"Er MJ, Zhang Y, Chen J, Gao W (2023) Ship detection with deep learning: a survey. Artif Intell Rev. https:\/\/doi.org\/10.1007\/S10462-023-10455-X\/TABLES\/3","journal-title":"Artif Intell Rev"},{"key":"10595_CR43","doi-asserted-by":"publisher","unstructured":"Fan Y, Lu X, Li D, Liu Y (2016) Video-based emotion recognition using CNN-RNN and C3D hybrid networks. ICMI 2016 Proc 18th ACM Int Conf Multimodal Interact 445\u2013450. https:\/\/doi.org\/10.1145\/2993148.2997632","DOI":"10.1145\/2993148.2997632"},{"key":"10595_CR44","doi-asserted-by":"publisher","first-page":"167","DOI":"10.32604\/CMC.2018.02356","volume":"57","author":"W Fang","year":"2018","unstructured":"Fang W, Zhang F, Sheng VS, Ding Y (2018) A method for improving CNN-based image recognition using DCGAN. Comput Mater Contin 57:167\u2013178. https:\/\/doi.org\/10.32604\/CMC.2018.02356","journal-title":"Comput Mater Contin"},{"key":"10595_CR45","doi-asserted-by":"publisher","unstructured":"Fang J, Lin H, Chen X, Zeng K (2022) A hybrid network of CNN and transformer for lightweight image super-resolution. IEEE Comput Soc Conf Comput Vis Pattern Recognit Work 2022-June:1102\u20131111. https:\/\/doi.org\/10.1109\/CVPRW56347.2022.00119","DOI":"10.1109\/CVPRW56347.2022.00119"},{"key":"10595_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/J.ENGFAILANAL.2022.107039","volume":"145","author":"Q Feng","year":"2023","unstructured":"Feng Q, Li F, Li H et al (2023) Hybrid convolution and transformer network for coupler fracture failure pattern segmentation recognition in heavy-haul trains. Eng Fail Anal 145:107039. https:\/\/doi.org\/10.1016\/J.ENGFAILANAL.2022.107039","journal-title":"Eng Fail Anal"},{"key":"10595_CR47","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/J.NEUNET.2021.07.019","volume":"144","author":"S Frolov","year":"2021","unstructured":"Frolov S, Hinz T, Raue F et al (2021) Adversarial text-to-image synthesis: a review. Neural Netw 144:187\u2013209. https:\/\/doi.org\/10.1016\/J.NEUNET.2021.07.019","journal-title":"Neural Netw"},{"key":"10595_CR48","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1007\/978-3-030-87199-4_6\/COVER","volume":"12903","author":"Y Gao","year":"2021","unstructured":"Gao Y, Zhou M, Metaxas DN (2021) UTNet: a hybrid transformer architecture for medical image segmentation. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 12903:61\u201371. https:\/\/doi.org\/10.1007\/978-3-030-87199-4_6\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.neunet.2023.02.021","volume":"162","author":"P Gao","year":"2022","unstructured":"Gao P, Yang X, Zhang R et al (2022b) Generalised image outpainting with U-transformer. Neural Netw 162:1\u201310. https:\/\/doi.org\/10.1016\/j.neunet.2023.02.021","journal-title":"Neural Netw"},{"key":"10595_CR50","doi-asserted-by":"publisher","unstructured":"Gao G, Xu Z, Li J et al (2022a) CTCNet: a CNN-transformer cooperation network for face image super-resolution. https:\/\/doi.org\/10.1109\/TIP.2023.3261747","DOI":"10.1109\/TIP.2023.3261747"},{"key":"10595_CR51","first-page":"4193","volume":"34","author":"C Ge","year":"2021","unstructured":"Ge C, Liang Y, Song Y et al (2021) Revitalizing CNN attention via transformers in self-supervised visual representation learning. Adv Neural Inf Process Syst 34:4193\u20134206","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR52","doi-asserted-by":"crossref","unstructured":"Graham B, El-Nouby A, Touvron H et al (2021) LeViT: a vision transformer in convnet\u2019s clothing for faster inference. Proc IEEE Int Conf Comput Vis 12239\u201312249","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"10595_CR53","doi-asserted-by":"publisher","first-page":"515","DOI":"10.3390\/S23010515","volume":"23","author":"H Guo","year":"2023","unstructured":"Guo H, Song M, Ding Z et al (2023) Vision-based efficient robotic manipulation with a dual-streaming compact convolutional transformer. Sensors 23:515. https:\/\/doi.org\/10.3390\/S23010515","journal-title":"Sensors"},{"key":"10595_CR54","doi-asserted-by":"publisher","unstructured":"Guo J, Han K, Wu H, et al (2021) CMT: convolutional neural networks meet vision transformers. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022-June:12165\u201312175. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01186","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"10595_CR55","unstructured":"Habib G, Saleem TJ, Lall B (2023) Knowledge distillation in vision transformers: a critical review"},{"key":"10595_CR56","doi-asserted-by":"publisher","unstructured":"Hampali S, Sarkar SD, Rad M, Lepetit V (2021) Keypoint transformer: solving joint identification in challenging hands and object interactions for accurate 3D pose estimation. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022-June:11080\u201311090. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01081","DOI":"10.1109\/CVPR52688.2022.01081"},{"key":"10595_CR57","first-page":"15908","volume":"19","author":"K Han","year":"2021","unstructured":"Han K, Xiao A, Wu E et al (2021) Transformer in transformer. Adv Neural Inf Process Syst 19:15908\u201315919","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR58","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","volume":"45","author":"K Han","year":"2023","unstructured":"Han K, Wang Y, Chen H et al (2023) A survey on vision transformer. IEEE Trans Pattern Anal Mach Intell 45:87\u2013110. https:\/\/doi.org\/10.1109\/TPAMI.2022.3152247","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10595_CR59","unstructured":"Hassani A, Walton S, Shah N et al (2021) Escaping the big data paradigm with compact transformers"},{"key":"10595_CR60","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2023.106629","volume":"155","author":"Q He","year":"2023","unstructured":"He Q, Yang Q, Xie M (2023) HCTNet: a hybrid CNN-transformer network for breast ultrasound image segmentation. Comput Biol Med 155:106629. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2023.106629","journal-title":"Comput Biol Med"},{"key":"10595_CR61","doi-asserted-by":"publisher","unstructured":"Heidari M, Kazerouni A, Soltany M et al (2022) HiFormer: hierarchical multi-scale representations using transformers for medical image segmentation. Proc 2023 IEEE Winter Conf Appl Comput Vision, WACV 2023 6191\u20136201. https:\/\/doi.org\/10.1109\/WACV56688.2023.00614","DOI":"10.1109\/WACV56688.2023.00614"},{"key":"10595_CR62","doi-asserted-by":"publisher","first-page":"7512","DOI":"10.1007\/S10489-022-03867-9\/TABLES\/4","volume":"53","author":"YJ Heo","year":"2023","unstructured":"Heo YJ, Yeo WH, Kim BG (2023) DeepFake detection algorithm based on improved vision transformer. Appl Intell 53:7512\u20137527. https:\/\/doi.org\/10.1007\/S10489-022-03867-9\/TABLES\/4","journal-title":"Appl Intell"},{"key":"10595_CR63","doi-asserted-by":"publisher","unstructured":"Heo B, Yun S, Han D et al (2021) Rethinking spatial dimensions of vision transformers. Proc IEEE Int Conf Comput Vis 11916\u201311925. https:\/\/doi.org\/10.48550\/arxiv.2103.16302","DOI":"10.48550\/arxiv.2103.16302"},{"key":"10595_CR64","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/J.INS.2021.08.043","volume":"580","author":"Q Huang","year":"2021","unstructured":"Huang Q, Huang C, Wang X, Jiang F (2021a) Facial expression recognition with grid-wise attention and visual transformer. Inf Sci (NY) 580:35\u201354. https:\/\/doi.org\/10.1016\/J.INS.2021.08.043","journal-title":"Inf Sci (NY)"},{"key":"10595_CR65","doi-asserted-by":"publisher","DOI":"10.1117\/1.JRS.17.026510","volume":"17","author":"K Huang","year":"2023","unstructured":"Huang K, Wen M, Wang C, Ling L (2023a) FPDT: a multi-scale feature pyramidal object detection transformer. J Appl Remote Sensing 17:026510. https:\/\/doi.org\/10.1117\/1.JRS.17.026510","journal-title":"J Appl Remote Sensing"},{"key":"10595_CR66","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1016\/J.BBE.2023.02.002","volume":"43","author":"X Huang","year":"2023","unstructured":"Huang X, Chen J, Chen M et al (2023b) FRE-Net: full-region enhanced network for nuclei segmentation in histopathology images. Biocybern Biomed Eng 43:386\u2013401. https:\/\/doi.org\/10.1016\/J.BBE.2023.02.002","journal-title":"Biocybern Biomed Eng"},{"key":"10595_CR67","unstructured":"Huang J, Zhu Z, Huang G (2019) Multi-stage HRNet: multiple stage high-resolution network for human pose estimation"},{"key":"10595_CR68","unstructured":"Huang Z, Ben Y, Luo G et al (2021b) Shuffle transformer: rethinking spatial shuffle for vision transformer"},{"key":"10595_CR69","unstructured":"Islam MA, Kowal M, Jia S, et al (2021) Position, padding and predictions: a deeper look at position information in CNNs. ArXiv"},{"key":"10595_CR70","unstructured":"Islam K (2022) Recent advances in vision transformer: a survey and outlook of recent work"},{"key":"10595_CR71","doi-asserted-by":"publisher","DOI":"10.1016\/J.JAG.2023.103333","volume":"120","author":"A Jamali","year":"2023","unstructured":"Jamali A, Roy SK, Ghamisi P (2023) WetMapFormer: a unified deep CNN and vision transformer for complex wetland mapping. Int J Appl Earth Obs Geoinf 120:103333. https:\/\/doi.org\/10.1016\/J.JAG.2023.103333","journal-title":"Int J Appl Earth Obs Geoinf"},{"key":"10595_CR72","doi-asserted-by":"publisher","first-page":"421","DOI":"10.1007\/S11633-022-1394-4\/METRICS","volume":"20","author":"GP Ji","year":"2023","unstructured":"Ji GP, Zhuge M, Gao D et al (2023) Masked vision-language transformer in fashion. Mach Intell Res 20:421\u2013434. https:\/\/doi.org\/10.1007\/S11633-022-1394-4\/METRICS","journal-title":"Mach Intell Res"},{"key":"10595_CR73","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2022.106207","volume":"150","author":"S Jiang","year":"2022","unstructured":"Jiang S, Li J (2022) TransCUNet: UNet cross fused transformer for medical image segmentation. Comput Biol Med 150:106207. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2022.106207","journal-title":"Comput Biol Med"},{"key":"10595_CR74","first-page":"14745","volume":"18","author":"Y Jiang","year":"2021","unstructured":"Jiang Y, Chang S, Wang Z (2021) TransGAN: two pure transformers can make one strong GAN, and that can scale up. Adv Neural Inf Process Syst 18:14745\u201314758","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR75","doi-asserted-by":"publisher","DOI":"10.1016\/J.JVCIR.2022.103664","volume":"89","author":"K Jiang","year":"2022","unstructured":"Jiang K, Peng P, Lian Y, Xu W (2022) The encoding method of position embeddings in vision transformer. J vis Commun Image Represent 89:103664. https:\/\/doi.org\/10.1016\/J.JVCIR.2022.103664","journal-title":"J vis Commun Image Represent"},{"key":"10595_CR76","doi-asserted-by":"publisher","unstructured":"Jiang A, Yan N, Wang F et al (2019) Visible image recognition of power transformer equipment based on mask R-CNN. iSPEC 2019\u20132019 IEEE Sustain Power Energy Conf Grid Mod Energy Revolution, Proc 657\u2013661. https:\/\/doi.org\/10.1109\/ISPEC48194.2019.8975213","DOI":"10.1109\/ISPEC48194.2019.8975213"},{"key":"10595_CR77","doi-asserted-by":"publisher","unstructured":"Jin W, Yu H, Luo X (2021) CvT-ASSD: convolutional vision-transformer based attentive single shot MultiBox detector. Proc Int Conf Tools with Artif Intell ICTAI 2021:736\u2013744. https:\/\/doi.org\/10.1109\/ICTAI52525.2021.00117","DOI":"10.1109\/ICTAI52525.2021.00117"},{"key":"10595_CR78","doi-asserted-by":"publisher","unstructured":"Jing Y, Wang F (2022) TP-VIT: a two-pathway vision transformer for video action recognition. ICASSP, IEEE Int Conf Acoust Speech Signal Process\u2014Proc 2022-May:2185\u20132189. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747276","DOI":"10.1109\/ICASSP43922.2022.9747276"},{"key":"10595_CR79","doi-asserted-by":"publisher","unstructured":"Jing T, Meng Q-H, Hou H-R (2023) SmokeSeger: a transformer-CNN coupled model for urban scene smoke segmentation. IEEE Trans Ind Informatics 1\u201312. https:\/\/doi.org\/10.1109\/TII.2023.3271441","DOI":"10.1109\/TII.2023.3271441"},{"key":"10595_CR80","doi-asserted-by":"publisher","unstructured":"Kanwal N, Eftest\u00f8l T, Khoraminia F et al (2023) Vision transformers for\u00a0small histological datasets learned through knowledge distillation. 167\u2013179. https:\/\/doi.org\/10.1007\/978-3-031-33380-4_13","DOI":"10.1007\/978-3-031-33380-4_13"},{"key":"10595_CR81","doi-asserted-by":"publisher","unstructured":"Karras T, Laine S, Aittala M et al (2019) Analyzing and improving the image quality of StyleGAN. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 8107\u20138116. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00813","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"10595_CR82","doi-asserted-by":"publisher","DOI":"10.1016\/J.NEURI.2021.100035","volume":"2","author":"G Kaur","year":"2022","unstructured":"Kaur G, Sinha R, Tiwari PK et al (2022) Face mask recognition system using CNN model. Neurosci Inform 2:100035. https:\/\/doi.org\/10.1016\/J.NEURI.2021.100035","journal-title":"Neurosci Inform"},{"key":"10595_CR83","doi-asserted-by":"publisher","DOI":"10.1016\/J.MEDIA.2023.102758","volume":"85","author":"J Ke","year":"2023","unstructured":"Ke J, Lu Y, Shen Y et al (2023) ClusterSeg: a crowd cluster pinpointed nucleus segmentation framework with cross-modality datasets. Med Image Anal 85:102758. https:\/\/doi.org\/10.1016\/J.MEDIA.2023.102758","journal-title":"Med Image Anal"},{"key":"10595_CR84","doi-asserted-by":"publisher","first-page":"5455","DOI":"10.1007\/s10462-020-09825-6","volume":"53","author":"A Khan","year":"2020","unstructured":"Khan A, Sohail A, Zahoora U, Qureshi AS (2020) A survey of the recent architectures of deep convolutional neural networks. Artif Intell Rev 53:5455\u20135516. https:\/\/doi.org\/10.1007\/s10462-020-09825-6","journal-title":"Artif Intell Rev"},{"key":"10595_CR85","doi-asserted-by":"publisher","first-page":"1745","DOI":"10.1111\/coin.12459","volume":"37","author":"A Khan","year":"2021","unstructured":"Khan A, Qureshi AS, Wahab N et al (2021a) A recent survey on the applications of genetic programming in image processing. Comput Intell 37:1745\u20131778. https:\/\/doi.org\/10.1111\/coin.12459","journal-title":"Comput Intell"},{"key":"10595_CR86","doi-asserted-by":"publisher","DOI":"10.1145\/3505244","author":"S Khan","year":"2021","unstructured":"Khan S, Naseer M, Hayat M et al (2021b) Transformers in vision: a survey. ACM Comput Surv. https:\/\/doi.org\/10.1145\/3505244","journal-title":"ACM Comput Surv"},{"key":"10595_CR87","doi-asserted-by":"publisher","DOI":"10.1093\/JMICRO\/DFAC027","author":"SH Khan","year":"2022","unstructured":"Khan SH, Shah NS, Nuzhat R et al (2022) Malaria parasite classification framework using a novel channel squeezed and boosted CNN. Microscopy. https:\/\/doi.org\/10.1093\/JMICRO\/DFAC027","journal-title":"Microscopy"},{"key":"10595_CR88","doi-asserted-by":"publisher","DOI":"10.1080\/0952813X.2023.2165724","author":"A Khan","year":"2023","unstructured":"Khan A, Khan SH, Saif M et al (2023) A survey of deep learning techniques for the analysis of COVID-19 and their usability for detecting omicron. J Exp Theor Artif Intell. https:\/\/doi.org\/10.1080\/0952813X.2023.2165724","journal-title":"J Exp Theor Artif Intell"},{"key":"10595_CR89","doi-asserted-by":"crossref","unstructured":"Khan SH, Khan A, Lee YS et al (2021c) Segmentation of shoulder muscle MRI using a new region and edge based deep auto-encoder","DOI":"10.1007\/s11042-022-14061-x"},{"key":"10595_CR90","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2023.109659","volume":"141","author":"BJ Kim","year":"2023","unstructured":"Kim BJ, Choi H, Jang H et al (2023) Improved robustness of vision transformers via prelayernorm in patch embedding. Pattern Recognit 141:109659. https:\/\/doi.org\/10.1016\/J.PATCOG.2023.109659","journal-title":"Pattern Recognit"},{"key":"10595_CR91","doi-asserted-by":"crossref","unstructured":"Kirillov A, Mintun E, Ravi N et al (2023) Segment anything","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10595_CR92","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1162\/NECO.1989.1.4.541","volume":"1","author":"Y LeCun","year":"1989","unstructured":"LeCun Y, Boser B, Denker JS et al (1989) Backpropagation applied to handwritten zip code recognition. Neural Comput 1:541\u2013551. https:\/\/doi.org\/10.1162\/NECO.1989.1.4.541","journal-title":"Neural Comput"},{"key":"10595_CR93","unstructured":"Lee K, Chang H, Jiang L et al (2021a) ViTGAN: training gans with vision transformers"},{"key":"10595_CR94","doi-asserted-by":"publisher","unstructured":"Lee Y, Kim J, Willette J, Hwang SJ (2021b) MPViT: multi-path vision transformer for dense prediction. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022:7277\u20137286. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00714","DOI":"10.1109\/CVPR52688.2022.00714"},{"key":"10595_CR95","unstructured":"Leong MC, Zhang H, Tan HL et al (2022) Combined CNN transformer encoder for enhanced fine-grained human action recognition"},{"key":"10595_CR96","doi-asserted-by":"publisher","first-page":"884","DOI":"10.3390\/AGRICULTURE12060884","volume":"12","author":"X Li","year":"2022","unstructured":"Li X, Li S (2022a) Transformer help CNN see better: a lightweight hybrid apple disease identification model based on transformers. Agriculture 12:884. https:\/\/doi.org\/10.3390\/AGRICULTURE12060884","journal-title":"Agriculture"},{"key":"10595_CR97","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3164083","author":"Y Li","year":"2021","unstructured":"Li Y, Yao T, Pan Y, Mei T (2021b) Contextual transformer networks for visual recognition. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/TPAMI.2022.3164083","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10595_CR98","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1007\/978-3-031-15937-4_65\/COVER","volume":"13532","author":"Z Li","year":"2022","unstructured":"Li Z, Li D, Xu C et al (2022b) TFCNs: a CNN-transformer hybrid network for\u00a0medical image segmentation. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13532:781\u2013792. https:\/\/doi.org\/10.1007\/978-3-031-15937-4_65\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR99","doi-asserted-by":"publisher","DOI":"10.1016\/J.BSPC.2023.104636","volume":"83","author":"G Li","year":"2023","unstructured":"Li G, Chen R, Zhang J et al (2023a) Fusing enhanced transformer and large kernel CNN for malignant thyroid nodule segmentation. Biomed Signal Process Control 83:104636. https:\/\/doi.org\/10.1016\/J.BSPC.2023.104636","journal-title":"Biomed Signal Process Control"},{"key":"10595_CR100","doi-asserted-by":"publisher","DOI":"10.1016\/J.JVCIR.2022.103692","volume":"90","author":"G Li","year":"2023","unstructured":"Li G, Yao H, Le Y, Qin C (2023b) Recaptured screen image identification based on vision transformer. J vis Commun Image Represent 90:103692. https:\/\/doi.org\/10.1016\/J.JVCIR.2022.103692","journal-title":"J vis Commun Image Represent"},{"key":"10595_CR101","doi-asserted-by":"publisher","DOI":"10.1016\/J.MEDIA.2023.102762","volume":"85","author":"J Li","year":"2023","unstructured":"Li J, Chen J, Tang Y et al (2023c) Transforming medical imaging with Transformers? A comparative review of key properties, current progresses, and future perspectives. Med Image Anal 85:102762. https:\/\/doi.org\/10.1016\/J.MEDIA.2023.102762","journal-title":"Med Image Anal"},{"key":"10595_CR102","doi-asserted-by":"publisher","first-page":"361","DOI":"10.3390\/RS15020361","volume":"15","author":"J Li","year":"2023","unstructured":"Li J, Du Q, Li W et al (2023d) MCAFNet: a multiscale channel attention fusion network for semantic segmentation of remote sensing images. Remote Sens 15:361. https:\/\/doi.org\/10.3390\/RS15020361","journal-title":"Remote Sens"},{"key":"10595_CR103","doi-asserted-by":"publisher","DOI":"10.1016\/J.JVCIR.2023.103800","volume":"92","author":"R Li","year":"2023","unstructured":"Li R, Mai Z, Zhang Z et al (2023e) TransCAM: transformer attention-based CAM refinement for weakly supervised semantic segmentation. J vis Commun Image Represent 92:103800. https:\/\/doi.org\/10.1016\/J.JVCIR.2023.103800","journal-title":"J vis Commun Image Represent"},{"key":"10595_CR104","doi-asserted-by":"publisher","DOI":"10.1016\/J.JKSUCI.2022.09.013","volume":"35","author":"X Li","year":"2023","unstructured":"Li X, Li X, Zhang S et al (2023f) SLViT: shuffle-convolution-based lightweight vision transformer for effective diagnosis of sugarcane leaf diseases. J King Saud Univ Comput Inf Sci 35:101401. https:\/\/doi.org\/10.1016\/J.JKSUCI.2022.09.013","journal-title":"J King Saud Univ Comput Inf Sci"},{"key":"10595_CR105","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPAG.2023.107651","volume":"205","author":"X Li","year":"2023","unstructured":"Li X, Xiang Y, Li S (2023g) Combining convolutional and vision transformer structures for sheep face recognition. Comput Electron Agric 205:107651. https:\/\/doi.org\/10.1016\/J.COMPAG.2023.107651","journal-title":"Comput Electron Agric"},{"key":"10595_CR106","doi-asserted-by":"publisher","unstructured":"Li C, Tang T, Wang G, et al (2021a) BossNAS: exploring hybrid CNN-transformers with block-wisely self-supervised neural architecture search. Proc IEEE Int Conf Comput Vis 12261\u201312271. https:\/\/doi.org\/10.48550\/arxiv.2103.12424","DOI":"10.48550\/arxiv.2103.12424"},{"key":"10595_CR107","doi-asserted-by":"publisher","unstructured":"Li Y, Zhang K, Cao J et al (2021c) LocalViT: bringing locality to vision transformers. https:\/\/doi.org\/10.48550\/arxiv.2104.05707","DOI":"10.48550\/arxiv.2104.05707"},{"key":"10595_CR108","doi-asserted-by":"publisher","unstructured":"Li Y, Zhang S, Wang Z et al (2021d) TokenPose: Learning Keypoint Tokens for Human Pose Estimation. Proc IEEE Int Conf Comput Vis 11293\u201311302. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01112","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"10595_CR109","doi-asserted-by":"publisher","first-page":"230","DOI":"10.3390\/UNIVERSE9050230","volume":"9","author":"J Lian","year":"2023","unstructured":"Lian J, Liu T, Zhou Y et al (2023) Aurora classification in all-sky images via CNN-transformer. Universe 9:230. https:\/\/doi.org\/10.3390\/UNIVERSE9050230","journal-title":"Universe"},{"key":"10595_CR110","doi-asserted-by":"publisher","first-page":"2754","DOI":"10.1080\/01431161.2023.2208711","volume":"44","author":"S Liang","year":"2023","unstructured":"Liang S, Hua Z, Li J (2023) Hybrid transformer-CNN networks using superpixel segmentation for remote sensing building change detection. Int J Remote Sensing 44:2754\u20132780. https:\/\/doi.org\/10.1080\/01431161.2023.2208711","journal-title":"Int J Remote Sensing"},{"key":"10595_CR111","doi-asserted-by":"publisher","unstructured":"Lin S, Xie H, Wang B et al (2022) Knowledge distillation via the target-aware transformer. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022-June:10905\u201310914. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01064","DOI":"10.1109\/CVPR52688.2022.01064"},{"key":"10595_CR112","doi-asserted-by":"publisher","first-page":"1089","DOI":"10.1007\/s10462-018-9641-3","volume":"52","author":"X Liu","year":"2018","unstructured":"Liu X, Deng Z, Yang Y (2018) Recent progress in semantic image segmentation. Artif Intell Rev 52:1089\u20131106. https:\/\/doi.org\/10.1007\/s10462-018-9641-3","journal-title":"Artif Intell Rev"},{"key":"10595_CR113","doi-asserted-by":"publisher","DOI":"10.1016\/J.ENGAPPAI.2023.106184","volume":"123","author":"J Liu","year":"2023","unstructured":"Liu J, Li H, Kong W (2023a) Multi-level learning counting via pyramid vision transformer and CNN. Eng Appl Artif Intell 123:106184. https:\/\/doi.org\/10.1016\/J.ENGAPPAI.2023.106184","journal-title":"Eng Appl Artif Intell"},{"key":"10595_CR114","unstructured":"Liu Y, Wu Y-H, Sun G et al (2021a) Vision transformers with hierarchical attention"},{"key":"10595_CR115","doi-asserted-by":"publisher","unstructured":"Liu Y, Zhang YY, Wang Y et al (2021b) A survey of visual transformers. IEEE Transactions on Neural Networks and Learning Systems. pp. 1-21. https:\/\/doi.org\/10.1109\/TNNLS.2022.3227717","DOI":"10.1109\/TNNLS.2022.3227717"},{"key":"10595_CR116","doi-asserted-by":"publisher","unstructured":"Liu Z, Lin Y, Cao Y et al (2021c) Swin transformer: hierarchical vision transformer using shifted windows. Proc IEEE Int Conf Comput Vis 9992\u201310002. https:\/\/doi.org\/10.48550\/arxiv.2103.14030","DOI":"10.48550\/arxiv.2103.14030"},{"key":"10595_CR117","unstructured":"Liu Y, Ong N, Peng K et al (2023b) MMViT: multiscale multiview vision transformers"},{"key":"10595_CR118","doi-asserted-by":"publisher","first-page":"5288","DOI":"10.3390\/S23115288","volume":"23","author":"T Lu","year":"2023","unstructured":"Lu T, Wan L, Qi S, Gao M (2023a) Land cover classification of UAV remote sensing based on transformer\u2013CNN hybrid architecture. Sensors 23:5288. https:\/\/doi.org\/10.3390\/S23115288","journal-title":"Sensors"},{"key":"10595_CR119","doi-asserted-by":"publisher","first-page":"1211","DOI":"10.1109\/JSTARS.2023.3234161","volume":"16","author":"W Lu","year":"2023","unstructured":"Lu W, Lan C, Niu C et al (2023b) A CNN-transformer hybrid model based on CSWin transformer for UAV image object detection. IEEE J Sel Top Appl Earth Obs Remote Sens 16:1211\u20131231. https:\/\/doi.org\/10.1109\/JSTARS.2023.3234161","journal-title":"IEEE J Sel Top Appl Earth Obs Remote Sens"},{"key":"10595_CR120","doi-asserted-by":"publisher","DOI":"10.1016\/J.MEDIA.2023.102760","volume":"85","author":"J Lyu","year":"2023","unstructured":"Lyu J, Li G, Wang C et al (2023) Region-focused multi-view transformer-based generative adversarial network for cardiac cine MRI reconstruction. Med Image Anal 85:102760. https:\/\/doi.org\/10.1016\/J.MEDIA.2023.102760","journal-title":"Med Image Anal"},{"key":"10595_CR121","doi-asserted-by":"publisher","first-page":"1236","DOI":"10.1109\/TAFFC.2021.3122146","volume":"14","author":"F Ma","year":"2023","unstructured":"Ma F, Sun B, Li S (2023a) Facial expression recognition with visual transformers and attentional selective fusion. IEEE Trans Affect Comput 14:1236\u20131248. https:\/\/doi.org\/10.1109\/TAFFC.2021.3122146","journal-title":"IEEE Trans Affect Comput"},{"key":"10595_CR122","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2022.106533","volume":"153","author":"Z Ma","year":"2023","unstructured":"Ma Z, Qi Y, Xu C et al (2023b) ATFE-Net: axial transformer and feature enhancement-based CNN for ultrasound breast mass segmentation. Comput Biol Med 153:106533. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2022.106533","journal-title":"Comput Biol Med"},{"key":"10595_CR123","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-031-25082-8_1\/COVER","volume":"13807","author":"M Maaz","year":"2023","unstructured":"Maaz M, Shaker A, Cholakkal H et al (2023) EdgeNeXt: efficiently amalgamated CNN-transformer architecture for mobile vision applications. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13807:3\u201320. https:\/\/doi.org\/10.1007\/978-3-031-25082-8_1\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR124","doi-asserted-by":"publisher","unstructured":"Maaz M, Shaker A, Cholakkal H et al (2022) EdgeNeXt: efficiently amalgamated CNN-transformer architecture for mobile vision applications. https:\/\/doi.org\/10.48550\/arxiv.2206.10589","DOI":"10.48550\/arxiv.2206.10589"},{"key":"10595_CR125","doi-asserted-by":"crossref","unstructured":"Mao W, Ge Y, Shen C, et al (2021) TFPose: direct human pose estimation with transformers","DOI":"10.1007\/978-3-031-20068-7_5"},{"key":"10595_CR126","doi-asserted-by":"crossref","unstructured":"Mathian E, Liu H, Fernandez-Cuesta L et al (2022) HaloAE: an halonet based local transformer auto-encoder for anomaly detection and localization","DOI":"10.5220\/0011865900003417"},{"key":"10595_CR127","doi-asserted-by":"publisher","first-page":"5521","DOI":"10.3390\/APP13095521","volume":"13","author":"J Maur\u00edcio","year":"2023","unstructured":"Maur\u00edcio J, Domingues I, Bernardino J (2023) Comparing vision transformers and convolutional neural networks for image classification: a literature review. Appl Sci 13:5521. https:\/\/doi.org\/10.3390\/APP13095521","journal-title":"Appl Sci"},{"key":"10595_CR128","doi-asserted-by":"publisher","first-page":"3809","DOI":"10.3390\/S23083809","volume":"23","author":"JN Mogan","year":"2023","unstructured":"Mogan JN, Lee CP, Lim KM et al (2023) Gait-CNN-ViT: multi-model gait recognition with convolutional neural networks and vision transformer. Sensors 23:3809. https:\/\/doi.org\/10.3390\/S23083809","journal-title":"Sensors"},{"key":"10595_CR129","unstructured":"Morra L, Piano L, Lamberti F, Tommasi T (2020) Bridging the gap between natural and medical images through deep colorization. In: Proceedings\u2014International Conference on Pattern Recognition"},{"key":"10595_CR130","doi-asserted-by":"publisher","first-page":"734","DOI":"10.3390\/S23020734","volume":"23","author":"O Moutik","year":"2023","unstructured":"Moutik O, Sekkat H, Tigani S et al (2023) Convolutional neural networks or vision transformers: who will win the race for action recognitions in visual data? Sensors 23:734. https:\/\/doi.org\/10.3390\/S23020734","journal-title":"Sensors"},{"key":"10595_CR131","doi-asserted-by":"publisher","first-page":"1489","DOI":"10.3390\/MATH11061489","volume":"11","author":"SI Nafisah","year":"2023","unstructured":"Nafisah SI, Muhammad G, Hossain MS, AlQahtani SA (2023) A comparative evaluation between convolutional neural networks and vision transformers for COVID-19 detection. Mathematics 11:1489. https:\/\/doi.org\/10.3390\/MATH11061489","journal-title":"Mathematics"},{"key":"10595_CR132","doi-asserted-by":"publisher","DOI":"10.1016\/J.IMAVIS.2021.104284","volume":"115","author":"S Naveen","year":"2021","unstructured":"Naveen S, Ram Kiran MSS, Indupriya M et al (2021) Transformer models for enhancing AttnGAN based text to image generation. Image vis Comput 115:104284. https:\/\/doi.org\/10.1016\/J.IMAVIS.2021.104284","journal-title":"Image vis Comput"},{"key":"10595_CR133","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1007\/978-3-031-17266-3_5\/COVER","volume":"13574","author":"A Obeid","year":"2022","unstructured":"Obeid A, Mahbub T, Javed S et al (2022) NucDETR: end-to-end transformer for nucleus detection in histopathology images. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13574:47\u201357. https:\/\/doi.org\/10.1007\/978-3-031-17266-3_5\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR134","doi-asserted-by":"crossref","unstructured":"Pan X, Ge C, Lu R et al (2022) On the Integration of Self-Attention and Convolution. 815\u2013825","DOI":"10.1109\/CVPR52688.2022.00089"},{"key":"10595_CR135","doi-asserted-by":"publisher","unstructured":"Parmar N, Vaswani A, Uszkoreit J et al (2018) Image transformer. 35th Int Conf Mach Learn ICML 2018 9:6453\u20136462. https:\/\/doi.org\/10.48550\/arxiv.1802.05751","DOI":"10.48550\/arxiv.1802.05751"},{"key":"10595_CR136","doi-asserted-by":"publisher","first-page":"419","DOI":"10.3390\/INFO13090419","volume":"13","author":"R Pecoraro","year":"2022","unstructured":"Pecoraro R, Basile V, Bono V (2022) Local multi-head channel self-attention for facial expression recognition. Information 13:419. https:\/\/doi.org\/10.3390\/INFO13090419","journal-title":"Information"},{"key":"10595_CR137","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3243048","author":"Z Peng","year":"2023","unstructured":"Peng Z, Guo Z, Huang W et al (2023) Conformer: local features coupling global representations for recognition and detection. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/TPAMI.2023.3243048","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10595_CR138","doi-asserted-by":"publisher","unstructured":"Peng Z, Huang W, Gu S, et al (2021) Conformer: local features coupling global representations for visual recognition. Proc IEEE Int Conf Comput Vis 357\u2013366. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00042","DOI":"10.1109\/ICCV48922.2021.00042"},{"key":"10595_CR139","doi-asserted-by":"publisher","first-page":"10957","DOI":"10.1007\/S00521-023-08277-7\/TABLES\/7","volume":"35","author":"J Quan","year":"2023","unstructured":"Quan J, Ge B, Wang M (2023) CrackViT: a unified CNN-transformer model for pixel-level crack extraction. Neural Comput Appl 35:10957\u201310973. https:\/\/doi.org\/10.1007\/S00521-023-08277-7\/TABLES\/7","journal-title":"Neural Comput Appl"},{"key":"10595_CR140","doi-asserted-by":"publisher","DOI":"10.1007\/S10462-023-10414-6","author":"G Rafiq","year":"2023","unstructured":"Rafiq G, Rafiq M, Gyu GS et al (2023) Video description: a comprehensive survey of deep learning approaches. Artif Intell Rev. https:\/\/doi.org\/10.1007\/S10462-023-10414-6","journal-title":"Artif Intell Rev"},{"key":"10595_CR141","doi-asserted-by":"publisher","first-page":"10881","DOI":"10.1109\/ACCESS.2023.3241334","volume":"11","author":"S Raghavendra","year":"2023","unstructured":"Raghavendra S, Ramyashree ASK et al (2023) Efficient deep learning approach to recognize person attributes by using hybrid transformers for surveillance scenarios. IEEE Access 11:10881\u201310893. https:\/\/doi.org\/10.1109\/ACCESS.2023.3241334","journal-title":"IEEE Access"},{"key":"10595_CR142","doi-asserted-by":"crossref","unstructured":"Ranftl R, Bochkovskiy A, Koltun V (2021) Vision transformers for dense prediction. Proc IEEE Int Conf Comput Vis 12159\u201312168","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"10595_CR143","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3273451","author":"D Rao","year":"2022","unstructured":"Rao D, Wu X-J, Xu T (2022) TGFuse: an infrared and visible image fusion approach based on transformer and generative adversarial network. IEEE Trans Image Process. https:\/\/doi.org\/10.1109\/TIP.2023.3273451","journal-title":"IEEE Trans Image Process"},{"key":"10595_CR144","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1093\/jmicro\/dfac051","volume":"72","author":"Z Rauf","year":"2023","unstructured":"Rauf Z, Sohail A, Khan SH et al (2023) Attention-guided multi-scale deep object detection framework for lymphocyte analysis in IHC histological images. Reprod Syst Sex Disord 72:27\u201342. https:\/\/doi.org\/10.1093\/jmicro\/dfac051","journal-title":"Reprod Syst Sex Disord"},{"key":"10595_CR145","unstructured":"Rehman A, Khan A (2023) MaxViT-UNet: multi-axis attention for medical image segmentation. arXiv Prepr arXiv230508396"},{"key":"10595_CR146","doi-asserted-by":"publisher","unstructured":"Ren P, Li C, Wang G et al (2022) Beyond fixation: dynamic window visual transformer. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022-June:11977\u201311987. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01168","DOI":"10.1109\/CVPR52688.2022.01168"},{"key":"10595_CR147","doi-asserted-by":"publisher","DOI":"10.1016\/J.MEASUREMENT.2023.112961","volume":"216","author":"ST Seydi","year":"2023","unstructured":"Seydi ST, Sadegh M (2023) Improved burned area mapping using monotemporal Landsat-9 imagery and convolutional shift-transformer. Measurement 216:112961. https:\/\/doi.org\/10.1016\/J.MEASUREMENT.2023.112961","journal-title":"Measurement"},{"key":"10595_CR148","doi-asserted-by":"publisher","first-page":"93","DOI":"10.3390\/DRONES7020093","volume":"7","author":"MBA Shafri","year":"2023","unstructured":"Shafri MBA, Al-Ruzouq HZM, Shanableh R et al (2023) Large-scale date palm tree segmentation from multiscale UAV-based and aerial images using deep vision transformers. Drones 7:93. https:\/\/doi.org\/10.3390\/DRONES7020093","journal-title":"Drones"},{"key":"10595_CR149","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102802","author":"F Shamshad","year":"2023","unstructured":"Shamshad F, Khan S, Zamir SW et al (2023) Transformers in medical imaging: a survey. Med Image Anal. https:\/\/doi.org\/10.1016\/j.media.2023.102802","journal-title":"Med Image Anal"},{"key":"10595_CR150","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPMEDIMAG.2022.102055","volume":"97","author":"X Shen","year":"2022","unstructured":"Shen X, Xu J, Jia H et al (2022) Self-attentional microvessel segmentation via squeeze-excitation transformer Unet. Comput Med Imaging Graph 97:102055. https:\/\/doi.org\/10.1016\/J.COMPMEDIMAG.2022.102055","journal-title":"Comput Med Imaging Graph"},{"key":"10595_CR151","doi-asserted-by":"publisher","first-page":"200","DOI":"10.1016\/J.PATREC.2022.11.023","volume":"166","author":"R Shi","year":"2023","unstructured":"Shi R, Yang S, Chen Y et al (2023) CNN-transformer for visual-tactile fusion applied in road recognition of autonomous vehicles. Pattern Recognit Lett 166:200\u2013208. https:\/\/doi.org\/10.1016\/J.PATREC.2022.11.023","journal-title":"Pattern Recognit Lett"},{"key":"10595_CR152","unstructured":"Si C, Yu W, Zhou P et al (2022) Inception transformer"},{"key":"10595_CR153","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. 3rd Int Conf Learn Represent ICLR 2015\u2014Conf Track Proc"},{"key":"10595_CR154","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2021.102121","volume":"72","author":"A Sohail","year":"2021","unstructured":"Sohail A, Khan A, Nisar H et al (2021a) Mitotic nuclei analysis in breast cancer histopathology images using deep ensemble classifier. Med Image Anal 72:102121. https:\/\/doi.org\/10.1016\/j.media.2021.102121","journal-title":"Med Image Anal"},{"key":"10595_CR155","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2021.102121","volume":"72","author":"A Sohail","year":"2021","unstructured":"Sohail A, Khan A, Nisar H et al (2021b) Mitotic nuclei analysis in breast cancer histopathology images using deep ensemble classifier. Med Image Anal 72:102121. https:\/\/doi.org\/10.1016\/j.media.2021.102121","journal-title":"Med Image Anal"},{"key":"10595_CR156","doi-asserted-by":"publisher","first-page":"18201","DOI":"10.1007\/S10489-022-03472-W\/TABLES\/3","volume":"52","author":"L Song","year":"2022","unstructured":"Song L, Liu G, Ma M (2022a) TD-Net:unsupervised medical image registration network based on transformer and CNN. Appl Intell 52:18201\u201318209. https:\/\/doi.org\/10.1007\/S10489-022-03472-W\/TABLES\/3","journal-title":"Appl Intell"},{"key":"10595_CR157","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3256763","author":"Y Song","year":"2023","unstructured":"Song Y, He Z, Qian H, Du X (2023) Vision transformers for single image dehazing. IEEE Trans Image Process. https:\/\/doi.org\/10.1109\/TIP.2023.3256763","journal-title":"IEEE Trans Image Process"},{"key":"10595_CR158","doi-asserted-by":"publisher","unstructured":"Song Z, Yu J, Chen YPP, Yang W (2022b) Transformer tracking with cyclic shifting window attention. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 2022b-June:8781\u20138790. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00859","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"10595_CR159","unstructured":"Springenberg M, Frommholz A, Wenzel M et al (2022) From CNNs to vision transformers\u2014a comprehensive evaluation of deep learning models for histopathology"},{"key":"10595_CR160","doi-asserted-by":"publisher","unstructured":"Srinivas A, Lin TY, Parmar N, et al (2021) Bottleneck transformers for visual recognition. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit. pp. 16514\u201316524. https:\/\/doi.org\/10.1109\/CVPR46437.2021.01625","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"10595_CR161","unstructured":"Stoffl L, Vidal M, Mathis A (2021) End-to-end trainable multi-instance pose estimation with transformers"},{"key":"10595_CR162","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2023.109443","volume":"139","author":"W Su","year":"2023","unstructured":"Su W, Wang Y, Li K et al (2023) Hybrid token transformer for deep face recognition. Pattern Recognit 139:109443. https:\/\/doi.org\/10.1016\/J.PATCOG.2023.109443","journal-title":"Pattern Recognit"},{"key":"10595_CR163","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3231725","author":"S Sun","year":"2022","unstructured":"Sun S, Yue X, Zhao H et al (2022) Patch-based separable transformer for visual recognition. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/TPAMI.2022.3231725","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10595_CR164","doi-asserted-by":"crossref","unstructured":"Sun K, Xiao B, Liu D, Wang J (2019) Deep high-resolution representation learning for human pose estimation. pp. 5693\u20135703","DOI":"10.1109\/CVPR.2019.00584"},{"key":"10595_CR165","unstructured":"Tan M, Le Q V. (2019) EfficientNet: rethinking model scaling for convolutional neural networks. 36th Int Conf Mach Learn ICML 2019 2019-June:10691\u201310700"},{"key":"10595_CR166","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3256320","author":"LCO Tiong","year":"2023","unstructured":"Tiong LCO, Sigmund D, Teoh ABJ (2023) Face-periocular cross-identification via contrastive hybrid attention vision transformer. IEEE Signal Process Lett. https:\/\/doi.org\/10.1109\/LSP.2023.3256320","journal-title":"IEEE Signal Process Lett"},{"key":"10595_CR167","doi-asserted-by":"publisher","first-page":"702","DOI":"10.1109\/WACV56688.2023.00077","volume":"2023","author":"D Torbunov","year":"2022","unstructured":"Torbunov D, Huang Y, Yu H et al (2022) UVCGAN: UNet vision transformer cycle-consistent GAN for unpaired image-to-image translation. Proc\u20142023 IEEE Winter Conf Appl Comput Vision. WACV 2023:702\u2013712. https:\/\/doi.org\/10.1109\/WACV56688.2023.00077","journal-title":"WACV"},{"key":"10595_CR168","doi-asserted-by":"publisher","unstructured":"Touvron H, Cord M, Douze M et al (2020) Training data-efficient image transformers & distillation through attention. https:\/\/doi.org\/10.48550\/arxiv.2012.12877","DOI":"10.48550\/arxiv.2012.12877"},{"key":"10595_CR169","doi-asserted-by":"publisher","unstructured":"Touvron H, Cord M, Sablayrolles A, et al (2021) Going deeper with Image Transformers. Proc IEEE Int Conf Comput Vis. pp. 32\u201342. https:\/\/doi.org\/10.48550\/arxiv.2103.17239","DOI":"10.48550\/arxiv.2103.17239"},{"key":"10595_CR170","doi-asserted-by":"crossref","unstructured":"Tragakis A, Kaul C, Murray-Smith R, Husmeier D (2022) The fully convolutional transformer for medical image segmentation. Institute of Electrical and Electronics Engineers Inc.","DOI":"10.1109\/WACV56688.2023.00365"},{"key":"10595_CR171","doi-asserted-by":"publisher","first-page":"5662","DOI":"10.1109\/JSTARS.2022.3190322","volume":"15","author":"J Tu","year":"2022","unstructured":"Tu J, Mei G, Ma Z, Piccialli F (2022a) SWCGAN: generative adversarial network combining swin transformer and CNN for remote sensing image super-resolution. IEEE J Sel Top Appl Earth Obs Remote Sens 15:5662\u20135673. https:\/\/doi.org\/10.1109\/JSTARS.2022.3190322","journal-title":"IEEE J Sel Top Appl Earth Obs Remote Sens"},{"key":"10595_CR172","doi-asserted-by":"publisher","first-page":"459","DOI":"10.48550\/arxiv.2204.01697","volume":"13684","author":"Z Tu","year":"2022","unstructured":"Tu Z, Talebi H, Zhang H et al (2022b) MaxViT: multi-axis vision transformer. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13684:459\u2013479. https:\/\/doi.org\/10.48550\/arxiv.2204.01697","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR173","unstructured":"Ulhaq A, Akhtar N, Pogrebna G, Mian A (2022) Vision transformers for action recognition: a survey"},{"key":"10595_CR174","doi-asserted-by":"publisher","DOI":"10.1016\/J.ENGAPPAI.2023.106173","volume":"123","author":"W Ullah","year":"2023","unstructured":"Ullah W, Hussain T, Ullah FUM et al (2023) TransCNN: hybrid CNN and transformer mechanism for surveillance anomaly detection. Eng Appl Artif Intell 123:106173. https:\/\/doi.org\/10.1016\/J.ENGAPPAI.2023.106173","journal-title":"Eng Appl Artif Intell"},{"key":"10595_CR175","unstructured":"Vaswani A, Brain G, Shazeer N et al (2017a) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"10595_CR176","doi-asserted-by":"publisher","unstructured":"Vaswani A, Shazeer N, Parmar N et al (2017b) Attention is all you need. Adv Neural Inf Process Syst 2017b:5999\u20136009. https:\/\/doi.org\/10.48550\/arxiv.1706.03762","DOI":"10.48550\/arxiv.1706.03762"},{"key":"10595_CR177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863","author":"Y Wang","year":"2020","unstructured":"Wang Y, Xu Z, Wang X et al (2020) End-to-end video instance segmentation with transformers. Proc IEEE Comput Soc Conf Comput vis Pattern Recognit. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00863","journal-title":"Proc IEEE Comput Soc Conf Comput vis Pattern Recognit"},{"key":"10595_CR178","doi-asserted-by":"publisher","first-page":"415","DOI":"10.1007\/s41095-022-0274-8","volume":"8","author":"W Wang","year":"2021","unstructured":"Wang W, Xie E, Li X et al (2021b) PVT v2: improved baselines with pyramid vision transformer. Comput vis Media 8:415\u2013424. https:\/\/doi.org\/10.1007\/s41095-022-0274-8","journal-title":"Comput vis Media"},{"key":"10595_CR300","unstructured":"Wang Y, Yang Y, Bai J, Zhang M, Bai J, Yu J, Zhang C, Huang G, Tong Y (2021c). Evolving attention with residual convolutions. In International conference on machine learning, PMLR, pp. 10971\u201310980"},{"key":"10595_CR179","doi-asserted-by":"publisher","first-page":"3941","DOI":"10.1007\/S11063-022-10794-W\/TABLES\/8","volume":"54","author":"R Wang","year":"2022","unstructured":"Wang R, Geng F, Wang X (2022a) MTPose: human pose estimation with high-resolution multi-scale transformers. Neural Process Lett 54:3941\u20133964. https:\/\/doi.org\/10.1007\/S11063-022-10794-W\/TABLES\/8","journal-title":"Neural Process Lett"},{"key":"10595_CR180","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3187135","author":"W Wang","year":"2022","unstructured":"Wang W, Tang C, Wang X, Zheng B (2022c) A ViT-based multiscale feature fusion approach for remote sensing image segmentation. IEEE Geosci Remote Sens Lett. https:\/\/doi.org\/10.1109\/LGRS.2022.3187135","journal-title":"IEEE Geosci Remote Sens Lett"},{"key":"10595_CR181","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3212434","author":"Y Wang","year":"2022","unstructured":"Wang Y, Qiu Y, Cheng P, Zhang J (2022d) Hybrid CNN-transformer features for visual place recognition. IEEE Trans Circuits Syst Video Technol. https:\/\/doi.org\/10.1109\/TCSVT.2022.3212434","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"10595_CR182","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPAG.2023.107682","volume":"206","author":"J Wang","year":"2023","unstructured":"Wang J, Zhang Z, Luo L et al (2023a) DualSeg: fusing transformer and CNN structure for image segmentation in complex vineyard environment. Comput Electron Agric 206:107682. https:\/\/doi.org\/10.1016\/J.COMPAG.2023.107682","journal-title":"Comput Electron Agric"},{"key":"10595_CR183","doi-asserted-by":"publisher","DOI":"10.1016\/J.BSPC.2023.104976","volume":"85","author":"L Wang","year":"2023","unstructured":"Wang L, Pan L, Wang H et al (2023b) DHUnet: Dual-branch hierarchical global\u2013local fusion network for whole slide image segmentation. Biomed Signal Process Control 85:104976. https:\/\/doi.org\/10.1016\/J.BSPC.2023.104976","journal-title":"Biomed Signal Process Control"},{"key":"10595_CR184","doi-asserted-by":"publisher","first-page":"2936","DOI":"10.3390\/RS15112936","volume":"15","author":"W Wang","year":"2023","unstructured":"Wang W, Wang J, Lu B et al (2023d) MCPT: mixed convolutional parallel transformer for polarimetric SAR image classification. Remote Sens 15:2936. https:\/\/doi.org\/10.3390\/RS15112936","journal-title":"Remote Sens"},{"key":"10595_CR185","doi-asserted-by":"crossref","unstructured":"Wang L, Tien A (2023) Aerial image object detection with vision transformer detector (ViTDet)","DOI":"10.1109\/IGARSS52108.2023.10282836"},{"key":"10595_CR186","unstructured":"Wang Y, Yang Y, Bai J, Zhang M (2021) Evolving attention with residual convolutions. Proceedings of the 38th International Conference on Machine Learning, PMLR 139"},{"key":"10595_CR187","doi-asserted-by":"publisher","unstructured":"Wang H, Zhu Y, Adam H et al (2021a) Max-DeepLab: end-to-end panoptic segmentation with mask transformers. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit 5459\u20135470. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00542","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"10595_CR188","doi-asserted-by":"publisher","unstructured":"Wang W, Xie E, Li X et al (2021c) Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. Proc IEEE Int Conf Comput Vis. pp. 548\u2013558. https:\/\/doi.org\/10.48550\/arxiv.2102.12122","DOI":"10.48550\/arxiv.2102.12122"},{"key":"10595_CR189","doi-asserted-by":"crossref","unstructured":"Wang W, Dai J, Chen Z et al (2022b) InternImage: exploring large-scale vision foundation models with deformable convolutions. pp. 14408\u201314419","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"10595_CR190","doi-asserted-by":"crossref","unstructured":"Wang W, Chen W, Qiu Q et al (2023c) CrossFormer++: a versatile vision transformer hinging on cross-scale attention","DOI":"10.1109\/TPAMI.2023.3341806"},{"key":"10595_CR191","doi-asserted-by":"publisher","unstructured":"Wei Z, Pan H, Li L et al (2023) DMFormer: closing the gap between CNN and vision transformers. ICASSP 2023\u20132023 IEEE Int Conf Acoust Speech Signal Process. pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10097256","DOI":"10.1109\/ICASSP49357.2023.10097256"},{"key":"10595_CR192","doi-asserted-by":"publisher","first-page":"16591","DOI":"10.1109\/ACCESS.2021.3053408","volume":"9","author":"W Weng","year":"2015","unstructured":"Weng W, Zhu X (2015) U-Net: convolutional networks for biomedical image segmentation. IEEE Access 9:16591\u201316603. https:\/\/doi.org\/10.1109\/ACCESS.2021.3053408","journal-title":"IEEE Access"},{"key":"10595_CR193","doi-asserted-by":"crossref","unstructured":"Wensel J, Ullah H, Member SS et al (2022) ViT-ReT: vision and recurrent transformer neural networks for human activity recognition in videos","DOI":"10.1109\/ACCESS.2023.3293813"},{"key":"10595_CR194","doi-asserted-by":"crossref","unstructured":"Woo S, Debnath S, Hu R et al (2023) ConvNeXt V2: Co-designing and scaling convnets with masked autoencoders","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"10595_CR195","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1016\/J.PATCOG.2019.01.006","volume":"90","author":"Z Wu","year":"2019","unstructured":"Wu Z, Shen C, van den Hengel A (2019) Wider or deeper: revisiting the ResNet model for visual recognition. Pattern Recognit 90:119\u2013133. https:\/\/doi.org\/10.1016\/J.PATCOG.2019.01.006","journal-title":"Pattern Recognit"},{"key":"10595_CR196","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.2103.15808","author":"H Wu","year":"2021","unstructured":"Wu H, Xiao B, Codella N et al (2021a) CvT: introducing convolutions to vision transformers. Proc IEEE Int Conf Comput vis. https:\/\/doi.org\/10.48550\/arxiv.2103.15808","journal-title":"Proc IEEE Int Conf Comput vis"},{"key":"10595_CR197","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00988","author":"K Wu","year":"2021","unstructured":"Wu K, Peng H, Chen M et al (2021b) Rethinking and improving relative position encoding for vision transformer. Proc IEEE Int Conf Comput vis. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00988","journal-title":"Proc IEEE Int Conf Comput vis"},{"key":"10595_CR198","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1007\/978-3-031-19803-8_5","volume":"13681","author":"K Wu","year":"2022","unstructured":"Wu K, Zhang J, Peng H et al (2022a) TinyViT: fast pretraining distillation for small vision transformers. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13681:68\u201385. https:\/\/doi.org\/10.1007\/978-3-031-19803-8_5","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR199","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2022.3200438","author":"Q Wu","year":"2022","unstructured":"Wu Q, Wu Y, Zhang Y, Zhang L (2022b) A local-global estimator based on large kernel CNN and transformer for human pose estimation and running pose measurement. IEEE Trans Instrum Meas. https:\/\/doi.org\/10.1109\/TIM.2022.3200438","journal-title":"IEEE Trans Instrum Meas"},{"key":"10595_CR200","doi-asserted-by":"publisher","DOI":"10.1016\/J.BSPC.2022.103896","volume":"78","author":"Y Wu","year":"2022","unstructured":"Wu Y, Wang G, Wang Z et al (2022c) DI-Unet: dimensional interaction self-attention for medical image segmentation. Biomed Signal Process Control 78:103896. https:\/\/doi.org\/10.1016\/J.BSPC.2022.103896","journal-title":"Biomed Signal Process Control"},{"key":"10595_CR201","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1109\/TETCI.2022.3210992","volume":"7","author":"Y Wu","year":"2023","unstructured":"Wu Y, Lian C, Zeng Z et al (2023b) An aggregated convolutional transformer based on slices and channels for multivariate time series classification. IEEE Trans Emerg Top Comput Intell 7:768\u2013779. https:\/\/doi.org\/10.1109\/TETCI.2022.3210992","journal-title":"IEEE Trans Emerg Top Comput Intell"},{"key":"10595_CR202","doi-asserted-by":"publisher","DOI":"10.1016\/J.CMPB.2023.107452","volume":"233","author":"Z Wu","year":"2023","unstructured":"Wu Z, Liao W, Yan C et al (2023c) Deep learning based MRI reconstruction with transformer. Comput Methods Programs Biomed 233:107452. https:\/\/doi.org\/10.1016\/J.CMPB.2023.107452","journal-title":"Comput Methods Programs Biomed"},{"key":"10595_CR203","doi-asserted-by":"crossref","unstructured":"Wu J, Fu R, Fang H, et al (2023a) MedSegDiff-V2: diffusion based medical image segmentation with transformer","DOI":"10.1609\/aaai.v38i6.28418"},{"key":"10595_CR204","doi-asserted-by":"publisher","first-page":"581","DOI":"10.3390\/S23020581","volume":"23","author":"Z Xia","year":"2023","unstructured":"Xia Z, Kim J (2023b) Enhancing mask transformer with auxiliary convolution layers for semantic segmentation. Sensors 23:581. https:\/\/doi.org\/10.3390\/S23020581","journal-title":"Sensors"},{"key":"10595_CR205","doi-asserted-by":"publisher","first-page":"4784","DOI":"10.1109\/CVPR52688.2022.00475","volume":"2022","author":"Z Xia","year":"2022","unstructured":"Xia Z, Pan X, Song S et al (2022) Vision transformer with deformable attention. Proc IEEE Comput Soc Conf Comput vis Pattern Recognit 2022:4784\u20134793. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00475","journal-title":"Proc IEEE Comput Soc Conf Comput vis Pattern Recognit"},{"key":"10595_CR206","doi-asserted-by":"publisher","first-page":"144","DOI":"10.1111\/AAB.12804","volume":"182","author":"W Xia","year":"2023","unstructured":"Xia W, Han D, Li D et al (2023a) An ensemble learning integration of multiple CNN with improved vision transformer models for pest classification. Ann Appl Biol 182:144\u2013158. https:\/\/doi.org\/10.1111\/AAB.12804","journal-title":"Ann Appl Biol"},{"key":"10595_CR207","first-page":"30392","volume":"36","author":"T Xiao","year":"2021","unstructured":"Xiao T, Singh M, Mintun E et al (2021) Early convolutions help transformers see better. Adv Neural Inf Process Syst 36:30392\u201330400","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR208","doi-asserted-by":"publisher","DOI":"10.1016\/J.BSPC.2023.104791","volume":"84","author":"H Xiao","year":"2023","unstructured":"Xiao H, Li L, Liu Q et al (2023) Transformers in medical image segmentation: a review. Biomed Signal Process Control 84:104791. https:\/\/doi.org\/10.1016\/J.BSPC.2023.104791","journal-title":"Biomed Signal Process Control"},{"key":"10595_CR209","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P et al (2017) Aggregated residual transformations for deep neural networks. openaccess.thecvf.com","DOI":"10.1109\/CVPR.2017.634"},{"key":"10595_CR210","unstructured":"Xiong S, Kasaei H (2022) Fine-grained object categorization for service robots"},{"key":"10595_CR211","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00983","author":"W Xu","year":"2021","unstructured":"Xu W, Xu Y, Chang T, Tu Z (2021a) Co-scale conv-attentional image transformers. Proc IEEE Int Conf Comput vis. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00983","journal-title":"Proc IEEE Int Conf Comput vis"},{"key":"10595_CR212","first-page":"28522","volume":"34","author":"Y Xu","year":"2021","unstructured":"Xu Y, Zhang Q, Zhang J, Tao D (2021b) ViTAE: vision transformer advanced by exploring intrinsic inductive bias. Adv Neural Inf Process Syst 34:28522\u201328535","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR213","doi-asserted-by":"publisher","first-page":"6753","DOI":"10.1007\/s10489-022-03785-w","volume":"53","author":"T Xue","year":"2023","unstructured":"Xue T, Ma P (2023) TC-net: transformer combined with cnn for image denoising. Appl Intell 53:6753\u20136762. https:\/\/doi.org\/10.1007\/s10489-022-03785-w","journal-title":"Appl Intell"},{"key":"10595_CR214","doi-asserted-by":"publisher","first-page":"3059","DOI":"10.3390\/IJERPH20043059","volume":"20","author":"C Yan","year":"2023","unstructured":"Yan C, Fan X, Fan J et al (2023) HyFormer: hybrid transformer and CNN for pixel-level multispectral image land cover classification. Int J Environ Res Public Heal 20:3059. https:\/\/doi.org\/10.3390\/IJERPH20043059","journal-title":"Int J Environ Res Public Heal"},{"key":"10595_CR215","unstructured":"Yan H, Li Z, Li W, Wang C, Wu M, Zhang C (2021) Contnet: why not use convolution and transformer at the same time? arXiv preprint arXiv:2104.13497"},{"key":"10595_CR216","doi-asserted-by":"publisher","DOI":"10.1016\/J.ESWA.2022.119024","volume":"213","author":"H Yang","year":"2023","unstructured":"Yang H, Yang D (2023) CSwin-PNet: a CNN-swin transformer combined pyramid network for breast lesion segmentation in ultrasound images. Expert Syst Appl 213:119024. https:\/\/doi.org\/10.1016\/J.ESWA.2022.119024","journal-title":"Expert Syst Appl"},{"key":"10595_CR217","doi-asserted-by":"publisher","first-page":"1388","DOI":"10.1109\/IGARSS46834.2022.9884262","volume":"2022","author":"J Yang","year":"2022","unstructured":"Yang J, Du B, Wu C (2022a) Hybrid vision transformer model for hyperspectral image classification. Int Geosci Remote Sens Symp 2022:1388\u20131391. https:\/\/doi.org\/10.1109\/IGARSS46834.2022.9884262","journal-title":"Int Geosci Remote Sens Symp"},{"key":"10595_CR218","doi-asserted-by":"publisher","first-page":"1545","DOI":"10.24963\/IJCAI.2022\/215","volume":"2","author":"S Yang","year":"2022","unstructured":"Yang S, Guo W, Ren Y (2022b) CrowdFormer: an overlap patching vision transformer for top-down crowd counting. IJCAI Int Jt Conf Artif Intell 2:1545\u20131551. https:\/\/doi.org\/10.24963\/IJCAI.2022\/215","journal-title":"IJCAI Int Jt Conf Artif Intell"},{"key":"10595_CR219","doi-asserted-by":"publisher","DOI":"10.1016\/J.BSPC.2022.104376","volume":"80","author":"J Yang","year":"2023","unstructured":"Yang J, Tu J, Zhang X et al (2023a) TSE DeepLab: an efficient visual transformer for medical image segmentation. Biomed Signal Process Control 80:104376. https:\/\/doi.org\/10.1016\/J.BSPC.2022.104376","journal-title":"Biomed Signal Process Control"},{"key":"10595_CR220","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109232","volume":"136","author":"S Yang","year":"2023","unstructured":"Yang S, Feng Z, Wang Z et al (2023b) Detecting and grouping keypoints for multi-person pose estimation using instance-aware attention. Pattern Recognit 136:109232. https:\/\/doi.org\/10.1016\/j.patcog.2022.109232","journal-title":"Pattern Recognit"},{"key":"10595_CR221","doi-asserted-by":"publisher","DOI":"10.1016\/J.CMPB.2023.107348","volume":"230","author":"Y Yang","year":"2023","unstructured":"Yang Y, Zhang L, Ren L, Wang X (2023c) MMViT-Seg: a lightweight transformer and CNN fusion network for COVID-19 segmentation. Comput Methods Programs Biomed 230:107348. https:\/\/doi.org\/10.1016\/J.CMPB.2023.107348","journal-title":"Comput Methods Programs Biomed"},{"key":"10595_CR222","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/J.PATREC.2018.05.018","volume":"118","author":"G Yao","year":"2019","unstructured":"Yao G, Lei T, Zhong J (2019) A review of convolutional-neural-network-based action recognition. Pattern Recognit Lett 118:14\u201322. https:\/\/doi.org\/10.1016\/J.PATREC.2018.05.018","journal-title":"Pattern Recognit Lett"},{"key":"10595_CR223","doi-asserted-by":"publisher","unstructured":"Yao T, Li Y, Pan Y, Wang Y, Zhang XP, Mei T (2023) Dual vision transformer. IEEE Trans Pattern Anal Mach Intell. Chicago. https:\/\/doi.org\/10.1109\/TPAMI.2023.3268446","DOI":"10.1109\/TPAMI.2023.3268446"},{"key":"10595_CR224","doi-asserted-by":"publisher","first-page":"342","DOI":"10.1016\/J.NEUCOM.2022.10.081","volume":"520","author":"C Yao","year":"2023","unstructured":"Yao C, Feng L, Kong Y et al (2023) Transformers and CNNs fusion network for salient object detection. Neurocomputing 520:342\u2013355. https:\/\/doi.org\/10.1016\/J.NEUCOM.2022.10.081","journal-title":"Neurocomputing"},{"key":"10595_CR225","doi-asserted-by":"publisher","first-page":"10494","DOI":"10.1109\/CVPR.2019.01075","volume":"2019","author":"L Ye","year":"2019","unstructured":"Ye L, Rochan M, Liu Z, Wang Y (2019) Cross-modal self-attention network for referring image segmentation. Proc IEEE Comput Soc Conf Comput vis Pattern Recognit 2019:10494\u201310503. https:\/\/doi.org\/10.1109\/CVPR.2019.01075","journal-title":"Proc IEEE Comput Soc Conf Comput vis Pattern Recognit"},{"key":"10595_CR226","doi-asserted-by":"publisher","first-page":"2827","DOI":"10.1109\/TIP.2023.3274988","volume":"32","author":"D Ye","year":"2023","unstructured":"Ye D, Ni Z, Wang H et al (2023a) CSformer: bridging convolution and transformer for compressive sensing. IEEE Trans Image Process 32:2827\u20132842. https:\/\/doi.org\/10.1109\/TIP.2023.3274988","journal-title":"IEEE Trans Image Process"},{"key":"10595_CR227","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2023.3241825","author":"T Ye","year":"2023","unstructured":"Ye T, Qin W, Zhao Z et al (2023b) Real-time object detection network in UAV-vision based on CNN and transformer. IEEE Trans Instrum Meas. https:\/\/doi.org\/10.1109\/TIM.2023.3241825","journal-title":"IEEE Trans Instrum Meas"},{"key":"10595_CR228","doi-asserted-by":"publisher","DOI":"10.1016\/J.INFRARED.2023.104640","volume":"131","author":"S Yi","year":"2023","unstructured":"Yi S, Li L, Liu X et al (2023) HCTIRdeblur: a hybrid convolution-transformer network for single infrared image deblurring. Infrared Phys Technol 131:104640. https:\/\/doi.org\/10.1016\/J.INFRARED.2023.104640","journal-title":"Infrared Phys Technol"},{"key":"10595_CR229","doi-asserted-by":"publisher","first-page":"2377","DOI":"10.3390\/MATH11102377","volume":"11","author":"G Yu","year":"2023","unstructured":"Yu G, Zhou X (2023) An improved YOLOv5 crack detection method combined with a bottleneck transformer. Math 11:2377. https:\/\/doi.org\/10.3390\/MATH11102377","journal-title":"Math"},{"key":"10595_CR230","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062","author":"K Yuan","year":"2021","unstructured":"Yuan K, Guo S, Liu Z et al (2021a) Incorporating convolution designs into visual transformers. Proc IEEE Int Conf Comput vis. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00062","journal-title":"Proc IEEE Int Conf Comput vis"},{"key":"10595_CR231","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2022.109228","volume":"136","author":"F Yuan","year":"2023","unstructured":"Yuan F, Zhang Z, Fang Z (2023a) An effective CNN and transformer complementary network for medical image segmentation. Pattern Recognit 136:109228. https:\/\/doi.org\/10.1016\/J.PATCOG.2022.109228","journal-title":"Pattern Recognit"},{"key":"10595_CR232","doi-asserted-by":"publisher","DOI":"10.1007\/S10278-023-00842-9\/TABLES\/8","author":"J Yuan","year":"2023","unstructured":"Yuan J, Zhou F, Guo Z et al (2023b) HCformer: hybrid CNN-transformer for LDCT image denoising. J Digit Imaging. https:\/\/doi.org\/10.1007\/S10278-023-00842-9\/TABLES\/8","journal-title":"J Digit Imaging"},{"key":"10595_CR233","doi-asserted-by":"crossref","unstructured":"Yuan L, Chen Y, Wang T, et al (2021b) Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet. Proc IEEE Int Conf Comput Vis 538\u2013547","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"10595_CR234","doi-asserted-by":"publisher","DOI":"10.1016\/j.pdpdt.2021.102676","volume":"37","author":"MM Zafar","year":"2021","unstructured":"Zafar MM, Rauf Z, Sohail A et al (2021) Detection of tumour infiltrating lymphocytes in CD3 and CD8 stained histopathological images using a two-phase deep CNN. Photodiagnosis Photodyn Ther 37:102676. https:\/\/doi.org\/10.1016\/j.pdpdt.2021.102676","journal-title":"Photodiagnosis Photodyn Ther"},{"key":"10595_CR235","doi-asserted-by":"publisher","first-page":"2726","DOI":"10.3390\/S22072726","volume":"22","author":"MM Zahoor","year":"2022","unstructured":"Zahoor MM, Qureshi SA, Bibi S et al (2022) A new deep hybrid boosted and ensemble learning-based brain tumor analysis using MRI. Sensors 22:2726. https:\/\/doi.org\/10.3390\/S22072726","journal-title":"Sensors"},{"key":"10595_CR236","doi-asserted-by":"publisher","first-page":"15475","DOI":"10.48550\/arxiv.2105.13677","volume":"19","author":"QL Zhang","year":"2021","unstructured":"Zhang QL, Bin YY (2021) ResT: an efficient transformer for visual recognition. Adv Neural Inf Process Syst 19:15475\u201315485. https:\/\/doi.org\/10.48550\/arxiv.2105.13677","journal-title":"Adv Neural Inf Process Syst"},{"key":"10595_CR237","doi-asserted-by":"publisher","first-page":"2127","DOI":"10.1007\/S13042-022-01750-0\/TABLES\/8","volume":"14","author":"X Zhang","year":"2022","unstructured":"Zhang X, Zhang Y (2022) Conv-PVT: a fusion architecture of convolution and pyramid vision transformer. Int J Mach Learn Cybern 14:2127\u20132136. https:\/\/doi.org\/10.1007\/S13042-022-01750-0\/TABLES\/8","journal-title":"Int J Mach Learn Cybern"},{"key":"10595_CR238","doi-asserted-by":"publisher","first-page":"7267","DOI":"10.1109\/CVPR52688.2022.00713","volume":"2022","author":"C Zhang","year":"2021","unstructured":"Zhang C, Zhang M, Zhang S et al (2021a) Delving deep into the generalization of vision transformers under distribution shifts. Proc IEEE Comput Soc Conf Comput vis Pattern Recognit 2022:7267\u20137276. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00713","journal-title":"Proc IEEE Comput Soc Conf Comput vis Pattern Recognit"},{"key":"10595_CR239","doi-asserted-by":"publisher","first-page":"1614","DOI":"10.1109\/JAS.2020.1003390","volume":"8","author":"K Zhang","year":"2021","unstructured":"Zhang K, Su Y, Guo X et al (2021b) MU-GAN: facial attribute editing based on multi-attention mechanism. IEEE\/CAA J Autom Sin 8:1614\u20131626. https:\/\/doi.org\/10.1109\/JAS.2020.1003390","journal-title":"IEEE\/CAA J Autom Sin"},{"key":"10595_CR240","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1007\/978-3-030-87193-2_2\/COVER","volume":"12901","author":"Y Zhang","year":"2021","unstructured":"Zhang Y, Liu H, Hu Q (2021c) TransFuse: fusing transformers and CNNs for medical image segmentation. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 12901:14\u201324. https:\/\/doi.org\/10.1007\/978-3-030-87193-2_2\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR241","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2022.106292","volume":"151","author":"N Zhang","year":"2022","unstructured":"Zhang N, Yu L, Zhang D et al (2022b) APT-net: adaptive encoding and parallel decoding transformer for medical image segmentation. Comput Biol Med 151:106292. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2022.106292","journal-title":"Comput Biol Med"},{"key":"10595_CR242","doi-asserted-by":"publisher","first-page":"466","DOI":"10.1007\/978-3-031-19806-9_27","volume":"13685","author":"Q Zhang","year":"2022","unstructured":"Zhang Q, Xu Y, Zhang J, Tao D (2022c) VSA: learning varied-size window attention in vision transformers. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13685:466\u2013483. https:\/\/doi.org\/10.1007\/978-3-031-19806-9_27","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR243","doi-asserted-by":"publisher","first-page":"1141","DOI":"10.1007\/s11263-022-01739-w","volume":"131","author":"Q Zhang","year":"2022","unstructured":"Zhang Q, Xu Y, Zhang J, Tao D (2022d) ViTAEv2: vision transformer advanced by exploring inductive bias for image recognition and beyond. Int J Comput vis 131:1141\u20131162. https:\/\/doi.org\/10.1007\/s11263-022-01739-w","journal-title":"Int J Comput vis"},{"key":"10595_CR244","doi-asserted-by":"publisher","first-page":"1013","DOI":"10.1007\/S10462-022-10192-7\/FIGURES\/2","volume":"56","author":"J Zhang","year":"2023","unstructured":"Zhang J, Li C, Yin Y et al (2023a) Applications of artificial neural networks in microorganism image analysis: a comprehensive review from conventional multilayer perceptron to popular convolutional neural network and potential visual transformer. Artif Intell Rev 56:1013\u20131070. https:\/\/doi.org\/10.1007\/S10462-022-10192-7\/FIGURES\/2","journal-title":"Artif Intell Rev"},{"key":"10595_CR245","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3245674","author":"X Zhang","year":"2023","unstructured":"Zhang X, Cheng S, Wang L, Li H (2023b) Asymmetric cross-attention hierarchical network based on CNN and transformer for bitemporal remote sensing images change detection. IEEE Trans Geosci Remote Sens. https:\/\/doi.org\/10.1109\/TGRS.2023.3245674","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"10595_CR246","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2023.106967","volume":"161","author":"Z Zhang","year":"2023","unstructured":"Zhang Z, Sun G, Zheng K et al (2023c) TC-Net: A joint learning framework based on CNN and vision transformer for multi-lesion medical images segmentation. Comput Biol Med 161:106967. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2023.106967","journal-title":"Comput Biol Med"},{"key":"10595_CR247","doi-asserted-by":"crossref","unstructured":"Zhang N, Nex F, Vosselman G, Kerle N (2022a) Lite-Mono: a lightweight CNN and transformer architecture for self-supervised monocular depth estimation","DOI":"10.1109\/CVPR52729.2023.01778"},{"key":"10595_CR248","doi-asserted-by":"publisher","DOI":"10.1109\/AIAM57466.2022.00091","author":"L Zhao","year":"2022","unstructured":"Zhao L, Yu Q, Yang Y (2022a) Video person re-identification based on transformer-CNN model. 4th Int Conf Artif Intell Adv Manuf. https:\/\/doi.org\/10.1109\/AIAM57466.2022.00091","journal-title":"4th Int Conf Artif Intell Adv Manuf"},{"key":"10595_CR249","doi-asserted-by":"publisher","first-page":"1252","DOI":"10.1109\/LSP.2022.3176486","volume":"29","author":"M Zhao","year":"2022","unstructured":"Zhao M, Cao G, Huang X, Yang L (2022b) Hybrid transformer-CNN for real image denoising. IEEE Signal Process Lett 29:1252\u20131256. https:\/\/doi.org\/10.1109\/LSP.2022.3176486","journal-title":"IEEE Signal Process Lett"},{"key":"10595_CR250","doi-asserted-by":"publisher","first-page":"559","DOI":"10.1007\/978-3-031-20500-2_46\/COVER","volume":"13605","author":"S Zhao","year":"2022","unstructured":"Zhao S, Liu K, Huang Y et al (2022c) DPIT: dual-pipeline integrated transformer for\u00a0human pose estimation. Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics) 13605:559\u2013576. https:\/\/doi.org\/10.1007\/978-3-031-20500-2_46\/COVER","journal-title":"Lect Notes Comput Sci (including Subser Lect Notes Artif Intell Lect Notes Bioinformatics)"},{"key":"10595_CR251","doi-asserted-by":"publisher","DOI":"10.1016\/J.COMPBIOMED.2022.106513","volume":"153","author":"X Zhao","year":"2023","unstructured":"Zhao X, Yang T, Li B, Zhang X (2023) SwinGAN: a dual-domain swin transformer-based generative adversarial network for MRI reconstruction. Comput Biol Med 153:106513. https:\/\/doi.org\/10.1016\/J.COMPBIOMED.2022.106513","journal-title":"Comput Biol Med"},{"key":"10595_CR252","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1117\/12.2653776","volume":"12464","author":"T Zheng","year":"2023","unstructured":"Zheng T, Oda H, Hayashi Y et al (2023) L-former: a lightweight transformer for realistic medical image generation and its application to super-resolution. SPIE 12464:245\u2013250. https:\/\/doi.org\/10.1117\/12.2653776","journal-title":"SPIE"},{"key":"10595_CR253","doi-asserted-by":"publisher","DOI":"10.1016\/J.DISPLA.2022.102352","volume":"76","author":"Z Zhou","year":"2023","unstructured":"Zhou Z, Li G, Wang G (2023a) A hybrid of transformer and CNN for efficient single image super-resolution via multi-level distillation. Displays 76:102352. https:\/\/doi.org\/10.1016\/J.DISPLA.2022.102352","journal-title":"Displays"},{"key":"10595_CR254","doi-asserted-by":"publisher","DOI":"10.1111\/MICE.13003","author":"Z Zhou","year":"2023","unstructured":"Zhou Z, Zhang J, Gong C (2023b) Hybrid semantic segmentation for tunnel lining cracks based on Swin Transformer and convolutional neural network. Comput Civ Infrastruct Eng. https:\/\/doi.org\/10.1111\/MICE.13003","journal-title":"Comput Civ Infrastruct Eng"},{"key":"10595_CR255","unstructured":"Zhou D, Kang B, Jin X et al (2021) DeepViT: towards deeper vision transformer"},{"key":"10595_CR256","doi-asserted-by":"publisher","first-page":"2242","DOI":"10.1109\/ICCV.2017.244","volume":"2017","author":"JY Zhu","year":"2017","unstructured":"Zhu JY, Park T, Isola P, Efros AA (2017) Unpaired Image-to-Image translation using cycle-consistent adversarial networks. Proc IEEE Int Conf Comput vis 2017:2242\u20132251. https:\/\/doi.org\/10.1109\/ICCV.2017.244","journal-title":"Proc IEEE Int Conf Comput vis"},{"key":"10595_CR257","doi-asserted-by":"publisher","first-page":"6015","DOI":"10.3390\/S23136015","volume":"23","author":"D Zhu","year":"2023","unstructured":"Zhu D, Tan J, Wu C et al (2023a) Crop disease identification by fusing multiscale convolution and vision transformer. Sensors 23:6015. https:\/\/doi.org\/10.3390\/S23136015","journal-title":"Sensors"},{"key":"10595_CR258","doi-asserted-by":"publisher","first-page":"203","DOI":"10.3934\/MFC.2022018","volume":"6","author":"X Zhu","year":"2023","unstructured":"Zhu X, Li Z, Sun J et al (2023b) Expression recognition method combining convolutional features and transformer. Math Found Comput 6:203\u2013217. https:\/\/doi.org\/10.3934\/MFC.2022018","journal-title":"Math Found Comput"},{"key":"10595_CR259","doi-asserted-by":"publisher","DOI":"10.1016\/J.ESWA.2022.119452","volume":"216","author":"U Zidan","year":"2023","unstructured":"Zidan U, Gaber MM, Abdelsamea MM (2023) SwinCup: cascaded swin transformer for histopathological structures segmentation in colorectal cancer. Expert Syst Appl 216:119452. https:\/\/doi.org\/10.1016\/J.ESWA.2022.119452","journal-title":"Expert Syst Appl"},{"key":"10595_CR260","doi-asserted-by":"publisher","DOI":"10.1007\/S13748-023-00300-1\/FIGURES\/4","author":"P Zou","year":"2023","unstructured":"Zou P, Wu JS (2023) SwinE-UNet3+: swin transformer encoder network for medical image segmentation. Prog Artif Intell. https:\/\/doi.org\/10.1007\/S13748-023-00300-1\/FIGURES\/4","journal-title":"Prog Artif Intell"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-023-10595-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10462-023-10595-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-023-10595-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T23:44:06Z","timestamp":1730245446000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10462-023-10595-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,4]]},"references-count":261,"journal-issue":{"issue":"S3","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["10595"],"URL":"https:\/\/doi.org\/10.1007\/s10462-023-10595-0","relation":{},"ISSN":["0269-2821","1573-7462"],"issn-type":[{"value":"0269-2821","type":"print"},{"value":"1573-7462","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,4]]},"assertion":[{"value":"4 October 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing financial and\/or non-financial interests about the described work.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}