{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T17:50:40Z","timestamp":1774893040505,"version":"3.50.1"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2022,12,22]],"date-time":"2022-12-22T00:00:00Z","timestamp":1671667200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,12,22]],"date-time":"2022-12-22T00:00:00Z","timestamp":1671667200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s13042-022-01750-0","type":"journal-article","created":{"date-parts":[[2022,12,22]],"date-time":"2022-12-22T12:04:43Z","timestamp":1671710683000},"page":"2127-2136","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Conv-PVT: a fusion architecture of convolution and pyramid vision transformer"],"prefix":"10.1007","volume":"14","author":[{"given":"Xin","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,12,22]]},"reference":[{"issue":"1","key":"1750_CR1","first-page":"1420","volume":"15","author":"HA Afan","year":"2021","unstructured":"Afan HA, Ibrahem Ahmed Osman A, Essam Y et al (2021) Modeling the fluctuations of groundwater level by employing ensemble deep learning techniques. Eng Appl Comput Fluid Mech 15(1):1420\u20131439","journal-title":"Eng Appl Comput Fluid Mech"},{"key":"1750_CR2","first-page":"2","volume":"34","author":"Y Bai","year":"2021","unstructured":"Bai Y, Mei J, Yuille A et al (2021) Are transformers more robust than CNNs? Adv Neural Inf Process Syst 34:2","journal-title":"Adv Neural Inf Process Syst"},{"key":"1750_CR3","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, et\u00a0al (2020) End-to-end object detection with transformers. In: European Conference on Computer Vision, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"issue":"15","key":"1750_CR4","doi-asserted-by":"publisher","first-page":"11381","DOI":"10.1007\/s00500-019-04602-2","volume":"24","author":"P Chaudhari","year":"2020","unstructured":"Chaudhari P, Agrawal H, Kotecha K (2020) Data augmentation using MG-GAN for improved cancer classification on gene expression data. Soft Comput 24(15):11381\u201311391. https:\/\/doi.org\/10.1007\/s00500-019-04602-2","journal-title":"Soft Comput"},{"issue":"1","key":"1750_CR5","first-page":"248","volume":"16","author":"C Chen","year":"2022","unstructured":"Chen C, Zhang Q, Kashani MH et al (2022) Forecast of rainfall distribution based on fixed sliding window long short-term memory. Eng Appl Comput Fluid Mech 16(1):248\u2013261","journal-title":"Eng Appl Comput Fluid Mech"},{"key":"1750_CR6","doi-asserted-by":"crossref","unstructured":"Chen H (2021) Pre-trained image processing transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12,299\u201312,310","DOI":"10.1109\/CVPR46437.2021.01212"},{"issue":"1","key":"1750_CR7","first-page":"965","volume":"16","author":"W Chen","year":"2022","unstructured":"Chen W, Sharifrazi D, Liang G et al (2022) Accurate discharge coefficient prediction of streamlined weirs by coupling linear regression and deep convolutional gated recurrent unit. Eng Appl Comput Fluid Mech 16(1):965\u2013976","journal-title":"Eng Appl Comput Fluid Mech"},{"key":"1750_CR8","doi-asserted-by":"crossref","unstructured":"Chollet F (2017) Xception: Deep learning with depthwise separable convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1251\u20131258","DOI":"10.1109\/CVPR.2017.195"},{"key":"1750_CR9","unstructured":"d\u2019Ascoli S, Touvron H, Leavitt M, et\u00a0al (2021) ConViT: Improving vision transformers with soft convolutional inductive biases. In: ICML, vol 2, 3"},{"key":"1750_CR10","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, et\u00a0al (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1750_CR11","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et\u00a0al (2021) An image is worth 16x16 words: Transformers for image recognition at scale. In: ICLR, vol 1, 2, 3, 4, 5, 10. p\u00a013"},{"key":"1750_CR12","doi-asserted-by":"publisher","first-page":"25111","DOI":"10.1109\/ACCESS.2020.2970836","volume":"8","author":"Y Fan","year":"2020","unstructured":"Fan Y, Xu K, Wu H et al (2020) Spatiotemporal modeling for nonlinear distributed thermal processes based on kl decomposition, mlp and lstm network. IEEE Access 8:25111\u201325121","journal-title":"IEEE Access"},{"key":"1750_CR13","unstructured":"Geirhos R, Rubisch P, Michaelis C, et\u00a0al (2019) ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness. In: International Conference on Learning Representations"},{"key":"1750_CR14","unstructured":"Glorot X, Bengio Y (2010) Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of the thirteenth international conference on artificial intelligence and statistics, pp 249\u2013256"},{"key":"1750_CR15","unstructured":"Goyal P (2017) Accurate, large minibatch sgd: Training imagenet in 1 hour. ArXiv Prepr ArXiv170602677"},{"issue":"2","key":"1750_CR16","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1007\/s41095-021-0229-5","volume":"7","author":"MH Guo","year":"2021","unstructured":"Guo MH, Cai JX, Liu ZN et al (2021) PCT: Point cloud transformer. Comput Vis Media 7(2):187\u2013199","journal-title":"Comput Vis Media"},{"key":"1750_CR17","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"1750_CR18","doi-asserted-by":"crossref","unstructured":"He K, Gkioxari G, Doll\u00e1r P, et\u00a0al (2017) Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision, pp 2961\u20132969","DOI":"10.1109\/ICCV.2017.322"},{"key":"1750_CR19","unstructured":"Hendrycks D, Dietterich T (2019) Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. In: Proceedings of the International Conference on Learning Representations"},{"key":"1750_CR20","doi-asserted-by":"crossref","unstructured":"Hendrycks D, Zhao K, Basart S, et\u00a0al (2021) Natural Adversarial Examples. CVPR","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"1750_CR21","unstructured":"Howard AG, Zhu M, Chen B, et\u00a0al (1704) Mobilenets: efficient convolutional neural networks for mobile vision applications. CoRR 2, 4, 5:6"},{"key":"1750_CR22","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/j.patrec.2021.04.024","volume":"148","author":"Y Jin","year":"2021","unstructured":"Jin Y, Han D, Ko H (2021) TrSeg: transformer for semantic segmentation. Pattern Recogn Lett 148:29\u201335. https:\/\/doi.org\/10.1016\/j.patrec.2021.04.024","journal-title":"Pattern Recogn Lett"},{"key":"1750_CR23","doi-asserted-by":"crossref","unstructured":"K.\u00a0He PDG.\u00a0Gkioxari, Girshick R (2017) Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision, pp 2961\u20132969","DOI":"10.1109\/ICCV.2017.322"},{"key":"1750_CR24","doi-asserted-by":"crossref","unstructured":"Kirillov A, Girshick R, He K, et\u00a0al (2019) Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6399\u20136408","DOI":"10.1109\/CVPR.2019.00656"},{"key":"1750_CR25","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton G (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inf Process Syst 25:1097\u20131105","journal-title":"Adv Neural Inf Process Syst"},{"key":"1750_CR26","doi-asserted-by":"crossref","unstructured":"Lin T, Doll\u00e1r P, Girshick R, et\u00a0al (2017) Feature pyramid networks for object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"1750_CR27","doi-asserted-by":"crossref","unstructured":"Lin T, Goyal P, Girshick R, et\u00a0al (2017) Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision, pp 2980\u20132988","DOI":"10.1109\/ICCV.2017.324"},{"key":"1750_CR28","doi-asserted-by":"crossref","unstructured":"Lin TY (2014) Microsoft coco: common objects in context. In: European conference on computer vision, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1750_CR29","doi-asserted-by":"crossref","unstructured":"Lin TY, Goyal P, Girshick R, et\u00a0al (2017) Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision, pp 2980\u20132988","DOI":"10.1109\/ICCV.2017.324"},{"key":"1750_CR30","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, et\u00a0al (2021) Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). p 10,012\u201310,022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1750_CR31","unstructured":"Loshchilov I, Hutter F (2017) SGDR: stochastic gradient descent with warm restarts. In: Proceedings of the Internal Conference on Learning Representations 2017"},{"key":"1750_CR32","unstructured":"Loshchilov I, Hutter F (2019) Decoupled weight decay regularization. In: ICLR, vol 1, 3. p\u00a05"},{"key":"1750_CR33","unstructured":"Radford A, Narasimhan K, Salimans T, et\u00a0al (2018) Improving language understanding by generative pre-training"},{"key":"1750_CR34","unstructured":"Raghu M, Unterthiner T, Kornblith S, et\u00a0al (2021) Do vision transformers see like convolutional neural networks? In: Thirty-Fifth Conference on Neural Information Processing Systems"},{"key":"1750_CR35","doi-asserted-by":"crossref","unstructured":"Shamshirband S, Rabczuk T, Chau KW (2019) A survey of deep learning techniques: application in wind and solar energy resources. IEEE Access 7:164,650\u2013164,666","DOI":"10.1109\/ACCESS.2019.2951750"},{"key":"1750_CR36","unstructured":"Simonyan K, Zisserman A (2015) Very deep convolutional networks for large-scale image recognition. In: ICLR"},{"key":"1750_CR37","doi-asserted-by":"crossref","unstructured":"Sun P (2021) Sparse r-cnn: End-to-end object detection with learnable proposals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14,454\u201314,463","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"1750_CR38","doi-asserted-by":"crossref","unstructured":"Szegedy C (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1750_CR39","doi-asserted-by":"crossref","unstructured":"Szegedy C, Vanhoucke V, Ioffe S, et\u00a0al (2016) Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2818\u20132826","DOI":"10.1109\/CVPR.2016.308"},{"key":"1750_CR40","unstructured":"Touvron H, Cord M, Douze M, et\u00a0al (2021) Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning. PMLR, pp 10,347\u201310,357"},{"key":"1750_CR41","unstructured":"Vaswani A (2017) Attention is all you need. In: Advances in neural information processing systems. p 5998\u20136008"},{"key":"1750_CR42","unstructured":"Wang P (2021) Scaled relu matters for training vision transformers. ArXiv Prepr ArXiv210903810"},{"key":"1750_CR43","doi-asserted-by":"crossref","unstructured":"Wang W, Xie E, Li X, et\u00a0al (2021) Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: ICCV, vol\u00a03","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"1750_CR44","doi-asserted-by":"crossref","unstructured":"Wu H, Xiao B, Codella N, et\u00a0al (2021) CvT: Introducing convolutions to vision transformers. In: ICCV, vol\u00a03","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"1750_CR45","first-page":"2","volume":"2","author":"T Xiao","year":"2021","unstructured":"Xiao T, Dollar P, Singh M et al (2021) Early convolutions help transformers see better. Adv Neural Inf Process Syst 2:2","journal-title":"Adv Neural Inf Process Syst"},{"key":"1750_CR46","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P, et\u00a0al (2017) Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1492\u20131500","DOI":"10.1109\/CVPR.2017.634"},{"key":"1750_CR47","unstructured":"Yan H, Li Z, Li W, et\u00a0al (2021) ConTNet: Why not use convolution and transformer at the same time? In: ArXiv210413497 Cs. http:\/\/arxiv.org\/abs\/2104.13497"},{"key":"1750_CR48","doi-asserted-by":"publisher","unstructured":"Yuan H, Cai Z, Zhou H, et\u00a0al (2021) TransAnomaly: Video Anomaly Detection Using Video Vision Transformer. IEEE Access 9:123,977\u2013123,986. https:\/\/doi.org\/10.1109\/ACCESS.2021.3109102.","DOI":"10.1109\/ACCESS.2021.3109102."},{"key":"1750_CR49","doi-asserted-by":"crossref","unstructured":"Yuan K, Guo S, Liu Z, et\u00a0al (2021) Incorporating convolution designs into visual transformers. In: ICCV, vol\u00a03","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"1750_CR50","doi-asserted-by":"crossref","unstructured":"Yuan L, Chen Y, Wang T, et\u00a0al (2021) Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). p 558\u2013567","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"1750_CR51","unstructured":"Zhang H, Ciss\u00e9 M, Dauphin YN, et\u00a0al (2018) Mixup: Beyond empirical risk minimization. In: ICLR"},{"key":"1750_CR52","doi-asserted-by":"crossref","unstructured":"Zhang P (2021) Multi-Scale Vision Longformer: A New Vision Transformer for High-Resolution Image Encoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV, pp 2998\u20133008","DOI":"10.1109\/ICCV48922.2021.00299"},{"key":"1750_CR53","doi-asserted-by":"crossref","unstructured":"Zhang X, Zhou X, Lin M, et\u00a0al (2018) Shufflenet: an extremely efficient convolutional neural network for mobile devices. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6848\u20136856","DOI":"10.1109\/CVPR.2018.00716"},{"key":"1750_CR54","doi-asserted-by":"crossref","unstructured":"Zhang Y (2021) Vidtr: Video transformer without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 13,577\u201313,587","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"1750_CR55","doi-asserted-by":"crossref","unstructured":"Zhou B, Zhao H, Puig X, et\u00a0al (2017) Scene parsing through ade20k dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 633\u2013641","DOI":"10.1109\/CVPR.2017.544"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-022-01750-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-022-01750-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-022-01750-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,11]],"date-time":"2023-05-11T03:13:53Z","timestamp":1683774833000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-022-01750-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12,22]]},"references-count":55,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["1750"],"URL":"https:\/\/doi.org\/10.1007\/s13042-022-01750-0","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,12,22]]},"assertion":[{"value":"17 June 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 December 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 December 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"I declare I have no financial support and personal relationships with other people or organizations that can inappropriately influence our work. There is no professional or other personal interest of any nature or kind in any products, services, or company that could be construed as influencing the position presented in, or the review of, the manuscript entitled.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declaration of Interest Statement"}}]}}