{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:29:38Z","timestamp":1740122978128,"version":"3.37.3"},"reference-count":75,"publisher":"Springer Science and Business Media LLC","issue":"41","license":[{"start":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T00:00:00Z","timestamp":1711411200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T00:00:00Z","timestamp":1711411200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-18911-8","type":"journal-article","created":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T06:19:47Z","timestamp":1711433987000},"page":"88717-88744","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["FusFormer: global and detail feature fusion transformer for semantic segmentation of small objects"],"prefix":"10.1007","volume":"83","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4160-0849","authenticated-orcid":false,"given":"Zheng","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Houjin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jupeng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Baozheng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changyong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,3,26]]},"reference":[{"key":"18911_CR1","doi-asserted-by":"crossref","unstructured":"Gao X, Wang B, Tao D, Li X (2011) A relay level set method for automatic image segmentation. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics) 41(2):518\u2013525","DOI":"10.1109\/TSMCB.2010.2065800"},{"issue":"8","key":"18911_CR2","doi-asserted-by":"publisher","first-page":"1426","DOI":"10.1109\/TCYB.2014.2352343","volume":"45","author":"K Zhang","year":"2015","unstructured":"Zhang K, Liu Q, Song H, Li X (2015) A variational approach to simultaneous image segmentation and bias correction. IEEE Trans Cybern 45(8):1426\u20131437","journal-title":"IEEE Trans Cybern"},{"key":"18911_CR3","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"18911_CR4","doi-asserted-by":"crossref","unstructured":"Zhang H, Dana K, Shi J, Zhang Z., Wang X, Tyagi A, Agrawal A (2018) Context encoding for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7151\u20137160","DOI":"10.1109\/CVPR.2018.00747"},{"key":"18911_CR5","doi-asserted-by":"crossref","unstructured":"Zhang F, Chen Y, Li Z, Hong Z, Liu J, Ma F, Han J, Ding E (2019) Acfnet: Attentional class feature network for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6798\u20136807","DOI":"10.1109\/ICCV.2019.00690"},{"key":"18911_CR6","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly, S et\u00a0al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"18911_CR7","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie E, Wang W, Yu Z, Anandkumar A, Alvarez JM, Luo P (2021) Segformer: Simple and efficient design for semantic segmentation with transformers. Adv Neural Inf Process Syst 34:12077\u201312090","journal-title":"Adv Neural Inf Process Syst"},{"key":"18911_CR8","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"18911_CR9","unstructured":"Yan H, Zhang C, Wu M (2022) Lawin transformer: Improving semantic segmentation transformer with multi-scale representations via large window attention. arXiv:2201.01615"},{"key":"18911_CR10","doi-asserted-by":"crossref","unstructured":"Meng Z, Fan X, Chen X, Chen M, Tong Y (2017) Detecting small signs from large images. In: 2017 IEEE international conference on information reuse and integration (IRI), pp 217\u2013224","DOI":"10.1109\/IRI.2017.57"},{"key":"18911_CR11","doi-asserted-by":"crossref","unstructured":"Li J, Liang X, Wei Y, Xu T, Feng J, Yan S (2017) Perceptual generative adversarial networks for small object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1222\u20131230","DOI":"10.1109\/CVPR.2017.211"},{"key":"18911_CR12","doi-asserted-by":"crossref","unstructured":"Li H, Lin Z, Shen X, Brandt J, Hua G (2015) A convolutional neural network cascade for face detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5325\u20135334","DOI":"10.1109\/CVPR.2015.7299170"},{"key":"18911_CR13","doi-asserted-by":"crossref","unstructured":"Zheng S, Jayasumana S, Romera-Paredes B, Vineet V, Su Z, Du D, Huang C, Torr PH (2015) Conditional random fields as recurrent neural networks. In: Proceedings of the IEEE international conference on computer vision, pp 1529\u20131537","DOI":"10.1109\/ICCV.2015.179"},{"issue":"6","key":"18911_CR14","doi-asserted-by":"publisher","first-page":"2643","DOI":"10.1109\/TIP.2018.2888701","volume":"28","author":"D Guo","year":"2018","unstructured":"Guo D, Zhu L, Lu Y, Yu H, Wang S (2018) Small object sensitive segmentation of urban street scene with spatial adjacency between object classes. IEEE Trans Image Process 28(6):2643\u20132653","journal-title":"IEEE Trans Image Process"},{"key":"18911_CR15","unstructured":"Kr\u00e4henb\u00fchl P, Koltun V (2011) Efficient inference in fully connected crfs with gaussian edge potentials. Advances in neural information processing systems 24"},{"key":"18911_CR16","doi-asserted-by":"crossref","unstructured":"Chandra S, Kokkinos I (2016) Fast, exact and multi-scale inference for semantic image segmentation with deep gaussian crfs. In: Computer Vision\u2013ECCV 2016: 14th european conference, amsterdam, the netherlands, October 11\u201314, 2016, Proceedings, Part VII 14, pp 402\u2013418","DOI":"10.1007\/978-3-319-46478-7_25"},{"issue":"12","key":"18911_CR17","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","volume":"39","author":"V Badrinarayanan","year":"2017","unstructured":"Badrinarayanan V, Kendall A, Cipolla R (2017) Segnet: A deep convolutional encoder-decoder architecture for image segmentation. IEEE Trans Pattern Anal Mach Intell 39(12):2481\u20132495","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"18911_CR18","doi-asserted-by":"crossref","unstructured":"Strudel R, Garcia R, Laptev I, Schmid C (2021) Segmenter: Transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7262\u20137272","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"18911_CR19","doi-asserted-by":"crossref","unstructured":"Wu H, Xiao B, Codella N, Liu M, Dai X, Yuan L, Zhang L (2021) Cvt: Introducing convolutions to vision transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 22\u201331","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"18911_CR20","doi-asserted-by":"crossref","unstructured":"Yuan K, Guo S, Liu Z, Zhou A, Yu F, Wu W (2021) Incorporating convolution designs into visual transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 579\u2013588","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"18911_CR21","unstructured":"Yan H, Li Z, Li W, Wang C, Wu M, Zhang C (2021) Contnet: Why not use convolution and transformer at the same time. arXiv:2104.13497"},{"key":"18911_CR22","first-page":"965","volume":"34","author":"Z Dai","year":"2021","unstructured":"Dai Z, Liu H, Le QV, Tan M (2021) Coatnet: Marrying convolution and attention for all data sizes. Adv Neural Inf Process Syst 34:965\u20133977","journal-title":"Adv Neural Inf Process Syst"},{"key":"18911_CR23","doi-asserted-by":"crossref","unstructured":"Cordts M, Omran M, Ramos S, Rehfeld T, Enzweiler M, Benenson R, Franke U, Roth S, Schiele B (2016) The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3213\u20133223","DOI":"10.1109\/CVPR.2016.350"},{"key":"18911_CR24","doi-asserted-by":"crossref","unstructured":"Zhou B, Zhao H, Puig X, Fidler, S, Barriuso A, Torralba A (2017) Scene parsing through ade20k dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 633\u2013641","DOI":"10.1109\/CVPR.2017.544"},{"issue":"1","key":"18911_CR25","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham M, Eslami SMA, Van Gool L, Williams CKI, Winn J, Zisserman A (2015) The pascal visual object classes challenge: A retrospective. Int J Comput Vis 111(1):98\u2013136","journal-title":"Int J Comput Vis"},{"key":"18911_CR26","doi-asserted-by":"crossref","unstructured":"Fu J, Liu J, Tian H, Li Y, Bao Y, Fang Z, Lu H (2019) Dual attention network for scene segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3146\u20133154","DOI":"10.1109\/CVPR.2019.00326"},{"key":"18911_CR27","doi-asserted-by":"crossref","unstructured":"Huang Z, Wang X, Huang L, Huang C, Wei Y, Liu W (2019) Ccnet: Criss-cross attention for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 603\u2013612","DOI":"10.1109\/ICCV.2019.00069"},{"key":"18911_CR28","doi-asserted-by":"crossref","unstructured":"Yuan Y, Chen X, Chen X, Wang J (2019) Segmentation transformer: Object-contextual representations for semantic segmentation. arXiv:1909.11065","DOI":"10.1007\/978-3-030-58539-6_11"},{"key":"18911_CR29","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"18911_CR30","doi-asserted-by":"crossref","unstructured":"Takikawa T, Acuna D, Jampani V, Fidler S (2019) Gated-scnn: Gated shape cnns for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5229\u20135238","DOI":"10.1109\/ICCV.2019.00533"},{"key":"18911_CR31","doi-asserted-by":"crossref","unstructured":"Jin Z, Liu B, Chu Q, Yu N (2021) Isnet: Integrate image-level and semantic-level context for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7189\u20137198","DOI":"10.1109\/ICCV48922.2021.00710"},{"key":"18911_CR32","doi-asserted-by":"crossref","unstructured":"Zheng S, Lu J, Zhao H, Zhu X, Luo Z, Wang Y, Fu Y, Feng J, Xiang T, Torr PH et\u00a0al (2021) Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6881\u20136890","DOI":"10.1109\/CVPR46437.2021.00681"},{"issue":"12","key":"18911_CR33","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","volume":"39","author":"V Badrinarayanan","year":"2017","unstructured":"Badrinarayanan V, Kendall A, Cipolla R (2017) Segnet: A deep convolutional encoder-decoder architecture for image segmentation. IEEE Trans Pattern Anal Mach Intell 39(12):2481\u20132495","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"18911_CR34","first-page":"1","volume":"60","author":"A Ma","year":"2021","unstructured":"Ma A, Wang J, Zhong Y, Zheng Z (2021) Factseg: Foreground activation-driven small object semanticsegmentation in large-scale remote sensing imagery. IEEE Trans Geosci Remote Sens 60:1\u201316","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"18911_CR35","doi-asserted-by":"crossref","unstructured":"Sun C, Shrivastava A, Singh S, Gupta A (2017) Revisiting unreasonable effectiveness of data in deep learning era. In: Proceedings of the IEEE international conference on computer vision, pp 843\u2013852","DOI":"10.1109\/ICCV.2017.97"},{"key":"18911_CR36","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"18911_CR37","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H (2021) Training data-efficient image transformers & distillation through attention. In: International conference on machine learning, pp 10347\u201310357"},{"key":"18911_CR38","doi-asserted-by":"crossref","unstructured":"Chen C-FR, Fan Q, Panda R (2021) Crossvit: Cross-attention multi-scale vision transformer for image classification. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 357\u2013366","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"18911_CR39","doi-asserted-by":"crossref","unstructured":"Wang W, Xie E, Li X, Fan D-P, Song K, Liang D, Lu T, Luo P, Shao L (2021) Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 568\u2013578","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"18911_CR40","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/j.neucom.2023.01.055","volume":"525","author":"S Xu","year":"2023","unstructured":"Xu S, Gu J, Hua Y, Liu Y (2023) Dktnet: Dual-key transformer network for small object detection. Neurocomputing 525:29\u201341","journal-title":"Neurocomputing"},{"key":"18911_CR41","first-page":"15475","volume":"34","author":"Q Zhang","year":"2021","unstructured":"Zhang Q, Yang Y-B (2021) Rest: An efficient transformer for visual recognition. Adv Neural Inf Process Syst 34:15475\u201315485","journal-title":"Adv Neural Inf Process Syst"},{"key":"18911_CR42","doi-asserted-by":"crossref","unstructured":"Zhao H, Shi J, Qi X, Wang X, Jia J (2017) Pyramid scene parsing network. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2881\u20132890","DOI":"10.1109\/CVPR.2017.660"},{"issue":"4","key":"18911_CR43","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L-C Chen","year":"2017","unstructured":"Chen L-C, Papandreou G, Kokkinos I, Murphy K, Yuille AL (2017) Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans Pattern Anal Mach Intell 40(4):834\u2013848","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"18911_CR44","unstructured":"Chen L-C, Papandreou G, Schroff F, Adam H (2017) Rethinking atrous convolution for semantic image segmentation. arXiv:1706.05587"},{"key":"18911_CR45","doi-asserted-by":"crossref","unstructured":"Chen L-C, Zhu Y, Papandreou G, Schroff F, Adam H (2018) Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the european conference on computer vision (ECCV), pp 801\u2013818","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"18911_CR46","doi-asserted-by":"crossref","unstructured":"Yang M, Yu K, Zhang C, Li Z, Yang K (2018) Denseaspp for semantic segmentation in street scenes. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3684\u20133692","DOI":"10.1109\/CVPR.2018.00388"},{"key":"18911_CR47","doi-asserted-by":"publisher","first-page":"360","DOI":"10.1109\/TIP.2019.2930906","volume":"29","author":"Y Liu","year":"2019","unstructured":"Liu Y, Han J, Zhang Q, Shan C (2019) Deep salient object detection with contextual information guidance. IEEE Trans Image Process 29:360\u2013374","journal-title":"IEEE Trans Image Process"},{"key":"18911_CR48","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1016\/j.neucom.2020.11.022","volume":"428","author":"Y Liu","year":"2021","unstructured":"Liu Y, Duanmu M, Huo Z, Qi H, Chen Z, Li L, Zhang Q (2021) Exploring multi-scale deformable context and channel-wise attention for salient object detection. Neurocomputing 428:92\u2013103","journal-title":"Neurocomputing"},{"key":"18911_CR49","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv:1607.06450"},{"key":"18911_CR50","unstructured":"Hendrycks D, Gimpel K (2016) Gaussian error linear units (gelus). arXiv:1606.08415"},{"key":"18911_CR51","unstructured":"Guo M-H, Lu C-Z, Hou Q, Liu Z, Cheng M-M, Hu S-M (2022) Segnext: Rethinking convolutional attention design for semantic segmentation. arXiv:2209.08575"},{"key":"18911_CR52","unstructured":"Deng H, Ren Q, Chen X, Zhang H, Ren J, Zhang Q (2021) Discovering and explaining the representation bottleneck of dnns. arXiv:2111.06236"},{"key":"18911_CR53","unstructured":"Li S, Wang Z, Liu Z, Tan C, Lin H, Wu D, Chen Z, Zheng J, Li SZ (2022) Efficient multi-order gated aggregation network. arXiv:2211.03295"},{"key":"18911_CR54","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"18911_CR55","doi-asserted-by":"crossref","unstructured":"Szegedy C, Ioffe S, Vanhoucke V, Alemi AA (2017) Inception-v4, inception-resnet and the impact of residual connections on learning. In: Thirty-first AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"18911_CR56","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P, Tu Z, He K (2017) Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1492\u20131500","DOI":"10.1109\/CVPR.2017.634"},{"key":"18911_CR57","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.neunet.2017.12.012","volume":"107","author":"S Elfwing","year":"2018","unstructured":"Elfwing S, Uchibe E, Doya K (2018) Sigmoid-weighted linear units for neural network function approximation in reinforcement learning. Neural Netw 107:3\u201311","journal-title":"Neural Netw"},{"key":"18911_CR58","doi-asserted-by":"crossref","unstructured":"Shi W, Caballero J, Husz\u00e1r F, Totz J, Aitken AP, Bishop R, Rueckert, D, Wang, Z (2016) Real-time single image and video super-resolution using an efficient sub-pixel convolutional neural network. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1874\u20131883","DOI":"10.1109\/CVPR.2016.207"},{"key":"18911_CR59","first-page":"25346","volume":"34","author":"M Mao","year":"2021","unstructured":"Mao M, Zhang R, Zheng H, Ma T, Peng Y, Ding E, Zhang B, Han S et al (2021) Dual-stream network for visual recognition. Adv Neural Inf Process Syst 34:25346\u201325358","journal-title":"Adv Neural Inf Process Syst"},{"key":"18911_CR60","unstructured":"Wang Y, Sun H, Wang X, Zhang B, Li C, Xin Y, Zhang B, Ding E, Han S (2022) Maformer: A transformer network with multi-scale attention fusion for visual recognition. arXiv:2209.01620"},{"key":"18911_CR61","doi-asserted-by":"crossref","unstructured":"Huang S, Lu Z, Cheng R, He C (2021) Fapn: Feature-aligned pyramid network for dense image prediction. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 864\u2013873","DOI":"10.1109\/ICCV48922.2021.00090"},{"key":"18911_CR62","unstructured":"Islam MA, Jia S, Bruce ND (2020) How much position information do convolutional neural networks encode. arXiv:2001.08248"},{"key":"18911_CR63","unstructured":"Chu X, Tian Z, Zhang B, Wang X, Wei X, Xia H, Shen C (2021) Conditional positional encodings for vision transformers. arXiv:2102.10882"},{"key":"18911_CR64","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: Accelerating deep network training by reducing internal covariate shift. In: International conference on machine learning, pp 448\u2013456"},{"key":"18911_CR65","unstructured":"Glorot X, Bordes A, Bengio Y (2011) Deep sparse rectifier neural networks. In: Proceedings of the fourteenth international conference on artificial intelligence and statistics, pp 315\u2013323"},{"key":"18911_CR66","unstructured":"Contributors M (2020) MMSegmentation: OpenMMLab Semantic Segmentation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation"},{"key":"18911_CR67","unstructured":"Loshchilov I, Hutter F (2017) Decoupled weight decay regularization. arXiv:1711.05101"},{"key":"18911_CR68","doi-asserted-by":"crossref","unstructured":"Zhu Z, Xu M, Bai S, Huang T, Bai X (2019) Asymmetric non-local neural networks for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 593\u2013602","DOI":"10.1109\/ICCV.2019.00068"},{"key":"18911_CR69","doi-asserted-by":"publisher","first-page":"1169","DOI":"10.1109\/TIP.2020.3042065","volume":"30","author":"T Wu","year":"2020","unstructured":"Wu T, Tang S, Zhang R, Cao J, Zhang Y (2020) Cgnet: A light-weight context guided network for semantic segmentation. IEEE Trans Image Process 30:1169\u20131179","journal-title":"IEEE Trans Image Process"},{"key":"18911_CR70","doi-asserted-by":"crossref","unstructured":"Zhao H, Zhang Y, Liu S, Shi J, Loy CC, Lin D, Jia J (2018) Psanet: Point-wise spatial attention network for scene parsing. In: Proceedings of the european conference on computer vision (ECCV), pp 267\u2013283","DOI":"10.1007\/978-3-030-01240-3_17"},{"key":"18911_CR71","doi-asserted-by":"crossref","unstructured":"Xiao T, Liu Y, Zhou B, Jiang Y, Sun J (2018) Unified perceptual parsing for scene understanding. In: Proceedings of the european conference on computer vision (ECCV), pp 418\u2013434","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"18911_CR72","doi-asserted-by":"crossref","unstructured":"Li X, Zhong Z, Wu J, Yang Y, Lin Z, Liu H (2019) Expectation-maximization attention networks for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 9167\u20139176","DOI":"10.1109\/ICCV.2019.00926"},{"key":"18911_CR73","unstructured":"Park N, Kim S (2022) How do vision transformers work. arXiv:2202.06709"},{"key":"18911_CR74","unstructured":"Pan Z, Cai J, Zhuang B (2022) Fast vision transformers with hilo attention. arXiv:2205.13213"},{"key":"18911_CR75","doi-asserted-by":"crossref","unstructured":"Bai J, Yuan L, Xia S-T, Yan S, Li Z, Liu W (2022) Improving vision transformers by revisiting high-frequency components. arXiv:2204.00993","DOI":"10.1007\/978-3-031-20053-3_1"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18911-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-18911-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18911-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T12:11:15Z","timestamp":1734955875000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-18911-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,26]]},"references-count":75,"journal-issue":{"issue":"41","published-online":{"date-parts":[[2024,12]]}},"alternative-id":["18911"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-18911-8","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2024,3,26]]},"assertion":[{"value":"15 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 October 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 March 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}},{"value":"Informed consent was obtained from all individual participants included in the study.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and Informed Consent"}}]}}