{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T00:52:15Z","timestamp":1771030335577,"version":"3.50.1"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,7,9]],"date-time":"2025-07-09T00:00:00Z","timestamp":1752019200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,7,9]],"date-time":"2025-07-09T00:00:00Z","timestamp":1752019200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"DOI":"10.1007\/s11063-025-11774-6","type":"journal-article","created":{"date-parts":[[2025,7,10]],"date-time":"2025-07-10T09:23:18Z","timestamp":1752139398000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["A Transformer-Based Hierarchical Hybrid Encoder Network for Semantic Segmentation"],"prefix":"10.1007","volume":"57","author":[{"given":"Shan","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Xuan","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Kaiwen","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Yuan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,9]]},"reference":[{"key":"11774_CR1","unstructured":"Jonathan L, Evan S, Trevor D (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440"},{"key":"11774_CR2","doi-asserted-by":"crossref","unstructured":"Olaf R, Philipp F, Thomas B (2015) U-net: Convolutional networks for biomedical image segmentation. In: Nassir, N., Joachim, H., M., W.W., F., F.A. (eds.) Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015, pp. 234\u2013241. Springer, New York","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"11774_CR3","doi-asserted-by":"crossref","unstructured":"Kaiming H, Xiangyu Z, Shaoqing R, Jian S (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"12","key":"11774_CR4","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","volume":"39","author":"B Vijay","year":"2017","unstructured":"Vijay B, Alex K, Roberto C (2017) Segnet: A deep convolutional encoder-decoder architecture for image segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence 39(12):2481\u20132495. https:\/\/doi.org\/10.1109\/TPAMI.2016.2644615","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"11774_CR5","doi-asserted-by":"publisher","unstructured":"Karen S, Andrew Z (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556https:\/\/doi.org\/10.48550\/arXiv.1409.1556","DOI":"10.48550\/arXiv.1409.1556"},{"key":"11774_CR6","unstructured":"Wenjie L, Yujia L, Raquel U, Richard Z (2016) Understanding the effective receptive field in deep convolutional neural networks. In: Proceedings of the 30th International Conference on Neural Information Processing Systems, pp. 4905\u20134913. Curran Associates Inc., Red Hook, NY, USA"},{"key":"11774_CR7","doi-asserted-by":"publisher","unstructured":"Saining X, Ross G, Piotr D, Zhuowen T, Kaiming H (2017) Aggregated residual transformations for deep neural networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5987\u20135995. https:\/\/doi.org\/10.1109\/CVPR.2017.634","DOI":"10.1109\/CVPR.2017.634"},{"key":"11774_CR8","unstructured":"Hengshuang Z, Jianping S, Xiaojuan Q, Xiaogang W, Jiaya J (2017) Pyramid scene parsing network. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6230\u20136239"},{"key":"11774_CR9","first-page":"833","volume-title":"Computer Vision - ECCV 2018","author":"C Liang-Chieh","year":"2018","unstructured":"Liang-Chieh C, Yukun Z, George P, Florian S, Hartwig A (2018) Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Ferrari V, Hebert M, Sminchisescu C, Weiss Y (eds) Computer Vision - ECCV 2018. Springer, Munich, Germany, pp 833\u2013851"},{"key":"11774_CR10","doi-asserted-by":"publisher","unstructured":"Alexey D, Lucas B, Alexander K, Dirk W, Xiaohua Z, Thomas U, Mostafa D, Matthias M, Georg H, Sylvain G, et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929https:\/\/doi.org\/10.48550\/arXiv.2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"11774_CR11","unstructured":"Jia D, Wei D, Richard S, Li-Jia L, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE, Miami, FL, USA"},{"key":"11774_CR12","unstructured":"Sixiao Z, Jiachen L, Hengshuang Z, Xiatian Z, Zekun L, Yabiao W, Yanwei F, Jianfeng F, Tao X, HS, TP, Li Z (2021) Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6877\u20136886. IEEE, online"},{"key":"11774_CR13","doi-asserted-by":"publisher","unstructured":"Enze X, Wenhai W, Zhiding Y, Anima A, M AJ, Ping L (2021) Segformer: Simple and efficient design for semantic segmentation with transformers. Advances in Neural Information Processing Systems 34, 12077\u201312090 https:\/\/doi.org\/10.48550\/arXiv.2105.15203","DOI":"10.48550\/arXiv.2105.15203"},{"key":"11774_CR14","unstructured":"Maoke Y, Kun Y, Chi Z, Zhiwei L, Kuiyuan Y (2018) Denseaspp for semantic segmentation in street scenes. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3684\u20133692. IEEE, Salt Lake City, UT, USA"},{"key":"11774_CR15","doi-asserted-by":"publisher","unstructured":"Liang-Chieh C, George P, Iasonas K, Kevin M, L YA (2014) Semantic image segmentation with deep convolutional nets and fully connected crfs. arXiv preprint arXiv:1412.7062https:\/\/doi.org\/10.48550\/arXiv.1412.7062","DOI":"10.48550\/arXiv.1412.7062"},{"key":"11774_CR16","doi-asserted-by":"publisher","unstructured":"Liang-Chieh C, George P, Florian S, Hartwig A (2017) Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587https:\/\/doi.org\/10.48550\/arXiv.1706.05587","DOI":"10.48550\/arXiv.1706.05587"},{"key":"11774_CR17","doi-asserted-by":"crossref","unstructured":"Guosheng L, Anton M, Chunhua S, Ian R (2017) Refinenet: Multi-path refinement networks for high-resolution semantic segmentation. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5168\u20135177. IEEE, Honolulu, HI, USA","DOI":"10.1109\/CVPR.2017.549"},{"key":"11774_CR18","unstructured":"Wenhai W, Enze X, Xiang L, Deng-Ping F, Kaitao S, Ding L, Tong L, Ping L, Ling S (2021) Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 548\u2013558. IEEE, online"},{"key":"11774_CR19","doi-asserted-by":"crossref","unstructured":"Haiping W, Bin X, Noel C, Mengchen L, Xiyang D, Lu Y, Lei Z (2021) Cvt: Introducing convolutions to vision transformers. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 22\u201331. IEEE, online","DOI":"10.1109\/ICCV48922.2021.00009"},{"issue":"11","key":"11774_CR20","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ac9830","volume":"2022","author":"S d\u2019Ascoli","year":"2022","unstructured":"d\u2019Ascoli S, Touvron H, Leavitt ML, Morcos AS, Biroli G, Sagun L (2022) Convit: improving vision transformers with soft convolutional inductive biases*. Journal of Statistical Mechanics: Theory and Experiment 2022(11):114005. https:\/\/doi.org\/10.1088\/1742-5468\/ac9830","journal-title":"Journal of Statistical Mechanics: Theory and Experiment"},{"key":"11774_CR21","unstructured":"Byeongho H, Sangdoo Y, Dongyoon H, Sanghyuk C, Junsuk C, Joon OS (2021) Rethinking spatial dimensions of vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11936\u201311945. IEEE, online"},{"key":"11774_CR22","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9992\u201310002. IEEE, online","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"11774_CR23","doi-asserted-by":"crossref","unstructured":"Mark S, Andrew H, Menglong Z, Andrey Z, Liang-Chieh C (2018) Mobilenetv2: Inverted residuals and linear bottlenecks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4510\u20134520. IEEE, Salt Lake City, UT, USA","DOI":"10.1109\/CVPR.2018.00474"},{"key":"11774_CR24","doi-asserted-by":"crossref","unstructured":"Andrew H, Mark S, Bo C, Weijun W, Liang-Chieh C, Mingxing T, Grace C, Vijay V, Yukun Z, Ruoming P, Hartwig A, Quoc L (2019) Searching for mobilenetv3. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1314\u20131324. IEEE, Seoul, South Korea","DOI":"10.1109\/ICCV.2019.00140"},{"key":"11774_CR25","doi-asserted-by":"publisher","unstructured":"Sachin M, Mohammad R (2021) Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178https:\/\/doi.org\/10.48550\/arXiv.2110.02178","DOI":"10.48550\/arXiv.2110.02178"},{"key":"11774_CR26","unstructured":"Robin S, Ricardo G, Ivan L, Cordelia S (2021) Segmenter: Transformer for semantic segmentation. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7242\u20137252. IEEE, online"},{"key":"11774_CR27","unstructured":"Ben G, Alaaeldin E-N, Touvron H, Stock P, Joulin A, J\u00e9gou H, Douze M (2021) Levit: a vision transformer in convnet\u2019s clothing for faster inference. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 12239\u201312249. IEEE, online"},{"key":"11774_CR28","doi-asserted-by":"crossref","unstructured":"Wenqiang Z, Zilong H, Guozhong L, Tao C, Xinggang W, Wenyu L, Gang Y, Chunhua S (2022) Topformer: Token pyramid transformer for mobile semantic segmentation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12073\u201312083. IEEE, New Orleans, LA, USA","DOI":"10.1109\/CVPR52688.2022.01177"},{"key":"11774_CR29","doi-asserted-by":"crossref","unstructured":"Hu C, Yueyue W, Joy C, Dongsheng J, Xiaopeng Z, Qi T, Manning W (2023) Swin-unet: Unet-like pure transformer for medical image segmentation. In: L., K., T., M., K., N. (eds.) Computer Vision \u2013 ECCV 2022 Workshops, pp. 205\u2013218. Springer, Tel Aviv, Israel","DOI":"10.1007\/978-3-031-25066-8_9"},{"key":"11774_CR30","unstructured":"Marius C, Omran M, Sebastian R, Timo R, Markus E, Rodrigo B, Uwe F, Stefan R, Bernt S (2016) The cityscapes dataset for semantic urban scene understanding. In: Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, Las Vegas, NV, USA"},{"key":"11774_CR31","first-page":"5122","volume-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Z Bolei","year":"2017","unstructured":"Bolei Z, Hang Z, Xavier P, Sanja F, Adela B, Antonio T (2017) Scene parsing through ade20k dataset. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, Honolulu, HI, USA, pp 5122\u20135130"},{"issue":"2","key":"11774_CR32","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham M, Van Gool L, Williams CKI, Winn J, Zisserman A (2010) The pascal visual object classes (voc) challenge. International Journal of Computer Vision 88(2):303\u2013338","journal-title":"International Journal of Computer Vision"},{"key":"11774_CR33","doi-asserted-by":"crossref","unstructured":"Hang Z, Kristin D, Jianping S, Zhongyue Z, Xiaogang W, Ambrish T, Amit A (2018) Context encoding for semantic segmentation. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7151\u20137160. IEEE, Salt Lake City, UT, USA","DOI":"10.1109\/CVPR.2018.00747"},{"key":"11774_CR34","doi-asserted-by":"publisher","unstructured":"Zilong H, Xinggang W, Yunchao W, Lichao H, Humphrey S, Wenyu L, S HT (2023) Ccnet: Criss-cross attention for semantic segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence 45(6):6896\u20136908. https:\/\/doi.org\/10.1109\/TPAMI.2020.3007032","DOI":"10.1109\/TPAMI.2020.3007032"},{"key":"11774_CR35","doi-asserted-by":"crossref","unstructured":"Yuhui Y, Xilin C, Jingdong W (2020) Object-contextual representations for semantic segmentation. In: A., V., H., B., T., B., J., F. (eds.) Computer Vision \u2013 ECCV 2020, pp. 173\u2013190. Springer, online","DOI":"10.1007\/978-3-030-58539-6_11"},{"key":"11774_CR36","doi-asserted-by":"crossref","unstructured":"Huiyu W, Yukun Z, Bradley G, Hartwig A, Alan Y, Liang-Chieh C (2020) Axial-deeplab: Stand-alone axial-attention for panoptic segmentation. In: A., V., H., B., T., B., J., F. (eds.) Computer Vision \u2013 ECCV 2020, pp. 108\u2013126. Springer, online","DOI":"10.1007\/978-3-030-58548-8_7"},{"key":"11774_CR37","unstructured":"Yanwei L, Lin S, Yukang C, Zeming L, Xiangyu Z, Xingang W, Jian S (2020) Learning dynamic routing for semantic segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8550\u20138559. IEEE, online"},{"key":"11774_CR38","doi-asserted-by":"crossref","unstructured":"Chenxi L, Liang-Chieh C, Florian S, Hartwig A, Wei H, L, YA, Li F-F (2019) Auto-deeplab: Hierarchical neural architecture search for semantic image segmentation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 82\u201392. IEEE, Long Beach, CA, USA","DOI":"10.1109\/CVPR.2019.00017"},{"key":"11774_CR39","doi-asserted-by":"publisher","unstructured":"Fu J, Liu J, Tian H, Li Y, Bao Y, Fang Z, Lu H (2019) Dual attention network for scene segmentation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3141\u20133149. https:\/\/doi.org\/10.1109\/CVPR.2019.00326","DOI":"10.1109\/CVPR.2019.00326"},{"key":"11774_CR40","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1007\/978-3-030-01240-3_17","volume-title":"Computer Vision - ECCV 2018","author":"H Zhao","year":"2018","unstructured":"Zhao H, Zhang Y, Liu S, Shi J, Loy CC, Lin D, Jia J (2018) Psanet: Point-wise spatial attention network for scene parsing. In: Ferrari V, Hebert M, Sminchisescu C, Weiss Y (eds) Computer Vision - ECCV 2018. Springer, Cham, pp 270\u2013286"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-025-11774-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-025-11774-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-025-11774-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,7]],"date-time":"2025-09-07T04:09:47Z","timestamp":1757218187000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-025-11774-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,9]]},"references-count":40,"journal-issue":{"issue":"4","published-online":{"date-parts":[[2025,8]]}},"alternative-id":["11774"],"URL":"https:\/\/doi.org\/10.1007\/s11063-025-11774-6","relation":{},"ISSN":["1573-773X"],"issn-type":[{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,9]]},"assertion":[{"value":"12 May 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 July 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest\/Competing Interests"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"All authors agreed to participate in this paper.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to Participate"}},{"value":"Not applicable","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for Publication"}},{"value":"The code are available from the corresponding author upon reasonable request.","order":6,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code Availability"}}],"article-number":"66"}}