{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T03:11:49Z","timestamp":1758597109545,"version":"3.44.0"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T00:00:00Z","timestamp":1757030400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T00:00:00Z","timestamp":1757030400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"National Science and Technology Major Project of High Resolution Earth Observation System of China","award":["80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23"],"award-info":[{"award-number":["80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23","80-Y50G19-9001-22\/23"]}]},{"name":"Science and Technology Research Project of Henan Province","award":["222102210061","222102210061","222102210061","222102210061","222102210061","222102210061"],"award-info":[{"award-number":["222102210061","222102210061","222102210061","222102210061","222102210061","222102210061"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11760-025-04575-w","type":"journal-article","created":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T16:15:55Z","timestamp":1757088955000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhanced Building Extraction via STMC-UNet: Integrating Super Token Transformer and Multi-scale Convolution"],"prefix":"10.1007","volume":"19","author":[{"given":"Ke","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Cong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yanna","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ru","family":"Miao","sequence":"additional","affiliation":[]},{"given":"Shihao","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Yuanxing","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,5]]},"reference":[{"issue":"4","key":"4575_CR1","doi-asserted-by":"publisher","first-page":"808","DOI":"10.3390\/rs13040808","volume":"13","author":"B Neupane","year":"2021","unstructured":"Neupane, B., Horanont, T., Aryal, J.: Deep learning-based semantic segmentation of urban features in satellite images: A review and meta-analysis. Remote Sensing 13(4), 808 (2021)","journal-title":"Remote Sensing"},{"key":"4575_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 801\u2013818 (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"4575_CR3","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5693\u20135703 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"4575_CR4","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"4575_CR5","doi-asserted-by":"crossref","unstructured":"Wang, Q., Wu, B., Zhu, P., Li, P., Zuo, W., Hu, Q.: Eca-net: Efficient channel attention for deep convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11534\u201311542 (2020)","DOI":"10.1109\/CVPR42600.2020.01155"},{"key":"4575_CR6","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"4575_CR7","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"4575_CR8","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"4575_CR9","unstructured":"Chen, J., Lu, Y., Yu, Q., Luo, X., Adeli, E., Wang, Y., Lu, L., Yuille, A.L., Zhou, Y.: Transunet: Transformers make strong encoders for medical image segmentation. arXiv preprint arXiv:2102.04306 (2021)"},{"key":"4575_CR10","first-page":"1","volume":"21","author":"M Lin","year":"2024","unstructured":"Lin, M., Jing, W., Li, C., Jung, A.: Optimized vectorizing of building structures with switch: High-efficiency convolutional channel-switch hybridization strategy. IEEE Geosci. Remote Sens. Lett. 21, 1\u20135 (2024)","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"key":"4575_CR11","doi-asserted-by":"crossref","unstructured":"Lin, M., Jing, W., Zou, W., Qiu, Z., Li, C.: Gaussian-based swap operator for context-aware extraction of building boundary vectors. IEEE Transactions on Geoscience and Remote Sensing (2025)","DOI":"10.1109\/TGRS.2025.3532830"},{"key":"4575_CR12","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"4575_CR13","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: Convolutional networks for biomedical image segmentation. In: Medical Image Computing and Computer-assisted intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18, pp. 234\u2013241 (2015). Springer","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"4","key":"4575_CR14","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L-C Chen","year":"2017","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4575_CR15","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"issue":"9","key":"4575_CR16","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.3390\/rs14091956","volume":"14","author":"H Wang","year":"2022","unstructured":"Wang, H., Chen, X., Zhang, T., Xu, Z., Li, J.: Cctnet: Coupled cnn and transformer network for crop segmentation of remote sensing images. Remote Sensing 14(9), 1956 (2022)","journal-title":"Remote Sensing"},{"key":"4575_CR17","first-page":"1","volume":"19","author":"L Wang","year":"2022","unstructured":"Wang, L., Li, R., Duan, C., Zhang, C., Meng, X., Fang, S.: A novel transformer based semantic segmentation scheme for fine-resolution remote sensing images. IEEE Geosci. Remote Sens. Lett. 19, 1\u20135 (2022)","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"key":"4575_CR18","doi-asserted-by":"crossref","unstructured":"Wu, Y., Zhang, M.: Swin-cfnet: An attempt at fine-grained urban green space classification using swin transformer and convolutional neural network. IEEE Geoscience and Remote Sensing Letters (2024)","DOI":"10.1109\/LGRS.2024.3404393"},{"issue":"6","key":"4575_CR19","doi-asserted-by":"publisher","first-page":"1856","DOI":"10.1109\/TMI.2019.2959609","volume":"39","author":"Z Zhou","year":"2019","unstructured":"Zhou, Z., Siddiquee, M.M.R., Tajbakhsh, N., Liang, J.: Unet++: Redesigning skip connections to exploit multiscale features in image segmentation. IEEE Trans. Med. Imaging 39(6), 1856\u20131867 (2019)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"4575_CR20","doi-asserted-by":"crossref","unstructured":"Huang, H., Lin, L., Tong, R., Hu, H., Zhang, Q., Iwamoto, Y., Han, X., Chen, Y.-W., Wu, J.: Unet 3+: A full-scale connected unet for medical image segmentation. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1055\u20131059 (2020). IEEE","DOI":"10.1109\/ICASSP40776.2020.9053405"},{"key":"4575_CR21","doi-asserted-by":"crossref","unstructured":"Wang, H., Cao, P., Wang, J., Zaiane, O.R.: Uctransnet: rethinking the skip connections in u-net from a channel-wise perspective with transformer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 2441\u20132449 (2022)","DOI":"10.1609\/aaai.v36i3.20144"},{"key":"4575_CR22","unstructured":"Huang, H., Zhou, X., Cao, J., He, R., Tan, T.: Vision transformer with super token sampling. arXiv preprint arXiv:2211.11167 (2022)"},{"key":"4575_CR23","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.-Y., Kweon, I.S.: Cbam: Convolutional block attention module. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"4575_CR24","unstructured":"Mnih, V.: Machine learning for aerial image labeling. PhD thesis, University of Toronto (Canada) (2013)"},{"issue":"1","key":"4575_CR25","doi-asserted-by":"publisher","first-page":"574","DOI":"10.1109\/TGRS.2018.2858817","volume":"57","author":"S Ji","year":"2018","unstructured":"Ji, S., Wei, S., Lu, M.: Fully convolutional networks for multisource building extraction from an open aerial and satellite imagery data set. IEEE Trans. Geosci. Remote Sens. 57(1), 574\u2013586 (2018)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"4575_CR26","doi-asserted-by":"crossref","unstructured":"Maggiori, E., Tarabalka, Y., Charpiat, G., Alliez, P.: Can semantic labeling methods generalize to any city? the inria aerial image labeling benchmark. In: 2017 IEEE International Geoscience and Remote Sensing Symposium (IGARSS), pp. 3226\u20133229 (2017). IEEE","DOI":"10.1109\/IGARSS.2017.8127684"},{"key":"4575_CR27","first-page":"1140","volume":"35","author":"M-H Guo","year":"2022","unstructured":"Guo, M.-H., Lu, C.-Z., Hou, Q., Liu, Z., Cheng, M.-M., Hu, S.-M.: Segnext: Rethinking convolutional attention design for semantic segmentation. Adv. Neural. Inf. Process. Syst. 35, 1140\u20131156 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4575_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11976\u201311986 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"4575_CR29","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: Simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4575_CR30","doi-asserted-by":"crossref","unstructured":"Yu, W., Luo, M., Zhou, P., Si, C., Zhou, Y., Wang, X., Feng, J., Yan, S.: Metaformer is actually what you need for vision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10819\u201310829 (2022)","DOI":"10.1109\/CVPR52688.2022.01055"},{"issue":"5","key":"4575_CR31","doi-asserted-by":"publisher","first-page":"1111","DOI":"10.3390\/electronics12051111","volume":"12","author":"Z Wang","year":"2023","unstructured":"Wang, Z., Li, J., Tan, Z., Liu, X., Li, M.: Swin-upernet: A semantic segmentation model for mangroves and spartina alterniflora loisel based on upernet. Electronics 12(5), 1111 (2023)","journal-title":"Electronics"},{"key":"4575_CR32","doi-asserted-by":"crossref","unstructured":"Wang, T., Xu, C., Liu, B., Yang, G., Zhang, E., Niu, D., Zhang, H.: Mcat-unet: Convolutional and cross-shaped window attention enhanced unet for efficient high-resolution remote sensing image segmentation. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing (2024)","DOI":"10.1109\/JSTARS.2024.3397488"},{"key":"4575_CR33","doi-asserted-by":"publisher","first-page":"51","DOI":"10.1016\/j.isprsjprs.2024.01.022","volume":"209","author":"Y Li","year":"2024","unstructured":"Li, Y., Hong, D., Li, C., Yao, J., Chanussot, J.: Hd-net: High-resolution decoupled network for building footprint extraction via deeply supervised body and boundary decomposition. ISPRS J. Photogramm. Remote. Sens. 209, 51\u201365 (2024)","journal-title":"ISPRS J. Photogramm. Remote. Sens."},{"key":"4575_CR34","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1016\/j.isprsjprs.2025.01.017","volume":"220","author":"Z Wang","year":"2025","unstructured":"Wang, Z., Yi, J., Chen, A., Chen, L., Lin, H., Xu, K.: Accurate semantic segmentation of very high-resolution remote sensing images considering feature state sequences: From benchmark datasets to urban applications. ISPRS J. Photogramm. Remote. Sens. 220, 824\u2013840 (2025)","journal-title":"ISPRS J. Photogramm. Remote. Sens."}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04575-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-04575-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04575-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T13:16:09Z","timestamp":1758546969000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-04575-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,5]]},"references-count":34,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["4575"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-04575-w","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"type":"print","value":"1863-1703"},{"type":"electronic","value":"1863-1711"}],"subject":[],"published":{"date-parts":[[2025,9,5]]},"assertion":[{"value":"18 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 July 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 September 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"1019"}}