{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:27:41Z","timestamp":1761388061561,"version":"build-2065373602"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Innovation and Entrepreneurship Training Programme for University Students Project, China","award":["2024098DCXM","2024098DCXM","2024098DCXM","2024009DCXM"],"award-info":[{"award-number":["2024098DCXM","2024098DCXM","2024098DCXM","2024009DCXM"]}]},{"name":"Zhongshan Science and Technology Planning Project","award":["2020AG019"],"award-info":[{"award-number":["2020AG019"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s00530-025-01888-1","type":"journal-article","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T10:13:57Z","timestamp":1755771237000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Spcformer: spatial perception correction transformer for semantic segmentation of scene parsing"],"prefix":"10.1007","volume":"31","author":[{"given":"Zhengan","family":"Lu","sequence":"first","affiliation":[]},{"given":"Zhuang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Shuobin","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Zizhao","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Binghua","family":"Su","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,21]]},"reference":[{"key":"1888_CR1","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV), pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1888_CR2","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. Proc. Conf. Adv. Neural Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Proc. Conf. Adv. Neural Inf. Process. Syst."},{"key":"1888_CR3","unstructured":"Chen, Z., Duan, Y., Wang, W., He, J., Lu, T., Dai, J., Qiao, Y.: Vision transformer adapter for dense predictions (2022). arXiv preprint arXiv:2205.08534"},{"key":"1888_CR4","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Li, Z., Chen, Y., Wang, F., Zhu, X., Luo, J., Wang, W., Lu, T., Li, H., Qiao, Y., et al.: Efficient deformable ConvNets: rethinking dynamic and sparse operator for vision applications (2024). arXiv preprint arXiv:2401.06197","DOI":"10.1109\/CVPR52733.2024.00540"},{"key":"1888_CR5","doi-asserted-by":"crossref","unstructured":"Pan, X., Wang, S., Liu, Y., Wen, L., Lu, M.: iPCa-Former: A multi-task transformer framework for perceiving incidental prostate cancer. IEEE Signal Process. Lett. (2024)","DOI":"10.1109\/LSP.2024.3372787"},{"key":"1888_CR6","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, a \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"1888_CR7","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., Wei, Y.: Deformable convolutional networks. In: Proc. IEEE Int. Conf. Comput. Vis. (ICCV), pp. 764\u2013773 (2017)","DOI":"10.1109\/ICCV.2017.89"},{"key":"1888_CR8","doi-asserted-by":"crossref","unstructured":"Zhu, X., Hu, H., Lin, S., Dai, J.: \u201cDeformable ConvNets V2: More Deformable, Better Results,\u201d In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 9308-9316 (2019)","DOI":"10.1109\/CVPR.2019.00953"},{"key":"1888_CR9","doi-asserted-by":"crossref","unstructured":"Wang, W., Dai, J., Chen, Z., Huang, Z., Li, Z., Zhu, X., Hu, X., Lu, T., Lu, L., Li, H., et al.: InternImage: Exploring large-scale vision foundation models with deformable convolutions. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 14408\u201314419 (2023)","DOI":"10.1109\/CVPR52729.2023.01385"},{"issue":"2","key":"1888_CR10","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/s00530-024-01262-7","volume":"30","author":"Z Liang","year":"2024","unstructured":"Liang, Z., Dong, W., Zhang, B.: A dual-branch hybrid network of CNN and transformer with adaptive keyframe scheduling for video semantic segmentation. Multimedia Syst. 30(2), 67 (2024)","journal-title":"Multimedia Syst."},{"issue":"3","key":"1888_CR11","first-page":"3072","volume":"37","author":"Y Xu","year":"2023","unstructured":"Xu, Y., Yang, Y., Zhang, L.: DeMT: deformable mixer transformer for multi-task learning of dense prediction. Proc. AAAI Conf. Artif. Intell. 37(3), 3072\u20133080 (2023)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"3","key":"1888_CR12","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1007\/s00530-024-01342-8","volume":"30","author":"F Sun","year":"2024","unstructured":"Sun, F., He, N., Li, R., Wang, X., Xu, S.: GD-PAN: a multiscale fusion architecture applied to object detection in UAV aerial images. Multimedia Syst. 30(3), 143 (2024)","journal-title":"Multimedia Syst."},{"key":"1888_CR13","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (GELUs) (2016). arXiv preprint arXiv:1606.08415"},{"key":"1888_CR14","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Chen, X., Liu, X., Cho, N.-G., Lee, S.-W., Fidler, S., Urtasun, R., Yuille, A.: The role of context for object detection and semantic segmentation in the wild. In: Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"1888_CR15","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: Indoor segmentation and support inference from RGBD images. In: Proc. Eur. Conf. Comput. Vis. (ECCV), Florence, Italy, Oct. pp. 746\u2013760 (2012)","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"1888_CR16","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: SUN RGB-D: a RGB-D scene understanding benchmark suite. In: Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 567\u2013576 (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"1888_CR17","doi-asserted-by":"crossref","unstructured":"Seichter, D., Fischedick, S.B., K\u00f6hler, M., Gro\u00df, H.-M.: Efficient multi-task RGB-D scene analysis for indoor environments. In: Proc. 2022 Int. Joint Conf. Neural Netw. (IJCNN), pp. 1\u201310 (2022) IEEE","DOI":"10.1109\/IJCNN55064.2022.9892852"},{"key":"1888_CR18","doi-asserted-by":"crossref","unstructured":"Takikawa, T., Acuna, D., Jampani, V., Fidler, S.: Gated-SCNN: Gated shape CNNs for semantic segmentation. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV), pp. 5229\u20135238 (2019)","DOI":"10.1109\/ICCV.2019.00533"},{"issue":"5","key":"1888_CR19","doi-asserted-by":"publisher","first-page":"1926","DOI":"10.1109\/TCSVT.2020.3015866","volume":"31","author":"J Ji","year":"2020","unstructured":"Ji, J., Shi, R., Li, S., Chen, P., Miao, Q.: Encoder-decoder with cascaded CRFs for semantic segmentation. IEEE Trans. Circuits Syst. Video Technol. 31(5), 1926\u20131938 (2020)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1888_CR20","unstructured":"Chen, L.-C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation (2017). arXiv preprint arXiv:1706.05587"},{"key":"1888_CR21","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Gao, C., Yu, G., Shen, C., Sang, N.: Context prior for scene segmentation. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 12416\u201312425 (2020)","DOI":"10.1109\/CVPR42600.2020.01243"},{"key":"1888_CR22","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Peng, C., Gao, C., Yu, G., Sang, N.: Learning a discriminative feature network for semantic segmentation. In: Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 1857\u20131866 (2018)","DOI":"10.1109\/CVPR.2018.00199"},{"key":"1888_CR23","doi-asserted-by":"crossref","unstructured":"Ye, H., Xu, D.: InvPT++: inverted pyramid multi-task transformer for visual scene understanding. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3397031"},{"key":"1888_CR24","doi-asserted-by":"crossref","unstructured":"Vandenhende, S., Georgoulis, S., Van Gool, L.: Mti-net: Multi-scale task interaction networks for multi-task learning. In: Proc. 16th Eur. Conf. Comput. Vis. (ECCV), Glasgow, UK, Aug. 23\u201328, 2020, pp. 527\u2013543, Springer (2020)","DOI":"10.1007\/978-3-030-58548-8_31"},{"key":"1888_CR25","doi-asserted-by":"publisher","unstructured":"Xu, Y., Li, X., Yuan, H., Yang, Y., Zhang, L.: Multi-task learning with multi-query transformer for dense prediction. IEEE Trans. Circuits Syst. Video Technol. (2023). [Online]. https:\/\/doi.org\/10.1109\/TCSVT.2023","DOI":"10.1109\/TCSVT.2023"},{"key":"1888_CR26","unstructured":"Yin, B., Zhang, X., Li, Z., Liu, L., Cheng, M.-M., Hou, Q.: Dformer: rethinking RGBD representation learning for semantic segmentation (2023). arXiv preprint arXiv:2309.09668"},{"key":"1888_CR27","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1109\/LSP.2024.3378120","volume":"31","author":"Y Zhang","year":"2024","unstructured":"Zhang, Y., Zhou, W., Ran, X., Fang, M.: Lightweight dual stream network with knowledge distillation for RGB-D scene parsing. IEEE Signal Process. Lett. 31, 855\u2013859 (2024)","journal-title":"IEEE Signal Process. Lett."},{"issue":"1","key":"1888_CR28","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1109\/TIV.2022.3164899","volume":"8","author":"W Zhou","year":"2022","unstructured":"Zhou, W., Dong, S., Lei, J., Yu, L.: MTANet: multitask-aware network with hierarchical multimodal fusion for RGB-T urban scene understanding. IEEE Trans. Intell. Veh. 8(1), 48\u201358 (2022)","journal-title":"IEEE Trans. Intell. Veh."},{"issue":"12","key":"1888_CR29","doi-asserted-by":"publisher","first-page":"7096","DOI":"10.1109\/TCSVT.2023.3275314","volume":"33","author":"W Zhou","year":"2023","unstructured":"Zhou, W., Zhang, H., Yan, W., Lin, W.: MMSMCNet: modal memory sharing and morphological complementary networks for RGB-T urban scene semantic segmentation. IEEE Trans. Circuits Syst. Video Technol. 33(12), 7096\u20137108 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1888_CR30","doi-asserted-by":"publisher","first-page":"2526","DOI":"10.1109\/TMM.2021.3086618","volume":"24","author":"W Zhou","year":"2021","unstructured":"Zhou, W., Lin, X., Lei, J., Yu, L., Hwang, J.-N.: MFFENet: multiscale feature fusion and enhancement network for RGB-thermal urban road scene parsing. IEEE Trans. Multimedia 24, 2526\u20132538 (2021)","journal-title":"IEEE Trans. Multimedia"},{"issue":"4","key":"1888_CR31","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/JSTSP.2022.3174338","volume":"16","author":"W Zhou","year":"2022","unstructured":"Zhou, W., Yang, E., Lei, J., Yu, L.: FRNet: feature reconstruction network for RGB-D indoor scene parsing. IEEE J. Sel. Top. Signal Process. 16(4), 677\u2013687 (2022)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"issue":"4","key":"1888_CR32","doi-asserted-by":"publisher","first-page":"666","DOI":"10.1109\/JSTSP.2022.3159032","volume":"16","author":"W Zhou","year":"2022","unstructured":"Zhou, W., Jin, J., Lei, J., Yu, L.: CIMFNet: cross-layer interaction and multiscale fusion network for semantic segmentation of high-resolution remote sensing images. IEEE J. Sel. Top. Signal Process. 16(4), 666\u2013676 (2022)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"1888_CR33","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1016\/j.inffus.2023.01.016","volume":"94","author":"W Zhou","year":"2023","unstructured":"Zhou, W., Yue, Y., Fang, M., Qian, X., Yang, R., Yu, L.: BCINet: bilateral cross-modal interaction network for indoor scene understanding in RGB-D images. Inf. Fusion 94, 32\u201342 (2023)","journal-title":"Inf. Fusion"},{"key":"1888_CR34","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., pp. 12179\u201312188 (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"1888_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Z., Hu, H., Lin, Y., Yao, Z., Xie, Z., Wei, Y., Ning, J., Cao, Y., Zhang, Z., Dong, L., et al.: Swin Transformer V2: scaling up capacity and resolution. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR), pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"1888_CR36","doi-asserted-by":"crossref","unstructured":"Ye, H., Xu, D.: Inverted pyramid multi-task transformer for dense scene understanding. In: Proc. European Conf. Comput. Vis. (ECCV), pp. 514\u2013530, Springer (2022)","DOI":"10.1007\/978-3-031-19812-0_30"},{"key":"1888_CR37","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., Van Der Maaten, L., Joulin, A., Misra, I.: Omnivore: a single model for many visual modalities. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 16102\u201316112 (2022)","DOI":"10.1109\/CVPR52688.2022.01563"},{"issue":"5","key":"1888_CR38","doi-asserted-by":"publisher","first-page":"2563","DOI":"10.1007\/s00530-023-01152-4","volume":"29","author":"A Yang","year":"2023","unstructured":"Yang, A., Liu, Y., Cheng, S., Cao, J., Ji, Z., Pang, Y.: Spatial attention-guided deformable fusion network for salient object detection. Multimedia Syst. 29(5), 2563\u20132573 (2023)","journal-title":"Multimedia Syst."},{"issue":"4","key":"1888_CR39","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/s00530-024-01415-8","volume":"30","author":"F Liu","year":"2024","unstructured":"Liu, F., Jiang, A., Chen, L.: A multi-scale channel attention network with federated learning for magnetic resonance image super-resolution. Multimedia Syst. 30(4), 214 (2024)","journal-title":"Multimedia Syst."},{"key":"1888_CR40","unstructured":"Hou, R., Chang, H., Ma, B., Shan, S., Chen, X.: Cross attention network for few-shot classification. In: Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"1888_CR41","doi-asserted-by":"crossref","unstructured":"Voita, E., Talbot, D., Moiseev, F., Sennrich, R., Titov, I.: Analyzing multi-head self-attention: specialized heads do the heavy lifting, the rest can be pruned (2019). arXiv:1905.09418","DOI":"10.18653\/v1\/P19-1580"},{"key":"1888_CR42","doi-asserted-by":"crossref","unstructured":"Br\u00fcggemann, D., Kanakis, M., Obukhov, A., Georgoulis, S., Van Gool, L.: Exploring relational context for multi-task dense prediction. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., pp. 15869\u201315878 (2021)","DOI":"10.1109\/ICCV48922.2021.01557"},{"key":"1888_CR43","doi-asserted-by":"crossref","unstructured":"Dong, X., Yokoya, N.: Understanding dark scenes by contrasting multi-modal observations. In: Proc. IEEE\/CVF Winter Conf. Appl. Comput. Vis., pp. 840\u2013850 (2024)","DOI":"10.1109\/WACV57701.2024.00089"},{"key":"1888_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, J., Liu, H., Yang, K., Hu, X., Liu, R., Stiefelhagen, R.: CMX: Cross-modal fusion for RGB-X semantic segmentation with transformers. IEEE Trans. Intell. Transp. Syst., IEEE (2023)","DOI":"10.1109\/TITS.2023.3300537"},{"key":"1888_CR45","doi-asserted-by":"publisher","first-page":"2567","DOI":"10.1109\/LSP.2022.3229594","volume":"29","author":"E Yang","year":"2022","unstructured":"Yang, E., Zhou, W., Qian, X., Yu, L.: MGCNet: multilevel gated collaborative network for RGB-D semantic segmentation of indoor scene. IEEE Signal Process. Lett. 29, 2567\u20132571 (2022)","journal-title":"IEEE Signal Process. Lett."},{"key":"1888_CR46","doi-asserted-by":"crossref","unstructured":"Bachmann, R., Mizrahi, D., Atanov, A., Zamir, A.: Multimae: multi-modal multi-task masked autoencoders. In: Proc. Eur. Conf. Comput. Vis., pp. 348\u2013367, Springer (2022)","DOI":"10.1007\/978-3-031-19836-6_20"},{"issue":"2","key":"1888_CR47","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1007\/s41745-019-0098-4","volume":"99","author":"P Netrapalli","year":"2019","unstructured":"Netrapalli, P.: Stochastic gradient descent and its variants in machine learning. J. Indian Inst. Sci. 99(2), 201\u2013213 (2019)","journal-title":"J. Indian Inst. Sci."},{"issue":"2","key":"1888_CR48","first-page":"123","volume":"31","author":"S Wei","year":"2024","unstructured":"Wei, S., Zhou, Z., Lu, Z., Yuan, Z., Su, B.: HDBFormer: efficient RGB-D semantic segmentation with a heterogeneous dual-branch framework. IEEE Signal Process. Lett. 31(2), 123\u2013130 (2024)","journal-title":"IEEE Signal Process. Lett."},{"key":"1888_CR49","doi-asserted-by":"crossref","unstructured":"Xu, D., Ouyang, W., Wang, X., Sebe, N.: Pad-net: Multi-tasks guided prediction-and-distillation network for simultaneous depth estimation and scene parsing. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 675\u2013684 (2018)","DOI":"10.1109\/CVPR.2018.00077"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01888-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01888-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01888-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:24:16Z","timestamp":1761387856000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01888-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,21]]},"references-count":49,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["1888"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01888-1","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,8,21]]},"assertion":[{"value":"1 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"320"}}