{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:05:19Z","timestamp":1757617519737,"version":"3.44.0"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T00:00:00Z","timestamp":1738886400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T00:00:00Z","timestamp":1738886400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00371-025-03820-0","type":"journal-article","created":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T05:51:01Z","timestamp":1738907461000},"page":"7509-7520","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Unsupervised video object segmentation with mask transformer: boosting accuracy and efficiency through feature fusion"],"prefix":"10.1007","volume":"41","author":[{"given":"Daikun","family":"Qu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongwei","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingzhu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,7]]},"reference":[{"key":"3820_CR1","doi-asserted-by":"crossref","unstructured":"Wang, J., Cao, H., Sun, C., Huang, Z., & Zhang, Y.: Motion perception-driven multimodal self-supervised video object segmentation. Visual Computer, 1\u201318 (2024)","DOI":"10.1007\/s00371-024-03597-8"},{"key":"3820_CR2","first-page":"596","volume-title":"European Conference on Computer Vision","author":"G Pei","year":"2022","unstructured":"Pei, G., Shen, F., Yao, Y., Xie, G.S., Tang, Z., Tang, J.: Hierarchical feature alignment network for unsupervised video object segmentation. In: European Conference on Computer Vision, pp. 596\u2013613. Springer Nature Switzerland, Cham (2022)"},{"key":"3820_CR3","doi-asserted-by":"crossref","unstructured":"Ji, G. P., Fu, K., Wu, Z., Fan, D. P., Shen, J., & Shao, L.: Full-duplex strategy for video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 4922\u20134933. (2021)","DOI":"10.1109\/ICCV48922.2021.00488"},{"key":"3820_CR4","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhang, L., Qi, J., Lu, H., Wang, S., & Zhang, X.: Learning motion-appearance co-attention for zero-shot video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 1564\u20131573. (2021)","DOI":"10.1109\/ICCV48922.2021.00159"},{"key":"3820_CR5","doi-asserted-by":"crossref","unstructured":"Ren, S., Liu, W., Liu, Y., Chen, H., Han, G., & He, S.: Reciprocal transformations for unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 15455\u201315464. (2021)","DOI":"10.1109\/CVPR46437.2021.01520"},{"key":"3820_CR6","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., & Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929. (2020)"},{"key":"3820_CR7","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimed. 25, 50\u201361 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"3820_CR8","doi-asserted-by":"crossref","unstructured":"Hou, B., & Li, G. PCCFormer: Parallel coupled convolutional transformer for image super-resolution. Vis. Comput. 1\u201312. (2024)","DOI":"10.1007\/s00371-023-03257-3"},{"key":"3820_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11042-021-11772-5","volume":"81","author":"X Su","year":"2022","unstructured":"Su, X., Gao, M., Ren, J., et al.: Face mask detection and classification via deep transfer learning. Multimed Tools Appl 81, 1\u201320 (2022)","journal-title":"Multimed Tools Appl"},{"key":"3820_CR10","first-page":"17864","volume":"34","author":"B Cheng","year":"2021","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. Adv. Neural. Inf. Process. Syst. 34, 17864\u201317875 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3820_CR11","doi-asserted-by":"crossref","unstructured":"Saragadam, V., & Sankaranarayanan, A.C.: Programmable spectrometry: Per-pixel material classification using learned spectral filters. In: 2020 IEEE International Conference on Computational Photography (ICCP), pp. 1\u201310. IEEE (2020","DOI":"10.1109\/ICCP48838.2020.9105281"},{"key":"3820_CR12","first-page":"8","volume":"2","author":"A Faktor","year":"2014","unstructured":"Faktor, A., Irani, M.: Video segmentation by nonlocal consensus voting. BMVC 2, 8 (2014)","journal-title":"BMVC"},{"key":"3820_CR13","doi-asserted-by":"crossref","unstructured":"Miao, J., Wei, Y., & Yang, Y.: Memory aggregation networks for efficient interactive video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10366\u201310375. (2020)","DOI":"10.1109\/CVPR42600.2020.01038"},{"key":"3820_CR14","doi-asserted-by":"crossref","unstructured":"Wang, W., Shen, J., & Porikli, F.: Saliency-aware geodesic video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3395\u20133402. (2015)","DOI":"10.1109\/CVPR.2015.7298961"},{"issue":"6","key":"3820_CR15","doi-asserted-by":"publisher","first-page":"1187","DOI":"10.1109\/TPAMI.2013.242","volume":"36","author":"P Ochs","year":"2013","unstructured":"Ochs, P., Malik, J., Brox, T.: Segmentation of moving objects by long term video analysis. IEEE Trans. Pattern Anal. Mach. Intell. 36(6), 1187\u20131200 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3820_CR16","first-page":"16857","volume":"33","author":"K Song","year":"2020","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: Mpnet: Masked and permuted pre-training for language understanding. Adv. Neural. Inf. Process. Syst. 33, 16857\u201316867 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3820_CR17","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Arbelaez, P., Felsen, P., & Malik, J.: Learning to segment moving objects in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4083\u20134090. (2015)","DOI":"10.1109\/CVPR.2015.7299035"},{"key":"3820_CR18","doi-asserted-by":"crossref","unstructured":"Tokmakov, P., Alahari, K., & Schmid, C.: Learning motion patterns in videos. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3386\u20133394. (2017)","DOI":"10.1109\/CVPR.2017.64"},{"key":"3820_CR19","doi-asserted-by":"crossref","unstructured":"Lu, X., Wang, W., Ma, C., Shen, J., Shao, L., & Porikli, F.: See more, know more: Unsupervised video object segmentation with co-attention siamese networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3623\u20133632. (2019)","DOI":"10.1109\/CVPR.2019.00374"},{"key":"3820_CR20","doi-asserted-by":"crossref","unstructured":"Yang, Z., Wang, Q., Bertinetto, L., Hu, W., Bai, S., & Torr, P.H.: Anchor diffusion for unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 931\u2013940. (2019)","DOI":"10.1109\/ICCV.2019.00102"},{"issue":"6","key":"3820_CR21","doi-asserted-by":"publisher","first-page":"7099","DOI":"10.1109\/TPAMI.2022.3225573","volume":"45","author":"T Zhou","year":"2022","unstructured":"Zhou, T., Porikli, F., Crandall, D.J., Van Gool, L., Wang, W.: A survey on deep learning technique for video segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 7099\u20137122 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3820_CR22","unstructured":"Cheng, Y., Li, L., Xu, Y., Li, X., Yang, Z., Wang, W., & Yang, Y.: Segment and track anything. arXiv preprint arXiv:2305.06558. (2023)"},{"key":"3820_CR23","doi-asserted-by":"crossref","unstructured":"Lu, X., Wang, W., Shen, J., Tai, Y.W., Crandall, D.J., & Hoi, S.C.: Learning video object segmentation from unlabeled videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 8960\u20138970. (2020)","DOI":"10.1109\/CVPR42600.2020.00898"},{"key":"3820_CR24","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Wang, Y., Wang, L., Zhao, X., Lu, H., Wang, Y., & Zhang, L.: Isomer: Isomerous transformer for zero-shot video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, (pp. 966\u2013976). (2023)","DOI":"10.1109\/ICCV51070.2023.00095"},{"key":"3820_CR25","doi-asserted-by":"crossref","unstructured":"Lee, M., Cho, S., Lee, S., Park, C., & Lee, S.: Unsupervised video object segmentation via prototype memory network. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5924\u20135934, (2023)","DOI":"10.1109\/WACV56688.2023.00587"},{"key":"3820_CR26","doi-asserted-by":"crossref","unstructured":"Cho, S., Lee, M., Lee, S., Park, C., Kim, D., & Lee, S.: Treating motion as option to reduce motion dependency in unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5140\u20135149. (2023)","DOI":"10.1109\/WACV56688.2023.00511"},{"key":"3820_CR27","first-page":"4918","volume":"38","author":"H Song","year":"2024","unstructured":"Song, H., Su, T., Zheng, Y., Zhang, K., Liu, B., Liu, D.: Generalizable fourier augmentation for unsupervised video object segmentation. Proc AAAI Conf Artif Intell 38, 4918\u20134924 (2024)","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"3820_CR28","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., & Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3431\u20133440. (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"issue":"4","key":"3820_CR29","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L. C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2017). Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence, 40(4), 834\u2013848.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"3820_CR30","doi-asserted-by":"crossref","unstructured":"Jiang, K., Wang, Z., Wang, Z., Yi, P., Jiang, J., Xiao, J., & Lin, C.W.: Danet: Image deraining via dynamic association learning. In: IJCAI, pp. 980\u2013986. (2022)","DOI":"10.24963\/ijcai.2022\/137"},{"key":"3820_CR31","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., & Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 2881\u20132890. (2017)","DOI":"10.1109\/CVPR.2017.660"},{"issue":"8","key":"3820_CR32","doi-asserted-by":"publisher","first-page":"2375","DOI":"10.1007\/s11263-021-01465-9","volume":"129","author":"Y Yuan","year":"2021","unstructured":"Yuan, Y., Huang, L., Guo, J., Zhang, C., Chen, X., Wang, J.: OCNet: Object context for semantic segmentation. Int. J. Comput. Vision 129(8), 2375\u20132398 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"3820_CR33","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wang, X., Huang, L., Huang, C., Wei, Y., & Liu, W.: Ccnet: Criss-cross attention for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 603\u2013612. (2019)","DOI":"10.1109\/ICCV.2019.00069"},{"key":"3820_CR34","first-page":"657","volume":"2019","author":"P Bharati","year":"2020","unstructured":"Bharati, P., Pramanik, A.: Deep learning techniques\u2014R-CNN to mask R-CNN: a survey. Comput. Intell. Pattern Recog.: Proc. CIPR 2019, 657\u2013668 (2020)","journal-title":"Comput. Intell. Pattern Recog.: Proc. CIPR"},{"key":"3820_CR35","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J.: Deformable detr: Deformable transformers for end-to-end object detection.\u00a0arXiv preprint arXiv:2010.04159. (2020)"},{"key":"3820_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., & Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer vision, pp. 10012\u201310022. (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"3","key":"3820_CR37","doi-asserted-by":"publisher","first-page":"2936","DOI":"10.1007\/s10489-022-03659-1","volume":"53","author":"Z He","year":"2023","unstructured":"He, Z., He, Y., Cao, W.: Deformable image registration with attention-guided fusion of multi-scale deformation fields. Appl. Intell. 53(3), 2936\u20132950 (2023)","journal-title":"Appl. Intell."},{"issue":"10","key":"3820_CR38","first-page":"5393","volume":"9","author":"U Ruby","year":"2020","unstructured":"Ruby, U., Yendapalli, V.: Binary cross entropy with deep learning technique for image classification. Int. J. Adv. Trends Comput. Sci. Eng 9(10), 5393\u20135397 (2020)","journal-title":"Int. J. Adv. Trends Comput. Sci. Eng"},{"key":"3820_CR39","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van Gool, L., Gross, M., & Sorkine-Hornung, A. A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732. (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"3820_CR40","doi-asserted-by":"crossref","unstructured":"Prest, A., Leistner, C., Civera, J., Schmid, C., & Ferrari, V.: Learning object class detectors from weakly annotated video. In: 2012 IEEE Conference on computer vision and pattern recognition, pp. 3282\u20133289. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248065"},{"key":"3820_CR41","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part II 16","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: Raft: Recurrent all-pairs field transforms for optical flow. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part II 16, pp. 402\u2013419. Springer International Publishing, Cham (2020)"},{"key":"3820_CR42","unstructured":"Loshchilov, I., & Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101. (2017)"},{"key":"3820_CR43","first-page":"715","volume-title":"Proceedings of the European conference on computer vision (ECCV)","author":"H Song","year":"2018","unstructured":"Song, H., Wang, W., Zhao, S., Shen, J., Lam, K.M.: Pyramid dilated deeper convlstm for video salient object detection. In: Proceedings of the European conference on computer vision (ECCV), pp. 715\u2013731. Springer International Publishing, Cham (2018)"},{"key":"3820_CR44","doi-asserted-by":"crossref","unstructured":"Wang, W., Song, H., Zhao, S., Shen, J., Zhao, S., Hoi, S.C., & Ling, H.: Learning unsupervised video object segmentation through visual attention. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 3064\u20133074. (2019)","DOI":"10.1109\/CVPR.2019.00318"},{"key":"3820_CR45","doi-asserted-by":"crossref","unstructured":"Wang, W., Lu, X., Shen, J., Crandall, D.J., & Shao, L.: Zero-shot video object segmentation via attentive graph neural networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9236\u20139245. (2019)","DOI":"10.1109\/ICCV.2019.00933"},{"issue":"07","key":"3820_CR46","first-page":"13066","volume":"34","author":"T Zhou","year":"2020","unstructured":"Zhou, T., Wang, S., Zhou, Y., Yao, Y., Li, J., Shao, L.: Motion-attentive transition for zero-shot video object segmentation. Proc. AAAI Conf Artif. Intell. 34(07), 13066\u201313073 (2020)","journal-title":"Proc. AAAI Conf Artif. Intell."},{"key":"3820_CR47","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1007\/978-3-030-58580-8_39","volume-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16","author":"X Lu","year":"2020","unstructured":"Lu, X., Wang, W., Danelljan, M., Zhou, T., Shen, J., Van Gool, L.: Video object segmentation with episodic graph memory networks. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16, pp. 661\u2013679. Springer International Publishing, Cham (2020)"},{"key":"3820_CR48","unstructured":"Kr\u00e4henb\u00fchl, P., & Koltun, V.: Efficient inference in fully connected crfs with gaussian edge potentials. Adv. Neural Inf. Process. Syst. 24 (2011)"},{"key":"3820_CR49","doi-asserted-by":"crossref","unstructured":"Li, S., Seybold, B., Vorobyov, A., Lei, X., & Kuo, C.C.J.: Unsupervised video object segmentation with motion-based bilateral networks. In: Proceedings of the European Conference on Computer Vision (ECCV): pp. 207\u2013223. (2018)","DOI":"10.1007\/978-3-030-01219-9_13"},{"key":"3820_CR50","doi-asserted-by":"crossref","unstructured":"Zhao, X., Pang, Y., Yang, J., Zhang, L., & Lu, H.: Multi-source fusion and automatic predictor selection for zero-shot video object segmentation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2645\u20132653. (2021)","DOI":"10.1145\/3474085.3475192"},{"issue":"3","key":"3820_CR51","first-page":"2109","volume":"35","author":"D Liu","year":"2021","unstructured":"Liu, D., Yu, D., Wang, C., Zhou, P.: F2net: Learning to focus on the foreground for unsupervised video object segmentation. Proc. AAAI Conf. Artif. Intell. 35(3), 2109\u20132117 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"3820_CR52","doi-asserted-by":"crossref","unstructured":"Zhang, K., Zhao, Z., Liu, D., Liu, Q., & Liu, B.: Deep transport network for unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 8781\u20138790. (2021)","DOI":"10.1109\/ICCV48922.2021.00866"},{"key":"3820_CR53","doi-asserted-by":"publisher","first-page":"490","DOI":"10.1007\/978-3-030-58568-6_29","volume-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16","author":"L Zhang","year":"2020","unstructured":"Zhang, L., Zhang, J., Lin, Z., M\u011bch, R., Lu, H., He, Y.: Unsupervised video object segmentation with joint hotspot tracking. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16, pp. 490\u2013506. Springer International Publishing, Cham (2020)"},{"key":"3820_CR54","doi-asserted-by":"crossref","unstructured":"Yu, W., Luo, M., Zhou, P., Si, C., Zhou, Y., Wang, X.,  & Yan, S. (2022). Metaformer is actually what you need for vision. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10819\u201310829).","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"3820_CR55","doi-asserted-by":"publisher","first-page":"393","DOI":"10.1007\/978-3-031-19211-1_33","volume-title":"International conference on wireless algorithms, systems, and applications","author":"Z Chen","year":"2022","unstructured":"Chen, Z., Zhong, F., Luo, Q., Zhang, X., Zheng, Y.: Edgevit: Efficient visual modeling for edge computing. In: International conference on wireless algorithms, systems, and applications, pp. 393\u2013405. Springer Nature Switzerland, Cham (2022)"},{"key":"3820_CR56","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.P., Song, K., Liang, D., & Shao, L.: Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 568\u2013578. (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"3820_CR57","unstructured":"Tan, M., & Le, Q.: Efficientnetv2: Smaller models and faster training. In: International conference on machine learning, pp. 10096\u201310106. PMLR (2021)"},{"key":"3820_CR58","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Chen, X., Chen, X., & Wang, J.: Segmentation Transformer: Object-Contextual Representations for Semantic Segmentation, 2021 (1909)","DOI":"10.1007\/978-3-030-58539-6_11"},{"key":"3820_CR59","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., & Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the European conference on computer vision (ECCV), pp. 801\u2013818. (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"3820_CR60","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., & Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125. (2017)","DOI":"10.1109\/CVPR.2017.106"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03820-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-03820-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03820-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T04:42:59Z","timestamp":1757133779000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-03820-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,7]]},"references-count":60,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["3820"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-03820-0","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2025,2,7]]},"assertion":[{"value":"17 January 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 February 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interests"}}]}}