{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,16]],"date-time":"2026-07-16T02:42:20Z","timestamp":1784169740572,"version":"3.55.0"},"reference-count":93,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T00:00:00Z","timestamp":1737763200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T00:00:00Z","timestamp":1737763200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s11263-025-02345-2","type":"journal-article","created":{"date-parts":[[2025,1,25]],"date-time":"2025-01-25T02:06:15Z","timestamp":1737770775000},"page":"3645-3666","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":45,"title":["SeaFormer++: Squeeze-Enhanced Axial Transformer for Mobile Visual Recognition"],"prefix":"10.1007","volume":"133","author":[{"given":"Qiang","family":"Wan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zilong","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiachen","family":"Lu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gang","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1031-5420","authenticated-orcid":false,"given":"Li","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,1,25]]},"reference":[{"key":"2345_CR1","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J. & Ferrari, V. (2018). Coco-stuff: Thing and stuff classes in context. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00132"},{"key":"2345_CR2","doi-asserted-by":"crossref","unstructured":"Cao, Y., Xu, J., Lin, S., Wei, F. & Hu, H. (2019). Gcnet: Non-local networks meet squeeze-excitation networks and beyond. In IEEE International Conference on Computer Vision Workshops.","DOI":"10.1109\/ICCVW.2019.00246"},{"key":"2345_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Chen, D., Liu, M., Dong, X., Yuan, L. & Liu, Z. (2022). Mobile-former: Bridging mobilenet and transformer. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.00520"},{"key":"2345_CR4","unstructured":"Chen, Y., Kalantidis, Y., Li, J., Yan, S. & Feng, J. (2018). $${\\rm A}^2$$-nets: Double attention networks. In Advances in Neural Information Processing Systems."},{"key":"2345_CR5","unstructured":"Chen, L.-C., Papandreou, G., Schroff, F. & Adam, H. (2017). Rethinking atrous convolution for semantic image segmentation. arXiv preprint."},{"key":"2345_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Q., Wu, Q., Wang, J., Hu, Q., Hu, T., Ding, E., Cheng, J. & Wang, J. (2022). Mixformer: Mixing features across windows and dimensions. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"2345_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F. & Adam, H. (2018). Encoder-decoder with atrous separable convolution for semantic image segmentation. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"2345_CR8","doi-asserted-by":"crossref","unstructured":"Cho, J. H. & Hariharan, B. (2019). On the efficacy of knowledge distillation. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00489"},{"key":"2345_CR9","unstructured":"Choromanski, K., Likhosherstov, V., Dohan, D., Song, X., Gane, A., Sarlos, T., Hawkins, P., Davis, J., Mohiuddin, A., Kaiser, L., et al. (2021). Rethinking attention with performers. In International Conference on Learning Representations."},{"key":"2345_CR10","unstructured":"Contributors, T. (2019). TNN: A high-performance, lightweight neural network inference framework."},{"key":"2345_CR11","unstructured":"Contributors, M. (2020). MMSegmentation: Openmmlab semantic segmentation toolbox and benchmark."},{"key":"2345_CR12","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S. & Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2016.350"},{"key":"2345_CR13","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K. & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2345_CR14","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations."},{"key":"2345_CR15","doi-asserted-by":"crossref","unstructured":"Gao, H., Wang, Z. & Ji, S. (2020). Kronecker attention networks. In ACM SIGKDD.","DOI":"10.1145\/3394486.3403065"},{"key":"2345_CR16","doi-asserted-by":"crossref","unstructured":"He, T., Shen, C., Tian, Z., Gong, D., Sun, C. & Yan, Y. (2019). Knowledge adaptation for efficient semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00067"},{"key":"2345_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S. & Sun, J. (2016). Deep residual learning for image recognition. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2345_CR18","doi-asserted-by":"crossref","unstructured":"Heo, B., Kim, J., Yun, S., Park, H., Kwak, N. & Choi, J. Y. (2019). A comprehensive overhaul of feature distillation. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00201"},{"key":"2345_CR19","unstructured":"Hinton, G. (2015). Distilling the knowledge in a neural network. arXiv preprint."},{"key":"2345_CR20","unstructured":"Ho, J., Kalchbrenner, N., Weissenborn, D. & Salimans, T. (2019). Axial attention in multidimensional transformers. arXiv preprint."},{"key":"2345_CR21","unstructured":"Hong, Y., Pan, H., Sun, W. & Jia, Y. (2021). Deep dual-resolution networks for real-time and accurate semantic segmentation of road scenes. arXiv preprint."},{"key":"2345_CR22","doi-asserted-by":"crossref","unstructured":"Hou, Q., Zhang, L., Cheng, M.-M. & Feng, J. (2020). Strip pooling: Rethinking spatial pooling for scene parsing. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR42600.2020.00406"},{"key":"2345_CR23","doi-asserted-by":"crossref","unstructured":"Hou, Q., Zhou, D. & Feng, J. (2021). Coordinate attention for efficient mobile network design. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.01350"},{"key":"2345_CR24","doi-asserted-by":"crossref","unstructured":"Howard, A., Sandler, M., Chu, G., Chen, L.-C., Chen, B., Tan, M., Wang, W., Zhu, Y., Pang, R., Vasudevan, V., et al. (2019). Searching for mobilenetv3. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00140"},{"key":"2345_CR25","doi-asserted-by":"crossref","unstructured":"Hu, B., Zhou, S., Xiong, Z. & Wu, F. (2022). Cross-resolution distillation for efficient 3d medical image registration. IEEE Transactions on Circuits and Systems for Video Technology.","DOI":"10.1109\/TCSVT.2022.3178178"},{"key":"2345_CR26","unstructured":"Huang, Z., Ben, Y., Luo, G., Cheng, P., Yu, G. & Fu, B. (2021). Shuffle transformer: Rethinking spatial shuffle for vision transformer. arXiv preprint."},{"key":"2345_CR27","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wang, X., Huang, L., Huang, C., Wei, Y. & Liu, W. (2019). Ccnet: Criss-cross attention for semantic segmentation. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2019.00069"},{"key":"2345_CR28","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wei, Y., Wang, X., Liu, W., Huang, T. S., Shi, H. (2021). Alignseg: Feature-aligned segmentation networks. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2021.3062772"},{"key":"2345_CR29","unstructured":"Huang, L., Yuan, Y., Guo, J., Zhang, C., Chen, X. & Wang, J. (2019). Interlaced sparse self-attention for semantic segmentation. arXiv preprint."},{"key":"2345_CR30","unstructured":"Ioffe, S. & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In International Conference on Machine Learning."},{"key":"2345_CR31","unstructured":"Kim, J., Park, S. & Kwak, N. (2018). Paraphrasing complex network: Network compression via factor transfer. Advances in Neural Information Processing Systems."},{"key":"2345_CR32","unstructured":"Kingma, D. P. & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint."},{"key":"2345_CR33","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K. & Doll\u00e1r, P. (2019). Panoptic feature pyramid networks. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00656"},{"key":"2345_CR34","unstructured":"Li, J., Hassani, A., Walton, S. & Shi, H. (2021). Convmlp: Hierarchical convolutional mlps for vision. arXiv preprint."},{"key":"2345_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y., Hu, J., Wen, Y., Evangelidis, G., Salahi, K., Wang, Y., Tulyakov, S. & Ren, J. (2023). Rethinking vision transformers for mobilenet size and speed. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.01549"},{"key":"2345_CR36","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, X., Yang, L., Zhao, B., Song, R., Luo, L., Li, J. & Yang, J. (2023). Curriculum temperature for knowledge distillation. In AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v37i2.25236"},{"key":"2345_CR37","doi-asserted-by":"crossref","unstructured":"Li, X., Li, X., You, A., Zhang, L., Cheng, G., Yang, K., Tong, Y. & Lin, Z. (2021). Towards efficient scene understanding via squeeze reasoning. IEEE Transactions on Image Processing.","DOI":"10.1109\/TIP.2021.3099369"},{"key":"2345_CR38","doi-asserted-by":"crossref","unstructured":"Li, X., Li, X., Zhang, L., Cheng, G., Shi, J., Lin, Z., Tan, S. & Tong, Y. (2020). Improving semantic segmentation via decoupled body and edge supervision. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58520-4_26"},{"key":"2345_CR39","doi-asserted-by":"crossref","unstructured":"Li, H., Xiong, P., Fan, H. & Sun, J. (2019). Dfanet: Deep feature aggregation for real-time semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00975"},{"key":"2345_CR40","doi-asserted-by":"crossref","unstructured":"Li, Z., Ye, J., Song, M., Huang, Y. & Pan, Z. (2021). Online knowledge distillation for efficient pose estimation. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV48922.2021.01153"},{"key":"2345_CR41","unstructured":"Li, Y., Yuan, G., Wen, Y., Hu, E., Evangelidis, G., Tulyakov, S., Wang, Y. & Ren, J. (2022). Efficientformer: Vision transformers at mobilenet speed. arXiv preprint."},{"key":"2345_CR42","doi-asserted-by":"crossref","unstructured":"Li, X., Zhang, L., Cheng, G., Yang, K., Tong, Y., Zhu, X. & Xiang, T. (2021). Global aggregation then local distribution for scene parsing. IEEE Transactions on Image Processing.","DOI":"10.1109\/TIP.2021.3099366"},{"key":"2345_CR43","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K. & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2345_CR44","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, K., Liu, C., Qin, Z., Luo, Z. & Wang, J. (2019). Structured knowledge distillation for semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00271"},{"key":"2345_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S. & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2345_CR46","unstructured":"Liu, P. J., Saleh, M., Pot, E., Goodrich, B., Sepassi, R., Kaiser, L. & Shazeer, N. (2018). Generating wikipedia by summarizing long sequences. In International Conference on Learning Representations."},{"key":"2345_CR47","unstructured":"Liu, R., Yang, K., Roitberg, A., Zhang, J., Peng, K., Liu, H. & Stiefelhagen, R. (2022). Transkd: Transformer knowledge distillation for efficient semantic segmentation. arXiv preprint."},{"key":"2345_CR48","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E. & Darrell, T. (2015). Fully convolutional networks for semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"2345_CR49","doi-asserted-by":"crossref","unstructured":"Luong, M.-T., Pham, H. & Manning, C. D. (2015). Effective approaches to attention-based neural machine translation. arXiv preprint.","DOI":"10.18653\/v1\/D15-1166"},{"key":"2345_CR50","doi-asserted-by":"crossref","unstructured":"Ma, N., Zhang, X., Zheng, H.-T. & Sun, J. (2018). Shufflenet v2: Practical guidelines for efficient cnn architecture design. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"2345_CR51","unstructured":"Mehta, S. & Rastegari, M. (2022). Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. In International Conference on Learning Representations."},{"key":"2345_CR52","unstructured":"Mehta, S. & Rastegari, M. (2022). Separable self-attention for mobile vision transformers. arXiv preprint."},{"key":"2345_CR53","doi-asserted-by":"crossref","unstructured":"Mirzadeh, S. I., Farajtabar, M., Li, A., Levine, N., Matsukawa, A. & Ghasemzadeh, H. (2020). Improved knowledge distillation via teacher assistant. In AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"2345_CR54","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Chen, X., Liu, X., Cho, N.-G., Lee, S.-W., Fidler, S., Urtasun, R. & Yuille, A. (2014). The role of context for object detection and semantic segmentation in the wild. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2014.119"},{"key":"2345_CR55","doi-asserted-by":"crossref","unstructured":"Pan, J., Bulat, A., Tan, F., Zhu, X., Dudziak, L., Li, H., Tzimiropoulos, G. & Martinez, B. (2022). Edgevits: Competing light-weight cnns on mobile devices with vision transformers. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-20083-0_18"},{"key":"2345_CR56","doi-asserted-by":"crossref","unstructured":"Pan, X., Ge, C., Lu, R., Song, S., Chen, G., Huang, Z. & Huang, G. (2022). On the integration of self-attention and convolution. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.00089"},{"key":"2345_CR57","doi-asserted-by":"crossref","unstructured":"Park, W., Kim, D., Lu, Y. & Cho, M. (2019). Relational knowledge distillation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00409"},{"key":"2345_CR58","unstructured":"Poudel, R. P., Liwicki, S. & Cipolla, R. (2019). Fast-scnn: Fast semantic segmentation network. In British Machine Vision Conference."},{"key":"2345_CR59","doi-asserted-by":"crossref","unstructured":"Qi, L., Kuen, J., Gu, J., Lin, Z., Wang, Y., Chen, Y., Li, Y. & Jia, J. (2021). Multi-scale aligned distillation for low-resolution detection. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.01421"},{"key":"2345_CR60","unstructured":"Romero, A., Ballas, N., Kahou, S. E., Chassang, A., Gatta, C. & Bengio, Y. (2014). Fitnets: Hints for thin deep nets. arXiv preprint."},{"key":"2345_CR61","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A. & Chen, L.-C. (2018). Mobilenetv2: Inverted residuals and linear bottlenecks. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"2345_CR62","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S. & Li, H. (2021). Efficient attention: Attention with linear complexities. In IEEE Winter Conference on Applications of Computer Vision."},{"key":"2345_CR63","unstructured":"Tan, M. & Le, Q. (2019). Efficientnet: Rethinking model scaling for convolutional neural networks. In International Conference on Machine Learning."},{"key":"2345_CR64","unstructured":"Tang, S., Sun, T., Peng, J., Chen, G., Hao, Y., Lin, M., Xiao, Z., You, J. & Liu, Y. (2023). Pp-mobileseg: Explore the fast and accurate semantic segmentation model on mobile devices. arXiv preprint."},{"key":"2345_CR65","unstructured":"Tian, Y., Krishnan, D. & Isola, P. (2019). Contrastive representation distillation. arXiv preprint."},{"key":"2345_CR66","doi-asserted-by":"crossref","unstructured":"Vasu, P. K. A., Gabriel, J., Zhu, J., Tuzel, O. & Ranjan, A. (2022). An improved one millisecond mobile backbone. arXiv preprint.","DOI":"10.1109\/CVPR52729.2023.00764"},{"key":"2345_CR67","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141. & Polosukhin, I. (2017). Attention is all you need. In Advances in Neural Information Processing Systems."},{"key":"2345_CR68","unstructured":"Wan, Q., Huang, Z., Lu, J., Yu, G. & Zhang, L. (2023). Seaformer: Squeeze-enhanced axial transformer for mobile semantic segmentation. In International Conference on Learning Representations."},{"key":"2345_CR69","doi-asserted-by":"crossref","unstructured":"Wang, J., Chen, Y., Zheng, Z., Li, X., Cheng, M.-M. & Hou, Q. (2024). Crosskd: Cross-head knowledge distillation for object detection. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.01563"},{"key":"2345_CR70","unstructured":"Wang, S., Li, B. Z., Khabsa, M., Fang, H. & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv preprint."},{"key":"2345_CR71","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P. & Shao, L. (2021). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"2345_CR72","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhu, Y., Green, B., Adam, H., Yuille, A. & Chen, L.-C. (2020). Axial-deeplab: Stand-alone axial-attention for panoptic segmentation. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58548-8_7"},{"key":"2345_CR73","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.-Y. & Kweon, I. S. (2018). Cbam: Convolutional block attention module. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"2345_CR74","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J. M. & Luo, P. (2021). Segformer: Simple and efficient design for semantic segmentation with transformers. In Advances in Neural Information Processing Systems."},{"key":"2345_CR75","doi-asserted-by":"crossref","unstructured":"Xu, W., Xu, Y., Chang, T. & Tu, Z. (2021). Co-scale conv-attentional image transformers. In IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"2345_CR76","unstructured":"Yan, H., Li, Z., Li, W., Wang, C., Wu, M. & Zhang, C. (2021). Contnet: Why not use convolution and transformer at the same time? arXiv preprint."},{"key":"2345_CR77","doi-asserted-by":"crossref","unstructured":"Yang, C., Wang, Y., Zhang, J., Zhang, H., Wei, Z., Lin, Z. & Yuille, A. (2022). Lite vision transformer with enhanced self-attention. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01169"},{"key":"2345_CR78","doi-asserted-by":"crossref","unstructured":"Yang, C., Xie, L., Su, C. & Yuille, A. L. (2019). Snapshot distillation: Teacher-student optimization in one generation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00297"},{"key":"2345_CR79","doi-asserted-by":"crossref","unstructured":"Yang, C., Zhou, H., An, Z., Jiang, X., Xu, Y. & Zhang, Q. (2022). Cross-image relational knowledge distillation for semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01200"},{"key":"2345_CR80","doi-asserted-by":"crossref","unstructured":"Yim, J., Joo, D., Bae, J. & Kim, J. (2017). A gift from knowledge distillation: Fast optimization, network minimization and transfer learning. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2017.754"},{"key":"2345_CR81","doi-asserted-by":"crossref","unstructured":"Yu, C., Gao, C., Wang, J., Yu, G., Shen, C. & Sang, N. (2021). Bisenet v2: Bilateral network with guided aggregation for real-time semantic segmentation. International Journal of Computer Vision.","DOI":"10.1007\/s11263-021-01515-2"},{"key":"2345_CR82","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Peng, C., Gao, C., Yu, G. Sang, N. (2018). Bisenet: Bilateral segmentation network for real-time semantic segmentation. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"2345_CR83","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Chen, X. & Wang, J. (2020). Object-contextual representations for semantic segmentation. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58539-6_11"},{"key":"2345_CR84","unstructured":"Yuan, Y., Fu, R., Huang, L., Lin, W., Zhang, C., Chen, X. & Wang, J. (2021). Hrformer: High-resolution transformer for dense prediction. arXiv preprint."},{"key":"2345_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, L., Chen, M., Arnab, A., Xue, X. & Torr, P. H. (2022). Dynamic graph message passing networks for visual recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2022.3207500"},{"key":"2345_CR86","unstructured":"Zhang, H., Hu, W. & Wang, X. (2022). Edgeformer: Improving light-weight convnets by learning from vision transformers. arXiv preprint."},{"key":"2345_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, W., Huang, Z., Luo, G., Chen, T., Wang, X., Liu, W., Yu, G. & Shen, C. (2022). Topformer: Token pyramid transformer for mobile semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01177"},{"key":"2345_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xiang, T., Hospedales, T. M. & Lu, H. (2018). Deep mutual learning. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00454"},{"key":"2345_CR89","doi-asserted-by":"crossref","unstructured":"Zhang, L., Xu, D., Arnab, A. & Torr, P. H. (2020). Dynamic graph message passing networks. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR42600.2020.00378"},{"key":"2345_CR90","doi-asserted-by":"crossref","unstructured":"Zhao, B., Cui, Q., Song, R., Qiu, Y. & Liang, J. (2022). Decoupled knowledge distillation. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01165"},{"key":"2345_CR91","doi-asserted-by":"crossref","unstructured":"Zhao, H., Qi, X., Shen, X., Shi, J. & Jia, J. (2018). Icnet for real-time semantic segmentation on high-resolution images. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-01219-9_25"},{"key":"2345_CR92","doi-asserted-by":"crossref","unstructured":"Zheng, S., Lu, J., Zhao, H., Zhu, X., Luo, Z., Wang, Y., Fu, Y., Feng, J., Xiang, T., Torr, P. H., et al. (2021). Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"2345_CR93","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A. & Torralba, A. (2017). Scene parsing through ade20k dataset. In IEEE Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2017.544"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02345-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02345-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02345-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T06:56:39Z","timestamp":1746860199000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02345-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,25]]},"references-count":93,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["2345"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02345-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,25]]},"assertion":[{"value":"7 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}