{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:10:12Z","timestamp":1774602612857,"version":"3.50.1"},"reference-count":58,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T00:00:00Z","timestamp":1772064000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02603-3","type":"journal-article","created":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T09:29:28Z","timestamp":1772098168000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MGFNet: Meta Global Filter Network for multi-size image feature extraction"],"prefix":"10.1007","volume":"134","author":[{"given":"Hong","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiaxu","family":"Wan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianbo","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hanyang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ding","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4237-5874","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,26]]},"reference":[{"key":"2603_CR1","doi-asserted-by":"crossref","unstructured":"Caesar, H., Bankiti, V., Lang, A. H., Vora, S., Liong, V. E., Xu, Q., Krishnan, A., Pan, Y., Baldan, G., & Beijbom, O. (2020). nuscenes: A multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 11621\u201311631).","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"2603_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., & Ferrari, V. (2018). Coco-stuff: Thing and stuff classes in context. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 1209\u20131218).","DOI":"10.1109\/CVPR.2018.00132"},{"issue":"5","key":"2603_CR3","doi-asserted-by":"publisher","first-page":"1483","DOI":"10.1109\/TPAMI.2019.2956516","volume":"43","author":"Z Cai","year":"2019","unstructured":"Cai, Z., & Vasconcelos, N. (2019). Cascade r-cnn: High quality object detection and instance segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(5), 1483\u20131498.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2603_CR4","unstructured":"Chen, K., Wang, J., Pang, J., Cao, Y., Xiong, Y., Li, X., Sun, S., Feng, W., Liu, Z., Xu, J., Zhang, Z., Cheng, D., Zhu, C., Cheng, T., Zhao, Q., Li, B., Lu, X., Zhu, R., Wu, Y., Dai, J., Wang, J., Shi, J., Ouyang, W., Loy, C. C., & Lin, D. (2019). MMDetection: Open mmlab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155"},{"key":"2603_CR5","unstructured":"Contributors, M. (2020). MMSegmentation: OpenMMLab Semantic Segmentation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation"},{"key":"2603_CR6","doi-asserted-by":"crossref","unstructured":"Cubuk, E. D., Zoph, B., Shlens, J., Le, Q. V. (2020). Randaugment: Practical automated data augmentation with a reduced search space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, (pp. 702\u2013703).","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"2603_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 248\u2013255).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2603_CR8","unstructured":"Diao, Q., Jiang, Y., Wen, B., Sun, J., & Yuan, Z. (2022). Metaformer: A unified meta framework for fine-grained recognition. arXiv preprint arXiv:2203.02751"},{"key":"2603_CR9","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, Y., Ge, Y., Zhao, S., Song, L., Yue, X., & Shan, Y. (2024). Unireplknet: A universal perception large-kernel convnet for audio video point cloud time-series and image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 5513\u20135524).","DOI":"10.1109\/CVPR52733.2024.00527"},{"key":"2603_CR10","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, X., Han, J., & Ding, G. (2022). Scaling up your kernels to 31x31: Revisiting large kernel design in cnns. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 11963\u201311975).","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"2603_CR11","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Chen, D., Zhang, W., Yu, N., Yuan, L., Chen, D., & Guo, B. (2022). Cswin transformer: A general vision transformer backbone with cross-shaped windows. IEEE Conf. Comput. Vis. Pattern Recog., (pp. 12124\u201312134).","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"2603_CR12","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., ... & (2020). Houlsby, N. An image is worth 16x16 words: Transformers for image recognition at scale. In: Int. Conf. Learn. Represent."},{"key":"2603_CR13","first-page":"15908","volume":"34","author":"K Han","year":"2021","unstructured":"Han, K., Xiao, A., Wu, E., Guo, J., Xu, C., & Wang, Y. (2021). Transformer in transformer. Advances in Neural Information Processing Systems, 34, 15908\u201315919.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2603_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In: Int. Conf. Comput. Vis., (pp. 2961\u20132969).","DOI":"10.1109\/ICCV.2017.322"},{"key":"2603_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"2603_CR16","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"2603_CR17","unstructured":"Huang, J., & Huang, G. (2022). Bevdet4d: Exploit temporal cues in multi-camera 3d object detection. arXiv preprint arXiv:2203.17054"},{"key":"2603_CR18","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zhang, Z., Lan, C., Zha, Z.-J., Lu, Y., & Guo, B. (2023). Adaptive frequency filters as efficient global token mixers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 6049\u20136059).","DOI":"10.1109\/ICCV51070.2023.00556"},{"key":"2603_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, L., Dai, B., Wu, W., & Loy, C. C. (2021). Focal frequency loss for image reconstruction and synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 13919\u201313929).","DOI":"10.1109\/ICCV48922.2021.01366"},{"key":"2603_CR20","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., & Doll\u00e1r, P. (2019). Panoptic feature pyramid networks. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 6399\u20136408).","DOI":"10.1109\/CVPR.2019.00656"},{"key":"2603_CR21","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/s00591-010-0080-8","volume":"58","author":"F Klinker","year":"2011","unstructured":"Klinker, F. (2011). Exponential moving average versus moving exponential average. Mathematische Semesterberichte, 58, 97\u2013107.","journal-title":"Mathematische Semesterberichte"},{"key":"2603_CR22","doi-asserted-by":"crossref","unstructured":"Lee, Y., Hwang, J.-W., Lee, S., Bae, Y., & Park, J. (2019). An energy and gpu-computation efficient backbone network for real-time object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, (pp. 0\u20130).","DOI":"10.1109\/CVPRW.2019.00103"},{"key":"2603_CR23","doi-asserted-by":"publisher","unstructured":"Li, D., Hu, J., Wang, C., Li, X., She, Q., Zhu, L., Zhang, T., & Chen, Q. (2021). Involution: Inverting the inherence of convolution for visual recognition. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), (pp. 12316\u201312325). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01214","DOI":"10.1109\/CVPR46437.2021.01214"},{"key":"2603_CR24","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, W., Li, H., Xie, E., Sima, C., Lu, T., Qiao, Y., & Dai, J. (2022). Bevformer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: European Conference on Computer Vision, (pp. 1\u201318). Springer.","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"2603_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: Int. Conf. Comput. Vis., (pp. 2980\u20132988).","DOI":"10.1109\/ICCV.2017.324"},{"key":"2603_CR26","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In: Eur. Conf. Comput. Vis., (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2603_CR27","doi-asserted-by":"crossref","unstructured":"Lin, W., Wu, Z., Chen, J., Huang, J., & Jin, L. (2023). Scale-aware modulation meet transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 6015\u20136026).","DOI":"10.1109\/ICCV51070.2023.00553"},{"key":"2603_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., Hu, H., Lin, Y., Yao, Z., Xie, Z., Wei, Y., ... & Guo, B. (2022). Swin transformer v2: Scaling up capacity and resolution. Advances in Neural Information Processing Systems 12009\u201312019.","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"2603_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In: Int. Conf. Comput. Vis., (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2603_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., & Xie, S. (2022). A convnet for the 2020s. arXiv preprint arXiv:2201.03545","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"2603_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Y., Yan, J., Jia, F., Li, S., Gao, A., Wang, T., & Zhang, X. (2023). Petrv2: A unified framework for 3d perception from multi-camera images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 3262\u20133272).","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"2603_CR32","first-page":"9204","volume":"34","author":"H Liu","year":"2021","unstructured":"Liu, H., Dai, Z., So, D., & Le, Q. V. (2021). Pay attention to mlps. Advances in Neural Information Processing Systems, 34, 9204\u20139215.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"10","key":"2603_CR33","doi-asserted-by":"publisher","first-page":"12581","DOI":"10.1109\/TPAMI.2023.3282631","volume":"45","author":"K Li","year":"2023","unstructured":"Li, K., Wang, Y., Zhang, J., Gao, P., Song, G., Liu, Y., Li, H., & Qiao, Y. (2023). Uniformer: Unifying convolution and self-attention for visual recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(10), 12581\u201312600.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"4","key":"2603_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530159","volume":"41","author":"Y Men","year":"2022","unstructured":"Men, Y., Yao, Y., Cui, M., Lian, Z., & Xie, X. (2022). Dct-net: domain-calibrated translation for portrait stylization. ACM Transactions on Graphics (TOG), 41(4), 1\u20139.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"2603_CR35","unstructured":"Odom, F. Fft-conv-pytorch. https:\/\/github.com\/fkodom\/fft-conv-pytorch"},{"key":"2603_CR36","doi-asserted-by":"crossref","unstructured":"Qin, Z., Zhang, P., Wu, F., & Li, X. (2021). Fcanet: Frequency channel attention networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 783\u2013792).","DOI":"10.1109\/ICCV48922.2021.00082"},{"issue":"9","key":"2603_CR37","doi-asserted-by":"publisher","first-page":"10960","DOI":"10.1109\/TPAMI.2023.3263824","volume":"45","author":"Y Rao","year":"2023","unstructured":"Rao, Y., Zhao, W., Zhu, Z., Zhou, J., & Lu, J. (2023). Gfnet: Global filter networks for visual recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(9), 10960\u201310973.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2603_CR38","doi-asserted-by":"publisher","unstructured":"Tatsunami, Y., & Taki, M. (2024). Fft-based dynamic token mixer for vision. In: Wooldridge, M.J., Dy, J. G., Natarajan, S. (eds.) Thirty-Eighth AAAI Conference on Artificial Intelligence, AAAI 2024, Thirty-Sixth Conference on Innovative Applications of Artificial Intelligence, IAAI 2024, Fourteenth Symposium on Educational Advances in Artificial Intelligence, EAAI 2014, February 20-27, 2024, Vancouver, Canada, pp. 15328\u201315336. AAAI Press. https:\/\/doi.org\/10.1609\/AAAI.V38I14.29457 .","DOI":"10.1609\/AAAI.V38I14.29457"},{"key":"2603_CR39","first-page":"24261","volume":"34","author":"IO Tolstikhin","year":"2021","unstructured":"Tolstikhin, I. O., Houlsby, N., Kolesnikov, A., Beyer, L., Zhai, X., Unterthiner, T., Yung, J., Steiner, A., Keysers, D., Uszkoreit, J., et al. (2021). Mlp-mixer: An all-mlp architecture for vision. Advances in Neural Information Processing Systems, 34, 24261\u201324272.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2603_CR40","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., & J\u00e9gou, H. (2022). Deit iii: Revenge of the vit. In: European Conference on Computer Vision, (pp. 516\u2013533). Springer.","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"2603_CR41","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems 30."},{"key":"2603_CR42","doi-asserted-by":"publisher","unstructured":"Wang, W., Chen, W., Qiu, Q., Chen, L., Wu, B., Lin, B., He, X., & Liu, W. (2023). Crossformer++: A versatile vision transformer hinging on cross-scale attention. IEEE Transactions on Pattern Analysis and Machine Intelligence, TPAMI. https:\/\/doi.org\/10.1109\/TPAMI.2023.3341806","DOI":"10.1109\/TPAMI.2023.3341806"},{"key":"2603_CR43","doi-asserted-by":"crossref","unstructured":"Wang, W., Dai, J., Chen, Z., Huang, Z., Li, Z., Zhu, X., ... & Qiao, Y. (2023). Internimage: Exploring large-scale vision foundation models with deformable convolutions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 14408\u201314419).","DOI":"10.1109\/CVPR52729.2023.01385"},{"issue":"3","key":"2603_CR44","doi-asserted-by":"publisher","first-page":"415","DOI":"10.1007\/s41095-022-0274-8","volume":"8","author":"W Wang","year":"2022","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2022). Pvt v2: Improved baselines with pyramid vision transformer. Computational Visual Media, 8(3), 415\u2013424.","journal-title":"Computational Visual Media"},{"key":"2603_CR45","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In: Eur. Conf. Comput. Vis., (pp. 418\u2013434).","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"2603_CR46","doi-asserted-by":"crossref","unstructured":"Xu, K., Qin, M., Sun, F., Wang, Y., Chen, Y.-K., & Ren, F. (2020). Learning in the frequency domain. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 1740\u20131749).","DOI":"10.1109\/CVPR42600.2020.00181"},{"key":"2603_CR47","doi-asserted-by":"crossref","unstructured":"Yang, C., Chen, Y., Tian, H., Tao, C., Zhu, X., Zhang, Z., ... & Dai, J. (2023). Bevformer v2: Adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 17830\u201317839).","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"2603_CR48","doi-asserted-by":"crossref","unstructured":"Yu, W., Zhou, P., Yan, S., & Wang, X. (2024). Inceptionnext: When inception meets convnext. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 5672\u20135683).","DOI":"10.1109\/CVPR52733.2024.00542"},{"key":"2603_CR49","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S. J., Chun, S., Choe, J., & Yoo, Y. (2019). Cutmix: Regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 6023\u20136032).","DOI":"10.1109\/ICCV.2019.00612"},{"key":"2603_CR50","doi-asserted-by":"crossref","unstructured":"Yun, G., Yoo, J., Kim, K., Lee, J., & Kim, D. H. (2023). Spanet: Frequency-balancing token mixer using spectral pooling aggregation modulation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 6113\u20136124).","DOI":"10.1109\/ICCV51070.2023.00562"},{"issue":"2","key":"2603_CR51","doi-asserted-by":"publisher","first-page":"896","DOI":"10.1109\/TPAMI.2023.3329173","volume":"46","author":"W Yu","year":"2023","unstructured":"Yu, W., Si, C., Zhou, P., Luo, M., Zhou, Y., Feng, J., Yan, S., & Wang, X. (2023). Metaformer baselines for vision. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46(2), 896\u2013912.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2603_CR52","unstructured":"Zhang, H., Cisse, M., Dauphin, Y. N., Lopez-Paz, D. (2017). mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412"},{"issue":"9","key":"2603_CR53","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11263-024-02034-6","volume":"132","author":"J Zhang","year":"2024","unstructured":"Zhang, J., Li, X., Wang, Y., Wang, C., Yang, Y., Liu, Y., & Tao, D. (2024). Eatformer: Improving vision transformer inspired by evolutionary algorithm. International Journal of Computer Vision, 132(9), 1\u201328.","journal-title":"International Journal of Computer Vision"},{"key":"2603_CR54","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102455","volume":"110","author":"H Zhang","year":"2024","unstructured":"Zhang, H., Wan, J., He, Z., Song, J., Yang, Y., & Yuan, D. (2024). Sparse agent transformer for unified voxel and image feature extraction and fusion. Information Fusion, 110, Article 102455. https:\/\/doi.org\/10.1016\/j.inffus.2024.102455","journal-title":"Information Fusion"},{"key":"2603_CR55","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Li, B., Tang, L., Kuang, S., Wu, S., & Ding, S. (2022). Detecting camouflaged object in frequency domain. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 4504\u20134513).","DOI":"10.1109\/CVPR52688.2022.00446"},{"key":"2603_CR56","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Zheng, L., Kang, G., Li, S., & Yang, Y. (2020). Random erasing data augmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, (vol. 34, pp. 13001\u201313008).","DOI":"10.1609\/aaai.v34i07.7000"},{"key":"2603_CR57","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ade20k dataset. In: IEEE Conf. Comput. Vis. Pattern Recog., (pp. 633\u2013641).","DOI":"10.1109\/CVPR.2017.544"},{"key":"2603_CR58","doi-asserted-by":"crossref","unstructured":"Zhu, L., Wang, X., Ke, Z., Zhang, W., Lau, R. W. (2023). Biformer: Vision transformer with bi-level routing attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 10323\u201310333).","DOI":"10.1109\/CVPR52729.2023.00995"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02603-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02603-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02603-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:32:40Z","timestamp":1774600360000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02603-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,26]]},"references-count":58,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2603"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02603-3","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,26]]},"assertion":[{"value":"8 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"138"}}