{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:15:33Z","timestamp":1774602933259,"version":"3.50.1"},"reference-count":96,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T00:00:00Z","timestamp":1769212800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T00:00:00Z","timestamp":1769212800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-026-02753-y","type":"journal-article","created":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T18:57:21Z","timestamp":1769281041000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Entropy-Guided Condensing for Vision Transformer"],"prefix":"10.1007","volume":"134","author":[{"given":"Sihao","family":"Lin","sequence":"first","affiliation":[]},{"given":"Pumeng","family":"Lyu","sequence":"additional","affiliation":[]},{"given":"Dongrui","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zhihui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wenguan","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7778-8807","authenticated-orcid":false,"given":"Xiaojun","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Yuhui","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,24]]},"reference":[{"key":"2753_CR1","unstructured":"Ba, J.L., Kiros, J.R., & Hinton, G.E. (2016). Layer normalization. arXiv preprint arXiv:1607.06450."},{"key":"2753_CR2","unstructured":"Bachlechner, T.C., Majumder, B.P., Mao, H.H., Cottrell, G., & McAuley, J. (2020). Rezero is all you need: Fast convergence at large depth. In Conference on Uncertainty in Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:212644626"},{"key":"2753_CR3","doi-asserted-by":"crossref","unstructured":"Bai, J., Yuan, L., Xia, S.-T., Yan, S., Li, Z., & Liu, W. (2022). Improving vision transformers by revisiting high-frequency components. In European Conference on Computer Vision, (pp. 1\u201318). Springer","DOI":"10.1007\/978-3-031-20053-3_1"},{"key":"2753_CR4","unstructured":"Bhojanapalli, S., Chakrabarti, A., Veit, A., Lukasik, M., Jain, H., Liu, F., Chang, Y.-W., & Kumar, S. (2021). Leveraging redundancy in attention with reuse transformers. arXiv preprint arXiv:2110.06821."},{"key":"2753_CR5","doi-asserted-by":"crossref","unstructured":"Bian, Y., Huang, J., Cai, X., Yuan, J., & Church, K. (2021). On attention redundancy: A comprehensive study. In Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (pp. 930\u2013945).","DOI":"10.18653\/v1\/2021.naacl-main.72"},{"key":"2753_CR6","unstructured":"Bolya, D., Fu, C.-Y., Dai, X., Zhang, P., Feichtenhofer, C., & Hoffman, J. (2023). Token merging: Your vit but faster. In Proceedings of ICLR."},{"key":"2753_CR7","unstructured":"Chandar, S., Ahn, S., Larochelle, H., Vincent, P., Tesauro, G., & Bengio, Y. (2016). Hierarchical memory networks. arXiv preprint arXiv:1605.07427."},{"key":"2753_CR8","unstructured":"Chen, T., Cheng, Y., Gan, Z., Yuan, L., Zhang, L., & Wang, Z. (2021). Chasing sparsity in vision transformers: An end-to-end exploration. In Neural Information Processing Systems. https:\/\/api.semanticscholar.org\/CorpusID:235367934."},{"key":"2753_CR9","unstructured":"Chen, B., Li, P., Li, B., Li, C., Bai, L., Lin, C., Sun, M., Yan, J., & Ouyang, W. (2021). Psvit: Better vision transformer via token pooling and attention sharing. arXiv preprint arXiv:2108.03428."},{"key":"2753_CR10","doi-asserted-by":"crossref","unstructured":"Chen, M., Shao, W., Xu, P., Lin, M., Zhang, K., Chao, F., Ji, R., Qiao, Y., & Luo, P. (2023). Diffrate: Differentiable compression rate for efficient vision transformers. arXiv preprint arXiv:2305.17997.","DOI":"10.1109\/ICCV51070.2023.01574"},{"issue":"12","key":"2753_CR11","doi-asserted-by":"publisher","first-page":"3048","DOI":"10.1109\/TPAMI.2018.2874634","volume":"41","author":"S Chen","year":"2018","unstructured":"Chen, S., & Zhao, Q. (2018). Shallowing deep networks: Layer-wise pruning based on feature representations. IEEE transactions on pattern analysis and machine intelligence, 41(12), 3048\u20133056.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2753_CR12","unstructured":"Child, R., Gray, S., Radford, A., & Sutskever, I. (2019). Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509."},{"key":"2753_CR13","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition, (pp. 248\u2013255).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2753_CR14","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (long and Short Papers), (pp. 4171\u20134186).","DOI":"10.18653\/v1\/N19-1423"},{"key":"2753_CR15","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S., et\u00a0al (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"2753_CR16","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (voc) challenge. International journal of computer vision, 88, 303\u2013338.","journal-title":"International journal of computer vision"},{"key":"2753_CR17","unstructured":"Fan, A., Grave, E., & Joulin, A. (2019). Reducing transformer depth on demand with structured dropout. arXiv preprint arXiv:1909.11556."},{"key":"2753_CR18","unstructured":"Fu, Y., Yang, H., Yuan, J., Li, M., Wan, C., Krishnamoorthi, R., Chandra, V., & Lin, Y. (2022). Depthshrinker: a new compression paradigm towards boosting real-hardware efficiency of compact neural networks. In: International Conference on Machine Learning, (pp. 6849\u20136862). PMLR"},{"key":"2753_CR19","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. Advances in neural information processing systems 27."},{"key":"2753_CR20","unstructured":"Guan, C., Wang, X., Zhang, Q., Chen, R., He, D., & Xie, X. (2019). Towards a deep and unified understanding of deep neural models in nlp. In: International Conference on Machine Learning. PMLR."},{"key":"2753_CR21","unstructured":"Guo, P., Lee, C.-Y., & Ulbricht, D. (2020). Learning to branch for multi-task learning. In International Conference on Machine Learning, (pp. 3854\u20133863)."},{"key":"2753_CR22","doi-asserted-by":"crossref","unstructured":"Guo, J., Wang, N., Qi, L., & Shi, Y. (2023). Aloft: A lightweight mlp-like architecture with dynamic low-frequency transform for domain generalization. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.02311"},{"key":"2753_CR23","unstructured":"Han, S., Mao, H., & Dally, W.J. (2015). Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149."},{"key":"2753_CR24","unstructured":"Han, S., Pool, J., Tran, J., & Dally, W. (2015). Learning both weights and connections for efficient neural network. Advances in neural information processing systems 28."},{"key":"2753_CR25","doi-asserted-by":"crossref","unstructured":"Han, M., Wang, Y., Li, Z., Yao, L., Chang, X., & Qiao, Y. (2023). Html: Hybrid temporal-scale multimodal learning framework for referring video object segmentation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision","DOI":"10.1109\/ICCV51070.2023.01234"},{"key":"2753_CR26","unstructured":"Han, M., Yang, L., Chang, X., & Wang, H. (2023). Shot2story20k: A new benchmark for comprehensive understanding of multi-shot videos. arXiv e-prints, 2312."},{"key":"2753_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 16000\u201316009).","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2753_CR28","unstructured":"Hou, Z., Zhang, T., Xiong, Y., Duan, H., Pu, H., Tong, R., Zhao, C., Zhu, X., Qiao, Y., Dai, J., and others (2025). Dita: Scaling diffusion transformer for generalist vision-language-action policy. Proceedings of the IEEE International Conference on Computer Vision (ICCV) ."},{"key":"2753_CR29","first-page":"137646","volume":"37","author":"T Hu","year":"2024","unstructured":"Hu, T., Li, L., Weijer, J., Gao, H., Shahbaz Khan, F., Yang, J., Cheng, M.-M., Wang, K., & Wang, Y. (2024). Token merging for training-free semantic binding in text-to-image synthesis. Advances in Neural Information Processing Systems, 37, 137646\u2013137672.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2753_CR30","unstructured":"Kitaev, N., Kaiser, \u0141., & Levskaya, A. (2020). Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451."},{"key":"2753_CR31","unstructured":"Krizhevsky, A., Hinton, G., and others (2009). Learning multiple layers of features from tiny images."},{"key":"2753_CR32","doi-asserted-by":"crossref","unstructured":"Lang, K. (1995). Newsweeder: learning to filter netnews. In Proceedings of the Twelfth International Conference on International Conference on Machine Learning, (pp. 331\u2013339).","DOI":"10.1016\/B978-1-55860-377-6.50048-7"},{"key":"2753_CR33","unstructured":"Lee, N., Ajanthan, T., & Torr, P.H. (2018). Snip: Single-shot network pruning based on connection sensitivity. arXiv preprint arXiv:1810.02340."},{"key":"2753_CR34","first-page":"13851","volume":"37","author":"S-H Lee","year":"2024","unstructured":"Lee, S.-H., Wang, J., Zhang, Z., Fan, D., & Li, X. (2024). Video token merging for long video understanding. Advances in Neural Information Processing Systems, 37, 13851\u201313871.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2753_CR35","doi-asserted-by":"crossref","unstructured":"Li, C., Wang, G., Wang, B., Liang, X., Li, Z., & Chang, X. (2021). Dynamic slimmable network. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 8607\u20138617).","DOI":"10.1109\/CVPR46437.2021.00850"},{"key":"2753_CR36","unstructured":"Liang, Y., Ge, C., Tong, Z., Song, Y., Wang, J., & Xie, P. (2022). Not all patches are what you need: Expediting vision transformers via token reorganizations. In Proceedings of ICLR."},{"key":"2753_CR37","doi-asserted-by":"crossref","unstructured":"Lin, S., Lyu, P., Liu, D., Tang, T., Liang, X., Song, A., Chang, X.: Mlp can be a good transformer learner. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19489\u201319498 (2024).","DOI":"10.1109\/CVPR52733.2024.01843"},{"key":"2753_CR38","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In: European Conference on Computer Vision, (pp. 740\u2013755). Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2753_CR39","doi-asserted-by":"crossref","unstructured":"Lin, S., Zhang, Z., Huang, Z., Lu, Y., Lan, C., Chu, P., You, Q., Wang, J., Liu, Z., Parulkar, A., and others (2023). Deep frequency filtering for domain generalization. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.01135"},{"issue":"12","key":"2753_CR40","doi-asserted-by":"publisher","first-page":"3136","DOI":"10.1007\/s11263-023-01861-3","volume":"131","author":"M Lin","year":"2023","unstructured":"Lin, M., Chen, M., Zhang, Y., Shen, C., Ji, R., & Cao, L. (2023). Super vision transformer. International Journal of Computer Vision, 131(12), 3136\u20133151.","journal-title":"International Journal of Computer Vision"},{"key":"2753_CR41","unstructured":"Liu, J., & Shen, X.G. (2025). Fourier token merging: Understanding and capitalizing frequency domain for efficient image generation. Advances in Neural Information Processing Systems."},{"key":"2753_CR42","unstructured":"Liu, D., Deng, H., Cheng, X., Ren, Q., Wang, K., & Zhang, Q. (2023). Towards the difficulty for a deep neural network to learn concepts of different complexities. In NeurIPS."},{"key":"2753_CR43","doi-asserted-by":"publisher","unstructured":"Liu, Z., Hu, H., Lin, Y., Yao, Z., Xie, Z., Wei, Y., Ning, J., Cao, Y., Zhang, Z., Dong, L., Wei, F., & Guo, B. (2022). Swin transformer v2: Scaling up capacity and resolution. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11999\u201312009 . https:\/\/doi.org\/10.1109\/CVPR52688.2022.01170.","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"2753_CR44","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2024). Visual instruction tuning. Advances in neural information processing systems 36."},{"key":"2753_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2753_CR46","doi-asserted-by":"crossref","unstructured":"Liu, X., Peng, H., Zheng, N., Yang, Y., Hu, H., & Yuan, Y. (2023). Efficientvit: Memory efficient vision transformer with cascaded group attention. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 14420\u201314430).","DOI":"10.1109\/CVPR52729.2023.01386"},{"issue":"9","key":"2753_CR47","doi-asserted-by":"publisher","first-page":"9767","DOI":"10.1109\/TITS.2023.3268273","volume":"24","author":"D Liu","year":"2023","unstructured":"Liu, D., Chen, C., Xu, C., Qiu, R. C., & Chu, L. (2023). Self-supervised point cloud registration with deep versatile descriptors for intelligent driving. IEEE Transactions on Intelligent Transportation Systems, 24(9), 9767\u20139779.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"key":"2753_CR48","unstructured":"Louizos, C., Welling, M., & Kingma, D.P. (2018). Learning sparse neural networks through $$ l_0 $$ regularization. In Proceedings of ICLR."},{"key":"2753_CR49","unstructured":"Luo, J.-H., & Wu, J. (2017). An entropy-based pruning method for cnn compression. arXiv preprint arXiv:1706.05791."},{"issue":"8","key":"2753_CR50","doi-asserted-by":"publisher","first-page":"3355","DOI":"10.1007\/s11263-024-02035-5","volume":"132","author":"J Lu","year":"2024","unstructured":"Lu, J., Zhang, J., Zhu, X., Feng, J., Xiang, T., & Zhang, L. (2024). Softmax-free linear transformers. International Journal of Computer Vision, 132(8), 3355\u20133374.","journal-title":"International Journal of Computer Vision"},{"key":"2753_CR51","unstructured":"Michel, P., Levy, O., & Neubig, G. (2019). Are sixteen heads really better than one? Advances in neural information processing systems,32."},{"key":"2753_CR52","first-page":"2046","volume":"35","author":"H NB","year":"2022","unstructured":"NB, H., Kathpalia, A., & Nagaraj, N. (2022). Causality preserving chaotic transformation and classification using neurochaos learning. Advances in Neural Information Processing Systems, 35, 2046\u20132058.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2753_CR53","doi-asserted-by":"crossref","unstructured":"Pan, Z., Zhuang, B., Liu, J., He, H., & Cai, J. (2021). Scalable vision transformers with hierarchical pooling. In Proceedings of the IEEE\/cvf International Conference on Computer Vision.","DOI":"10.1109\/ICCV48922.2021.00043"},{"key":"2753_CR54","unstructured":"Rahaman, N., Baratin, A., Arpit, D., Draxler, F., Lin, M., Hamprecht, F., Bengio, Y., & Courville, A. (2019). On the spectral bias of neural networks. In International Conference on Machine Learning, pp. 5301\u20135310. PMLR."},{"key":"2753_CR55","first-page":"13937","volume":"34","author":"Y Rao","year":"2021","unstructured":"Rao, Y., Zhao, W., Liu, B., Lu, J., Zhou, J., & Hsieh, C.-J. (2021). Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems, 34, 13937\u201313949.","journal-title":"Advances in neural information processing systems"},{"key":"2753_CR56","doi-asserted-by":"crossref","unstructured":"Ren, S., Wei, F., Zhang, Z., & Hu, H. (2023). Tinymim: An empirical study of distilling mim pre-trained models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 3687\u20133697).","DOI":"10.1109\/CVPR52729.2023.00359"},{"issue":"2","key":"2753_CR57","doi-asserted-by":"publisher","first-page":"461","DOI":"10.1103\/PhysRevLett.85.461","volume":"85","author":"T Schreiber","year":"2000","unstructured":"Schreiber, T. (2000). Measuring information transfer. Physical review letters, 85(2), 461.","journal-title":"Physical review letters"},{"key":"2753_CR58","unstructured":"Shao, K., Tao, K., Qin, C., You, H., Sui, Y., & Wang, H. (2025). Holitom: Holistic token merging for fast video large language models. Advances in Neural Information Processing Systems."},{"key":"2753_CR59","doi-asserted-by":"crossref","unstructured":"Shi, B., Li, B., Cai, H., Lu, Y., Liu, S., Pavone, M., Kautz, J., Han, S., Darrell, T., & Molchanov, P., and others (2025). Scaling vision pre-training to 4k resolution. In Proceedings of the Computer Vision and Pattern Recognition Conference, (pp. 9631\u20139640).","DOI":"10.1109\/CVPR52734.2025.00900"},{"issue":"2","key":"2753_CR60","doi-asserted-by":"publisher","first-page":"725","DOI":"10.1137\/18M1192184","volume":"80","author":"J Sirignano","year":"2020","unstructured":"Sirignano, J., & Spiliopoulos, K. (2020). Mean field analysis of neural networks: A law of large numbers. SIAM Journal on Applied Mathematics, 80(2), 725\u2013752. https:\/\/doi.org\/10.1137\/18M1192184","journal-title":"SIAM Journal on Applied Mathematics"},{"key":"2753_CR61","unstructured":"Sun, Z., Ge, C., Wang, J., Lin, M., Chen, H., Li, H., & Sun, X. (2022). Entropy-driven mixed-precision quantization for deep network design. Advances in Neural Information Processing Systems 35."},{"key":"2753_CR62","doi-asserted-by":"crossref","unstructured":"Tang, Y., Han, K., Wang, Y., Xu, C., Guo, J., Xu, C., & Tao, D. (2022). Patch slimming for efficient vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 12165\u201312174).","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"2753_CR63","unstructured":"Tang, Y., Wang, Y., Xu, Y., Tao, D., Xu, C., Xu, C., & Xu, C. (2020). Scop: Scientific control for reliable neural network pruning. Advances in Neural Information Processing Systems 33."},{"key":"2753_CR64","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., & J\u00e9gou, H. (2022). DeiT III: Revenge of the ViT. arxiv:2204.07118.","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"2753_CR65","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021). Training data-efficient image transformers & distillation through attention. In International Conference on Machine Learning, (pp. 10347\u201310357). PMLR"},{"key":"2753_CR66","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., & J\u00e9gou, H. (2021). Going deeper with image transformers. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), (pp. 32\u201342).","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"2753_CR67","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., Rodriguez, A., Joulin, A., Grave, E., & Lample, G. (2023).LLaMA: Open and Efficient Foundation Language Models. arxiv:2302.13971."},{"key":"2753_CR68","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S., Bikel, D., Blecher, L., Ferrer, C.C., Chen, M., Cucurull, G., Esiobu, D., Fernandes, J., Fu, J., Fu, W., Fuller, B., Gao, C., Goswami, V., Goyal, N., Hartshorn, A., Hosseini, S., Hou, R., Inan, H., Kardas, M., Kerkez, V., Khabsa, M., Kloumann, I., Korenev, A., Koura, P.S., Lachaux, M.-A., Lavril, T., Lee, J., Liskovich, D., Lu, Y., Mao, Y., Martinet, X., Mihaylov, T., Mishra, P., Molybog, I., Nie, Y., Poulton, A., Reizenstein, J., Rungta, R., Saladi, K., Schelten, A., Silva, R., Smith, E.M., Subramanian, R., Tan, X.E., Tang, B., Taylor, R., Williams, A., Kuan, J.X., Xu, P., Yan, Z., Zarov, I., Zhang, Y., Fan, A., Kambadur, M., Narang, S., Rodriguez, A., Stojnic, R., Edunov, S., & Scialom, T. (2023). Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:2307.09288"},{"key":"2753_CR69","first-page":"30772","volume":"37","author":"C Tran","year":"2024","unstructured":"Tran, C., Nguyen, M. H., Nguyen, M.-D., Nguyen, T., Le, N., Xie, P., Sonntag, D., Zou, J. Y., Nguyen, B., & Niepert, M. (2024). Accelerating transformers with spectrum-preserving token merging. Advances in Neural Information Processing Systems, 37, 30772\u201330810.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2753_CR70","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems 30."},{"key":"2753_CR71","unstructured":"Wang, F., Yu, Y., Wei, G., Shao, W., Zhou, Y., Yuille, A., & Xie, C. (2025). Scaling laws in patchification: An image is worth 50,176 tokens and more. International Conference on Machine Learning"},{"issue":"10","key":"2753_CR72","doi-asserted-by":"publisher","first-page":"6761","DOI":"10.1109\/TPAMI.2024.3386927","volume":"46","author":"H Wang","year":"2024","unstructured":"Wang, H., Ma, S., Dong, L., Huang, S., Zhang, D., & Wei, F. (2024). Deepnet: Scaling transformers to 1,000 layers. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46(10), 6761\u20136774. https:\/\/doi.org\/10.1109\/TPAMI.2024.3386927","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2753_CR73","doi-asserted-by":"crossref","unstructured":"Wei, S., Ye, T., Zhang, S., Tang, Y., & Liang, J. (2023). Joint token pruning and squeezing towards more aggressive compression of vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 2092\u20132101).","DOI":"10.1109\/CVPR52729.2023.00208"},{"key":"2753_CR74","unstructured":"Weng, Y., Han, M., He, H., Li, M., Yao, L., Chang, X., & Zhuang, B (2023). Mask propagation for efficient video semantic segmentation. In Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, New Orleans, LA, USA, December 10 - 16, 2023. http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/167bcf2af2cd08fcf75b932022db0311-Abstract-Conference.html."},{"key":"2753_CR75","unstructured":"Xu, Z.J. (2018). Understanding training and generalization in deep learning by fourier analysis. arXiv preprint arXiv:1808.04295."},{"key":"2753_CR76","doi-asserted-by":"crossref","unstructured":"Xu, X., Li, C., Chen, Y., Chang, X., Liu, J., & Wang, S. (2023). No token left behind: Efficient vision transformer via dynamic token idling. In: Australasian Joint Conference on Artificial Intelligence, (pp. 28\u201341). Springer.","DOI":"10.1007\/978-981-99-8388-9_3"},{"key":"2753_CR77","doi-asserted-by":"crossref","unstructured":"Xu, K., Wang, Z., Chen, C., Geng, X., Lin, J., Yang, X., Wu, M., Li, X., & Lin, W. (2024). Lpvit: Low-power semi-structured pruning for vision transformers. arXiv preprint arXiv:2407.02068.","DOI":"10.1007\/978-3-031-73209-6_16"},{"key":"2753_CR78","doi-asserted-by":"publisher","first-page":"2964","DOI":"10.1609\/aaai.v36i3.20202","volume":"36","author":"Y Xu","year":"2022","unstructured":"Xu, Y., Zhang, Z., Zhang, M., Sheng, K., Li, K., Dong, W., Zhang, L., Xu, C., & Sun, X. (2022). Evo-vit: Slow-fast token evolution for dynamic vision transformer. Proceedings of the AAAI Conference on Artificial Intelligence, 36, 2964\u20132972.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2753_CR79","first-page":"30008","volume":"34","author":"J Yang","year":"2021","unstructured":"Yang, J., Li, C., Zhang, P., Dai, X., Xiao, B., Yuan, L., & Gao, J. (2021). Focal attention for long-range interactions in vision transformers. Advances in Neural Information Processing Systems, 34, 30008\u201330022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2753_CR80","first-page":"25739","volume":"35","author":"X Yang","year":"2022","unstructured":"Yang, X., Zhou, D., Liu, S., Ye, J., & Wang, X. (2022). Deep model reassembly. Advances in neural information processing systems, 35, 25739\u201325753.","journal-title":"Advances in neural information processing systems"},{"key":"2753_CR81","doi-asserted-by":"crossref","unstructured":"Yin, Y., Zhao, Y., Zheng, M., Lin, K., Ou, J., Chen, R., Huang, V.S.-J., Wang, J., Tao, X., Wan, P., and others (2025). Towards precise scaling laws for video diffusion transformers. In Proceedings of the Computer Vision and Pattern Recognition Conference, (pp. 18155\u201318165).","DOI":"10.1109\/CVPR52734.2025.01692"},{"key":"2753_CR82","doi-asserted-by":"crossref","unstructured":"Yu, L., & Xiang, W. (2023). X-pruner: explainable pruning for vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24355\u201324363.","DOI":"10.1109\/CVPR52729.2023.02333"},{"key":"2753_CR83","unstructured":"Yu, S., Chen, T., Shen, J., Yuan, H., Tan, J., Yang, S., Liu, J., & Wang, Z. (2022). Unified visual transformer compression. ICLR."},{"key":"2753_CR84","doi-asserted-by":"crossref","unstructured":"Yu, W., Luo, M., Zhou, P., Si, C., Zhou, Y., Wang, X., Feng, J., & Yan, S. (2022). Metaformer is actually what you need for vision. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, (pp. 10819\u201310829).","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"2753_CR85","doi-asserted-by":"publisher","first-page":"3143","DOI":"10.1609\/aaai.v36i3.20222","volume":"36","author":"F Yu","year":"2022","unstructured":"Yu, F., Huang, K., Wang, M., Cheng, Y., Chu, W., & Cui, L. (2022). Width & depth pruning for vision transformers. Proceedings of the AAAI Conference on Artificial Intelligence, 36, 3143\u20133151.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"2","key":"2753_CR86","doi-asserted-by":"publisher","first-page":"896","DOI":"10.1109\/tpami.2023.3329173","volume":"46","author":"W Yu","year":"2024","unstructured":"Yu, W., Si, C., Zhou, P., Luo, M., Zhou, Y., Feng, J., Yan, S., & Wang, X. (2024). Metaformer baselines for vision. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46(2), 896\u2013912. https:\/\/doi.org\/10.1109\/tpami.2023.3329173","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"9","key":"2753_CR87","doi-asserted-by":"publisher","first-page":"3509","DOI":"10.1007\/s11263-024-02034-6","volume":"132","author":"J Zhang","year":"2024","unstructured":"Zhang, J., Li, X., Wang, Y., Wang, C., Yang, Y., Liu, Y., & Tao, D. (2024). Eatformer: Improving vision transformer inspired by evolutionary algorithm. International Journal of Computer Vision, 132(9), 3509\u20133536.","journal-title":"International Journal of Computer Vision"},{"issue":"5","key":"2753_CR88","doi-asserted-by":"publisher","first-page":"1141","DOI":"10.1007\/s11263-022-01739-w","volume":"131","author":"Q Zhang","year":"2023","unstructured":"Zhang, Q., Xu, Y., Zhang, J., & Tao, D. (2023). Vitaev2: Vision transformer advanced by exploring inductive bias for image recognition and beyond. International Journal of Computer Vision, 131(5), 1141\u20131162.","journal-title":"International Journal of Computer Vision"},{"key":"2753_CR89","doi-asserted-by":"publisher","first-page":"13940","DOI":"10.1609\/aaai.v37i11.26632","volume":"37","author":"J Zhang","year":"2023","unstructured":"Zhang, J., Yao, W., Chen, X., & Feng, L. (2023). Transferable post-hoc calibration on pretrained transformers in noisy text classification. Proceedings of the AAAI Conference on Artificial Intelligence, 37, 13940\u201313948.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2753_CR90","doi-asserted-by":"crossref","unstructured":"Zhong, L., Wan, F., Chen, R., Quan, X., & Li, L. (2024). Blockpruner: Fine-grained pruning for large language models. arXiv preprint arXiv:2406.10594.","DOI":"10.18653\/v1\/2025.findings-acl.262"},{"key":"2753_CR91","unstructured":"Zhou, D., Kang, B., Jin, X., Yang, L., Lian, X., Jiang, Z., Hou, Q., & Feng, J. (2021). DeepViT: Towards Deeper Vision Transformer. arxiv:2103.11886."},{"key":"2753_CR92","unstructured":"Zhou, H., Zhang, H., Deng, H., Liu, D., Shen, W., Chan, S.-H., & Zhang, Q. (2023). Concept-level explanation for the generalization of a dnn. arXiv preprint arXiv:2302.13091."},{"key":"2753_CR93","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ade20k dataset. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) .","DOI":"10.1109\/CVPR.2017.544"},{"issue":"9","key":"2753_CR94","doi-asserted-by":"publisher","first-page":"4635","DOI":"10.1109\/TNNLS.2021.3059529","volume":"33","author":"Y Zhou","year":"2021","unstructured":"Zhou, Y., Yen, G. G., & Yi, Z. (2021). Evolutionary shallowing deep neural networks at block levels. IEEE Transactions on Neural Networks and Learning Systems, 33(9), 4635\u20134647.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"2753_CR95","unstructured":"Zhu, L., Liao, B., Zhang, Q., Wang, X., Liu, W., & Wang, X. (2024). Vision mamba: Efficient visual representation learning with bidirectional state space model. arXiv preprint arXiv:2401.09417."},{"key":"2753_CR96","unstructured":"Zhu, M., Tang, Y., & Han, K. (2021). Vision transformer pruning. arXiv preprint arXiv:2104.08500."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02753-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02753-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02753-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:41:28Z","timestamp":1774600888000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02753-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,24]]},"references-count":96,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2753"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02753-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,24]]},"assertion":[{"value":"14 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"86"}}