{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T15:15:11Z","timestamp":1780413311502,"version":"3.54.1"},"reference-count":178,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T00:00:00Z","timestamp":1739404800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T00:00:00Z","timestamp":1739404800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-024-02327-w","type":"journal-article","created":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T05:55:41Z","timestamp":1739426141000},"page":"3918-3950","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["An Experimental Study on Exploring Strong Lightweight Vision Transformers via Masked Image Modeling Pre-training"],"prefix":"10.1007","volume":"133","author":[{"given":"Jin","family":"Gao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shubo","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1694-9793","authenticated-orcid":false,"given":"Shaoru","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yutong","family":"Kou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zeming","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liang","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Congxuan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoqin","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yizheng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weiming","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,2,13]]},"reference":[{"key":"2327_CR1","first-page":"12980","volume":"33","author":"S Abbasi Koohpayegani","year":"2020","unstructured":"Abbasi Koohpayegani, S., Tejankar, A., & Pirsiavash, H. (2020). Compress: Self-supervised learning by compressing representations. Advances in Neural Information Processing Systems, 33, 12980\u201312992.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2327_CR2","doi-asserted-by":"crossref","unstructured":"Abdelhamed, A., Lin, S., & Brown, M.S.(2018). A high-quality denoising dataset for smartphone cameras. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2018.00182"},{"key":"2327_CR3","unstructured":"Abnar, S., Dehghani, M., & Zuidema, W.(2020). Transferring inductive biases through knowledge distillation . arXiv:2006.00555"},{"key":"2327_CR4","unstructured":"Ali, A., Touvron, H., Caron, M., Bojanowski, P., Douze, M., Joulin, A., Laptev, I., Neverova, N., Synnaeve, G., & Verbeek, J.,(2021). XCiTXcit: cross-covariance image transformers. Adv Neural Inf Process Syst, 34, 20014-20027."},{"key":"2327_CR5","unstructured":"Asano, Y.M., Rupprecht, C., & Vedaldi, A. (2020). Self-labelling via simultaneous clustering and representation learning. In Proceeding international conference learning represent."},{"key":"2327_CR6","doi-asserted-by":"crossref","unstructured":"Assran, M., Caron, M., Misra, I., Bojanowski, P., Joulin, A., Ballas, N., & Rabbat, M. (2021). Semi-supervised learning of visual features by non-parametrically predicting view assignments with support samples. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV48922.2021.00833"},{"key":"2327_CR7","unstructured":"Ba, J.L., Kiros, J.R., & Hinton, G.E.(2016). Layer normalization. arXiv:1607.06450"},{"key":"2327_CR8","unstructured":"Baevski, A., Hsu, W.-N., Xu, Q., Babu, A., Gu, J., & Auli, M. (2022). data2vec: A general framework for self-supervised learning in speech. In Proceeding international conference machine learning."},{"key":"2327_CR9","unstructured":"Bao, H., Dong, L., Piao, S., & Wei, F. (2022). BEiT: BERT pre-training of image transformers. In Proceeding international conference learning represent."},{"key":"2327_CR10","unstructured":"Bao, H., Dong, L., Wei, F., Wang, W., Yang, N., Liu, X., Wang, Y., Gao, J., Piao, S., Zhou, M., & Hon, H. (2020). UniLMv2: Pseudo-masked language models for unified language model pre-training. In Proceeding international conference machine learning."},{"key":"2327_CR11","unstructured":"Ben-Shaul, I., Shwartz-Ziv, R., Galanti, T., Dekel, S., & LeCun, Y. (2023). Reverse engineering self-supervised learning. In Advanced Neural Information Process System."},{"key":"2327_CR12","doi-asserted-by":"crossref","unstructured":"Beyer, L., Zhai, X., Royer, A., Markeeva, L., Anil, R., & Kolesnikov, A. (2022). Knowledge distillation: A good teacher is patient and consistent. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01065"},{"key":"2327_CR13","doi-asserted-by":"crossref","unstructured":"Bhat, P., Arani, E., & Zonooz, B. (2021). Distill on the go: Online knowledge distillation in self-supervised learning. In Proceeding IEEE conference computer vision pattern recognition","DOI":"10.1109\/CVPRW53098.2021.00301"},{"key":"2327_CR14","doi-asserted-by":"crossref","unstructured":"Blatter, P., Kanakis, M., Danelljan, M., & Gool, L.V.(2023). Efficient visual tracking with exemplar transformers. In IEEE\/CVF Winter Conference on Applications of Computer Vision.","DOI":"10.1109\/WACV56688.2023.00162"},{"key":"2327_CR15","unstructured":"Bojanowski, P., & Joulin, A. (2017). Unsupervised learning by predicting noise. In Proceeding international conference machine learning."},{"key":"2327_CR16","doi-asserted-by":"crossref","unstructured":"Borsuk, V., Vei, R., Kupyn, O., Martyniuk, T., Krashenyi, I., & Matas, J.(2022). FEAR: Fast, efficient, accurate and robust visual tracker. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-031-20047-2_37"},{"key":"2327_CR17","doi-asserted-by":"crossref","unstructured":"Bucilu\u0103, C., Caruana, R., & Niculescu-Mizil, A.(2006). Model compression. InProceeding ACM SIGKDD International Conference Know. Disco. & Data Mining.","DOI":"10.1145\/1150402.1150464"},{"key":"2327_CR18","doi-asserted-by":"crossref","unstructured":"Cai, H., Li, J., Hu, M., Gan, C., & Han, S. (2023). EfficientViT: Lightweight multi-scale attention for high-resolution dense prediction. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV51070.2023.01587"},{"key":"2327_CR19","doi-asserted-by":"crossref","unstructured":"Caron, M., Bojanowski, P., Joulin, A., & Douze, M. (2018). Deep clustering for unsupervised learning of visual features. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"2327_CR20","doi-asserted-by":"crossref","unstructured":"Caron, M., Bojanowski, P., Mairal, J., & Joulin, A. (2019). Unsupervised pre-training of image features on non-curated data. InProceeding international conference computer vision.","DOI":"10.1109\/ICCV.2019.00305"},{"key":"2327_CR21","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., & Joulin, A.(2021). In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9650-9660)."},{"key":"2327_CR22","first-page":"9912","volume":"33","author":"M Caron","year":"2020","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., & Joulin, A. (2020). Unsupervised learning of visual features by contrasting cluster assignments. Advances in Neural Information Processing Systems, 33, 9912\u20139924.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2327_CR23","doi-asserted-by":"crossref","unstructured":"Chen, X., & He, K. (2021). Exploring simple siamese representation learning. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"2327_CR24","unstructured":"Chen, G., Choi, W., Yu, X., & Han, T., Chandraker, M.(2017). Learning efficient object detection models with knowledge distillation. In Advanced Neural Information Processing System."},{"key":"2327_CR25","doi-asserted-by":"crossref","unstructured":"Chen, L., Chu, X., Zhang, X., & Sun, J.(2022). Simple baselines for image restoration. In Proceeding Europe conference computer vision","DOI":"10.1007\/978-3-031-20071-7_2"},{"key":"2327_CR26","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Chen, D., Liu, M., Dong, X., Yuan, L., & Liu, Z.(2022). Mobile-Former: Bridging MobileNet and transformer. In Proceeding IEEE conference computer vision pattern recognit.","DOI":"10.1109\/CVPR52688.2022.00520"},{"key":"2327_CR27","unstructured":"Chen, X., Fan, H., Girshick, R., & He, K.(2020). Improved baselines with momentum contrastive learning . arXiv:2003.04297"},{"key":"2327_CR28","doi-asserted-by":"crossref","unstructured":"Chen, X., Kang, B., Wang, D., Li, D., & Lu, H. (2022). Efficient visual tracking via hierarchical cross-attention transformer. In Proceeding international conference computer vision workshop.","DOI":"10.1007\/978-3-031-25085-9_26"},{"key":"2327_CR29","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A simple framework for contrastive learning of visual representations. In Proceeding international conference machine learning."},{"key":"2327_CR30","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., & He, K. (2021). An empirical study of training self-supervised vision transformers. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9640-9649).","DOI":"10.1109\/ICCV48922.2021.00950"},{"issue":"1","key":"2327_CR31","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/s11263-023-01852-4","volume":"132","author":"X Chen","year":"2024","unstructured":"Chen, X., Ding, M., Wang, X., Xin, Y., Mo, S., Wang, Y., Han, S., Luo, P., Zeng, G., & Wang, J. (2024). Context autoencoder for self-supervised representation learning. International Journal of Computer Vision, 132(1), 208\u2013223.","journal-title":"International Journal of Computer Vision"},{"key":"2327_CR32","doi-asserted-by":"crossref","unstructured":"Cheng, S., Wang, Y., Huang, H., Liu, D., Fan, H., Liu, S. (2021). NBNet: Noise basis learning for image denoising with subspace projection. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR46437.2021.00486"},{"key":"2327_CR33","doi-asserted-by":"crossref","unstructured":"Cho, J.H., & Hariharan, B. (2019). On the efficacy of knowledge distillation. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV.2019.00489"},{"key":"2327_CR34","first-page":"24645","volume":"34","author":"HM Choi","year":"2021","unstructured":"Choi, H. M., Kang, H., & Oh, D. (2021). Unsupervised representation transfer for small networks: I believe i can distill on-the-fly. Advances in neural information processing systems, 34, 24645\u201324658.","journal-title":"Advances in neural information processing systems"},{"key":"2327_CR35","unstructured":"Chu, X., Tian, Z., Zhang, B., Wang, X., & Shen, C. (2023). Conditional positional encodings for vision transformers. In Proceeding international conference learning represent."},{"key":"2327_CR36","first-page":"795","volume":"13","author":"C Cortes","year":"2012","unstructured":"Cortes, C., Mohri, M., & Rostamizadeh, A. (2012). Algorithms for learning kernels based on centered alignment. Journal of Machine Learning Research, 13, 795\u2013828.","journal-title":"Journal of Machine Learning Research"},{"key":"2327_CR37","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Shlens, J., & Le, Q. (2020). RandAugment: Practical automated data augmentation with a reduced search space. In Advanced Neural Information Processing System.","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"2327_CR38","unstructured":"Cui, Y., Song, T., Wu, G., & Wang, L.(2023). MixFormerV2: Efficient fully transformer tracking. In Advanced Neural Information Processing System."},{"key":"2327_CR39","doi-asserted-by":"crossref","unstructured":"Das, S., Jain, T., Reilly, D., Balaji, P., Karmakar, S., Marjit, S., Li, X., Das, A., & Ryoo, M.S.(2024). Limited data, unlimited potential: A study on vits augmented by masked autoencoders. In  Proceedings of the IEEE\/CVF winter conference on applications of computer vision (pp. 6878-6888).","DOI":"10.1109\/WACV57701.2024.00673"},{"key":"2327_CR40","doi-asserted-by":"crossref","unstructured":"d\u2019Ascoli, S., Touvron, H., Leavitt, M.L., Morcos, A.S., Biroli, G., & Sagun, L. (2021). ConViT: Improving vision transformers with soft convolutional inductive biases. In Proceeding international conference machine learning.","DOI":"10.1088\/1742-5468\/ac9830"},{"key":"2327_CR41","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L.(2009). Imagenet: A large-scale hierarchical image database. In 2009 IEEE conference on computer vision and pattern recognition(pp. 248-255).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2327_CR42","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings conference of the North American chapter of the association for computational linguistics: Human language technologies."},{"key":"2327_CR43","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, X., Han, J., & Ding, G. (2022). Scaling up your kernels to 31$$\\times $$31: Revisiting large kernel design in CNNs. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"2327_CR44","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Chen, D., Zhang, W., Yu, N., Yuan, L., Chen, D., & Guo, B. (2022). CSWin transformer: A general vision transformer backbone with cross-shaped windows. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"2327_CR45","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Zhang, T., Chen, D., Zhang, W., Yuan, L., Chen, D., Wen, F., & Yu, N.(2022). Bootstrapped masked autoencoders for vision bert pretraining. In European Conference on Computer Vision (pp. 247-264). Cham: Springer Nature Switzerland","DOI":"10.1007\/978-3-031-20056-4_15"},{"key":"2327_CR46","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Zhang, T., Chen, D., Zhang, W., Yuan, L., Chen, D., Wen, F., Yu, N., & Guo, B (2023). PeCo: Perceptual codebook for BERT pre-training of vision transformers. In In Proceedings of the AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v37i1.25130"},{"key":"2327_CR47","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. In Proceeding international conference learning represent."},{"key":"2327_CR48","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., & Brox, T. (2014). Discriminative unsupervised feature learning with convolutional neural networks. In Advance Neural Information Process System."},{"key":"2327_CR49","unstructured":"El-Nouby, A., Izacard, G., Touvron, H., Laptev, I., Jegou, H., & Grave, E. (2021). Are large-scale datasets necessary for self-supervised pre-training?. arXiv:2112.10740"},{"key":"2327_CR50","unstructured":"Ermolov, A., Siarohin, A., Sangineto, E., & Sebe, N. (2021). Whitening for self-supervised representation learning. In Proceeding international conference machine learning."},{"key":"2327_CR51","doi-asserted-by":"crossref","unstructured":"Fan, H., Lin, L., Yang, F., Chu, P., Deng, G., Yu, S., Bai, H., Xu, Y., Liao, C., & Ling, H. (2019). LaSOT: A high-quality benchmark for large-scale single object tracking. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2019.00552"},{"key":"2327_CR52","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., & Feichtenhofer, C. (2021). Multiscale vision transformers. In Proceeding international conference vision.","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"2327_CR53","unstructured":"Fang, Z., Wang, J., Wang, L., Zhang, L., Yang, Y., & Liu, Z.(2020). SEED: Self-supervised distillation for visual representation. In Proceeding international conference learning represent."},{"key":"2327_CR54","doi-asserted-by":"crossref","unstructured":"Filiot, A., Ghermi, R., Olivier, A., Jacob, P., Fidon, L., Kain, A.M., Saillard, C., & Schiratti, J.-B.(2023). Scaling self-supervised learning for histopathology with masked image modeling. medRxiv:2023.07.21.23292757","DOI":"10.1101\/2023.07.21.23292757"},{"key":"2327_CR55","unstructured":"Gao, P., Ma, T., Li, H., Lin, Z., Dai, J., & Qiao, Y (2022). MCMAE: Masked convolution meets masked autoencoders. In Advanced Neural Information Process System."},{"key":"2327_CR56","doi-asserted-by":"crossref","unstructured":"Gao, Y., Zhuang, J., Lin, S., Cheng, H., Sun, X., Li, K., Shen, C. (2022) DisCo: Remedying self-supervised learning on lightweight models with distilled contrastive learning. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-031-19809-0_14"},{"key":"2327_CR57","unstructured":"Ghasemabadi, A., Janjua, M.K., Salameh, M., Zhou, C., Sun, F., & Niu, D.(2024). CascadedGaze: Efficiency in global context extraction for image restoration. In Transactions. Machine. Learning. Research."},{"key":"2327_CR58","doi-asserted-by":"crossref","unstructured":"Gidaris, S., Bursuc, A., Komodakis, N., P\u00e9rez, P., & Cord, M. (2020). Learning representations by predicting bags of visual words. Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR42600.2020.00696"},{"key":"2327_CR59","unstructured":"Gidaris, S., Singh, P., & Komodakis, N. (2018). Unsupervised representation learning by predicting image rotations. In Proceeding international conference learning represent."},{"key":"2327_CR60","unstructured":"Goyal, P., Doll\u00e1r, P., Girshick, R., Noordhuis, P., Wesolowski, L., Kyrola, A., Tulloch, A., Jia, Y., & He, K.(2017). Accurate, large minibatch SGD: Training Imagenet in 1 hour . arXiv:1706.02677"},{"key":"2327_CR61","doi-asserted-by":"crossref","unstructured":"Graham, B., El-Nouby, A., Touvron, H., Stock, P., Joulin, A., J\u00e9gou, H., & Douze, M. (2021). LeViT: A vision transformer in ConvNet\u2019s clothing for faster inference. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"2327_CR62","first-page":"21271","volume":"33","author":"J-B Grill","year":"2020","unstructured":"Grill, J.-B., Strub, F., Altch\u00e9, F., Tallec, C., Richemond, P. H., Buchatskaya, E., Doersch, C., Pires, B. A., Guo, Z. D., Azar, M. G., Piot, B., Kavukcuoglu, K., Munos, R., & Valko, M. (2020). Bootstrap your own latent-a new approach to self-supervised learning. Advances in Neural Information Processing Systems, 33, 21271\u201321284.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2327_CR63","unstructured":"Han, K., Wang, Y., Zhang, Q., Zhang, W., Xu, C., & Zhang, T. (2020). Model Rubik\u2019s cube: Twisting resolution, depth and width for TinyNets. In Advanced Neural Information Processing System."},{"key":"2327_CR64","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R.B. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 16000-16009).","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2327_CR65","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R.(2020). Momentum contrast for unsupervised visual representation learning. In Proceedings of the IEEE\/CVF Conference on computer Vision and Pattern Recognition  (pp. 9729-9738).","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2327_CR66","doi-asserted-by":"crossref","unstructured":"He, T., Shen, C., Tian, Z., Gong, D., Sun, C., & Yan, Y.(2019). Knowledge adaptation for efficient semantic segmentation. In: Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","DOI":"10.1109\/CVPR.2019.00067"},{"key":"2327_CR67","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2327_CR68","doi-asserted-by":"crossref","unstructured":"Heo, B., Yun, S., Han, D., Chun, S., Choe, J., & Oh, S.J. (2021). Rethinking spatial dimensions of vision transformers. In Proceeding international conference vision.","DOI":"10.1109\/ICCV48922.2021.01172"},{"key":"2327_CR69","doi-asserted-by":"crossref","unstructured":"Hinton, G., Vinyals, O., & Dean, J.(2015). Distilling the knowledge in a neural network. arXiv:1503.02531. https:\/\/doi.org\/10.1145\/1150402.1150464","DOI":"10.1145\/1150402.1150464"},{"key":"2327_CR70","unstructured":"Hou, Z., Sun, F., Chen, Y., Xie, Y., & Kung, S.(2022). MILAN: Masked image pretraining on language assisted representation. arXiv:2208.06049"},{"key":"2327_CR71","doi-asserted-by":"crossref","unstructured":"Howard, A., Sandler, M., Chu, G., Chen, L.-C., Chen, B., Tan, M., Wang, W., Zhu, Y., Pang, R., Vasudevan, V., Le, Q.V., & Adam, H. (2019). Searching for MobileNetV3. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV.2019.00140"},{"key":"2327_CR72","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., & Wei, Y.(2018). Relation networks for object detection. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2018.00378"},{"key":"2327_CR73","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhang, Z., Xie, Z., & Lin, S. (2019). Local relation networks for image recognition. In Proceeding IEEE conference computer vision pattern recognition","DOI":"10.1109\/ICCV.2019.00356"},{"key":"2327_CR74","unstructured":"Huang, J., Dong, Q., Gong, S., & Zhu, X. (2019). Unsupervised deep learning by neighbourhood discovery. In Proceeding international conference machine learning."},{"key":"2327_CR75","unstructured":"Huang, T., Huang, L., You, S., Wang, F., Qian, C., & Xu, C.(2022). LightViT: Towards light-weight convolution-free vision transformers . arXiv:2207.05557"},{"key":"2327_CR76","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Maaten, L., & Weinberger, K.Q.(2017). Densely connected convolutional networks. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2017.243"},{"key":"2327_CR77","doi-asserted-by":"crossref","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., & Weinberger, K.Q. (2016). Deep networks with stochastic depth. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"2327_CR78","doi-asserted-by":"crossref","unstructured":"Jiao, X., Yin, Y., Shang, L., Jiang, X., Chen, X., Li, L., Wang, F., & Liu, Q.(2020). TinyBERT: Distilling BERT for natural language understanding. In: Proc. Findings of the Association for Computational Linguistics: EMNLP","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"2327_CR79","doi-asserted-by":"crossref","unstructured":"Jin, X., Peng, B., Wu, Y., Liu, Y., Liu, J., Liang, D., Yan, J., & Hu, X. (2019). Knowledge distillation via route constrained optimization. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV.2019.00143"},{"key":"2327_CR80","doi-asserted-by":"crossref","unstructured":"Kakogeorgiou, I., Gidaris, S., Psomas, B., Avrithis, Y., Bursuc, A., Karantzalos, K., & Komodakis, N. (2022). What to hide from your students: Attention-guided masked image modeling. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-031-20056-4_18"},{"key":"2327_CR81","doi-asserted-by":"crossref","unstructured":"Kang, B., Chen, X., Wang, D., Peng, H., & Lu, H. (2023). Exploring lightweight hierarchical vision transformers for efficient visual tracking. Proceeding Europe conference computer vision","DOI":"10.1109\/ICCV51070.2023.00881"},{"key":"2327_CR82","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R.B., He, K., & Doll\u00e1r, P. (2019). Panoptic feature pyramid networks. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2019.00656"},{"key":"2327_CR83","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., & Fei-Fei, L. (2013). 3D object representations for fine-grained categorization. In Proceeding international conference computer vision workshop.","DOI":"10.1109\/ICCVW.2013.77"},{"key":"2327_CR84","unstructured":"Krizhevsky, A., et\u00a0al.(2009). Learning multiple layers of features from tiny images. Technical Report"},{"key":"2327_CR85","doi-asserted-by":"crossref","unstructured":"Li, Y., Hu, J., Wen, Y., Evangelidis, G., Salahi, K., Wang, Y., Tulyakov, S., & Ren, J. (2023). Rethinking vision transformers for MobileNet size and speed. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV51070.2023.01549"},{"key":"2327_CR86","unstructured":"Li, X., Wang, W., Yang, L., & Yang, J.(2022). Uniform masking: Enabling MAE pre-training for pyramid-based vision transformers with locality . arXiv:2205.10063"},{"key":"2327_CR87","doi-asserted-by":"crossref","unstructured":"Li, Y., Wu, C., Fan, H., Mangalam, K., Xiong, B., Malik, J., & Feichtenhofer, C. (2022). MViTv2: Improved multiscale vision transformers for classification and detection. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"2327_CR88","unstructured":"Li, Y., Xie, S., Chen, X., Dollar, P., He, K., & Girshick, R.(2021). Benchmarking detection transfer learning with vision transformers. arXiv:2111.11429"},{"key":"2327_CR89","unstructured":"Li, C., Yang, J., Zhang, P., Gao, M., Xiao, B., Dai, X., Yuan, L., & Gao, J. (2022). Efficient self-supervised vision transformers for representation learning. In Proceeding international conference learning represent."},{"key":"2327_CR90","unstructured":"Li, G., Zheng, H., Liu, D., Wang, C., Su, B., & Zheng, C. (2022). SemMAE: Semantic-guided masking for learning masked autoencoders. In Advanced Neural Information Process System"},{"key":"2327_CR91","unstructured":"Li, J., Zhou, P., Xiong, C., & Hoi, S.C.H. (2021). Prototypical contrastive learning of unsupervised representations. In Proceeding international conference learning represent."},{"key":"2327_CR92","doi-asserted-by":"crossref","unstructured":"Lin, T., Maire, M., Belongie, S.J., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L.(2014). Microsoft COCO: Common objects in context. In Proceeding Europe Conference Computer Vision.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2327_CR93","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B.(2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceeding international conference computer vision","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2327_CR94","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., & Xie, S.(2020). A ConvNet for the,. In Proceeding IEEE conference computer vision pattern recognition (p. 2022). .","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"2327_CR95","doi-asserted-by":"crossref","unstructured":"Liu, Z., Miao, Z., Zhan, X., Wang, J., Gong, B., & Yu, S.X. (2019). Large-scale long-tailed recognition in an open world. In Proceeding IEEE conference computer vision pattern recognition","DOI":"10.1109\/CVPR.2019.00264"},{"key":"2327_CR96","doi-asserted-by":"crossref","unstructured":"Liu, X., Peng, H., Zheng, N., Yang, Y., Hu, H., & Yuan, Y. (2023). EfficientViT: Memory efficient vision transformer with cascaded group attention. In: Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52729.2023.01386"},{"key":"2327_CR97","unstructured":"Liu, H., Simonyan, K., & Yang, Y. (2019). DARTS: Differentiable architecture search. Proceeding international conference learning represent."},{"key":"2327_CR98","doi-asserted-by":"crossref","unstructured":"Liu, C., Zoph, B., Neumann, M., Shlens, J., Hua, W., Li, L., Fei-Fei, L., Yuille, A.L., Huang, J., & Murphy, K. (2018). Progressive neural architecture search. Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-030-01246-5_2"},{"key":"2327_CR99","unstructured":"Loshchilov, I., & Hutter, F. (2017). SGDR: Stochastic gradient descent with warm restarts. In Proceeding international conference learning represent."},{"key":"2327_CR100","doi-asserted-by":"crossref","unstructured":"Maaz, M., Shaker, A., Cholakkal, H., Khan, S.H., Zamir, S.W., Anwer, R.M., & Khan, F.S. (2022). EdgeNeXt: Efficiently amalgamated CNN-transformer architecture for mobile vision applications. In Proceeding Europe conference computer vision workshop.","DOI":"10.1007\/978-3-031-25082-8_1"},{"key":"2327_CR101","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., & Vedaldi, A.(2013). Fine-grained visual classification of aircraft. arXiv:1306.5151"},{"key":"2327_CR102","unstructured":"Mehta, S., & Rastegari, M.(2022). MobileViT: Light-weight, general-purpose, and mobile-friendly vision transformer. In Proceeding international conference learning represent."},{"key":"2327_CR103","unstructured":"Mehta, S., & Rastegari, M. (2023). Separable self-attention for mobile vision transformers. In Proceeding international conference learning represent."},{"key":"2327_CR104","doi-asserted-by":"crossref","unstructured":"Mirzadeh, S.I., Farajtabar, M., Li, A., Levine, N., Matsukawa, A., & Ghasemzadeh, H. (2020). Improved knowledge distillation via teacher assistant. In Proceeding AAAI Conference. Artificial. Intelligence.","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"2327_CR105","doi-asserted-by":"crossref","unstructured":"Newell, A., & Deng, J. (2020). How useful is self-supervised pretraining for visual tasks? In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR42600.2020.00737"},{"key":"2327_CR106","unstructured":"Nguyen, T., Raghu, M., & Kornblith, S. (2020). Do wide and deep networks learn the same things$$?$$ Uncovering how neural network representations vary with width and depth. Proceeding international conference learning represent."},{"key":"2327_CR107","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., & Zisserman, A. (2008). Automated flower classification over a large number of classes. InProceeding Indian Conference on Computer Vision, Graphics and Image Processing.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"2327_CR108","doi-asserted-by":"crossref","unstructured":"Noroozi, M., & Favaro, P. (2016). Unsupervised learning of visual representations by solving jigsaw puzzles. In Proceeding Europe conference computer vision","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"2327_CR109","unstructured":"Oord, A., Vinyals, O., & Kavukcuoglu, K.(2017). Neural discrete representation learning. InAdvanced Neural information Process System."},{"key":"2327_CR110","unstructured":"Ozbulak, U., Lee, H.J., Boga, B., Anzaku, E.T., Park, H., Messem, A.V., Neve, W.D., & Vankerschaver, J. (2023). Know your self-supervised learning: A survey on image-based generative and discriminative training. Transactions Machine Learning Research"},{"key":"2327_CR111","doi-asserted-by":"crossref","unstructured":"Pan, J., Bulat, A., Tan, F., Zhu, X., Dudziak, L., Li, H., Tzimiropoulos, G., & Mart\u00ednez, B. (2022). EdgeViTs: Competing light-weight CNNs on mobile devices with vision transformers. In Proceeding Europe conference Computer vision.","DOI":"10.1007\/978-3-031-20083-0_18"},{"key":"2327_CR112","unstructured":"Park, N., & Kim, S.(2022). How do vision transformers work? In Proceeding international conference learning represent."},{"key":"2327_CR113","unstructured":"Park, N., Kim, W., Heo, B., Kim, T., & Yun, S. (2023). What do self-supervised vision transformers learn? In Proceeding international conference learning represent."},{"key":"2327_CR114","doi-asserted-by":"crossref","unstructured":"Park, W., Kim, D., Lu, Y., & Cho, M.(2019). Relational knowledge distillation. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2019.00409"},{"key":"2327_CR115","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., & Jawahar, C.V. (2012). Cats and dogs. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"2327_CR116","doi-asserted-by":"crossref","unstructured":"Pathak, D., Kr\u00e4henb\u00fchl, P., Donahue, J., Darrell, T., & Efros, A.A.(2016). Context encoders: Feature learning by inpainting. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2536-2544).","DOI":"10.1109\/CVPR.2016.278"},{"key":"2327_CR117","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., & Wei, F. (2022). BEiTv2: Masked image modeling with vector-quantized visual tokenizers. arXiv:2208.06366"},{"key":"2327_CR118","unstructured":"Pham, H., Guan, M.Y., Zoph, B., Le, Q.V., Dean, J. (2018). Efficient neural architecture search via parameter sharing. In Proceeding international conference machine learning."},{"key":"2327_CR119","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. Proceeding international conference machine learning."},{"key":"2327_CR120","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Kosaraju, R.P., Girshick, R., He, K., & Doll\u00e1r, P.(2020). Designing network design spaces. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"2327_CR121","first-page":"140","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21, 140\u2013114067.","journal-title":"Journal of Machine Learning Research"},{"key":"2327_CR122","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I.(2021). Zero-shot text-to-image generation. In International conference on machine learning (pp. 8821-8831)."},{"key":"2327_CR123","doi-asserted-by":"crossref","unstructured":"Ren, S., Wei, F., Zhang, Z., & Hu, H.(2023). TinyMIM: Tinymim: An empirical study of distilling mim pre-trained models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 3687-3697).","DOI":"10.1109\/CVPR52729.2023.00359"},{"key":"2327_CR124","doi-asserted-by":"crossref","unstructured":"Romero, A., Ballas, N., Kahou, S.E., Chassang, A., Gatta, C., & Bengio, Y. (2015). FitNets: Hints for thin deep nets. In: Proceeding international conference learning represent. https:\/\/doi.org\/10.1145\/1150402.1150464","DOI":"10.1145\/1150402.1150464"},{"key":"2327_CR125","unstructured":"Ryali, C., Hu, Y.-T., Bolya, D., Wei, C., Fan, H., Huang, P.-Y., Aggarwal, V., Chowdhury, A., Poursaeed, O., Hoffman, J., Malik, J., Li, Y., & Feichtenhofer, C.(2023). Hiera: A hierarchical vision transformer without the bells-and-whistles. In international conference on machine learning (pp. 29441-29454)."},{"key":"2327_CR126","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., & Chen, L.-C. (2018). MobileNetv2: Inverted residuals and linear bottlenecks. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"2327_CR127","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T.(2019). DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter. arXiv:1910.01108"},{"key":"2327_CR128","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., & Vaswani, A.(2018). Self-attention with relative position representations. InProceedings conference of the North American chapter of the association for computational linguistics: human language technologies.","DOI":"10.18653\/v1\/N18-2074"},{"key":"2327_CR129","unstructured":"Shekhar, S., Bordes, F., Vincent, P., & Morcos, A.(2023). Objectives matter: Understanding the impact of self-supervised objectives on vision transformer representations . arXiv:2304.13089"},{"key":"2327_CR130","unstructured":"Shi, Y., Siddharth, N., Torr, P.H.S., & Kosiorek, A.R. (2022). Adversarial masking for self-supervised learning. In Proceeding international conference machine learning."},{"key":"2327_CR131","doi-asserted-by":"crossref","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In Proceeding international conference learning represent","DOI":"10.1109\/ICCV.2015.314"},{"key":"2327_CR132","unstructured":"Singh, S., & Shrivastava, A. (2021). CvT: Introducing convolutions to vision transformers. In Proceeding international conference computer vision."},{"key":"2327_CR133","first-page":"1393","volume":"13","author":"L Song","year":"2012","unstructured":"Song, L., Smola, A., Gretton, A., Bedo, J., & Borgwardt, K. (2012). Feature selection via dependence maximization. Journal of Machine Learning Research, 13, 1393\u20131434.","journal-title":"Journal of Machine Learning Research"},{"key":"2327_CR134","doi-asserted-by":"crossref","unstructured":"Srinivas, A., Lin, T., Parmar, N., Shlens, J., Abbeel, P., & Vaswani, A. (2021). Bottleneck transformers for visual recognition. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"2327_CR135","unstructured":"Steiner, A., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., & Beyer, L.(2022). How to train your ViT? Data, augmentation, and regularization in vision transformers transformers machine learning represent."},{"key":"2327_CR136","doi-asserted-by":"crossref","unstructured":"Sun, Z., Yu, H., Song, X., Liu, R., Yang, Y., & Zhou, D. (2020). MobileBERT: A compact task-agnostic BERT for resource-limited devices. In  Procceding annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"2327_CR137","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., & Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2016.308"},{"key":"2327_CR138","unstructured":"Tan, M., & Le, Q. (2019). Efficientnet: Rethinking model scaling for convolutional neural networks. Proceeding international conference machine learning."},{"key":"2327_CR139","unstructured":"Tian, Y., Krishnan, D., & Isola, P. (2019). Contrastive representation distillation. In Proceeding international conference learning represent."},{"key":"2327_CR140","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., & J\u00e9gou, H. (2022). DeiT III: Revenge of the ViT. In Proceeding international conference vision","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"2327_CR141","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & Jegou, H. (2021). Training data-efficient image transformers & distillation through attention. In Proceeding international conference machine learning."},{"key":"2327_CR142","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., & J\u00e9gou, H.(2021). Going deeper with image transformers. In Proceeding international conference vision.","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"2327_CR143","doi-asserted-by":"crossref","unstructured":"Van\u00a0Horn, G., Mac\u00a0Aodha, O., Song, Y., Cui, Y., Sun, C., Shepard, A., Adam, H., Perona, P., & Belongie, S. (2018). The inaturalist species classification and detection dataset. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2018.00914"},{"key":"2327_CR144","unstructured":"Vanyan, A., Barseghyan, A., Tamazyan, H., Huroyan, V., Khachatrian, H., & Danelljan, M.(2024). Analyzing local representations of self-supervised vision transformers . arXiv:2401.00463v2"},{"key":"2327_CR145","unstructured":"Vasu, P.K.A., Gabriel, J., Zhu, J., Tuzel, O., & Ranjan, A. (2023). FastViT: A fast hybrid vision transformer using structural reparameterization. In Proceeding international conference computer vision."},{"key":"2327_CR146","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. In Advanced Neural Information Process System."},{"key":"2327_CR147","unstructured":"Vishniakov, K., Shen, Z., & Liu, Z.(2024). ConvNet vs Transformer, supervised vs CLIP: Beyond ImageNet accuracy . arXiv:2311.09215v3"},{"key":"2327_CR148","unstructured":"Wadekar, S.N., & Chaurasia, A.(2022). MobileViTv3: Mobile-friendly vision transformer with simple and effective fusion of local, global and input features. arXiv:2209.15159"},{"key":"2327_CR149","unstructured":"Wan, Q., Huang, Z., Lu, J., Yu, G., & Zhang, L.(2023) SeaFormer: Squeeze-enhanced axial transformer for mobile semantic segmentation. In Proceeding international conference learning represent."},{"key":"2327_CR150","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Huang, S., Dong, L., & Wei, F.(2021). MiniLMv2: Multi-head self-attention relation distillation for compressing pretrained transformers. In Proceeding findings of the association for computational linguistics: ACL-IJCNLP.","DOI":"10.18653\/v1\/2021.findings-acl.188"},{"key":"2327_CR151","doi-asserted-by":"crossref","unstructured":"Wang, Z., Cun, X., Bao, J., Zhou, W., Liu, J., & Li, H. (2022). Uformer: A general U-shaped transformer for image restoration. In Proceeding IEEE conference computer vision pattern recognition","DOI":"10.1109\/CVPR52688.2022.01716"},{"key":"2327_CR152","unstructured":"Wang, S., Gao, J., Li, Z., Zhang, X., & Hu, W.(2023). A closer look at self-supervised lightweight vision transformers. In: Proceeding international conference machine learning."},{"key":"2327_CR153","doi-asserted-by":"crossref","unstructured":"Wang, H., Song, K., Fan, J., Wang, Y., Xie, J., & Zhang, Z.(2023). Hard patches mining for masked image modeling. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52729.2023.01000"},{"key":"2327_CR154","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2021). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In Proceeding Europe conference computer vision.","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"2327_CR155","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C., Yuille, A.L., & Feichtenhofer, C.(2022). Masked feature prediction for self-supervised visual pre-training. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 14668-14678).","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"2327_CR156","unstructured":"Wei, Y., Hu, H., Xie, Z., Zhang, Z., Cao, Y., Bao, J., Chen, D., & Guo, B.(2022). Contrastive learning rivals masked image modeling in fine-tuning via feature distillation . arXiv:2205.14141"},{"key":"2327_CR157","doi-asserted-by":"crossref","unstructured":"Woo, S., Debnath, S., Hu, R., Chen, X., Liu, Z., Kweon, I.S., & Xie, S. (2023). ConvNeXtV2: Co-designing and scaling ConvNets with masked autoencoders. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"2327_CR158","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In Proceeding Europe conference computer vision.","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"2327_CR159","unstructured":"Xiao, T., Singh, M., Mintun, E., Darrell, T., Doll\u00e1r, P., & Girshick, R.B. (2021). Early convolutions help transformers see better. In Advanced Neural Information Processing System."},{"key":"2327_CR160","unstructured":"Xie, J., Girshick, R.B., & Farhadi, A.(2016). Unsupervised deep embedding for clustering analysis. In Proceeding international conference machine learning"},{"key":"2327_CR161","doi-asserted-by":"crossref","unstructured":"Xie, Z., Zhang, Z., Cao, Y., Lin, Y., Bao, J., Yao, Z., Dai, Q., & Hu, H.(2022). SimMIM: In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9643-9653)","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"2327_CR162","doi-asserted-by":"crossref","unstructured":"Xu, G., Liu, Z., Li, X., & Loy, C.C. (2020). Knowledge distillation meets self-supervision. In Proceeding international conference computer vision.","DOI":"10.1007\/978-3-030-58545-7_34"},{"key":"2327_CR163","doi-asserted-by":"crossref","unstructured":"Xu, W., Xu, Y., Chang, T.A., & Tu, Z. (2021). Co-scale conv-attentional image transformers. In Proceeding international conference computer vision.","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"2327_CR164","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Wu, K., Wang, D., Fu, J., & Lu, H. (2021). LightTrack: Finding lightweight neural networks for object tracking via one-shot architecture search. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR46437.2021.01493"},{"key":"2327_CR165","doi-asserted-by":"crossref","unstructured":"Yang, J., Parikh, D., & Batra, D. (2016). Joint unsupervised learning of deep representations and image clusters. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2016.556"},{"key":"2327_CR166","doi-asserted-by":"crossref","unstructured":"Ye, B., Chang, H., Ma, B., Shan, S., & Chen, X.(2022). Joint feature learning and relation modeling for tracking: A one-stream framework. In European conference on computer vision (pp. 341-357). Cham: Springer Nature Switzerland.","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"2327_CR167","doi-asserted-by":"crossref","unstructured":"Yuan, L., Chen, Y., Wang, T., Yu, W., Shi, Y., Jiang, Z., Tay, F.E.H., Feng, J., & Yan, S. (2021). Tokens-to-token ViT: Training vision transformers from scratch on ImageNet. In Proceeding Europe conference computer vision.","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"2327_CR168","unstructured":"Yue, X., Bai, L., Wei, M., Pang, J., Liu, X., Zhou, L., & Ouyang, W.(2023). Understanding masked autoencoders from a local contrastive perspective . arXiv:2310.01994"},{"key":"2327_CR169","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., & Yoo, Y. (2019). CutMix: Regularization strategy to train strong classifiers with localizable features. In Proceeding international conference computer vision","DOI":"10.1109\/ICCV.2019.00612"},{"key":"2327_CR170","unstructured":"Zagoruyko, S., & Komodakis, N. (2017). Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. In: Proceeding international conference learning represent."},{"key":"2327_CR171","doi-asserted-by":"crossref","unstructured":"Zamir, S.W., Arora, A., Khan, S., Hayat, M., Khan, F.S., & Yang, M.-H. (2022). Restormer: Efficient transformer for high-resolution image restoration. In Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR52688.2022.00564"},{"key":"2327_CR172","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., & Deny, S. (2021). Barlow twins: Self-supervised learning via redundancy reduction. In Proceeding international conference machine learning."},{"key":"2327_CR173","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.N., & Lopez-Paz, D. (2018). Mixup: Beyond empirical risk minimization. In Proceeding international conference learning represent."},{"key":"2327_CR174","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., & Efros, A.A. (2016). Colorful image colorization. In Proceeding Europe conference computer vision","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"2327_CR175","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., & Sun, J. (2018). ShuffleNet: An extremely efficient convolutional neural network for mobile devices. Proceeding IEEE conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2018.00716"},{"key":"2327_CR176","unstructured":"Zhou, J., Wei, C., Wang, H., Shen, W., Xie, C., Yuille, A., & Kong, T. (2022). iBOT: Image BERT pre-training with online tokenizer. In Proceeding international conference learning represent."},{"key":"2327_CR177","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ADE20K dataset. In: Proceeding international conference computer vision pattern recognition.","DOI":"10.1109\/CVPR.2017.544"},{"key":"2327_CR178","doi-asserted-by":"crossref","unstructured":"Zhuang, C., Zhai, A.L., & Yamins, D. (2019). Local aggregation for unsupervised learning of visual embeddings. In Proceeding Europe conference computer vision.","DOI":"10.1109\/ICCV.2019.00610"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02327-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02327-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02327-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:01:52Z","timestamp":1749276112000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02327-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,13]]},"references-count":178,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2327"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02327-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,13]]},"assertion":[{"value":"26 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest Statement"}}]}}