{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T04:08:54Z","timestamp":1746850134682,"version":"3.40.5"},"reference-count":27,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,26]],"date-time":"2025-04-26T00:00:00Z","timestamp":1745625600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,26]],"date-time":"2025-04-26T00:00:00Z","timestamp":1745625600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["RS-2024-00453301","RS-2024-00453301","RS-2024-00453301"],"award-info":[{"award-number":["RS-2024-00453301","RS-2024-00453301","RS-2024-00453301"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002573","name":"Yonsei University","doi-asserted-by":"publisher","award":["2024-22-0161","2024-22-0161"],"award-info":[{"award-number":["2024-22-0161","2024-22-0161"]}],"id":[{"id":"10.13039\/501100002573","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s00138-025-01693-w","type":"journal-article","created":{"date-parts":[[2025,4,26]],"date-time":"2025-04-26T16:54:53Z","timestamp":1745686493000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Clean-to-clean: pretraining vision transformers without additional data"],"prefix":"10.1007","volume":"36","author":[{"given":"Hyeonjin","family":"Lee","sequence":"first","affiliation":[]},{"given":"Songkuk","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Jong-Seok","family":"Lee","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,26]]},"reference":[{"key":"1693_CR1","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"key":"1693_CR2","unstructured":"Krizhevsky, A., Hinton, G.: Learning multiple layers of features from tiny images. Master\u2019s thesis, Department of Computer Science, University of Toronto (2009)"},{"key":"1693_CR3","unstructured":"Gani, H., Naseer, M., Yaqub, M.: How to train vision transformer on small-scale datasets? In: Proceedings of the 33rd British Machine Vision Conference (BMVC) (2022)"},{"key":"1693_CR4","first-page":"857","volume":"35","author":"X Liu","year":"2021","unstructured":"Liu, X., Zhang, F., Hou, Z., Mian, L., Wang, Z., Zhang, J., Tang, J.: Self-supervised learning: generative or contrastive. IEEE Trans. Knowl. Data Eng. 35, 857\u2013876 (2021)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"1693_CR5","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT Pre-training of image transformers. In: Proceedings of the International Conference on Learning Representations (ICLR) (2022)"},{"key":"1693_CR6","doi-asserted-by":"crossref","unstructured":"Chen, H., Wang, Y., Guo, T., Xu, C., Deng, Y., Liu, Z., Ma, S., Xu, C., Xu, C., Gao, W.: Pre-trained image processing transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12299\u201312310 (2021)","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"1693_CR7","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"1693_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1693_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"1693_CR10","unstructured":"Atito, S., Awais, M., Kittler, J.: SiT: Self-supervised vision transformer. arXiv preprint arXiv:2104.03602 (2021)"},{"key":"1693_CR11","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), vol. 1, pp. 4171\u20134186 (2019)"},{"key":"1693_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3505244","volume":"54","author":"S Khan","year":"2022","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Comput. Surv. 54, 1\u201341 (2022)","journal-title":"ACM Comput. Surv."},{"key":"1693_CR13","doi-asserted-by":"crossref","unstructured":"Yuan, L., Chen, Y., Wang, T., Yu, W., Shi, Y., Jiang, Z.-H., Tay, F.E., Feng, J., Yan, S.: Tokens-to-token vit: Training vision transformers from scratch on imagenet. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 558\u2013567 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"1693_CR14","doi-asserted-by":"publisher","first-page":"123212","DOI":"10.1109\/ACCESS.2022.3224044","volume":"10","author":"S Lee","year":"2022","unstructured":"Lee, S., Lee, S., Song, B.C.: Improving vision transformers to learn small-size dataset from scratch. IEEE Access 10, 123212\u2013123224 (2022)","journal-title":"IEEE Access"},{"issue":"4","key":"1693_CR15","doi-asserted-by":"publisher","first-page":"2589","DOI":"10.1007\/s00371-023-02939-2","volume":"40","author":"D Yao","year":"2024","unstructured":"Yao, D., Shao, Y.: A data efficient transformer based on swin transformer. Vis. Comput. 40(4), 2589\u20132598 (2024)","journal-title":"Vis. Comput."},{"key":"1693_CR16","unstructured":"Zhang, H.: mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412 (2017)"},{"key":"1693_CR17","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: Cutmix: Regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6023\u20136032 (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"1693_CR18","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1126\/science.1127647","volume":"313","author":"GE Hinton","year":"2006","unstructured":"Hinton, G.E., Salakhutdinov, R.R.: Reducing the dimensionality of data with neural networks. Science 313, 504\u2013507 (2006)","journal-title":"Science"},{"key":"1693_CR19","unstructured":"Steiner, A.P., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., Beyer, L.: How to train your vit? Data, augmentation, and regularization in vision transformers. Transactions on Machine Learning Research (2022) https:\/\/doi.org\/10.48550\/arXiv.2106.10270"},{"key":"1693_CR20","unstructured":"Loshchilov, I., Hutter, F.: SGDR: Stochastic gradient descent with warm restarts. In: Proceedings of the International Conference on Learning Representations (ICLR) (2017)"},{"key":"1693_CR21","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"1693_CR22","unstructured":"Coates, A., Ng, A., Lee, H.: An analysis of single-layer networks in unsupervised feature learning. In: Proceedings of the 14th International Conference on Artificial Intelligence and Statistics (AISTATS), pp. 215\u2013223 (2011)"},{"key":"1693_CR23","unstructured":"Melas-Kyriazi, L.: Do you even need attention? a stack of feed-forward layers does surprisingly well on imagenet. arXiv preprint arXiv:2105.02723 (2021)"},{"key":"1693_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109659","volume":"141","author":"BJ Kim","year":"2023","unstructured":"Kim, B.J., Choi, H., Jang, H., Lee, D.G., Jeong, W., Kim, S.W.: Improved robustness of vision transformers via prelayernorm in patch embedding. Pattern Recogn. 141, 109659 (2023)","journal-title":"Pattern Recogn."},{"key":"1693_CR25","doi-asserted-by":"crossref","unstructured":"Luong, T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1412\u20131421 (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"1693_CR26","doi-asserted-by":"crossref","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 818\u2013833 (2014)","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"1693_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-025-01693-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-025-01693-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-025-01693-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,9]],"date-time":"2025-05-09T14:30:35Z","timestamp":1746801035000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-025-01693-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,26]]},"references-count":27,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["1693"],"URL":"https:\/\/doi.org\/10.1007\/s00138-025-01693-w","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"type":"print","value":"0932-8092"},{"type":"electronic","value":"1432-1769"}],"subject":[],"published":{"date-parts":[[2025,4,26]]},"assertion":[{"value":"29 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 April 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}],"article-number":"72"}}