{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T21:51:28Z","timestamp":1757541088894,"version":"3.41.0"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031919787","type":"print"},{"value":"9783031919794","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91979-4_12","type":"book-chapter","created":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T19:07:09Z","timestamp":1748718429000},"page":"131-149","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["DailyMAE: Towards Pretraining Masked Autoencoders in\u00a0One Day"],"prefix":"10.1007","author":[{"given":"Jiantao","family":"Wu","sequence":"first","affiliation":[]},{"given":"Shentong","family":"Mo","sequence":"additional","affiliation":[]},{"given":"Sara","family":"Atito","sequence":"additional","affiliation":[]},{"given":"Zhenhua","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Josef","family":"Kittler","sequence":"additional","affiliation":[]},{"given":"Muhammad","family":"Awais","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"12_CR1","unstructured":"Atito, S., Awais, M., Kittler, J.: SiT: self-supervised vIsion Transformer. ArXiv preprint abs\/2104.03602 (2021). https:\/\/arxiv.org\/abs\/2104.03602"},{"key":"12_CR2","unstructured":"Atito, S., Awais, M., Kittler, J.: GMML is All you Need, May 2022. http:\/\/arxiv.org\/abs\/2205.14986, arXiv:2205.14986 [cs]"},{"key":"12_CR3","unstructured":"Baevski, A., Babu, A., Hsu, W.N., Auli, M.: Efficient Self-supervised Learning with Contextualized Target Representations for Vision, Speech and Language, June 2023. http:\/\/arxiv.org\/abs\/2212.07525, arXiv:2212.07525 [cs, eess]"},{"key":"12_CR4","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT Pre-Training of Image Transformers, September 2022. http:\/\/arxiv.org\/abs\/2106.08254, arXiv:2106.08254 [cs]"},{"key":"12_CR5","unstructured":"Bordes, F., Balestriero, R., Vincent, P.: Towards Democratizing Joint-Embedding Self-Supervised Learning, March 2023. http:\/\/arxiv.org\/abs\/2303.01986, arXiv:2303.01986 [cs]"},{"key":"12_CR6","doi-asserted-by":"publisher","unstructured":"Brown, T.B., et al.: Language Models are Few-Shot Learners, July 2020. https:\/\/doi.org\/10.48550\/arXiv.2005.14165, http:\/\/arxiv.org\/abs\/2005.14165, arXiv:2005.14165 [cs]","DOI":"10.48550\/arXiv.2005.14165"},{"key":"12_CR7","unstructured":"Cao, Y.H., Wu, J.: Rethinking Self-Supervised Learning: Small is Beautiful, March 2021. http:\/\/arxiv.org\/abs\/2103.13559, arXiv:2103.13559 [cs]"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging Properties in Self-Supervised Vision Transformers, May 2021. http:\/\/arxiv.org\/abs\/2104.14294, arXiv:2104.14294 [cs]","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"12_CR9","unstructured":"Chen, M., Radford, A., Child, R., Wu, J., Jun, H., Luan, D., Sutskever, I.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"12_CR10","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.E.: A simple framework for contrastive learning of visual representations. In: Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event. Proceedings of Machine Learning Research, vol.\u00a0119, pp. 1597\u20131607. PMLR (2020). http:\/\/proceedings.mlr.press\/v119\/chen20j.html"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An Empirical Study of Training Self-Supervised Vision Transformers, August 2021. http:\/\/arxiv.org\/abs\/2104.02057, arXiv:2104.02057 [cs]","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"12_CR12","doi-asserted-by":"publisher","unstructured":"Chen, Y., et al.: SdAE: self-distillated masked autoencoder. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, vol. 13690, pp. 108\u2013124. Springer, Cham (2022).https:\/\/doi.org\/10.1007\/978-3-031-20056-4_7, https:\/\/link.springer.com\/10.1007\/978-3-031-20056-4_7, series Title: Lecture Notes in Computer Science","DOI":"10.1007\/978-3-031-20056-4_7"},{"key":"12_CR13","doi-asserted-by":"publisher","unstructured":"Cubuk, E.D., Zoph, B., Shlens, J., Le, Q.V.: Randaugment: Practical automated data augmentation with a reduced search space. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp. 3008\u20133017. IEEE, Seattle, June 2020. https:\/\/doi.org\/10.1109\/CVPRW50498.2020.00359, https:\/\/ieeexplore.ieee.org\/document\/9150790\/","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"12_CR14","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.: FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. Advances in Neural Information Processing Systems 35, 16344\u201316359 (2022). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html"},{"key":"12_CR15","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019).https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"12_CR16","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"12_CR17","doi-asserted-by":"publisher","unstructured":"Feng, Z., Zhang, S.: Evolved part masking for self-supervised learning. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10386\u201310395. IEEE, Vancouver, BC, Canada, June 2023. https:\/\/doi.org\/10.1109\/CVPR52729.2023.01001. https:\/\/ieeexplore.ieee.org\/document\/10205356\/","DOI":"10.1109\/CVPR52729.2023.01001"},{"key":"12_CR18","unstructured":"Grill, J.B., et al.: Bootstrap your own latent - a new approach to self-supervised learning. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M.F., Lin, H.T. (eds.) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/f3ada80d5c4ee70142b17b8192b2958e-Abstract.html"},{"key":"12_CR19","unstructured":"Haopeng, L., Qiuhong, K., Mingming, G., Drummond, T.: Progressive Video Summarization via Multimodal Self-supervised Learning, October 2022. http:\/\/arxiv.org\/abs\/2201.02494, arXiv:2201.02494 [cs]"},{"key":"12_CR20","doi-asserted-by":"publisher","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Dollar, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15979\u201315988. IEEE, New Orleans, LA, USA, June 2022. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01553, https:\/\/ieeexplore.ieee.org\/document\/9879206\/","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"12_CR21","doi-asserted-by":"publisher","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.B.: Momentum contrast for unsupervised visual representation learning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13-19, 2020, pp. 9726\u20139735. IEEE (2020).https:\/\/doi.org\/10.1109\/CVPR42600.2020.00975","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"12_CR22","unstructured":"Howard, J.: fast.ai - Now anyone can train Imagenet in 18 minutes, August 2018. https:\/\/www.fast.ai\/posts\/2018-08-10-fastai-diu-imagenet.html"},{"key":"12_CR23","unstructured":"Huang, L., You, S., Zheng, M., Wang, F., Qian, C., Yamasaki, T.: Green hierarchical vision transformer for masked image modeling. In: Advances in Neural Information Processing Systems, vol.\u00a035 (2022)"},{"key":"12_CR24","unstructured":"Irandoust, S., Durand, T., Rakhmangulova, Y., Zi, W., Hajimirsadeghi, H.: Training a vision transformer from scratch in less than 24 hours with 1 gpu. arXiv preprint arXiv:2211.05187 (2022)"},{"key":"12_CR25","doi-asserted-by":"publisher","unstructured":"Jerrish, D.J., et al.: Deep learning approaches for lyme disease detection: leveraging progressive resizing and self-supervised learning models. Multimedia Tools and Applications, August 2023. https:\/\/doi.org\/10.1007\/s11042-023-16306-9","DOI":"10.1007\/s11042-023-16306-9"},{"key":"12_CR26","doi-asserted-by":"publisher","unstructured":"Leclerc, G., Ilyas, A., Engstrom, L., Park, S.M., Salman, H., Madry, A.: FFCV: Accelerating Training by Removing Data Bottlenecks, June 2023. https:\/\/doi.org\/10.48550\/arXiv.2306.12517, http:\/\/arxiv.org\/abs\/2306.12517, arXiv:2306.12517 [cs]","DOI":"10.48550\/arXiv.2306.12517"},{"key":"12_CR27","unstructured":"Li, X., Wang, Z., Xie, C.: An Inverse Scaling Law for CLIP Training, October 2023. http:\/\/arxiv.org\/abs\/2305.07017, arXiv:2305.07017 [cs]"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer V2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, p.\u00a011 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"12_CR29","unstructured":"Micikevicius, P., et al.: Mixed Precision Training, February 2018. http:\/\/arxiv.org\/abs\/1710.03740, arXiv:1710.03740 [cs, stat]"},{"key":"12_CR30","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation Learning with Contrastive Predictive Coding. CoRR abs\/1807.03748 (2018). http:\/\/arxiv.org\/abs\/1807.03748, arXiv: 1807.03748"},{"key":"12_CR31","unstructured":"Ridnik, T., Lawen, H., Ben-Baruch, E., Noy, A.: Solving ImageNet: a unified scheme for training any backbone to top results, May 2022. http:\/\/arxiv.org\/abs\/2204.03475, arXiv:2204.03475 [cs]"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vision 115, 211\u2013252 (2015), publisher: Springer","DOI":"10.1007\/s11263-015-0816-y"},{"key":"12_CR33","doi-asserted-by":"publisher","unstructured":"Ryali, C., et al.: Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles, June 2023. https:\/\/doi.org\/10.48550\/arXiv.2306.00989, http:\/\/arxiv.org\/abs\/2306.00989, arXiv:2306.00989 [cs]","DOI":"10.48550\/arXiv.2306.00989"},{"key":"12_CR34","unstructured":"Tan, M., Le, Q.V.: EfficientNetV2: smaller models and faster training, June 2021. http:\/\/arxiv.org\/abs\/2104.00298, arXiv:2104.00298 [cs] version: 3"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Jegou, H.: DeiT III: revenge of the ViT. In: European Conference on Computer Vision, p.\u00a027 (2022)","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"12_CR36","unstructured":"Touvron, H., Vedaldi, A., Douze, M., J\u00e9gou, H.: Fixing the train-test resolution discrepancy, January 2022, http:\/\/arxiv.org\/abs\/1906.06423, arXiv:1906.06423 [cs]"},{"key":"12_CR37","unstructured":"Wang, S., Gao, J., Li, Z., Zhang, X., Hu, W.: A closer look at self-supervised lightweight vision transformers. In: Proceedings of the 40th International Conference on Machine Learning. pp. 35624\u201335641. PMLR, July 2023. https:\/\/proceedings.mlr.press\/v202\/wang23e.html, iSSN: 2640-3498"},{"key":"12_CR38","doi-asserted-by":"publisher","unstructured":"Wu, K., Zhang, J., Peng, H., Liu, M., Xiao, B., Fu, J., Yuan, L.: TinyViT: fast pretraining distillation for small vision transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, pp. 68\u201385. LNCS. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19803-8_5","DOI":"10.1007\/978-3-031-19803-8_5"},{"key":"12_CR39","doi-asserted-by":"publisher","unstructured":"Xie, Z., Zhang, Z., Cao, Y., Lin, Y., Bao, J., Yao, Z., Dai, Q., Hu, H.: SimMIM: a simple framework for masked image modeling. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9643\u20139653. IEEE, New Orleans, June 2022. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00943, https:\/\/ieeexplore.ieee.org\/document\/9880205\/","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"12_CR40","unstructured":"Zhou, J., et al.: iBOT: Image BERT Pre-Training with Online Tokenizer, January 2022. http:\/\/arxiv.org\/abs\/2111.07832, arXiv:2111.07832 [cs]"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91979-4_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T19:07:18Z","timestamp":1748718438000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91979-4_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031919787","9783031919794"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91979-4_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}