{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:39Z","timestamp":1765357719477,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031731150"},{"type":"electronic","value":"9783031731167"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73116-7_16","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:15:38Z","timestamp":1730301338000},"page":"272-289","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Emerging Property of\u00a0Masked Token for\u00a0Effective Pre-training"],"prefix":"10.1007","author":[{"given":"Hyesong","family":"Choi","sequence":"first","affiliation":[]},{"given":"Hunsang","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Seyoung","family":"Joung","sequence":"additional","affiliation":[]},{"given":"Hyejin","family":"Park","sequence":"additional","affiliation":[]},{"given":"Jiyeong","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Dongbo","family":"Min","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"16_CR1","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers. In: International Conference on Learning Representations (2021)"},{"key":"16_CR2","unstructured":"Bao, H., et\u00a0al.: UniLMv2: pseudo-masked language models for unified language model pre-training. In: International conference on machine learning, pp. 642\u2013652. PMLR (2020)"},{"key":"16_CR3","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR4","unstructured":"Cao, S., Xu, P., Clifton, D.A.: How to understand masked autoencoders. arXiv preprint arXiv:2202.03670 (2022)"},{"key":"16_CR5","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Cheng, Y., Wang, S., Gan, Z., Wang, Z., Liu, J.: EarlyBERT: Efficient BERT training via early-bird lottery tickets. arXiv preprint arXiv:2101.00063 (2020)","DOI":"10.18653\/v1\/2021.acl-long.171"},{"key":"16_CR7","unstructured":"Clark, K., Luong, M.T., Le, Q.V., Manning, C.D.: ELECTRA: Pre-training text encoders as discriminators rather than generators. arXiv preprint arXiv:2003.10555 (2020)"},{"key":"16_CR8","unstructured":"Conneau, A., Lample, G.: Cross-lingual language model pretraining. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"16_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"16_CR10","doi-asserted-by":"publisher","unstructured":"Dong, X., et al.: Bootstrapped masked autoencoders for vision BERT pretraining. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022. ECCV 2022. LNCS, vol. 13690, pp. 247\u2013264. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20056-4_15","DOI":"10.1007\/978-3-031-20056-4_15"},{"key":"16_CR11","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Ghazvininejad, M., Levy, O., Liu, Y., Zettlemoyer, L.: Mask-predict: Parallel decoding of conditional masked language models. arXiv preprint arXiv:1904.09324 (2019)","DOI":"10.18653\/v1\/D19-1633"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"16_CR14","unstructured":"Hou, Z., Sun, F., Chen, Y.K., Xie, Y., Kung, S.Y.: MILAN: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049 (2022)"},{"key":"16_CR15","unstructured":"Huang, L., You, S., Zheng, M., Wang, F., Qian, C., Yamasaki, T.: Green hierarchical vision transformer for masked image modeling. arXiv preprint arXiv:2205.13515 (2022)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Kakogeorgiou, I., et al.: What to hide from your students: Attention-guided masked image modeling. arXiv preprint arXiv:2203.12719 (2022)","DOI":"10.1007\/978-3-031-20056-4_18"},{"key":"16_CR17","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. In: International Conference on Learning Representations"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Lee, M., Park, J.H., Kim, J., Kim, K.M., Lee, S.: Efficient pre-training of masked language model via concept-based curriculum masking. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.502"},{"key":"16_CR19","doi-asserted-by":"publisher","unstructured":"Li, X., Ge, Y., Yi, K., Hu, Z., Shan, Y., Duan, L.Y.: mc-BEiT: multi-choice discretization for image BERT pre-training. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022. ECCV 2022. LNCS, vol. 13690, pp. 231\u2013246. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20056-4_14","DOI":"10.1007\/978-3-031-20056-4_14"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Liao, B., Thulke, D., Hewavitharana, S., Ney, H., Monz, C.: Mask more and mask later: efficient pre-training of masked language models by disentangling the [MASK] token. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.106"},{"key":"16_CR21","unstructured":"Liu, J., Huang, X., Liu, Y., Li, H.: MixMIM: Mixed and masked image modeling for efficient visual representation learning. arXiv preprint arXiv:2205.13137 (2022)"},{"key":"16_CR22","unstructured":"Liu, Y., et al.: RoBERTa: A robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer v2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"16_CR25","unstructured":"Pan, J., Zhou, P., Yan, S.: Towards understanding why mask-reconstruction pretraining helps in downstream tasks. arXiv preprint arXiv:2206.03826 (2022)"},{"key":"16_CR26","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., Wei, F.: BEiT v2: Masked image modeling with vector-quantized visual tokenizers. arXiv preprint arXiv:2208.06366 (2022)"},{"key":"16_CR27","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., Wei, F.: A unified view of masked image modeling. arXiv preprint arXiv:2210.10615 (2022)"},{"key":"16_CR28","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"issue":"1","key":"16_CR29","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"16_CR30","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: MASS: masked sequence to sequence pre-training for language generation. In: International Conference on Machine Learning, pp. 5926\u20135936. PMLR (2019)"},{"key":"16_CR31","first-page":"16857","volume":"33","author":"K Song","year":"2020","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: MPNet: masked and permuted pre-training for language understanding. Adv. Neural. Inf. Process. Syst. 33, 16857\u201316867 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR32","unstructured":"Taylor, R., et al.: Galactica: A large language model for science. arXiv preprint arXiv:2211.09085 (2022)"},{"key":"16_CR33","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Wettig, A., Gao, T., Zhong, Z., Chen, D.: Should you mask 15% in masked language modeling? arXiv preprint arXiv:2202.08005 (2022)","DOI":"10.18653\/v1\/2023.eacl-main.217"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Wettig, A., Gao, T., Zhong, Z., Chen, D.: Should you mask 15% in masked language modeling? In: Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pp. 2977\u20132992 (2023)","DOI":"10.18653\/v1\/2023.eacl-main.217"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"v.\u00a0Wintzingerode, F., G\u00f6bel, U.B., Stackebrandt, E.: Determination of microbial diversity in environmental samples: pitfalls of PCR-based rRNA analysis. FEMS Microbiol. Rev. 21(3), 213\u2013229 (1997)","DOI":"10.1111\/j.1574-6976.1997.tb00351.x"},{"key":"16_CR37","unstructured":"Wu, J., Mo, S.: Object-wise masked autoencoders for fast pre-training. arXiv preprint arXiv:2205.14338 (2022)"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: SimMIM: a simple framework for masked image modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9653\u20139663 (2022)","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Xue, H., et al.: Stare at what you see: masked image modeling without reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22732\u201322741 (2023)","DOI":"10.1109\/CVPR52729.2023.02177"},{"key":"16_CR40","unstructured":"Yi, K., et al.: Masked image modeling with denoising contrast. arXiv preprint arXiv:2205.09616 (2022)"},{"key":"16_CR41","unstructured":"Zaken, E.B., Ravfogel, S., Goldberg, Y.: BitFit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zhang, C., Song, J., Yi, J.S.K., Zhang, K., Kweon, I.S.: A survey on masked autoencoder for self-supervised learning in vision and beyond. arXiv preprint arXiv:2208.00173 (2022)","DOI":"10.24963\/ijcai.2023\/762"},{"key":"16_CR43","unstructured":"Zhang, Q., Wang, Y., Wang, Y.: How mask matters: Towards theoretical understandings of masked autoencoders. arXiv preprint arXiv:2210.08344 (2022)"},{"key":"16_CR44","unstructured":"Zhang, X., et al.: HiViT: a simpler and more efficient design of hierarchical vision transformer. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"16_CR45","unstructured":"Zhou, J., et al.: iBOT: Image BERT pre-training with online tokenizer. arXiv preprint arXiv:2111.07832 (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73116-7_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:24:26Z","timestamp":1730301866000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73116-7_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031731150","9783031731167"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73116-7_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}