{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:26:53Z","timestamp":1763922413076,"version":"3.45.0"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032093677","type":"print"},{"value":"9783032093684","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-09368-4_4","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:11Z","timestamp":1763921651000},"page":"53-70","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Masked Self-supervised Pre-training for\u00a0Text Recognition Transformers on\u00a0Large-Scale Datasets"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6853-0508","authenticated-orcid":false,"given":"Martin","family":"Ki\u0161\u0161","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6364-129X","authenticated-orcid":false,"given":"Michal","family":"Hradi\u0161","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"4_CR1","unstructured":"Aberdam, A., Ganz, R., Mazor, S., Litman, R.: Multimodal semi-supervised learning for text recognition (2022)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Aberdam, A., et al.: Sequence-to-sequence contrastive learning for text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15302\u201315312 (2021)","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"4_CR3","unstructured":"Assran, M., et al.: Self-supervised learning from images with a joint-embedding predictive architecture"},{"key":"4_CR4","unstructured":"Baevski, A., Babu, A., Hsu, W.N., Auli, M.: Efficient self-supervised learning with contextualized target representations for vision, speech and language. In: Proceedings of the 40th International Conference on Machine Learning, pp. 1416\u20131429. PMLR (2023). ISSN: 2640-3498"},{"key":"4_CR5","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 12449\u201312460. Curran Associates, Inc. (2020)"},{"key":"4_CR6","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers"},{"key":"4_CR7","unstructured":"Bardes, A., Ponce, J., LeCun, Y.: VICReg: variance-invariance-covariance regularization for self-supervised learning"},{"key":"4_CR8","first-page":"8799","volume":"35","author":"A Bardes","year":"2022","unstructured":"Bardes, A., Ponce, J., LeCun, Y.: VICRegL: self-supervised learning of local visual features. Adv. Neural. Inf. Process. Syst. 35, 8799\u20138810 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR9","unstructured":"Berthelot, D., Carlini, N., Goodfellow, I., Papernot, N., Oliver, A., Raffel, C.A.: MixMatch: a holistic approach to semi-supervised learning. In: Advances in Neural Information Processing Systems, vol.\u00a032. Curran Associates, Inc. (2019)"},{"key":"4_CR10","unstructured":"Betker, J., et al.: Improving image generation with better captions (2020)"},{"key":"4_CR11","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901. Curran Associates, Inc. (2020)"},{"key":"4_CR12","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 9912\u20139924. Curran Associates, Inc. (2020)"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Sig. Process. 16(6), 1505\u20131518 (2022).","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"4_CR15","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations"},{"key":"4_CR16","unstructured":"Chiu, C.C., Qin, J., Zhang, Y., Yu, J., Wu, Y.: Self-supervised learning with random-projection quantizer for speech recognition. In: Proceedings of the 39th International Conference on Machine Learning, pp. 3915\u20133924. PMLR (2022). ISSN: 2640-3498"},{"key":"4_CR17","doi-asserted-by":"publisher","unstructured":"Cl\u00e9rice, T., et al.: CATMuS medieval: a multilingual large-scale cross-century dataset in Latin script for handwritten text recognition and beyond. In: Barney\u00a0Smith, E.H., Liwicki, M., Peng, L. (eds.) Document Analysis and Recognition - ICDAR 2024, pp. 174\u2013194. Springer Nature Switzerland (2024). https:\/\/doi.org\/10.1007\/978-3-031-70543-4_11","DOI":"10.1007\/978-3-031-70543-4_11"},{"key":"4_CR18","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. CoRR abs\/1810.04805 (2018)"},{"key":"4_CR19","unstructured":"Esser, P., et al.: Scaling rectified flow transformers for high-resolution image synthesis"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Fischer, A., Frinken, V., Forn\u00e9s, A., Bunke, H.: Transcription alignment of Latin manuscripts using hidden Markov models. In: Proceedings of the 2011 Workshop on Historical Document Imaging and Processing, pp. 29\u201336. HIP 2011, Association for Computing Machinery (2011)","DOI":"10.1145\/2037342.2037348"},{"issue":"7","key":"4_CR21","doi-asserted-by":"publisher","first-page":"934","DOI":"10.1016\/j.patrec.2011.09.009","volume":"33","author":"A Fischer","year":"2012","unstructured":"Fischer, A., Keller, A., Frinken, V., Bunke, H.: Lexicon-free handwritten word spotting using character HMMs. Pattern Recogn. Lett. 33(7), 934\u2013942 (2012)","journal-title":"Pattern Recogn. Lett."},{"key":"4_CR22","unstructured":"Gao, Z., et al.: Self-supervised pre-training with symmetric superimposition modeling for scene text recognition"},{"key":"4_CR23","unstructured":"Grattafiori, A., Dubey, A., Jauhri, A., et al.: The llama 3 herd of models"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Guan, T., Shen, W., Yang, X., Feng, Q., Jiang, Z., Yang, X.: Self-supervised character-to-character distillation for text recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19473\u201319484 (2023)","DOI":"10.1109\/ICCV51070.2023.01784"},{"key":"4_CR25","unstructured":"Han, M., et al.: NEST-RQ: next token prediction for speech self-supervised pre-training"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 29, 3451\u20133460 (2021)","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"4_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"463","DOI":"10.1007\/978-3-030-86337-1_31","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"M Ki\u0161\u0161","year":"2021","unstructured":"Ki\u0161\u0161, M., Bene\u0161, K., Hradi\u0161, M.: AT-ST: self-training adaptation strategy for OCR in domains with limited transcriptions. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12824, pp. 463\u2013477. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86337-1_31"},{"key":"4_CR28","doi-asserted-by":"publisher","unstructured":"Ki\u0161\u0161, M., Hradi\u0161, M.: Self-supervised pre-training of text recognizers. In: Barney\u00a0Smith, E.H., Liwicki, M., Peng, L. (eds.) Document Analysis and Recognition - ICDAR 2024, pp. 218\u2013235. Springer Nature Switzerland (2024). https:\/\/doi.org\/10.1007\/978-3-031-70546-5_13","DOI":"10.1007\/978-3-031-70546-5_13"},{"key":"4_CR29","doi-asserted-by":"publisher","unstructured":"Ki\u0161\u0161, M., Hradi\u0161, M., Bene\u0161, K., Buchal, P., Kula, M.: SoftCTC\u2013semi-supervised learning for text recognition using soft pseudo-labels. Int. J. Document Anal. Recognit. (IJDAR) (2023). https:\/\/doi.org\/10.1007\/s10032-023-00452-9","DOI":"10.1007\/s10032-023-00452-9"},{"key":"4_CR30","doi-asserted-by":"publisher","unstructured":"Kodym, O., Hradi\u0161, M.: Page layout analysis system for unconstrained historic documents. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) Document Analysis and Recognition \u2013 ICDAR 2021, pp. 492\u2013506. Springer International Publishing, Lecture Notes in Computer Science (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_32","DOI":"10.1007\/978-3-030-86331-9_32"},{"key":"4_CR31","unstructured":"Kurakin, A., et al.: ReMixMatch: semi-supervised learning with distribution matching and augmentation anchoring. In: ICLR (2020)"},{"key":"4_CR32","unstructured":"Lee, D.H.: Pseudo-label : The simple and efficient semi-supervised learning method for deep neural networks. ICML 2013 Workshop : Challenges in Representation Learning (WREPL) (2013)"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Li, J., Xu, Y., Lv, T., Cui, L., Zhang, C., Wei, F.: DiT: Self-supervised pre-training for document image transformer. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 3530\u20133539. MM 2022, Association for Computing Machinery (2022)","DOI":"10.1145\/3503161.3547911"},{"key":"4_CR34","unstructured":"Liu, A.H., et al.: UniWav: towards unified pre-training for speech representation learning and generation. In: The Thirteenth International Conference on Learning Representations (2024)"},{"key":"4_CR35","unstructured":"Lyu, P., et al.: MaskOCR: text recognition with masked encoder-decoder pretraining"},{"key":"4_CR36","unstructured":"OpenAI, Achiam, J., Adler, S., Agarwal, S., et al.: GPT-4 technical report"},{"key":"4_CR37","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision"},{"key":"4_CR38","doi-asserted-by":"publisher","unstructured":"Parres, D., Anitei, D., Paredes, R., S\u00e1nchez, J.A., Bened\u00ed, J.M.: Speed-up pre-trained vision encoder\u2013decoder transformers by leveraging lightweight mixer layers for text recognition. In: Sfikas, G., Retsinas, G. (eds.) Document Analysis Systems. pp. 277\u2013294. Springer Nature Switzerland (2024). https:\/\/doi.org\/10.1007\/978-3-031-70442-0_17","DOI":"10.1007\/978-3-031-70442-0_17"},{"key":"4_CR39","unstructured":"Penarrubia, C., Garrido-Munoz, C., Valero-Mas, J.J., Calvo-Zaragoza, J.: Spatial context-based self-supervised learning for handwritten text recognition"},{"key":"4_CR40","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., Wei, F.: BEiT v2: masked image modeling with vector-quantized visual tokenizers"},{"key":"4_CR41","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision (2022)"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4_CR43","doi-asserted-by":"publisher","unstructured":"Scius-Bertrand, A., Str\u00f6bel, P., Volk, M., Hodel, T., Fischer, A.: The bullinger dataset: a writer adaptation challenge. In: Fink, G.A., Jain, R., Kise, K., Zanibbi, R. (eds.) Document Analysis and Recognition - ICDAR 2023, pp. 397\u2013410. Springer Nature Switzerland (2023). https:\/\/doi.org\/10.1007\/978-3-031-41676-7_23","DOI":"10.1007\/978-3-031-41676-7_23"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Souibgui, M.A., et al.: Text-DIAE: a self-supervised degradation invariant autoencoder for text recognition and document enhancement. In: Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence. AAAI 2023\/IAAI 2023\/EAAI 2023, vol.\u00a037, pp. 2330\u20132338. AAAI Press (2023)","DOI":"10.1609\/aaai.v37i2.25328"},{"key":"4_CR45","doi-asserted-by":"crossref","unstructured":"S\u00e1nchez, J.A., Romero, V., Toselli, A.H., Vidal, E.: ICFHR2014 competition on Handwritten Text Recognition on Transcriptorium Datasets (HTRtS). In: 2014 14th International Conference on Frontiers in Handwriting Recognition, pp. 785\u2013790 (2014). iSSN: 2167-6445","DOI":"10.1109\/ICFHR.2014.137"},{"key":"4_CR46","doi-asserted-by":"crossref","unstructured":"S\u00e1nchez, J.A., Romero, V., Toselli, A.H., Villegas, M., Vidal, E.: ICDAR2017 competition on handwritten text recognition on the READ dataset. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a001, pp. 1383\u20131388 (2017). ISSN: 2379-2140","DOI":"10.1109\/ICDAR.2017.226"},{"key":"4_CR47","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Image as a foreign language: BEiT pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"4_CR48","doi-asserted-by":"crossref","unstructured":"Xie, Q., Luong, M.T., Hovy, E., Le, Q.V.: Self-training with noisy student improves ImageNet classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"4_CR49","doi-asserted-by":"crossref","unstructured":"Yang, M., et al.: Reading and writing: discriminative and generative modeling for self-supervised text recognition. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4214\u20134223. MM 2022, Association for Computing Machinery (2022)","DOI":"10.1145\/3503161.3547784"},{"key":"4_CR50","unstructured":"Zhou, J., et al.: iBOT: image BERT pre-training with online tokenizer"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-09368-4_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:21Z","timestamp":1763921661000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-09368-4_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032093677","9783032093684"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-09368-4_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}