{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T18:37:23Z","timestamp":1775932643467,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733436","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:39Z","timestamp":1750876299000},"page":"1644-1653","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SSCD: Self-Supervised Coherence Discrimination Representation Learning for Scene Text Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3154-7756","authenticated-orcid":false,"given":"Zhi-Yuan","family":"Xue","sequence":"first","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6400-6014","authenticated-orcid":false,"given":"Li-Jun","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9682-9787","authenticated-orcid":false,"given":"Jia-Ying","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6901-5476","authenticated-orcid":false,"given":"Xin","family":"Luo","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9972-7370","authenticated-orcid":false,"given":"Xin-Shun","family":"Xu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01851"},{"key":"e_1_3_2_1_4_1","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume":"119","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey E. Hinton. 2020a. A simple framework for contrastive learning of visual representations. In Proc. Int. Conf. Mach. Learn., Vol. 119. 1597--1607.","journal-title":"Proc. Int. Conf. Mach. Learn."},{"key":"e_1_3_2_1_5_1","first-page":"22243","article-title":"Big self-supervised models are strong semi-supervised learners","volume":"33","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Kevin Swersky, Mohammad Norouzi, and Geoffrey E Hinton. 2020b. Big self-supervised models are strong semi-supervised learners. Proc. Adv. Neural Inf. Process. Syst., Vol. 33 (2020), 22243--22255.","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.157"},{"key":"e_1_3_2_1_7_1","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. 1422--1430","author":"Doersch Carl","unstructured":"Carl Doersch, Abhinav Gupta, and Alexei A. Efros. 2015. Unsupervised visual representation learning by context prediction. In Proc. IEEE\/CVF Int. Conf. Comput. Vis. 1422--1430."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/124"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_43"},{"key":"e_1_3_2_1_11_1","first-page":"21271","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. 2020. Bootstrap your own latent-a new approach to self-supervised learning. Proc. Adv. Neural Inf. Process. Syst., Vol. 33 (2020), 21271--21284.","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01784"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_14_1","first-page":"5679","article-title":"Self-supervised co-training for video representation learning","volume":"33","author":"Han Tengda","year":"2020","unstructured":"Tengda Han, Weidi Xie, and Andrew Zisserman. 2020. Self-supervised co-training for video representation learning. Proc. Adv. Neural Inf. Process. Syst., Vol. 33 (2020), 5679--5690.","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_16_1","volume-title":"Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227 (2014)."},{"key":"e_1_3_2_1_17_1","volume-title":"Discourse-based objectives for fast unsupervised sentence representation learning. arXiv preprint arXiv:1705.00557","author":"Jernite Yacine","year":"2017","unstructured":"Yacine Jernite, Samuel R Bowman, and David Sontag. 2017. Discourse-based objectives for fast unsupervised sentence representation learning. arXiv preprint arXiv:1705.00557 (2017)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"e_1_3_2_1_20_1","volume-title":"ICDAR 2015 competition on robust reading. In Proc. Int. Conf. Document Anal. Recog. 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In Proc. Int. Conf. Document Anal. Recog. 1156--1160."},{"key":"e_1_3_2_1_21_1","volume-title":"ICDAR 2013 robust reading competition. In Proc. Int. Conf. Document Anal. Recog. 1484--1493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis Gomez i Bigorda, Sergi Robles Mestre, Joan Mas, David Fernandez Mota, Jon Almazan Almazan, and Lluis Pere De Las Heras. 2013. ICDAR 2013 robust reading competition. In Proc. Int. Conf. Document Anal. Recog. 1484--1493."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20062"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681350"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3492705"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.02.002"},{"key":"e_1_3_2_1_26_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov I","year":"2017","unstructured":"I Loshchilov. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3509678"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00111"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.26.127"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"e_1_3_2_1_31_1","article-title":"Know your self-supervised learning: A survey on image-based generative and discriminative training","volume":"2023","author":"Ozbulak Utku","year":"2023","unstructured":"Utku Ozbulak, Hyun Jung Lee, Beril Boga, Esla Timothy Anzaku, Ho-min Park, Arnout Van Messem, Wesley De Neve, and Joris Vankerschaver. 2023. Know your self-supervised learning: A survey on image-based generative and discriminative training. Trans. Mach. Learn. Res., Vol. 2023 (2023).","journal-title":"Trans. Mach. Learn. Res."},{"key":"e_1_3_2_1_32_1","volume-title":"Spatial context-based self-supervised learning for handwritten text recognition. arXiv preprint arXiv:2404.11585","author":"Penarrubia Carlos","year":"2024","unstructured":"Carlos Penarrubia, Carlos Garrido-Munoz, Jose J Valero-Mas, and Jorge Calvo-Zaragoza. 2024. Spatial context-based self-supervised learning for handwritten text recognition. arXiv preprint arXiv:2404.11585 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.76"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01354"},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. Int. Conf. Mach. Learn. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proc. Int. Conf. Mach. Learn. 8748--8763."},{"key":"e_1_3_2_1_36_1","volume-title":"LEGO: Self-supervised representation learning for scene text images. arXiv preprint arXiv:2408.02036","author":"Ren Yujin","year":"2024","unstructured":"Yujin Ren, Jiaxin Zhang, and Lianwen Jin. 2024. LEGO: Self-supervised representation learning for scene text images. arXiv preprint arXiv:2408.02036 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"e_1_3_2_1_40_1","volume-title":"Retentive network: A successor to transformer for large language models. arXiv preprint arXiv:2307.08621","author":"Sun Yutao","year":"2023","unstructured":"Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. 2023. Retentive network: A successor to transformer for large language models. arXiv preprint arXiv:2307.08621 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01382"},{"key":"e_1_3_2_1_42_1","volume-title":"Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140","author":"Veit Andreas","year":"2016","unstructured":"Andreas Veit, Tomas Matera, Lukas Neumann, Jiri Matas, and Serge Belongie. 2016. Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140 (2016)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390294"},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. 1457--1464","author":"Wang Kai","year":"2011","unstructured":"Kai Wang, Boris Babenko, and Serge Belongie. 2011. End-to-end scene text recognition. In Proc. IEEE\/CVF Int. Conf. Comput. Vis. 1457--1464."},{"key":"e_1_3_2_1_45_1","volume-title":"Structbert: Incorporating language structures into pre-training for deep language understanding. arXiv preprint arXiv:1908.04577","author":"Wang Wei","year":"2019","unstructured":"Wei Wang, Bin Bi, Ming Yan, Chen Wu, Zuyi Bao, Jiangnan Xia, Liwei Peng, and Luo Si. 2019. Structbert: Incorporating language structures into pre-training for deep language understanding. arXiv preprint arXiv:1908.04577 (2019)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611769"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446874"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28402"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3523799"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547784"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01213"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2021.3116216"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612247"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.111229"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20245"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3512354"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/195"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733436","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:14:11Z","timestamp":1755749651000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733436"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":58,"alternative-id":["10.1145\/3731715.3733436","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733436","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}