{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T21:47:08Z","timestamp":1774648028393,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Nature Science Foundation of China","award":["62121002,62232006,62102384"],"award-info":[{"award-number":["62121002,62232006,62102384"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB3104700"],"award-info":[{"award-number":["2022YFB3104700"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611769","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"509-518","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":29,"title":["Symmetrical Linguistic Feature Distillation with CLIP for Scene Text Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0009-5033","authenticated-orcid":false,"given":"Zixiao","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6249-5315","authenticated-orcid":false,"given":"Hongtao","family":"Xie","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0228-6220","authenticated-orcid":false,"given":"Yuxin","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6398-3364","authenticated-orcid":false,"given":"Jianjun","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5314-4054","authenticated-orcid":false,"given":"Boqiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1151-1792","authenticated-orcid":false,"given":"Yongdong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CLIPTER: Looking at the Bigger Picture in Scene Text Recognition. arXiv preprint arXiv:2301.07464","author":"Aberdam Aviad","year":"2023","unstructured":"Aviad Aberdam, David Bensa\u00efd, Alona Golts, Roy Ganz, Oren Nuriel, Royee Tichauer, Shai Mazor, and Ron Litman. 2023. CLIPTER: Looking at the Bigger Picture in Scene Text Recognition. arXiv preprint arXiv:2301.07464 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00481"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00313"},{"key":"e_1_3_2_1_5_1","first-page":"33781","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume":"35","author":"Bangalath Hanoona","year":"2022","unstructured":"Hanoona Bangalath, Muhammad Maaz, Muhammad Uzair Khattak, Salman H Khan, and Fahad Shahbaz Khan. 2022. Bridging the gap between object and image-level representations for open-vocabulary detection. Advances in Neural Information Processing Systems, Vol. 35 (2022), 33781--33794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","volume-title":"Tel Aviv","author":"Bautista Darwin","year":"2022","unstructured":"Darwin Bautista and Rowel Atienza. 2022. Scene text recognition with permuted autoregressive sequence models. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXVIII. Springer, 178--196."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00252"},{"key":"e_1_3_2_1_8_1","volume-title":"Open-Vocabulary Panoptic Segmentation with MaskCLIP. arXiv preprint arXiv:2208.08984","author":"Ding Zheng","year":"2022","unstructured":"Zheng Ding, Jieke Wang, and Zhuowen Tu. 2022. Open-Vocabulary Panoptic Segmentation with MaskCLIP. arXiv preprint arXiv:2208.08984 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"Abinet: Autonomous, bidirectional and iterative language modeling for scene text spotting","author":"Fang Shancheng","year":"2022","unstructured":"Shancheng Fang, Zhendong Mao, Hongtao Xie, Yuxin Wang, Chenggang Yan, and Yongdong Zhang. 2022. Abinet: Autonomous, bidirectional and iterative language modeling for scene text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_14_1","volume-title":"Understanding and Improving the Role of Projection Head in Self-Supervised Learning. arXiv preprint arXiv:2212.11491","author":"Gupta Kartik","year":"2022","unstructured":"Kartik Gupta, Thalaiyasingam Ajanthan, Anton van den Hengel, and Stephen Gould. 2022. Understanding and Improving the Role of Projection Head in Self-Supervised Learning. arXiv preprint arXiv:2212.11491 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00201"},{"key":"e_1_3_2_1_16_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_17_1","volume-title":"Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227 (2014)."},{"key":"e_1_3_2_1_18_1","volume-title":"ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160."},{"key":"e_1_3_2_1_19_1","volume-title":"ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis Gomez i Bigorda, Sergi Robles Mestre, Joan Mas, David Fernandez Mota, Jon Almazan Almazan, and Lluis Pere De Las Heras. 2013. ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_21_1","volume-title":"Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video Retrieval. In European Conference on Computer Vision. Springer, 181--197","author":"Li Pandeng","year":"2022","unstructured":"Pandeng Li, Hongtao Xie, Jiannan Ge, Lei Zhang, Shaobo Min, and Yongdong Zhang. 2022b. Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video Retrieval. In European Conference on Computer Vision. Springer, 181--197."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3203612"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018714"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_25_1","volume-title":"Maskocr: Text recognition with masked encoder-decoder pretraining. arXiv preprint arXiv:2206.00311","author":"Lyu Pengyuan","year":"2022","unstructured":"Pengyuan Lyu, Chengquan Zhang, Shanshan Liu, Meina Qiao, Yangliu Xu, Liang Wu, Kun Yao, Junyu Han, Errui Ding, and Jingdong Wang. 2022. Maskocr: Text recognition with masked encoder-decoder pretraining. arXiv preprint arXiv:2206.00311 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Linearly mapping from image to text space. arXiv preprint arXiv:2209.15162","author":"Merullo Jack","year":"2022","unstructured":"Jack Merullo, Louis Castricato, Carsten Eickhoff, and Ellie Pavlick. 2022. Linearly mapping from image to text space. arXiv preprint arXiv:2209.15162 (2022)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Anand Mishra Karteek Alahari and CV Jawahar. 2012. Scene text recognition using higher order language priors. In BMVC-British machine vision conference. BMVA.","DOI":"10.5244\/C.26.127"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_12"},{"key":"e_1_3_2_1_29_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.76"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01354"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2910412"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_36_1","volume-title":"An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition","author":"Shi Baoguang","year":"2016","unstructured":"Baoguang Shi, Xiang Bai, and Cong Yao. 2016. An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 11 (2016), 2298--2304."},{"key":"e_1_3_2_1_37_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_38_1","volume-title":"Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190","author":"Song Haoyu","year":"2022","unstructured":"Haoyu Song, Li Dong, Wei-Nan Zhang, Ting Liu, and Furu Wei. 2022. Clip models are few-shot learners: Empirical studies on vqa and visual entailment. arXiv preprint arXiv:2203.07190 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140","author":"Veit Andreas","year":"2016","unstructured":"Andreas Veit, Tomas Matera, Lukas Neumann, Jiri Matas, and Serge Belongie. 2016. Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140 (2016)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6891"},{"key":"e_1_3_2_1_41_1","volume-title":"2D-CTC for scene text recognition. arXiv preprint arXiv:1907.09705","author":"Wan Zhaoyi","year":"2019","unstructured":"Zhaoyi Wan, Fengming Xie, Yibo Liu, Xiang Bai, and Cong Yao. 2019. 2D-CTC for scene text recognition. arXiv preprint arXiv:1907.09705 (2019)."},{"key":"e_1_3_2_1_42_1","volume-title":"2011 International conference on computer vision. IEEE, 1457--1464","author":"Wang Kai","year":"2011","unstructured":"Kai Wang, Boris Babenko, and Serge Belongie. 2011. End-to-end scene text recognition. In 2011 International conference on computer vision. IEEE, 1457--1464."},{"key":"e_1_3_2_1_43_1","volume-title":"Knowledge distillation and student-teacher learning for visual intelligence: A review and new outlooks","author":"Wang Lin","year":"2021","unstructured":"Lin Wang and Kuk-Jin Yoon. 2021. Knowledge distillation and student-teacher learning for visual intelligence: A review and new outlooks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2021)."},{"key":"e_1_3_2_1_44_1","volume-title":"Tel Aviv","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, Cheng Da, and Cong Yao. 2022b. Multi-granularity Prediction for Scene Text Recognition. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXVIII. Springer, 339--355."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3197981"},{"key":"e_1_3_2_1_48_1","volume-title":"Clip-td: Clip targeted distillation for vision-language tasks. arXiv preprint arXiv:2201.05729","author":"Wang Zhecan","year":"2022","unstructured":"Zhecan Wang, Noel Codella, Yen-Chun Chen, Luowei Zhou, Jianwei Yang, Xiyang Dai, Bin Xiao, Haoxuan You, Shih-Fu Chang, and Lu Yuan. 2022a. Clip-td: Clip targeted distillation for vision-language tasks. arXiv preprint arXiv:2201.05729 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Contrastive learning rivals masked image modeling in fine-tuning via feature distillation. arXiv preprint arXiv:2205.14141","author":"Wei Yixuan","year":"2022","unstructured":"Yixuan Wei, Han Hu, Zhenda Xie, Zheng Zhang, Yue Cao, Jianmin Bao, Dong Chen, and Baining Guo. 2022. Contrastive learning rivals masked image modeling in fine-tuning via feature distillation. arXiv preprint arXiv:2205.14141 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_18"},{"key":"e_1_3_2_1_51_1","volume-title":"Stare at What You See: Masked Image Modeling without Reconstruction. arXiv preprint arXiv:2211.08887","author":"Xue Hongwei","year":"2022","unstructured":"Hongwei Xue, Peng Gao, Hongyang Li, Yu Qiao, Hao Sun, Houqiang Li, and Jiebo Luo. 2022. Stare at What You See: Masked Image Modeling without Reconstruction. arXiv preprint arXiv:2211.08887 (2022)."},{"key":"e_1_3_2_1_52_1","volume-title":"Turning a CLIP Model into a Scene Text Detector. arXiv preprint arXiv:2302.14338","author":"Yu Wenwen","year":"2023","unstructured":"Wenwen Yu, Yuliang Liu, Wei Hua, Deqiang Jiang, Bo Ren, and Xiang Bai. 2023. Turning a CLIP Model into a Scene Text Detector. arXiv preprint arXiv:2302.14338 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"UK","author":"Yue Xiaoyu","year":"2020","unstructured":"Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, and Wayne Zhang. 2020. Robustscanner: Dynamically enhancing positional clues for robust text recognition. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XIX. Springer, 135--151."},{"key":"e_1_3_2_1_54_1","volume-title":"Linguistic More: Taking a Further Step toward Efficient and Accurate Scene Text Recognition. arXiv preprint arXiv:2305.05140","author":"Zhang Boqiang","year":"2023","unstructured":"Boqiang Zhang, Hongtao Xie, Yuxin Wang, Jianjun Xu, and Yongdong Zhang. 2023. Linguistic More: Taking a Further Step toward Efficient and Accurate Scene Text Recognition. arXiv preprint arXiv:2305.05140 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20245"},{"key":"e_1_3_2_1_56_1","volume-title":"SUNw: Scene Understanding Workshop-CVPR","volume":"2017","author":"Zhang Ying","year":"2017","unstructured":"Ying Zhang, Lionel Gueguen, Ilya Zharkov, Peter Zhang, Keith Seifert, and Ben Kadlec. 2017. Uber-text: A large-scale dataset for optical character recognition from street-level imagery. In SUNw: Scene Understanding Workshop-CVPR, Vol. 2017. 5."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611769","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611769","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:12:50Z","timestamp":1755821570000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611769"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3611769","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611769","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}