{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T02:58:29Z","timestamp":1764212309548,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foudnation of China (NSFC)","award":["U1936205"],"award-info":[{"award-number":["U1936205"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612488","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"7530-7539","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["STIRER: A Unified Model for Low-Resolution Scene Text Image Recovery and Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7720-806X","authenticated-orcid":false,"given":"Minyi","family":"Zhao","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5381-6281","authenticated-orcid":false,"given":"Shijie","family":"Xuyang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2313-7635","authenticated-orcid":false,"given":"Jihong","family":"Guan","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Fan Bai Zhanzhan Cheng Yi Niu Shiliang Pu and Shuigeng Zhou. 2018. Edit probability for scene text recognition. In CVPR. 1508--1516.","DOI":"10.1109\/CVPR.2018.00163"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Jingye Chen Bin Li and Xiangyang Xue. 2021b. Scene Text Telescope: Text-Focused Scene Image Super-Resolution. In CVPR. 12026--12035.","DOI":"10.1109\/CVPR46437.2021.01185"},{"key":"e_1_3_2_2_5_1","volume-title":"Benchmarking Chinese Text Recognition: Datasets, Baselines, and an Empirical Study. arXiv preprint arXiv:2112.15093","author":"Chen Jingye","year":"2021","unstructured":"Jingye Chen, Haiyang Yu, Jianqi Ma, Mengnan Guan, Xixi Xu, Xiaocong Wang, Shaobo Qu, Bin Li, and Xiangyang Xue. 2021c. Benchmarking Chinese Text Recognition: Datasets, Baselines, and an Empirical Study. arXiv preprint arXiv:2112.15093 (2021)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19904"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3440756"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Zhanzhan Cheng Fan Bai Yunlu Xu Gang Zheng Shiliang Pu and Shuigeng Zhou. 2017. Focusing attention: Towards accurate text recognition in natural images. In ICCV. 5076--5084.","DOI":"10.1109\/ICCV.2017.543"},{"key":"e_1_3_2_2_9_1","volume-title":"Aon: Towards arbitrarily-oriented text recognition. In CVPR. 5571--5579.","author":"Cheng Zhanzhan","year":"2018","unstructured":"Zhanzhan Cheng, Yangliu Xu, Fan Bai, Yi Niu, Shiliang Pu, and Shuigeng Zhou. 2018. Aon: Towards arbitrarily-oriented text recognition. In CVPR. 5571--5579."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Tao Dai Jianrui Cai Yongbing Zhang Shu-Tao Xia and Lei Zhang. 2019. Second-order attention network for single image super-resolution. In CVPR. 11065--11074.","DOI":"10.1109\/CVPR.2019.01132"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2439281"},{"key":"e_1_3_2_2_12_1","volume-title":"SVTR: Scene Text Recognition with a Single Visual Model. arXiv preprint arXiv:2205.00159","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. SVTR: Scene Text Recognition with a Single Visual Model. arXiv preprint arXiv:2205.00159 (2022)."},{"key":"e_1_3_2_2_13_1","volume-title":"JMLR","volume":"20","author":"Elsken Thomas","year":"2019","unstructured":"Thomas Elsken, Jan Hendrik Metzen, and Frank Hutter. 2019. Neural architecture search: A survey. JMLR, Vol. 20, 1 (2019), 1997--2017."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.05.060"},{"key":"e_1_3_2_2_15_1","unstructured":"Shancheng Fang Hongtao Xie Yuxin Wang Zhendong Mao and Yongdong Zhang. 2021a. Read Like Humans: Autonomous Bidirectional and Iterative Language Modeling for Scene Text Recognition. In CVPR. 7098--7107."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Alex Graves Santiago Fern\u00e1ndez Faustino Gomez and J\u00fcrgen Schmidhuber. 2006. Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In ICML. 369--376.","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6735"},{"key":"e_1_3_2_2_19_1","volume-title":"Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. arXiv preprint arXiv:1406.2227 (2014)."},{"key":"e_1_3_2_2_20_1","volume-title":"IFR: Iterative Fusion Based Recognizer for Low Quality Scene Text Recognition. In Pattern Recognition and Computer Vision: 4th Chinese Conference, PRCV","author":"Jia Zhiwei","year":"2021","unstructured":"Zhiwei Jia, Shugong Xu, Shiyi Mu, Yue Tao, Shan Cao, and Zhiyong Chen. 2021. IFR: Iterative Fusion Based Recognizer for Low Quality Scene Text Recognition. In Pattern Recognition and Computer Vision: 4th Chinese Conference, PRCV 2021, Beijing, China, October 29-November 1, 2021, Proceedings, Part II 4. Springer, 180--191."},{"key":"e_1_3_2_2_21_1","volume-title":"ICDAR 2015 competition on robust reading. In ICDAR. IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In ICDAR. IEEE, 1156--1160."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Christian Ledig Lucas Theis Ferenc Husz\u00e1r Jose Caballero Andrew Cunningham Alejandro Acosta Andrew Aitken Alykhan Tejani Johannes Totz Zehan Wang et al. 2017. Photo-realistic single image super-resolution using a generative adversarial network. In CVPR. 4681--4690.","DOI":"10.1109\/CVPR.2017.19"},{"key":"e_1_3_2_2_23_1","volume-title":"On efficient transformer and image pre-training for low-level vision. arXiv preprint arXiv:2112.10175","author":"Li Wenbo","year":"2021","unstructured":"Wenbo Li, Xin Lu, Jiangbo Lu, Xiangyu Zhang, and Jiaya Jia. 2021. On efficient transformer and image pre-training for low-level vision. arXiv preprint arXiv:2112.10175 (2021)."},{"key":"e_1_3_2_2_24_1","volume-title":"Learning Generative Structure Prior for Blind Text Image Super-resolution. arXiv preprint arXiv:2303.14726","author":"Li Xiaoming","year":"2023","unstructured":"Xiaoming Li, Wangmeng Zuo, and Chen Change Loy. 2023. Learning Generative Structure Prior for Blind Text Image Super-resolution. arXiv preprint arXiv:2303.14726 (2023)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00210"},{"key":"e_1_3_2_2_26_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_27_1","first-page":"109","article-title":"Moran: A multi-object rectified attention network for scene text recognition","volume":"90","author":"Luo Canjie","year":"2019","unstructured":"Canjie Luo, Lianwen Jin, and Zenghui Sun. 2019. Moran: A multi-object rectified attention network for scene text recognition. PR, Vol. 90 (2019), 109--118.","journal-title":"PR"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3237002"},{"key":"e_1_3_2_2_29_1","unstructured":"Jianqi Ma Zhetong Liang and Lei Zhang. 2022. A Text Attention Network for Spatial Deformation Robust Scene Text Image Super-resolution. In CVPR. 5911--5920."},{"key":"e_1_3_2_2_30_1","volume-title":"Docvqa: A dataset for vqa on document images. In CVPR. 2200--2209.","author":"Mathew Minesh","year":"2021","unstructured":"Minesh Mathew, Dimosthenis Karatzas, and CV Jawahar. 2021. Docvqa: A dataset for vqa on document images. In CVPR. 2200--2209."},{"volume-title":"Top-down and bottom-up cues for scene text recognition","author":"Mishra Anand","key":"e_1_3_2_2_31_1","unstructured":"Anand Mishra, Karteek Alahari, and CV Jawahar. 2012. Top-down and bottom-up cues for scene text recognition. In CVPR. IEEE, 2687--2694."},{"key":"e_1_3_2_2_32_1","volume-title":"Plugnet: Degradation aware scene text recognition supervised by a pluggable super-resolution unit","author":"Mou Yongqiang","year":"2020","unstructured":"Yongqiang Mou, Lei Tan, Hui Yang, Jingying Chen, Leyuan Liu, Rui Yan, and Yaohong Huang. 2020. Plugnet: Degradation aware scene text recognition supervised by a pluggable super-resolution unit. In ECCV. Springer, 158--174."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Shimon Nakaune Satoshi Iizuka and Kazuhiro Fukui. 2021. Skeleton-aware Text Image Super-Resolution. (2021).","DOI":"10.5244\/C.35.404"},{"key":"e_1_3_2_2_34_1","unstructured":"Ram Krishna Pandey K Vignesh AG Ramakrishnan et al. 2018. Binary document image super resolution for improved readability and OCR performance. arXiv preprint arXiv:1812.02475 (2018)."},{"key":"e_1_3_2_2_35_1","unstructured":"Trung Quy Phan Palaiahnakote Shivakumara Shangxuan Tian and Chew Lim Tan. 2013. Recognizing text with perspective distortion in natural scenes. In ICCV. 569--576."},{"key":"e_1_3_2_2_36_1","volume-title":"Seed: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537.","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020. Seed: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537."},{"key":"e_1_3_2_2_37_1","volume-title":"Scene Text Image Super-Resolution via Content Perceptual Loss and Criss-Cross Transformer Blocks. arXiv preprint arXiv:2210.06924","author":"Qin Rui","year":"2022","unstructured":"Rui Qin, Bin Wang, and Yu-Wing Tai. 2022. Scene Text Image Super-Resolution via Content Perceptual Loss and Criss-Cross Transformer Blocks. arXiv preprint arXiv:2210.06924 (2022)."},{"volume-title":"Roadtext-1k: Text detection & recognition dataset for driving videos","author":"Reddy Sangeeth","key":"e_1_3_2_2_38_1","unstructured":"Sangeeth Reddy, Minesh Mathew, Lluis Gomez, Marcc al Rusinol, Dimosthenis Karatzas, and CV Jawahar. 2020. Roadtext-1k: Text detection & recognition dataset for driving videos. In ICRA. IEEE, 11074--11080."},{"key":"e_1_3_2_2_39_1","volume-title":"NRTR: A no-recurrence sequence-to-sequence model for scene text recognition","author":"Sheng Fenfen","year":"2019","unstructured":"Fenfen Sheng, Zhineng Chen, and Bo Xu. 2019. NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In ICDAR. IEEE, 781--786."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_2_42_1","volume-title":"Meet Shah, Yu Jiang, Xinlei Chen, Dhruv Batra, Devi Parikh, and Marcus Rohrbach.","author":"Singh Amanpreet","year":"2019","unstructured":"Amanpreet Singh, Vivek Natarajan, Meet Shah, Yu Jiang, Xinlei Chen, Dhruv Batra, Devi Parikh, and Marcus Rohrbach. 2019. Towards vqa models that can read. In CVPR. 8317--8326."},{"key":"e_1_3_2_2_43_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"volume-title":"End-to-end scene text recognition","author":"Wang Kai","key":"e_1_3_2_2_44_1","unstructured":"Kai Wang, Boris Babenko, and Serge Belongie. 2011. End-to-end scene text recognition. In ICCV. IEEE, 1457--1464."},{"volume-title":"Scene text image super-resolution in the wild","author":"Wang Wenjia","key":"e_1_3_2_2_45_1","unstructured":"Wenjia Wang, Enze Xie, Xuebo Liu, Wenhai Wang, Ding Liang, Chunhua Shen, and Xiang Bai. 2020. Scene text image super-resolution in the wild. In ECCV. Springer, 650--666."},{"key":"e_1_3_2_2_46_1","volume-title":"Textsr: Content-aware text super-resolution guided by recognition. arXiv preprint arXiv:1909.07113","author":"Wang Wenjia","year":"2019","unstructured":"Wenjia Wang, Enze Xie, Peize Sun, Wenhai Wang, Lixun Tian, Chunhua Shen, and Ping Luo. 2019. Textsr: Content-aware text super-resolution guided by recognition. arXiv preprint arXiv:1909.07113 (2019)."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Yuxin Wang Hongtao Xie Shancheng Fang Jing Wang Shenggao Zhu and Yongdong Zhang. 2021. From two to one: A new scene text recognizer with visual language modeling network. In ICCV. 14194--14203.","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01716"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Xiangyu Xu Deqing Sun Jinshan Pan Yujin Zhang Hanspeter Pfister and Ming-Hsuan Yang. 2017. Learning to super-resolve blurry face and text images. In ICCV. 251--260.","DOI":"10.1109\/ICCV.2017.36"},{"key":"e_1_3_2_2_51_1","unstructured":"Deli Yu Xuan Li Chengquan Zhang Tao Liu Junyu Han Jingtuo Liu and Errui Ding. 2020. Towards accurate scene text recognition with semantic reasoning networks. In CVPR. 12113--12122."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Hui Zhang Quanming Yao Mingkun Yang Yongchao Xu and Xiang Bai. 2020. AutoSTR: Efficient Backbone Search for Scene Text Recognition. In ECCV.","DOI":"10.1007\/978-3-030-58586-0_44"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"crossref","unstructured":"Yulun Zhang Kunpeng Li Kai Li Lichen Wang Bineng Zhong and Yun Fu. 2018. Image super-resolution using very deep residual channel attention networks. In ECCV. 286--301.","DOI":"10.1007\/978-3-030-01234-2_18"},{"key":"e_1_3_2_2_54_1","volume-title":"Zhijun Ding, Jun Wu, Fumin Shen, and Heng Tao Shen.","author":"Zhao Cairong","year":"2021","unstructured":"Cairong Zhao, Shuyang Feng, Brian Nlong Zhao, Zhijun Ding, Jun Wu, Fumin Shen, and Heng Tao Shen. 2021. Scene Text Image Super-Resolution via Parallelly Contextual Attention Network. In MM. 2908--2917."},{"key":"e_1_3_2_2_55_1","volume-title":"Towards Video Text Visual Question Answering: Benchmark and Baseline. In Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Zhao Minyi","year":"2022","unstructured":"Minyi Zhao, Bingjia Li, Jie Wang, Wanqing Li, Wenjing Zhou, Lan Zhang, Shijie Xuyang, Zhihang Yu, Xinkun Yu, Guangze Li, et al. 2022a. Towards Video Text Visual Question Answering: Benchmark and Baseline. In Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"crossref","unstructured":"Minyi Zhao Miao Wang Fan Bai Bingjia Li Jie Wang and Shuigeng Zhou. 2022b. C3-STISR: Scene Text Image Super-resolution with Triple Clues. In IJCAI. 1707--1713.","DOI":"10.24963\/ijcai.2022\/238"},{"key":"e_1_3_2_2_57_1","volume-title":"Improving Scene Text Image Super-Resolution via Dual Prior Modulation Network. arXiv preprint arXiv:2302.10414","author":"Zhu Shipeng","year":"2023","unstructured":"Shipeng Zhu, Zuoyan Zhao, Pengfei Fang, and Hui Xue. 2023. Improving Scene Text Image Super-Resolution via Dual Prior Modulation Network. arXiv preprint arXiv:2302.10414 (2023)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612488","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:58:28Z","timestamp":1755820708000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612488"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":57,"alternative-id":["10.1145\/3581783.3612488","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612488","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}