{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:19:07Z","timestamp":1750220347057,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,24]],"date-time":"2021-08-24T00:00:00Z","timestamp":1629763200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The National Key RD Program of China","award":["2017YFB1200700"],"award-info":[{"award-number":["2017YFB1200700"]}]},{"name":"National Nature Science Foundation of China","award":["62072006 and 62072010"],"award-info":[{"award-number":["62072006 and 62072010"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,24]]},"DOI":"10.1145\/3460426.3463639","type":"proceedings-article","created":{"date-parts":[[2021,9,1]],"date-time":"2021-09-01T22:50:28Z","timestamp":1630536628000},"page":"385-393","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Scene Text Recognition with Cascade Attention Network"],"prefix":"10.1145","author":[{"given":"Min","family":"Zhang","sequence":"first","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Meng","family":"Ma","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Ping","family":"Wang","sequence":"additional","affiliation":[{"name":"Ministry of Education, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,9]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"J. Baek G. Kim J. Lee and etal 2019. What is wrong with scene text recognition model comparisons? dataset and model analysis. In ICCV. 4715--4723.  J. Baek G. Kim J. Lee and et al. 2019. What is wrong with scene text recognition model comparisons? dataset and model analysis. In ICCV. 4715--4723.","DOI":"10.1109\/ICCV.2019.00481"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Z. Cheng F. Bai Y. Xu and etal 2017. Focusing attention: Towards accurate text recognition in natural images. In ICCV . 5076--5084.  Z. Cheng F. Bai Y. Xu and et al. 2017. Focusing attention: Towards accurate text recognition in natural images. In ICCV . 5076--5084.","DOI":"10.1109\/ICCV.2017.543"},{"key":"e_1_3_2_1_3_1","unstructured":"Shancheng Fang Hongtao Xie Zheng-Jun Zha and etal 2018. Attention and language ensemble for scene text recognition with convolutional sequence modeling. In ACM Multimedia. 248--256.  Shancheng Fang Hongtao Xie Zheng-Jun Zha and et al. 2018. Attention and language ensemble for scene text recognition with convolutional sequence modeling. In ACM Multimedia. 248--256."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.01.094"},{"key":"e_1_3_2_1_5_1","volume-title":"ICDAR","volume":"1","author":"Ghosh S.K.","unstructured":"S.K. Ghosh , E. Valveny , and A.D. Bagdanov . 2017. Visual attention models for scene text recognition . In ICDAR , Vol. 1 . IEEE, 943--948. S.K. Ghosh, E. Valveny, and A.D. Bagdanov. 2017. Visual attention models for scene text recognition. In ICDAR, Vol. 1. IEEE, 943--948."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"A. Gupta A. Vedaldi and A. Zisserman. 2016. Synthetic data for text localisation in natural images. In CVPR. 2315--2324.  A. Gupta A. Vedaldi and A. Zisserman. 2016. Synthetic data for text localisation in natural images. In CVPR. 2315--2324.","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.10.010"},{"key":"e_1_3_2_1_8_1","unstructured":"M. Jaderberg K. Simonyan A. Vedaldi and etal 2015a. Deep structured output learning for unconstrained text recognition. ICLR (2015).  M. Jaderberg K. Simonyan A. Vedaldi and et al. 2015a. Deep structured output learning for unconstrained text recognition. ICLR (2015)."},{"key":"e_1_3_2_1_9_1","unstructured":"M. Jaderberg K. Simonyan A. Vedaldi and A. Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. In NIPS DLW .  M. Jaderberg K. Simonyan A. Vedaldi and A. Zisserman. 2014. Synthetic data and artificial neural networks for natural scene text recognition. In NIPS DLW ."},{"key":"e_1_3_2_1_10_1","unstructured":"Max Jaderberg Karen Simonyan Andrew Zisserman and etal 2015b. Spatial transformer networks. In NeurIPS. 2017--2025.  Max Jaderberg Karen Simonyan Andrew Zisserman and et al. 2015b. Spatial transformer networks. In NeurIPS. 2017--2025."},{"volume-title":"ICDAR 2015 competition on robust reading. In ICDAR. 1156--1160","author":"Karatzas D.","key":"e_1_3_2_1_11_1","unstructured":"D. Karatzas , L. Gomez-Bigorda , A. Nicolaou, and et al. 2015 . ICDAR 2015 competition on robust reading. In ICDAR. 1156--1160 . D. Karatzas, L. Gomez-Bigorda, A. Nicolaou, and et al. 2015. ICDAR 2015 competition on robust reading. In ICDAR. 1156--1160."},{"volume-title":"ICDAR 2013 robust reading competition. In ICDAR. IEEE, 1484--1493","author":"Karatzas D.","key":"e_1_3_2_1_12_1","unstructured":"D. Karatzas , F. Shafait , S. Uchida, and et al. 2013 . ICDAR 2013 robust reading competition. In ICDAR. IEEE, 1484--1493 . D. Karatzas, F. Shafait, S. Uchida, and et al. 2013. ICDAR 2013 robust reading competition. In ICDAR. IEEE, 1484--1493."},{"volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition . 2231--2239","author":"Lee C.","key":"e_1_3_2_1_13_1","unstructured":"C. Lee and S. Osindero . 2016. Recursive recurrent nets with attention modeling for ocr in the wild . In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition . 2231--2239 . C. Lee and S. Osindero. 2016. Recursive recurrent nets with attention modeling for ocr in the wild. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition . 2231--2239."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018714"},{"key":"e_1_3_2_1_15_1","first-page":"4","article-title":"Char-Net: A Character-Aware Neural Network for Distorted Scene Text Recognition","volume":"1","author":"Liu Wei","year":"2018","unstructured":"Wei Liu , Chaofeng Chen , and Kwan-Yee K Wong . 2018 . Char-Net: A Character-Aware Neural Network for Distorted Scene Text Recognition .. In AAAI , Vol. 1. 4 . Wei Liu, Chaofeng Chen, and Kwan-Yee K Wong. 2018. Char-Net: A Character-Aware Neural Network for Distorted Scene Text Recognition.. In AAAI , Vol. 1. 4.","journal-title":"AAAI"},{"volume-title":"ICDAR 2003 robust reading competitions. In ICDAR. 682--687","author":"Lucas S.M.","key":"e_1_3_2_1_16_1","unstructured":"S.M. Lucas , A. Panaretos , L. Sosa, and et al. 2003 . ICDAR 2003 robust reading competitions. In ICDAR. 682--687 . S.M. Lucas, A. Panaretos, L. Sosa, and et al. 2003. ICDAR 2003 robust reading competitions. In ICDAR. 682--687."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"P. Lyu M. Liao C. Yao and etal 2018. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. In ECCV. 67--83.  P. Lyu M. Liao C. Yao and et al. 2018. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. In ECCV. 67--83.","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"A. Mishra K. Alahari and C. Jawahar. 2012a. Scene text recognition using higher order language priors. In BMVC. 127.1--127.11.  A. Mishra K. Alahari and C. Jawahar. 2012a. Scene text recognition using higher order language priors. In BMVC. 127.1--127.11.","DOI":"10.5244\/C.26.127"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Anand Mishra Karteek Alahari and CV Jawahar. 2012b. Top-down and bottom-up cues for scene text recognition. In CVPR . 2687--2694.  Anand Mishra Karteek Alahari and CV Jawahar. 2012b. Top-down and bottom-up cues for scene text recognition. In CVPR . 2687--2694.","DOI":"10.1109\/CVPR.2012.6247990"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Tatiana Novikova Olga Barinova Pushmeet Kohli and etal 2012. Large-lexicon attribute-consistent text recognition in natural images. In ECCV. Springer 752--765.  Tatiana Novikova Olga Barinova Pushmeet Kohli and et al. 2012. Large-lexicon attribute-consistent text recognition in natural images. In ECCV. Springer 752--765.","DOI":"10.1007\/978-3-642-33783-3_54"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"T. Quy Phan P. Shivakumara S. Tian and etal 2013. Recognizing text with perspective distortion in natural scenes. In ICCV . 569--576.  T. Quy Phan P. Shivakumara S. Tian and et al. 2013. Recognizing text with perspective distortion in natural scenes. In ICCV . 569--576.","DOI":"10.1109\/ICCV.2013.76"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"B. Shi X. Wang P. Lyu and etal 2016b. Robust scene text recognition with automatic rectification. In CVPR . 4168--4176.  B. Shi X. Wang P. Lyu and et al. 2016b. Robust scene text recognition with automatic rectification. In CVPR . 4168--4176.","DOI":"10.1109\/CVPR.2016.452"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_1_26_1","unstructured":"A. Vaswani N. Shazeer N. Parmar and etal 2017. Attention is all you need. In NeurIPS. 5998--6008.  A. Vaswani N. Shazeer N. Parmar and et al. 2017. Attention is all you need. In NeurIPS. 5998--6008."},{"key":"e_1_3_2_1_27_1","unstructured":"J. Wang and X. Hu. 2017. Gated recurrent convolution neural network for ocr. In NeurIPS. 335--344.  J. Wang and X. Hu. 2017. Gated recurrent convolution neural network for ocr. In NeurIPS. 335--344."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"K. Wang B. Babenko and S. Belongie. 2011. End-to-end scene text recognition. In ICCV. 1457--1464.  K. Wang B. Babenko and S. Belongie. 2011. End-to-end scene text recognition. In ICCV. 1457--1464.","DOI":"10.1109\/ICCV.2011.6126402"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Kai Wang and Serge Belongie. 2010. Word spotting in the wild. In ECCV. 591--604.  Kai Wang and Serge Belongie. 2010. Word spotting in the wild. In ECCV. 591--604.","DOI":"10.1007\/978-3-642-15549-9_43"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2009.38"},{"key":"e_1_3_2_1_31_1","volume-title":"TOMM","volume":"15","author":"Xie Hongtao","year":"2019","unstructured":"Hongtao Xie , Shancheng Fang , Zheng-Jun Zha , and et al. 2019. Convolutional attention networks for scene text recognition . TOMM , Vol. 15 , 1s ( 2019 ), 1--17. Hongtao Xie, Shancheng Fang, Zheng-Jun Zha, and et al. 2019. Convolutional attention networks for scene text recognition. TOMM , Vol. 15, 1s (2019), 1--17."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"D. Yu X. Li C. Zhang and etal 2020. Towards accurate scene text recognition with semantic reasoning networks. In CVPR . 12113--12122.  D. Yu X. Li C. Zhang and et al. 2020. Towards accurate scene text recognition with semantic reasoning networks. In CVPR . 12113--12122.","DOI":"10.1109\/CVPR42600.2020.01213"},{"key":"e_1_3_2_1_33_1","volume-title":"Esir: End-to-end scene text recognition via iterative image rectification. In CVPR . 2059--2068.","author":"Zhan F.","year":"2019","unstructured":"F. Zhan and S. Lu . 2019 . Esir: End-to-end scene text recognition via iterative image rectification. In CVPR . 2059--2068. F. Zhan and S. Lu. 2019. Esir: End-to-end scene text recognition via iterative image rectification. In CVPR . 2059--2068."}],"event":{"name":"ICMR '21: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Taipei Taiwan","acronym":"ICMR '21"},"container-title":["Proceedings of the 2021 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460426.3463639","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3460426.3463639","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:04Z","timestamp":1750191424000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460426.3463639"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,24]]},"references-count":33,"alternative-id":["10.1145\/3460426.3463639","10.1145\/3460426"],"URL":"https:\/\/doi.org\/10.1145\/3460426.3463639","relation":{},"subject":[],"published":{"date-parts":[[2021,8,24]]},"assertion":[{"value":"2021-09-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}