{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:13:37Z","timestamp":1750220017767,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592214","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"353-361","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning and Fusing Multi-Scale Representations for Accurate Arbitrary-Shaped Scene Text Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8831-1191","authenticated-orcid":false,"given":"Mingjun","family":"Li","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1394-3550","authenticated-orcid":false,"given":"Shuo","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8426-9634","authenticated-orcid":false,"given":"Feng","family":"Su","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Dataset and Model Analysis. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4714\u20134722","author":"Baek Jeonghun","year":"2019","unstructured":"Jeonghun Baek, Geewook Kim, Junyeop Lee, Sungrae Park, Dongyoon Han, Sangdoo Yun, Seong\u00a0Joon Oh, and Hwalsuk Lee. 2019. What Is Wrong With Scene Text Recognition Model Comparisons? Dataset and Model Analysis. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). 4714\u20134722."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2555080"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00102"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.102"},{"key":"e_1_3_2_1_5_1","volume-title":"Dynamic Convolution: Attention Over Convolution Kernels. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11027\u201311036","author":"Chen Yinpeng","year":"2020","unstructured":"Yinpeng Chen, Xiyang Dai, Mengchen Liu, Dongdong Chen, Lu Yuan, and Zicheng Liu. 2020. Dynamic Convolution: Attention Over Convolution Kernels. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11027\u201311036."},{"key":"e_1_3_2_1_6_1","volume-title":"Focusing Attention: Towards Accurate Text Recognition in Natural Images. In 2017 IEEE International Conference on Computer Vision (ICCV). 5086\u20135094","author":"Cheng Zhanzhan","year":"2017","unstructured":"Zhanzhan Cheng, Fan Bai, Yunlu Xu, Gang Zheng, Shiliang Pu, and Shuigeng Zhou. 2017. Focusing Attention: Towards Accurate Text Recognition in Natural Images. In 2017 IEEE International Conference on Computer Vision (ICCV). 5086\u20135094."},{"key":"e_1_3_2_1_7_1","volume-title":"AON: Towards Arbitrarily-Oriented Text Recognition. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5571\u20135579","author":"Cheng Zhanzhan","year":"2018","unstructured":"Zhanzhan Cheng, Yangliu Xu, Fan Bai, Yi Niu, Shiliang Pu, and Shuigeng Zhou. 2018. AON: Towards Arbitrarily-Oriented Text Recognition. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5571\u20135579."},{"key":"e_1_3_2_1_8_1","volume-title":"Bidirectional and Iterative Language Modeling for Scene Text Recognition. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7094\u20137103","author":"Fang Shancheng","year":"2021","unstructured":"Shancheng Fang, Hongtao Xie, Yuxin Wang, Zhendong Mao, and Yongdong Zhang. 2021. Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7094\u20137103."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_10_1","volume-title":"Synthetic Data for Text Localisation in Natural Images. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2315\u20132324","author":"Gupta Ankush","year":"2016","unstructured":"Ankush Gupta, Andrea Vedaldi, and Andrew Zisserman. 2016. Synthetic Data for Text Localisation in Natural Images. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2315\u20132324."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0823-z"},{"key":"e_1_3_2_1_12_1","unstructured":"Max Jaderberg Karen Simonyan Andrew Zisserman and Koray Kavukcuoglu. 2015. Spatial Transformer Networks. In NIPS. 2017\u20132025."},{"key":"e_1_3_2_1_13_1","volume-title":"Zurich","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Andrea Vedaldi, and Andrew Zisserman. 2014. Deep Features for Text Spotting. In Computer Vision - ECCV 2014 - 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part IV. 512\u2013528."},{"key":"e_1_3_2_1_14_1","volume-title":"ICDAR 2015 competition on Robust Reading. In 13th International Conference on Document Analysis and Recognition (ICDAR). 1156\u20131160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman\u00a0K. Ghosh, Andrew\u00a0D. Bagdanov, Masakazu Iwamura, iri Matas, Lukas Neumann, Vijay\u00a0Ramaseshan Chandrasekhar, Shijian Lu, Faisal Shafait, Seiichi Uchida, and Ernest Valveny. 2015. ICDAR 2015 competition on Robust Reading. In 13th International Conference on Document Analysis and Recognition (ICDAR). 1156\u20131160."},{"key":"e_1_3_2_1_15_1","volume-title":"ICDAR 2013 Robust Reading Competition. In 12th International Conference on Document Analysis and Recognition (ICDAR). 1484\u20131493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis\u00a0Gomez i Bigorda, Sergi\u00a0Robles Mestre, Joan Mas, avid Fern\u00e1ndez\u00a0Mota, Jon Almaz\u00e1n, and Llu\u00eds-Pere de\u00a0las Heras. 2013. ICDAR 2013 Robust Reading Competition. In 12th International Conference on Document Analysis and Recognition (ICDAR). 1484\u20131493."},{"key":"e_1_3_2_1_16_1","volume-title":"Recursive Recurrent Nets with Attention Modeling for OCR in the Wild. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2231\u20132239","author":"Lee Chen-Yu","year":"2016","unstructured":"Chen-Yu Lee and Simon Osindero. 2016. Recursive Recurrent Nets with Attention Modeling for OCR in the Wild. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2231\u20132239."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00281"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018610"},{"key":"e_1_3_2_1_19_1","volume-title":"TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models. CoRR abs\/2109.10282","author":"Li Minghao","year":"2021","unstructured":"Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei A.\u00a0F. Flor\u00eancio, Cha Zhang, Zhoujun Li, and Furu Wei. 2021. TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models. CoRR abs\/2109.10282 (2021). arXiv:2109.10282"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2937086"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018714"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1080\/757582976"},{"volume-title":"SCATTER: Selective Context Attentional Scene Text Recognizer. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11959\u201311969","author":"Litman Ron","key":"e_1_3_2_1_23_1","unstructured":"Ron Litman, Oron Anschel, Shahar Tsiper, Roee Litman, Shai Mazor, and R. Manmatha. 2020. SCATTER: Selective Context Attentional Scene Text Recognizer. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 11959\u201311969."},{"key":"e_1_3_2_1_24_1","volume-title":"ICDAR 2003 Robust Reading Competitions. In 7th International Conference on Document Analysis and Recognition (ICDAR). 682\u2013687","author":"Lucas M.","year":"2003","unstructured":"Simon\u00a0M. Lucas, Alex Panaretos, Luis Sosa, Anthony Tang, Shirley Wong, and Robert Young. 2003. ICDAR 2003 Robust Reading Competitions. In 7th International Conference on Document Analysis and Recognition (ICDAR). 682\u2013687."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.01.020"},{"key":"e_1_3_2_1_26_1","volume-title":"MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining. CoRR abs\/2206.00311","author":"Lyu Pengyuan","year":"2022","unstructured":"Pengyuan Lyu, Chengquan Zhang, Shanshan Liu, Meina Qiao, Yangliu Xu, Liang Wu, Kun Yao, Junyu Han, Errui Ding, and Jingdong Wang. 2022. MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining. CoRR abs\/2206.00311 (2022). arXiv:2206.00311"},{"key":"e_1_3_2_1_27_1","volume-title":"Pointer Sentinel Mixture Models. CoRR abs\/1609.07843","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2016. Pointer Sentinel Mixture Models. CoRR abs\/1609.07843 (2016). arXiv:1609.07843"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.26.127"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248097"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2496234"},{"key":"e_1_3_2_1_31_1","volume-title":"Recognizing Text with Perspective Distortion in Natural Scenes. In 2013 IEEE International Conference on Computer Vision (ICCV). 569\u2013576","author":"Phan Trung\u00a0Quy","year":"2013","unstructured":"Trung\u00a0Quy Phan, Palaiahnakote Shivakumara, Shangxuan Tian, and Chew\u00a0Lim Tan. 2013. Recognizing Text with Perspective Distortion in Natural Scenes. In 2013 IEEE International Conference on Computer Vision (ICCV). 569\u2013576."},{"key":"e_1_3_2_1_32_1","volume-title":"SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13525\u201313534","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020. SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13525\u201313534."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_34_1","volume-title":"NRTR: A No-Recurrence Sequence-to-Sequence Model for Scene Text Recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR). 781\u2013786","author":"Sheng Fenfen","year":"2019","unstructured":"Fenfen Sheng, Zhineng Chen, and Bo Xu. 2019. NRTR: A No-Recurrence Sequence-to-Sequence Model for Scene Text Recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR). 781\u2013786."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_1_37_1","volume-title":"Scene Text Recognition Using Part-Based Tree-Structured Character Detection. In 2013 IEEE Conference on Computer Vision and Pattern Recognition. 2961\u20132968","author":"Shi Cunzhao","year":"2013","unstructured":"Cunzhao Shi, Chunheng Wang, Baihua Xiao, Yang Zhang, Song Gao, and Zhong Zhang. 2013. Scene Text Recognition Using Part-Based Tree-Structured Character Detection. In 2013 IEEE Conference on Computer Vision and Pattern Recognition. 2961\u20132968."},{"key":"e_1_3_2_1_38_1","volume-title":"Adams Wai-Kin Kong, and Jung-Jae Kim","author":"Tan Yew\u00a0Lee","year":"2022","unstructured":"Yew\u00a0Lee Tan, Adams Wai-Kin Kong, and Jung-Jae Kim. 2022. Pure Transformer with Integrated Experts for Scene Text Recognition. In Computer Vision \u2013 ECCV 2022. 481\u2013497."},{"key":"e_1_3_2_1_39_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017. 5998\u20136008."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6891"},{"key":"e_1_3_2_1_41_1","volume-title":"End-to-End Scene Text Recognition. In ICCV","author":"Wang Kai","year":"2011","unstructured":"Kai Wang, Boris Babenko, and Serge Belongie. 2011. End-to-End Scene Text Recognition. In ICCV 2011. 1457\u20131464."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_9"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00216"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_27"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Thessaloniki Greece","acronym":"ICMR '23"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592214","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592214","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:22Z","timestamp":1750182682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592214"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":45,"alternative-id":["10.1145\/3591106.3592214","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592214","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}