{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:50:19Z","timestamp":1772905819659,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of Hubei Province","award":["2023AFB775"],"award-info":[{"award-number":["2023AFB775"]}]},{"name":"Natural Science Foundation of Chongqing","award":["CSTB2023NSCQ-MSX0879"],"award-info":[{"award-number":["CSTB2023NSCQ-MSX0879"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681551","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1741-1750","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Trust Prophet or Not? Taking a Further Verification Step toward Accurate Scene Text Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9328-7068","authenticated-orcid":false,"given":"Anna","family":"Zhu","sequence":"first","affiliation":[{"name":"Wuhan University of Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3959-4520","authenticated-orcid":false,"given":"Ke","family":"Xiao","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9228-5235","authenticated-orcid":false,"given":"Bo","family":"Zhou","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9687-9918","authenticated-orcid":false,"given":"Runmin","family":"Wang","sequence":"additional","affiliation":[{"name":"Hunan Normal University, Changsha, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Joint Visual Semantic Reasoning: Multi- Stage Decoder for Text Recognition. In International Conference on Computer Vision (ICCV).","author":"Bhunia Ayan Kumar","year":"2021","unstructured":"Ayan Kumar Bhunia, Aneeshan Sain, Amandeep Kumar, Shuvozit Ghose, Pinaki Nath Chowdhury, and Yi-Zhe Song. 2021. Joint Visual Semantic Reasoning: Multi- Stage Decoder for Text Recognition. In International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_2_2_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry AmandaAskell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger T. J. Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeff Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS). 1877--1901."},{"key":"e_1_3_2_2_3_1","volume-title":"LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition. In International Conference on Computer Vision (ICCV). 12120--12127","author":"Cheng Changxu","year":"2023","unstructured":"Changxu Cheng, Peng Wang, Cheng Da, Qi Zheng, and Cong Yao. 2023. LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition. In International Conference on Computer Vision (ICCV). 12120--12127."},{"key":"e_1_3_2_2_4_1","volume-title":"Focusing Attention: Towards Accurate Text Recognition in Natural Images. In International Conference on Computer Vision (ICCV). 5086--5094","author":"Cheng Zhanzhan","year":"2017","unstructured":"Zhanzhan Cheng, Fan Bai, Yunlu Xu, Gang Zheng, Shiliang Pu, and Shuigeng Zhou. 2017. Focusing Attention: Towards Accurate Text Recognition in Natural Images. In International Conference on Computer Vision (ICCV). 5086--5094."},{"key":"e_1_3_2_2_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American Chapter of the Association for Computational Linguistics (NAACL)."},{"key":"e_1_3_2_2_6_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16Words Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/124"},{"key":"e_1_3_2_2_8_1","volume-title":"Bidirectional and Iterative Language Modeling for Scene Text Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7094--7103","author":"Fang Shancheng","year":"2021","unstructured":"Shancheng Fang, Hongtao Xie, Yuxin Wang, Zhendong Mao, and Yongdong Zhang. 2021. Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7094--7103."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475362"},{"key":"e_1_3_2_2_10_1","volume-title":"Synthetic Data for Text Localisation in Natural Images. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 2315--2324","author":"Gupta Ankush","year":"2016","unstructured":"Ankush Gupta, Andrea Vedaldi, and Andrew Zisserman. 2016. Synthetic Data for Text Localisation in Natural Images. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 2315--2324."},{"key":"e_1_3_2_2_11_1","volume-title":"Visual Semantics Allow for Textual Reasoning Better in Scene Text Recognition. In AAAI Conference on Artificial Intelligence (AAAI). 888--896","author":"He Yang","year":"2022","unstructured":"Yang He, Chen Chen, Jing Zhang, Juhua Liu, Fengxiang He, Chaoyue Wang, and Bo Du. 2022. Visual Semantics Allow for Textual Reasoning Better in Scene Text Recognition. In AAAI Conference on Artificial Intelligence (AAAI). 888--896."},{"key":"e_1_3_2_2_12_1","volume-title":"AAAI Conference on Artificial Intelligence (AAAI). 8610--8617","author":"Hui Li Chunhua Shen","year":"2019","unstructured":"Chunhua Shen Hui Li, PengWang and Guyu Zhang. 2019. Show, Attend and Read: A simple and strong baseline for irregular text recognition. In AAAI Conference on Artificial Intelligence (AAAI). 8610--8617."},{"key":"e_1_3_2_2_13_1","volume-title":"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition. ArXiv abs\/1406.2227","author":"Jaderberg Max","year":"2014","unstructured":"Max Jaderberg, Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2014. Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition. ArXiv abs\/1406.2227 (2014)."},{"key":"e_1_3_2_2_14_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 546--547","author":"Seong Joon Oh Seonghyeon Jeonghun Baek","year":"2020","unstructured":"Jeonghun Baek Seong Joon Oh Seonghyeon Kim Junyeop Lee, Sungrae Park and Hwalsuk Lee. 2020. On recognizing texts of arbitrary shapes with 2d selfattention. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 546--547."},{"key":"e_1_3_2_2_15_1","volume-title":"ICDAR 2015 competition on Robust Reading. In International Conference on Document Analysis and Recognition (ICDAR). 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Llu\u00eds G\u00f3mez i Bigorda, Anguelos Nicolaou, Suman K. Ghosh, Andrew D. Bagdanov, M. Iwamura, Jiri Matas, Luk\u00e1s Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, Faisal Shafait, Seiichi Uchida, and Ernest Valveny. 2015. ICDAR 2015 competition on Robust Reading. In International Conference on Document Analysis and Recognition (ICDAR). 1156--1160."},{"key":"e_1_3_2_2_16_1","volume-title":"ICDAR 2013 Robust Reading Competition. In International Conference on Document Analysis and Recognition (ICDAR). 1484--1493","author":"Karatzas Dimosthenis","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, M. Iwamura, Llu\u00eds G\u00f3mez i Bigorda, Sergi Robles Mestre, Joan Mas Romeu, David Fern\u00e1ndez Mota, Jon Almaz\u00e1n, and Llu\u00eds-Pere de las Heras. 2013. ICDAR 2013 Robust Reading Competition. In International Conference on Document Analysis and Recognition (ICDAR). 1484--1493."},{"key":"e_1_3_2_2_17_1","volume-title":"Mask TextSpotter: An End-to-End Trainable Neural Network for Spotting Text with Arbitrary Shapes","author":"Liao Minghui","year":"2021","unstructured":"Minghui Liao, Pengyuan Lyu, Minghang He, Cong Yao, Wenhao Wu, and Xiang Bai. 2021. Mask TextSpotter: An End-to-End Trainable Neural Network for Spotting Text with Arbitrary Shapes. IEEE Trans Pattern Anal Mach Intell (2021), 532--548."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107980"},{"key":"e_1_3_2_2_19_1","volume-title":"British Machine Vision Conference (BMVC). 1--11","author":"Mishra Anand","unstructured":"Anand Mishra, Alahari Karteek, and C. V. Jawahar. 2009. Scene Text Recognition using Higher Order Language Priors. In British Machine Vision Conference (BMVC). 1--11."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_26"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Matthew E. Peters Mark Neumann Mohit Iyyer Matt Gardner Christopher Clark Kenton Lee and Luke Zettlemoyer. 2018. Deep Contextualized Word Representations. In North American Chapter of the Association for Computational Linguistics (NAACL). 2227--2237.","DOI":"10.18653\/v1\/N18-1202"},{"key":"e_1_3_2_2_22_1","volume-title":"Recognizing Text with Perspective Distortion in Natural Scenes. In International Conference on Computer Vision (ICCV). 569--576","author":"Phan Trung Quy","year":"2013","unstructured":"Trung Quy Phan, Palaiahnakote Shivakumara, Shangxuan Tian, and Chew Lim Tan. 2013. Recognizing Text with Perspective Distortion in Natural Scenes. In International Conference on Computer Vision (ICCV). 569--576."},{"key":"e_1_3_2_2_23_1","volume-title":"Iterative and Mimicking Network for Scene Text Recognition. In ACM International Conference on Multimedia (ACM MM). 2046--2055","author":"Qiao Zhi","year":"2021","unstructured":"Zhi Qiao, Yu Zhou, Jin Wei, Wei Wang, Yuanqing Zhang, Ning Jiang, Hongbin Wang, andWeipingWang. 2021. PIMNet: A Parallel, Iterative and Mimicking Network for Scene Text Recognition. In ACM International Conference on Multimedia (ACM MM). 2046--2055."},{"key":"e_1_3_2_2_24_1","volume-title":"SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13525-- 13534","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020. SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13525-- 13534."},{"key":"e_1_3_2_2_25_1","volume-title":"Toeplitz Neural Network for Sequence Modeling. In International Conference on Learning Representations (ICLR).","author":"Qin Zhen","year":"2023","unstructured":"Zhen Qin, Xiaodong Han, Weixuan Sun, Bowen He, Dong Li, Dongxu Li, Yuchao Dai, Lingpeng Kong, and Yiran Zhong. 2023. Toeplitz Neural Network for Sequence Modeling. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_26_1","volume-title":"Revisiting Scene Text Recognition: A Data Perspective. In International Conference on Computer Vision (ICCV).","author":"Chongyu Liu Lianwen Jin Dezhi Peng","year":"2023","unstructured":"Dezhi Peng Chongyu Liu Lianwen Jin Qing Jiang, Jiapeng Wang. 2023. Revisiting Scene Text Recognition: A Data Perspective. In International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_2_28_1","volume-title":"NRTR:ANo-Recurrence Sequenceto- Sequence Model for Scene Text Recognition. In International Conference on Document Analysis and Recognition (ICDAR). 781--786","author":"Sheng Fenfen","year":"2019","unstructured":"Fenfen Sheng, Zhineng Chen, and Bo Xu. 2019. NRTR:ANo-Recurrence Sequenceto- Sequence Model for Scene Text Recognition. In International Conference on Document Analysis and Recognition (ICDAR). 781--786."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_2_31_1","volume-title":"Le","author":"Sutskever Ilya","year":"2014","unstructured":"Ilya Sutskever, Oriol Vinyals, and Quoc V. Le. 2014. Sequence to Sequence Learning with Neural Networks. In Advances in Neural Information Processing Systems (NeurIPS). 3104--3112."},{"key":"e_1_3_2_2_32_1","volume-title":"Pure Transformer with Integrated Experts for Scene Text Recognition. In European Conference on Computer Vision (ECCV). 481--497","author":"Tan Yew Lee","year":"2022","unstructured":"Yew Lee Tan, Adams Wai-Kin Kong, and Jung-Jae Kim. 2022. Pure Transformer with Integrated Experts for Scene Text Recognition. In European Conference on Computer Vision (ECCV). 481--497."},{"key":"e_1_3_2_2_33_1","unstructured":"Ashish Vaswani Noam M. Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In Advances in neural information processing systems (NeurIPS)."},{"key":"e_1_3_2_2_34_1","volume-title":"TextScanner: Reading Characters in Order for Robust Scene Text Recognition. In AAAI Conference on Artificial Intelligence (AAAI). 12120--12127","author":"Wan Zhaoyi","year":"2020","unstructured":"Zhaoyi Wan, Minghang He, Haoran Chen, Xiang Bai, and Cong Yao. 2020. TextScanner: Reading Characters in Order for Robust Scene Text Recognition. In AAAI Conference on Artificial Intelligence (AAAI). 12120--12127."},{"key":"e_1_3_2_2_35_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4556-- 4565","author":"Bai Xiang","year":"2021","unstructured":"HaoWang, Xiang Bai, Mingkun Yang, Shenggao Zhu, JingWang, andWenyu Liu. 2021. Scene Text Retrieval via Joint Text Detection and Similarity Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4556-- 4565."},{"key":"e_1_3_2_2_36_1","volume-title":"Towards Robust Visual Information Extraction in Real World: New Dataset and Novel Solution. In AAAI Conference on Artificial Intelligence (AAAI). 2738--2745","author":"Wang Jiapeng","year":"2021","unstructured":"Jiapeng Wang, Chongyu Liu, Lianwen Jin, Guozhi Tang, Jiaxin Zhang, Shuaitao Zhang, Qianying Wang, Y. Wu, and Mingxiang Cai. 2021. Towards Robust Visual Information Extraction in Real World: New Dataset and Novel Solution. In AAAI Conference on Artificial Intelligence (AAAI). 2738--2745."},{"key":"e_1_3_2_2_37_1","volume-title":"International Conference on Computer Vision (ICCV). 1457--1464","author":"Wang Kai","unstructured":"Kai Wang, Boris Babenko, and Serge J. Belongie. 2011. End-to-end scene text recognition. In International Conference on Computer Vision (ICCV). 1457--1464."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548103"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-023-3935-8"},{"key":"e_1_3_2_2_42_1","volume-title":"Toward UnderstandingWordArt: Corner-Guided Transformer for Scene Text Recognition. In European Conference on Computer Vision (ECCV). 303--321","author":"Xie Xudong","year":"2022","unstructured":"Xudong Xie, Ling Fu, Zhifei Zhang, ZhaowenWang, and Xiang Bai. 2022. Toward UnderstandingWordArt: Corner-Guided Transformer for Scene Text Recognition. In European Conference on Computer Vision (ECCV). 303--321."},{"key":"e_1_3_2_2_43_1","volume-title":"ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training. Findings of the Association for Computational Linguistics","author":"Yan Yu","year":"2020","unstructured":"Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, and Ming Zhou. 2020. ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training. Findings of the Association for Computational Linguistics (2020), 2401--2410."},{"key":"e_1_3_2_2_44_1","volume-title":"Towards Accurate Scene Text Recognition With Semantic Reasoning Networks. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 12110--12119","author":"Yu Deli","year":"2020","unstructured":"Deli Yu, Xuan Li, Chengquan Zhang, Junyu Han, Jingtuo Liu, and Errui Ding. 2020. Towards Accurate Scene Text Recognition With Semantic Reasoning Networks. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 12110--12119."},{"key":"e_1_3_2_2_45_1","volume-title":"RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition. In European Conference on Computer Vision (ECCV). 135--151","author":"Yue Xiaoyu","year":"2020","unstructured":"Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, and Wayne Zhang. 2020. RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition. In European Conference on Computer Vision (ECCV). 135--151."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/189"},{"key":"e_1_3_2_2_47_1","unstructured":"Shuai Zhao Xiaohan Wang Linchao Zhu and Yezhou Yang. 2023. CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model."},{"key":"e_1_3_2_2_48_1","volume-title":"CDistNet: Perceiving Multi-domain Character Distance for Robust Text Recognition. Int J Comput Vis","author":"Zheng Tianlun","year":"2023","unstructured":"Tianlun Zheng, Zhineng Chen, Shancheng Fang, Hongtao Xie, and Yu-Gang Jiang. 2023. CDistNet: Perceiving Multi-domain Character Distance for Robust Text Recognition. Int J Comput Vis (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681551","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681551","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681551"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"alternative-id":["10.1145\/3664647.3681551","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681551","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}