{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T16:02:16Z","timestamp":1758124936873,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":85,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"State Key Laboratory of Communication Content Cognition, People?s Daily Online","award":["2020YFB1406902 and A12003"],"award-info":[{"award-number":["2020YFB1406902 and A12003"]}]},{"name":"the Key Research Program of Frontier Sciences, CAS","award":["ZDBS-LY-7024"],"award-info":[{"award-number":["ZDBS-LY-7024"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612383","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"1851-1862","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Perceiving Ambiguity and Semantics without Recognition: An Efficient and Effective Ambiguous Scene Text Detector"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5544-9425","authenticated-orcid":false,"given":"Yan","family":"Shu","sequence":"first","affiliation":[{"name":"State Key Laboratory of Communication Content Cognition, People's Daily Online &amp; Harbin Institute of Technology, Beijing; Harbin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6847-469X","authenticated-orcid":false,"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4188-9953","authenticated-orcid":false,"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1810-5412","authenticated-orcid":false,"given":"Shaohui","family":"Liu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2173-6327","authenticated-orcid":false,"given":"Aoting","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8628-411X","authenticated-orcid":false,"given":"Dongbao","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8618-4992","authenticated-orcid":false,"given":"Weipinng","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Youngmin Baek Bado Lee Dongyoon Han Sangdoo Yun and Hwalsuk Lee. 2019. Character region awareness for text detection. In CVPR. 9365--9374.","DOI":"10.1109\/CVPR.2019.00959"},{"volume-title":"Large-scale machine learning with stochastic gradient descent","author":"Bottou L\u00e9on","key":"e_1_3_2_1_2_1","unstructured":"L\u00e9on Bottou. 2010. Large-scale machine learning with stochastic gradient descent. In COMPSTAT. Springer, 177--186."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Jingye Chen Bin Li and Xiangyang Xue. 2021. Scene text telescope: Text-focused scene image super-resolution. In CVPR. 12026--12035.","DOI":"10.1109\/CVPR46437.2021.01185"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Mengjun Cheng Yipeng Sun Longchao Wang Xiongwei Zhu Kun Yao Jie Chen Guoli Song Junyu Han Jingtuo Liu Errui Ding et al. 2022. ViSTA: vision and scene text aggregation for cross-modal retrieval. In CVPR. 5184--5193.","DOI":"10.1109\/CVPR52688.2022.00512"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12269"},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"I3CL: Intra-and Inter-Instance Collaborative Learning for Arbitrary-shaped Scene Text Detection. IJCV","author":"Du Bo","year":"2022","unstructured":"Bo Du, Jian Ye, Jing Zhang, Juhua Liu, and Dacheng Tao. 2022b. I3CL: Intra-and Inter-Instance Collaborative Learning for Arbitrary-shaped Scene Text Detection. IJCV (2022), 1--17."},{"key":"e_1_3_2_1_9_1","volume-title":"SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890.","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022a. SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890."},{"key":"e_1_3_2_1_10_1","volume-title":"A3S: Adversarial learning of semantic representations for Scene-Text Spotting. arXiv preprint arXiv:2302.10641","author":"Fujitake Masato","year":"2023","unstructured":"Masato Fujitake. 2023. A3S: Adversarial learning of semantic representations for Scene-Text Spotting. arXiv preprint arXiv:2302.10641 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Anton Van den Hengel, and Qi Wu","author":"Gao Chenyu","year":"2021","unstructured":"Chenyu Gao, Qi Zhu, Peng Wang, Hui Li, Yuliang Liu, Anton Van den Hengel, and Qi Wu. 2021. Structured multimodal attentions for textvqa. IEEE TPAMI (2021)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Albert Gordo Jon Almaz\u00e1n Naila Murray and Florent Perronin. 2015. LEWIS: latent embeddings for word images and their semantics. In ICCV. 1242--1250.","DOI":"10.1109\/ICCV.2015.147"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"A. Gupta A. Vedaldi and Andrew Zisserman. 2016. Synthetic Data for Text Localisation in Natural Images. In CVPR. 2315--2324.","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_14_1","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1r and Ross Girshick. 2017. Mask r-cnn. In ICCV. 2961--2969."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","first-page":"5406","article-title":"Multi-Oriented and Multi-Lingual Scene Text Detection With Direct Regression","volume":"27","author":"He Wenhao","year":"2018","unstructured":"Wenhao He, Xu-Yao Zhang, Fei Yin, and Cheng-Lin Liu. 2018. Multi-Oriented and Multi-Lingual Scene Text Detection With Direct Regression. IEEE TIP, Vol. 27, 11 (2018), 5406--5419.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_17_1","volume-title":"Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. In CVPR. 4593--4603.","author":"Huang Mingxin","year":"2022","unstructured":"Mingxin Huang, Yuliang Liu, Zhenghao Peng, Chongyu Liu, Dahua Lin, Shenggao Zhu, Nicholas Yuan, Kai Ding, and Lianwen Jin. 2022. Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. In CVPR. 4593--4603."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Zhicheng Huang Zhaoyang Zeng Yupan Huang Bei Liu Dongmei Fu and Jianlong Fu. 2021. Seeing out of the box: End-to-end pre-training for vision-language representation learning. In CVPR. 12976--12985.","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_1_19_1","volume-title":"Adam: A Method for Stochastic Optimization. Computer Science","author":"Kingma D.","year":"2014","unstructured":"D. Kingma and J. Ba. 2014. Adam: A Method for Stochastic Optimization. Computer Science (2014)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Yair Kittenplon Inbal Lavi Sharon Fogel Yarin Bar R Manmatha and Pietro Perona. 2022. Towards weakly-supervised text spotting using a multi-task transformer. In CVPR. 4604--4613.","DOI":"10.1109\/CVPR52688.2022.00456"},{"volume-title":"Word spotting and recognition using deep embedding","author":"Krishnan Praveen","key":"e_1_3_2_1_21_1","unstructured":"Praveen Krishnan, Kartik Dutta, and CV Jawahar. 2018. Word spotting and recognition using deep embedding. In DAS. IEEE, 1--6."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_23_1","volume-title":"Shape Robust Text Detection With Progressive Scale Expansion Network. CVPR","author":"Li Xiang","year":"2019","unstructured":"Xiang Li, Wenhai Wang, Wenbo Hou, Ruo-Ze Liu, Tong Lu, and Jian Yang. 2019. Shape Robust Text Detection With Progressive Scale Expansion Network. CVPR (2019), 9328--9337."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Xiaoni Li Yu Zhou Yifei Zhang Aoting Zhang Wei Wang Ning Jiang Haiying Wu and Weiping Wang. 2021. Dense semantic contrast for self-supervised visual representation learning. In ACM MM. 1368--1376.","DOI":"10.1145\/3474085.3475551"},{"key":"e_1_3_2_1_25_1","first-page":"3676","article-title":"Textboxes: A single-shot oriented scene text detector","volume":"27","author":"Liao Minghui","year":"2018","unstructured":"Minghui Liao, Baoguang Shi, and Xiang Bai. 2018. Textboxes: A single-shot oriented scene text detector. IEEE TIP, Vol. 27, 8 (2018), 3676--3690.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Baoguang Shi X. Bai Xinggang Wang and Wenyu Liu. 2017. TextBoxes: A Fast Text Detector with a Single Deep Neural Network. In AAAI.","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"e_1_3_2_1_28_1","unstructured":"Tsung-Yi Lin Piotr Doll\u00e1r Ross Girshick Kaiming He Bharath Hariharan and Serge Belongie. 2017. Feature pyramid networks for object detection. In CVPR. 2117--2125."},{"key":"e_1_3_2_1_29_1","volume-title":"SSD: Single Shot MultiBox Detector. In ECCV.","author":"Liu W.","year":"2016","unstructured":"W. Liu, Dragomir Anguelov, D. Erhan, Christian Szegedy, Scott E. Reed, Cheng-Yang Fu, and A. Berg. 2016. SSD: Single Shot MultiBox Detector. In ECCV."},{"key":"e_1_3_2_1_30_1","volume-title":"Fots: Fast oriented text spotting with a unified network. In CVPR. 5676--5685.","author":"Liu Xuebo","year":"2018","unstructured":"Xuebo Liu, Ding Liang, Shi Yan, Dagui Chen, Yu Qiao, and Junjie Yan. 2018. Fots: Fast oriented text spotting with a unified network. In CVPR. 5676--5685."},{"key":"e_1_3_2_1_31_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_32_1","first-page":"8048","article-title":"Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting","volume":"44","author":"Liu Yuliang","year":"2021","unstructured":"Yuliang Liu, Chunhua Shen, Lianwen Jin, Tong He, Peng Chen, Chongyu Liu, and Hao Chen. 2021. Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting. IEEE TPAMI, Vol. 44, 11 (2021), 8048--8064.","journal-title":"IEEE TPAMI"},{"key":"e_1_3_2_1_33_1","unstructured":"Yuliang Liu Jiaxin Zhang Dezhi Peng Mingxin Huang Xinyu Wang Jingqun Tang Can Huang Dahua Lin Chunhua Shen Xiang Bai et al. 2023. SPTS v2: Single-Point Scene Text Spotting. arXiv preprint arXiv:2301.01635 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Textsnake: A flexible representation for detecting text of arbitrary shapes. In ECCV. 20--36.","author":"Long Shangbang","year":"2018","unstructured":"Shangbang Long, Jiaqiang Ruan, Wenjie Zhang, Xin He, Wenhao Wu, and Cong Yao. 2018. Textsnake: A flexible representation for detecting text of arbitrary shapes. In ECCV. 20--36."},{"key":"e_1_3_2_1_35_1","volume-title":"NIPS","volume":"32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. NIPS, Vol. 32."},{"key":"e_1_3_2_1_36_1","unstructured":"Pengyuan Lyu Minghui Liao Cong Yao Wenhao Wu and Xiang Bai. 2018. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. In ECCV. 67--83."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Yiwei Ma Guohai Xu Xiaoshuai Sun Ming Yan Ji Zhang and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. In ACM MM. 638--647.","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_38_1","volume-title":"Vspw: A large-scale dataset for video scene parsing in the wild. In CVPR. 4133--4143.","author":"Miao Jiaxu","year":"2021","unstructured":"Jiaxu Miao, Yunchao Wei, Yu Wu, Chen Liang, Guangrui Li, and Yi Yang. 2021. Vspw: A large-scale dataset for video scene parsing in the wild. In CVPR. 4133--4143."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Dezhi Peng Xinyu Wang Yuliang Liu Jiaxin Zhang Mingxin Huang Songxuan Lai Jing Li Shenggao Zhu Dahua Lin Chunhua Shen et al. 2022. SPTS: single-point text spotting. In ACM MM. 4272--4281.","DOI":"10.1145\/3503161.3547942"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Zhi Qiao Yu Zhou Jin Wei Wei Wang Yuan Zhang Ning Jiang Hongbin Wang and Weiping Wang. 2021. PIMNet: a parallel iterative and mimicking network for scene text recognition. In ACM MM. 2046--2055.","DOI":"10.1145\/3474085.3475238"},{"key":"e_1_3_2_1_41_1","volume-title":"Seed: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537.","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020. Seed: Semantics enhanced encoder-decoder framework for scene text recognition. In CVPR. 13528--13537."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Youhui Guo Dayan Wu Zhihong Tian Ning Jiang Hongbin Wang and Weiping Wang. 2021b. Mask is all you need: Rethinking mask R-CNN for dense and arbitrary-shaped scene text detection. In ACM MM. 414--423.","DOI":"10.1145\/3474085.3475178"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Youhui Guo Dayan Wu and Weiping Wang. 2021a. Fc 2 rn: a fully convolutional corner refinement network for accurate multi-oriented scene text detection. In ICASSP. 4350--4354.","DOI":"10.1109\/ICASSP39728.2021.9413821"},{"key":"e_1_3_2_1_44_1","volume-title":"NIPS","volume":"28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. NIPS, Vol. 28."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Huawen Shen Xiang Gao Jin Wei Liang Qiao Yu Zhou Qiang Li and Zhanzhan Cheng. 2023. Divide Rows and Conquer Cells: Towards Structure Recognition for Large Tables. In IJCAI.","DOI":"10.24963\/ijcai.2023\/152"},{"key":"e_1_3_2_1_46_1","first-page":"335","article-title":"Centripetaltext: An efficient text instance representation for scene text detection","volume":"34","author":"Sheng Tao","year":"2021","unstructured":"Tao Sheng, Jie Chen, and Zhouhui Lian. 2021. Centripetaltext: An efficient text instance representation for scene text detection. NIPS, Vol. 34, 335--346.","journal-title":"NIPS"},{"key":"e_1_3_2_1_47_1","unstructured":"Baoguang Shi Xiang Bai and Serge Belongie. 2017. Detecting oriented text in natural images by linking segments. In CVPR. 2550--2558."},{"key":"e_1_3_2_1_48_1","volume-title":"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition","author":"Shi Baoguang","year":"2015","unstructured":"Baoguang Shi, Xiang Bai, and Cong Yao. 2015. An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition. IEEE TPAMI (2015)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Abhinav Shrivastava Abhinav Gupta and Ross Girshick. 2016. Training region-based object detectors with online hard example mining. In CVPR. 761--769.","DOI":"10.1109\/CVPR.2016.89"},{"key":"e_1_3_2_1_51_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Sibo Song Jianqiang Wan Zhibo Yang Jun Tang Wenqing Cheng Xiang Bai and Cong Yao. 2022. Vision-Language Pre-Training for Boosting Scene Text Detectors. In CVPR. 15681--15691.","DOI":"10.1109\/CVPR52688.2022.01523"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Jingqun Tang Su Qiao Benlei Cui Yuhang Ma Sheng Zhang and Dimitrios Kanoulas. 2022. You Can even Annotate Text with Voice: Transcription-only-Supervised Text Spotting. In ACM MM. 4154--4163.","DOI":"10.1145\/3503161.3547787"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Zhi Tian Weilin Huang Tong He Pan He and Yu Qiao. 2016. Detecting Text in Natural Image with Connectionist Text Proposal Network. In ECCV.","DOI":"10.1007\/978-3-319-46484-8_4"},{"key":"e_1_3_2_1_55_1","volume-title":"Fcos: Fully convolutional one-stage object detection. In ICCV. 9627--9636.","author":"Tian Zhi","year":"2019","unstructured":"Zhi Tian, Chunhua Shen, Hao Chen, and Tong He. 2019. Fcos: Fully convolutional one-stage object detection. In ICCV. 9627--9636."},{"key":"e_1_3_2_1_56_1","first-page":"1","article-title":"Fuzzy Semantics for Arbitrary-Shaped Scene Text Detection","volume":"32","author":"Wang Fangfang","year":"2022","unstructured":"Fangfang Wang, Xiaogang Xu, Yifeng Chen, and Xi Li. 2022b. Fuzzy Semantics for Arbitrary-Shaped Scene Text Detection. IEEE TIP, Vol. 32 (2022), 1--12.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Xiang Bai Mingkun Yang Shenggao Zhu Jing Wang and Wenyu Liu. 2021a. Scene text retrieval via joint text detection and similarity learning. In CVPR. 4558--4567.","DOI":"10.1109\/CVPR46437.2021.00453"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Qingqing Wang Liqiang Xiao Yue Lu Yaohui Jin and Hao He. 2021b. Towards reasoning ability in scene text visual question answering. In ACM MM. 2281--2289.","DOI":"10.1145\/3474085.3475390"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Xuebo Liu Xiaozhong Ji Enze Xie Ding Liang ZhiBo Yang Tong Lu Chunhua Shen and Ping Luo. 2020. Ae textspotter: Learning visual and linguistic representation for ambiguous text spotting. In ECCV. 457--473.","DOI":"10.1007\/978-3-030-58568-6_27"},{"key":"e_1_3_2_1_60_1","first-page":"5349","article-title":"Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text","volume":"44","author":"Wang Wenhai","year":"2021","unstructured":"Wenhai Wang, Enze Xie, Xiang Li, Xuebo Liu, Ding Liang, Zhibo Yang, Tong Lu, and Chunhua Shen. 2021c. Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE TPAMI, Vol. 44, 9 (2021), 5349--5367.","journal-title":"IEEE TPAMI"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Enze Xie Xiaoge Song Y. Zang Wenjia Wang T. Lu G. Yu and Chunhua Shen. 2019. Efficient and Accurate Arbitrary-Shaped Text Detection With Pixel Aggregation Network. In ICCV. 8439--8448.","DOI":"10.1109\/ICCV.2019.00853"},{"key":"e_1_3_2_1_62_1","volume-title":"Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025.","author":"Wang Wei","year":"2022","unstructured":"Wei Wang, Yu Zhou, Jiahao Lv, Dayan Wu, Guoqing Zhao, Ning Jiang, and Weipinng Wang. 2022c. Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Yuxin Wang Hongtao Xie Mengting Xing Jing Wang Shenggao Zhu and Yongdong Zhang. 2022a. Detecting Tampered Scene Text in the Wild. In ECCV. 215--232.","DOI":"10.1007\/978-3-031-19815-1_13"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548051"},{"volume-title":"Semantic and verbatim word spotting using deep neural networks","author":"Wilkinson Tomas","key":"e_1_3_2_1_65_1","unstructured":"Tomas Wilkinson and Anders Brun. 2016. Semantic and verbatim word spotting using deep neural networks. In ICFHR. IEEE, 307--312."},{"key":"e_1_3_2_1_66_1","unstructured":"Xingxing Xie Gong Cheng Jiabao Wang Xiwen Yao and Junwei Han. 2021. Oriented R-CNN for object detection. In ICCV. 3520--3529."},{"key":"e_1_3_2_1_67_1","volume-title":"Philip HS Torr, and Song Bai","author":"Xue Chuhui","year":"2022","unstructured":"Chuhui Xue, Wenqing Zhang, Yu Hao, Shijian Lu, Philip HS Torr, and Song Bai. 2022. Language Matters: A Weakly Supervised Vision-Language Pre-training Approach for Scene Text Detection and Spotting. In ECCV. 284--302."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3209022"},{"key":"e_1_3_2_1_69_1","volume-title":"Tap: Text-aware pre-training for text-vqa and text-caption. In CVPR. 8751--8761.","author":"Yang Zhengyuan","year":"2021","unstructured":"Zhengyuan Yang, Yijuan Lu, Jianfeng Wang, Xi Yin, Dinei Florencio, Lijuan Wang, Cha Zhang, Lei Zhang, and Jiebo Luo. 2021. Tap: Text-aware pre-training for text-vqa and text-caption. In CVPR. 8751--8761."},{"key":"e_1_3_2_1_70_1","first-page":"4737","article-title":"A unified framework for multioriented text detection and recognition","volume":"23","author":"Yao Cong","year":"2014","unstructured":"Cong Yao, Xiang Bai, and Wenyu Liu. 2014. A unified framework for multioriented text detection and recognition. IEEE TIP, Vol. 23, 11 (2014), 4737--4749.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Cong Yao Xiang Bai Wenyu Liu Yi Ma and Zhuowen Tu. 2012. Detecting texts of arbitrary orientations in natural images. In CVPR. 1083--1090.","DOI":"10.1109\/CVPR.2012.6247787"},{"key":"e_1_3_2_1_72_1","first-page":"516","article-title":"TextFuseNet: Scene Text Detection with Richer Fused Features","volume":"20","author":"Ye Jian","year":"2020","unstructured":"Jian Ye, Zhe Chen, Juhua Liu, and Bo Du. 2020. TextFuseNet: Scene Text Detection with Richer Fused Features.. In IJCAI, Vol. 20. 516--522.","journal-title":"IJCAI"},{"key":"e_1_3_2_1_73_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Gangyan Zeng Yuan Zhang Yu Zhou and Xiaomeng Yang. 2021. Beyond OCR VQA: Involving OCR into the flow for robust and accurate textVQA. In ACM MM. 376--385.","DOI":"10.1145\/3474085.3475606"},{"key":"e_1_3_2_1_75_1","first-page":"6967","article-title":"A robust attentional framework for license plate recognition in the wild","volume":"22","author":"Zhang Linjiang","year":"2020","unstructured":"Linjiang Zhang, Peng Wang, Hui Li, Zhen Li, Chunhua Shen, and Yanning Zhang. 2020a. A robust attentional framework for license plate recognition in the wild. IEEE ITITS, Vol. 22, 11 (2020), 6967--6976.","journal-title":"IEEE ITITS"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Rui Zhang Yongsheng Zhou Qianyi Jiang Qi Song Nan Li Kai Zhou Lei Wang Dong Wang Minghui Liao Mingkun Yang et al. 2019. Icdar 2019 robust reading challenge on reading chinese text on signboard. In ICDAR. 1577--1581.","DOI":"10.1109\/ICDAR.2019.00253"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"S. Zhang Xiaobin Zhu Jie-Bo Hou Chang Liu C. Yang Hongfa Wang and XuCheng Yin. 2020b. Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection. In CVPR. 9696--9705.","DOI":"10.1109\/CVPR42600.2020.00972"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3176122"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Xiang Zhang Yongwen Su Subarna Tripathi and Zhuowen Tu. 2022b. Text spotting transformers. In CVPR. 9519--9528.","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"e_1_3_2_1_80_1","volume-title":"Beyond Instance Discrimination: Relation-aware Contrastive Self-supervised Learning. arXiv preprint arXiv:2211.01796","author":"Zhang Yifei","year":"2022","unstructured":"Yifei Zhang, Chang Liu, Yu Zhou, Weiping Wang, Qixiang Ye, and Xiangyang Ji. 2022a. Beyond Instance Discrimination: Relation-aware Contrastive Self-supervised Learning. arXiv preprint arXiv:2211.01796 (2022)."},{"key":"e_1_3_2_1_81_1","volume-title":"Deep neural network for semantic-based text recognition in images. arXiv preprint arXiv:1908.01403","author":"Zheng Yi","year":"2019","unstructured":"Yi Zheng, Qitong Wang, and Margrit Betke. 2019. Deep neural network for semantic-based text recognition in images. arXiv preprint arXiv:1908.01403 (2019)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"Xu Zhong Jianbin Tang and Antonio Jimeno Yepes. 2019. Publaynet: largest dataset ever for document layout analysis. In ICDAR. 1015--1022.","DOI":"10.1109\/ICDAR.2019.00166"},{"key":"e_1_3_2_1_83_1","volume-title":"EAST: An Efficient and Accurate Scene Text Detector. In CVPR. 2642--2651.","author":"Zhou Xinyu","year":"2017","unstructured":"Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, and Jiajun Liang. 2017. EAST: An Efficient and Accurate Scene Text Detector. In CVPR. 2642--2651."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"crossref","unstructured":"Yiqin Zhu Jianyong Chen Lingyu Liang Zhanghui Kuang Lianwen Jin and Wayne Zhang. 2021. Fourier contour embedding for arbitrary-shaped text detection. In CVPR. 3123--3131.","DOI":"10.1109\/CVPR46437.2021.00314"},{"key":"e_1_3_2_1_85_1","first-page":"107336","article-title":"Textmountain: Accurate scene text detection via instance segmentation","volume":"110","author":"Zhu Yixing","year":"2021","unstructured":"Yixing Zhu and Jun Du. 2021. Textmountain: Accurate scene text detection via instance segmentation. PR, Vol. 110 (2021), 107336.","journal-title":"PR"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612383","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612383","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:55:49Z","timestamp":1755820549000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612383"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":85,"alternative-id":["10.1145\/3581783.3612383","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612383","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}