{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T00:45:54Z","timestamp":1768524354149,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":100,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"the Key Research Program of Frontier Sciences, CAS","award":["ZDBS-LY-7024"],"award-info":[{"award-number":["ZDBS-LY-7024"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611801","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"2025-2034","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Towards Robust Real-Time Scene Text Detection: From Semantic to Instance Representation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3130-3220","authenticated-orcid":false,"given":"Xugong","family":"Qin","sequence":"first","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3153-8519","authenticated-orcid":false,"given":"Pengyuan","family":"Lyu","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8254-5773","authenticated-orcid":false,"given":"Chengquan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4188-9953","authenticated-orcid":false,"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7155-4076","authenticated-orcid":false,"given":"Kun","family":"Yao","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9518-5914","authenticated-orcid":false,"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9137-5528","authenticated-orcid":false,"given":"Hailun","family":"Lin","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8618-4992","authenticated-orcid":false,"given":"Weiping","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Youngmin Baek Bado Lee Dongyoon Han Sangdoo Yun and Hwalsuk Lee. 2019. Character region awareness for text detection. In CVPR. 9365--9374.","DOI":"10.1109\/CVPR.2019.00959"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"e_1_3_2_1_3_1","volume-title":"Total-text: A comprehensive dataset for scene text detection and recognition. In ICDAR. 935--942.","author":"Ch'ng Chee Kheng","year":"2017","unstructured":"Chee Kheng Ch'ng and Chee Seng Chan. 2017. Total-text: A comprehensive dataset for scene text detection and recognition. In ICDAR. 935--942."},{"key":"e_1_3_2_1_4_1","unstructured":"Pengwen Dai Sanyi Zhang Hua Zhang and Xiaochun Cao. 2021. Progressive Contour Regression for Arbitrary-Shape Scene Text Detection. In CVPR. 7393--7402."},{"key":"e_1_3_2_1_5_1","volume-title":"Pixellink: Detecting scene text via instance segmentation. In AAAI. 6773--6780.","author":"Deng Dan","year":"2018","unstructured":"Dan Deng, Haifeng Liu, Xuelong Li, and Deng Cai. 2018. Pixellink: Detecting scene text via instance segmentation. In AAAI. 6773--6780."},{"key":"e_1_3_2_1_6_1","volume-title":"SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890.","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. SVTR: Scene Text Recognition with a Single Visual Model. In IJCAI. 884--890."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Youhui Guo Yu Zhou Xugong Qin and Weiping Wang. 2021. Which and Where to Focus: A Simple yet Accurate Framework for Arbitrary-Shaped Nearby Text Detection in Scene Images. In ICANN. 271--283.","DOI":"10.1007\/978-3-030-86383-8_22"},{"key":"e_1_3_2_1_8_1","volume-title":"UNITS: Unsupervised Intermediate Training Stage for Scene Text Detection. In ICME. 1--6.","author":"Guo Youhui","year":"2022","unstructured":"Youhui Guo, Yu Zhou, Xugong Qin, Enze Xie, and Weiping Wang. 2022. UNITS: Unsupervised Intermediate Training Stage for Scene Text Detection. In ICME. 1--6."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Ankush Gupta Andrea Vedaldi and Andrew Zisserman. 2016. Synthetic data for text localisation in natural images. In CVPR. 2315--2324.","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_10_1","unstructured":"Dafang He Xiao Yang Chen Liang Zihan Zhou Alexander G Ororbi Daniel Kifer and C Lee Giles. 2017c. Multi-scale FCN with cascaded instance aware segmentation for arbitrary oriented word spotting in the wild. In CVPR. 3519--3528."},{"key":"e_1_3_2_1_11_1","unstructured":"Kaiming He Georgia Gkioxari Piotr Doll\u00e1r and Ross Girshick. 2017a. Mask R-CNN. In ICCV. 2980--2988."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","volume-title":"MOST: A Multi-Oriented Scene Text Detector with Localization Refinement. In CVPR. 8813--8822.","author":"He Minghang","year":"2021","unstructured":"Minghang He, Minghui Liao, Zhibo Yang, Humen Zhong, Jun Tang, Wenqing Cheng, Cong Yao, Yongpan Wang, and Xiang Bai. 2021. MOST: A Multi-Oriented Scene Text Detector with Localization Refinement. In CVPR. 8813--8822."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Pan He Weilin Huang Tong He Qile Zhu Yu Qiao and Xiaolin Li. 2017b. Single shot text detector with regional attention. In ICCV. 3047--3055.","DOI":"10.1109\/ICCV.2017.331"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Wenhao He Xu-Yao Zhang Fei Yin and Cheng-Lin Liu. 2017d. Deep direct regression for multi-oriented scene text detection. In ICCV. 745--753.","DOI":"10.1109\/ICCV.2017.87"},{"key":"e_1_3_2_1_16_1","volume-title":"Gtc: Guided training of ctc towards efficient and accurate scene text recognition. In AAAI. 11005--11012.","author":"Hu Wenyang","year":"2020","unstructured":"Wenyang Hu, Xiaocong Cai, Jun Hou, Shuai Yi, and Zhiping Lin. 2020. Gtc: Guided training of ctc towards efficient and accurate scene text recognition. In AAAI. 11005--11012."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Weilin Huang Yu Qiao and Xiaoou Tang. 2014. Robust scene text detection with convolution neural network induced mser trees. In ECCV. 497--511.","DOI":"10.1007\/978-3-319-10593-2_33"},{"key":"e_1_3_2_1_18_1","volume-title":"Tinne Tuytelaars, and Luc V Gool.","author":"Jia Xu","year":"2016","unstructured":"Xu Jia, Bert De Brabandere, Tinne Tuytelaars, and Luc V Gool. 2016. Dynamic filter networks. In NeurIPS. 667--675."},{"key":"e_1_3_2_1_19_1","volume-title":"ICDAR 2015 competition on robust reading. In ICDAR. 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In ICDAR. 1156--1160."},{"key":"e_1_3_2_1_20_1","volume-title":"Mask textSpotter: An end-to-end trainable neural network for spotting text with arbitrary shapes","author":"Liao Minghui","year":"2021","unstructured":"Minghui Liao, Pengyuan Lyu, Minghang He, Cong Yao, Wenhao Wu, and Xiang Bai. 2021. Mask textSpotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. IEEE TPAMI (2021), 532--548."},{"key":"e_1_3_2_1_21_1","volume-title":"Textboxes: A single-shot oriented scene text detector","author":"Liao Minghui","year":"2018","unstructured":"Minghui Liao, Baoguang Shi, and Xiang Bai. 2018a. Textboxes: A single-shot oriented scene text detector. IEEE TIP (2018), 3676--3690."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Zhaoyi Wan Cong Yao Kai Chen and Xiang Bai. 2020. Real-time scene text detection with differentiable binarization. In AAAI. 11474--11481.","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Zhen Zhu Baoguang Shi Gui-song Xia and Xiang Bai. 2018b. In CVPR. 5909--5918.","DOI":"10.1109\/CVPR.2018.00619"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3155612"},{"key":"e_1_3_2_1_25_1","unstructured":"Tsung-Yi Lin Piotr Doll\u00e1r Ross Girshick Kaiming He Bharath Hariharan and Serge Belongie. 2017. Feature pyramid networks for object detection. In CVPR. 2117--2125."},{"key":"e_1_3_2_1_26_1","volume-title":"SSD: Single shot multibox detector. In ECCV. 21--37.","author":"Liu Wei","year":"2016","unstructured":"Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, and Alexander C Berg. 2016. SSD: Single shot multibox detector. In ECCV. 21--37."},{"key":"e_1_3_2_1_27_1","unstructured":"Yuliang Liu Hao Chen Chunhua Shen Tong He Lianwen Jin and Liangwei Wang. 2020a. ABCNet: Real-time scene text spotting with adaptive bezier-curve network. In CVPR. 9806--9815."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Yuliang Liu and Lianwen Jin. 2017. Deep matching prior network: Toward tighter multi-oriented text detection. In CVPR. 3454--3461.","DOI":"10.1109\/CVPR.2017.368"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2954218"},{"key":"e_1_3_2_1_30_1","volume-title":"Curved scene text detection via transverse and longitudinal sequence connection. PR","author":"Liu Yuliang","year":"2019","unstructured":"Yuliang Liu, Lianwen Jin, Shuaitao Zhang, Canjie Luo, and Sheng Zhang. 2019a. Curved scene text detection via transverse and longitudinal sequence connection. PR (2019), 337--345."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Yuliang Liu Sheng Zhang Lianwen Jin Lele Xie Yaqiang Wu and Zhepeng Wang. 2019c. Omnidirectional scene text detection with sequential-free box discretization. In IJCAI. 3052--3058.","DOI":"10.24963\/ijcai.2019\/423"},{"key":"e_1_3_2_1_32_1","unstructured":"Zichuan Liu Guosheng Lin Sheng Yang Fayao Liu Weisi Lin and Wang Ling Goh. 2019b. Towards robust curve text detection with conditional spatial expansion. In CVPR. 7269--7278."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Jonathan Long Evan Shelhamer and Trevor Darrell. 2015. Fully convolutional networks for semantic segmentation. In CVPR. 3431--3440.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01369-0"},{"key":"e_1_3_2_1_35_1","unstructured":"Shangbang Long Siyang Qin Dmitry Panteleev Alessandro Bissacco Yasuhisa Fujii and Michalis Raptis. 2022. Towards end-to-end unified scene text detection and layout analysis. In CVPR. 1049--1059."},{"key":"e_1_3_2_1_36_1","volume-title":"Textsnake: A flexible representation for detecting text of arbitrary shapes. In ECCV. 20--36.","author":"Long Shangbang","year":"2018","unstructured":"Shangbang Long, Jiaqiang Ruan, Wenjie Zhang, Xin He, Wenhao Wu, and Cong Yao. 2018. Textsnake: A flexible representation for detecting text of arbitrary shapes. In ECCV. 20--36."},{"key":"e_1_3_2_1_37_1","unstructured":"Pengyuan Lyu Cong Yao Wenhao Wu Shuicheng Yan and Xiang Bai. 2018. Multi-oriented scene text detection via corner localization and region segmentation. In CVPR. 7553--7563."},{"key":"e_1_3_2_1_38_1","volume-title":"ReLaText: Exploiting visual relationships for arbitrary-shaped scene text detection with graph convolutional networks. PR","author":"Ma Chixiang","year":"2021","unstructured":"Chixiang Ma, Lei Sun, Zhuoyao Zhong, and Qiang Huo. 2021. ReLaText: Exploiting visual relationships for arbitrary-shaped scene text detection with graph convolutional networks. PR (2021), 337--345."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2818020"},{"key":"e_1_3_2_1_40_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 3DV. 565--571.","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 3DV. 565--571."},{"key":"e_1_3_2_1_41_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Zhi Qiao Xugong Qin Yu Zhou Fei Yang and Weiping Wang. 2020a. Gaussian Constrained Attention Network for Scene Text Recognition. In ICPR. 3328--3335.","DOI":"10.1109\/ICPR48806.2021.9412806"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Zhi Qiao Yu Zhou Jin Wei Wei Wang Yuan Zhang Ning Jiang Hongbin Wang and Weiping Wang. 2021. PIMNet: A Parallel Iterative and Mimicking Network for Scene Text Recognition. In ACM MM. 2046--2055.","DOI":"10.1145\/3474085.3475238"},{"key":"e_1_3_2_1_44_1","volume-title":"SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In CVPR. 13525--13534.","author":"Qiao Zhi","year":"2020","unstructured":"Zhi Qiao, Yu Zhou, Dongbao Yang, Yucan Zhou, and Weiping Wang. 2020b. SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition. In CVPR. 13525--13534."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Youhui Guo Dayan Wu Zhihong Tian Ning Jiang Hongbin Wang and Weiping Wang. 2021b. Mask is all you need: Rethinking mask R-CNN for dense and arbitrary-shaped scene text detection. In ACM MM. 414--423.","DOI":"10.1145\/3474085.3475178"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Youhui Guo Dayan Wu and Weiping Wang. 2021a. FC(2)RN: A Fully Convolutional Corner Refinement Network for Accurate Multi-Oriented Scene Text Detection. In ICASSP. 4350--4354.","DOI":"10.1109\/ICASSP39728.2021.9413821"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Xugong Qin Yu Zhou Dongbao Yang and Weiping Wang. 2019. Curved Text Detection in Natural Scene Images with Semi- and Weakly-Supervised Learning. In ICDAR. 559--564.","DOI":"10.1109\/ICDAR.2019.00095"},{"key":"e_1_3_2_1_48_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster R-CNN: Towards real-time object detection with region proposal networks. In NeurIPS. 91--99."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Huawen Shen Xiang Gao Jin Wei Liang Qiao Yu Zhou Qiang Li and Zhanzhan Cheng. 2023. Divide Rows and Conquer Cells: Towards Structure Recognition for Large Tables. In IJCAI.","DOI":"10.24963\/ijcai.2023\/152"},{"key":"e_1_3_2_1_50_1","volume-title":"Centripetaltext: An efficient text instance representation for scene text detection. In NeurIPS. 335--346.","author":"Sheng Tao","year":"2021","unstructured":"Tao Sheng, Jie Chen, and Zhouhui Lian. 2021. Centripetaltext: An efficient text instance representation for scene text detection. In NeurIPS. 335--346."},{"key":"e_1_3_2_1_51_1","unstructured":"Baoguang Shi Xiang Bai and Serge Belongie. 2017a. Detecting oriented text in natural images by linking segments. In CVPR. 2550--2558."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Sibo Song Jianqiang Wan Zhibo Yang Jun Tang Wenqing Cheng Xiang Bai and Cong Yao. 2022. Vision-language pre-training for boosting scene text detectors. In CVPR. 15681--15691.","DOI":"10.1109\/CVPR52688.2022.01523"},{"key":"e_1_3_2_1_55_1","volume-title":"SegLink: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping. PR","author":"Tang Jun","year":"2019","unstructured":"Jun Tang, Zhibo Yang, Yongpan Wang, Qi Zheng, Yongchao Xu, and Xiang Bai. 2019. SegLink: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping. PR (2019), 106954."},{"key":"e_1_3_2_1_56_1","volume-title":"Few Could Be Better Than All: Feature Sampling and Grouping for Scene Text Detection","author":"Tang Jingqun","unstructured":"Jingqun Tang, Wenqing Zhang, Hongye Liu, Mingkun Yang, Bo Jiang, Guanglong Hu, and Xiang Bai. 2022. Few Could Be Better Than All: Feature Sampling and Grouping for Scene Text Detection. In CVPR. IEEE."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Zhi Tian Weilin Huang Tong He Pan He and Yu Qiao. 2016. Detecting text in natural image with connectionist text proposal network. In ECCV. 56--72.","DOI":"10.1007\/978-3-319-46484-8_4"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Zhuotao Tian Michelle Shu Pengyuan Lyu Ruiyu Li Chao Zhou Xiaoyong Shen and Jiaya Jia. 2019. Learning shape-aware embedding for scene text detection. In CVPR. 4234--4243.","DOI":"10.1109\/CVPR.2019.00436"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/129902.129906"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Qi Wan Haoqin Ji and Linlin Shen. 2021. Self-attention based text knowledge mining for text detection. In CVPR. 5983--5992.","DOI":"10.1109\/CVPR46437.2021.00592"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Fangfang Wang Yifeng Chen Fei Wu and Xi Li. 2020a. TextRay: Contour-based geometric modeling for arbitrary-shaped scene text detection. In ACM MM. 111--119.","DOI":"10.1145\/3394171.3413819"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Fangfang Wang Liming Zhao Xi Li Xinchao Wang and Dacheng Tao. 2018. Geometry-aware scene text detection with instance transformation network. In CVPR. 1381--1389.","DOI":"10.1109\/CVPR.2018.00150"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Junchao Liao Tianheng Cheng Zewen Gao Hao Liu Bo Ren Xiang Bai and Wenyu Liu. 2022a. Knowledge mining with scene text for fine-grained recognition. In CVPR. 4624--4633.","DOI":"10.1109\/CVPR52688.2022.00458"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Pengfei Wang Chengquan Zhang Fei Qi Zuming Huang Mengyi En Junyu Han Jingtuo Liu Errui Ding and Guangming Shi. 2019d. A single-shot arbitrarily-shaped text detector based on context attended multi-task learning. In ACM MM. 1277--1285.","DOI":"10.1145\/3343031.3350988"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Enze Xie Xiang Li Wenbo Hou Tong Lu Gang Yu and Shuai Shao. 2019b. Shape robust text detection with progressive scale expansion network. In CVPR. 9336--9345.","DOI":"10.1109\/CVPR.2019.00956"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Enze Xie Xiaoge Song Yuhang Zang Wenjia Wang Tong Lu Gang Yu and Chunhua Shen. 2019c. Efficient and accurate arbitrary-shaped text detection with pixel aggregation network. In ICCV. 8440--8449.","DOI":"10.1109\/ICCV.2019.00853"},{"key":"e_1_3_2_1_67_1","volume-title":"Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025.","author":"Wang Wei","year":"2022","unstructured":"Wei Wang, Yu Zhou, Jiahao Lv, Dayan Wu, Guoqing Zhao, Ning Jiang, and Weipinng Wang. 2022c. Tpsnet: Reverse thinking of thin plate splines for arbitrary shape scene text representation. In ACM MM. 5014--5025."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Xiaobing Wang Yingying Jiang Zhenbo Luo Cheng-Lin Liu Hyunsoo Choi and Sungjin Kim. 2019a. Arbitrary shape scene text detection with adaptive text region representation. In CVPR. 6449--6458.","DOI":"10.1109\/CVPR.2019.00661"},{"key":"e_1_3_2_1_69_1","volume-title":"Solov2: Dynamic and fast instance segmentation. NeurIPS","author":"Wang Xinlong","year":"2020","unstructured":"Xinlong Wang, Rufeng Zhang, Tao Kong, Lei Li, and Chunhua Shen. 2020c. Solov2: Dynamic and fast instance segmentation. NeurIPS (2020), 17721--17732."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"crossref","unstructured":"Yuxin Wang Hongtao Xie Mengting Xing Jing Wang Shenggao Zhu and Yongdong Zhang. 2022b. Detecting Tampered Scene Text in the Wild. In ECCV. 215--232.","DOI":"10.1007\/978-3-031-19815-1_13"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Yuxin Wang Hongtao Xie Zheng-Jun Zha Mengting Xing Zilong Fu and Yongdong Zhang. 2020b. ContourNet: Taking a further step toward accurate arbitrary-shaped scene text detection. In CVPR. 11750--11759.","DOI":"10.1109\/CVPR42600.2020.01177"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"crossref","unstructured":"Jin Wei Yuan Zhang Yu Zhou Gangyan Zeng Zhi Qiao Youhui Guo Haiying Wu Hongbin Wang and Weiping Wang. 2022. TextBlock: Towards Scene Text Spotting without Fine-Grained Detection. In ACM MM. 5892--5902.","DOI":"10.1145\/3503161.3548051"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"Yue Wu and Prem Natarajan. 2017. Self-organized text detection with minimal post-processing via border learning. In ICCV. 5000--5009.","DOI":"10.1109\/ICCV.2017.535"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Shanyu Xiao Liangrui Peng Yan Ruijie An Keyu Yao Gang and Min Jaesik. 2020. Sequential deformation for accurate scene text detection. In ECCV. 108--124.","DOI":"10.1007\/978-3-030-58526-6_7"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"crossref","unstructured":"Enze Xie Yuhang Zang Shuai Shao Gang Yu Cong Yao and Guangyao Li. 2019. Scene text detection with supervised pyramid context network. In AAAI. 9038--9045.","DOI":"10.1609\/aaai.v33i01.33019038"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Linjie Xing Zhi Tian Weilin Huang and Matthew R Scott. 2019. Convolutional character networks. In ICCV. 9126--9136.","DOI":"10.1109\/ICCV.2019.00922"},{"key":"e_1_3_2_1_77_1","volume-title":"TextField: Learning a deep direction field for irregular scene text detection","author":"Xu Yongchao","year":"2019","unstructured":"Yongchao Xu, Yukang Wang, Wei Zhou, Yongpan Wang, Zhibo Yang, and Xiang Bai. 2019. TextField: Learning a deep direction field for irregular scene text detection. IEEE TIP (2019), 5566--5579."},{"key":"e_1_3_2_1_78_1","unstructured":"Chuhui Xue Shijian Lu and Fangneng Zhan. 2018. Accurate scene text detection through border semantics awareness and bootstrapping. In ECCV. 355--372."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Chuhui Xue Shijian Lu and Wei Zhang. 2019. MSR: multi-scale shape regression for scene text detection. In IJCAI. 989--995.","DOI":"10.24963\/ijcai.2019\/139"},{"key":"e_1_3_2_1_80_1","volume-title":"Philip HS Torr, and Song Bai","author":"Xue Chuhui","year":"2022","unstructured":"Chuhui Xue, Wenqing Zhang, Yu Hao, Shijian Lu, Philip HS Torr, and Song Bai. 2022. Language matters: A weakly supervised vision-language pre-training approach for scene text detection and spotting. In ECCV. 284--302."},{"key":"e_1_3_2_1_81_1","volume-title":"Inceptext: A new inception-text module with deformable PSROI pooling for multi-oriented scene text detection. In IJCAI. 1071--1077.","author":"Yang Qiangpeng","year":"2018","unstructured":"Qiangpeng Yang, Mengli Cheng, Wenmeng Zhou, Yan Chen, Minghui Qiu, and Wei Lin. 2018. Inceptext: A new inception-text module with deformable PSROI pooling for multi-oriented scene text detection. In IJCAI. 1071--1077."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2014.2353813"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"crossref","unstructured":"Cong Yao Xiang Bai Wenyu Liu Yi Ma and Zhuowen Tu. 2012. Detecting texts of arbitrary orientations in natural images. In CVPR. 1083--1090.","DOI":"10.1109\/CVPR.2012.6247787"},{"key":"e_1_3_2_1_84_1","volume-title":"Scene text detection via holistic, multi-channel prediction. arXiv preprint arXiv:1606.09002","author":"Yao Cong","year":"2016","unstructured":"Cong Yao, Xiang Bai, Nong Sang, Xinyu Zhou, Shuchang Zhou, and Zhimin Cao. 2016. Scene text detection via holistic, multi-channel prediction. arXiv preprint arXiv:1606.09002 (2016)."},{"key":"e_1_3_2_1_85_1","volume-title":"Strokelets: A learned multi-scale representation for scene text recognition. In CVPR. 4042--4049.","author":"Yao Cong","year":"2014","unstructured":"Cong Yao, Xiang Bai, Baoguang Shi, and Wenyu Liu. 2014b. Strokelets: A learned multi-scale representation for scene text recognition. In CVPR. 4042--4049."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Jian Ye Zhe Chen Juhua Liu and Bo Du. 2020. TextFuseNet: Scene text detection with richer fused features. In IJCAI. 516--522.","DOI":"10.24963\/ijcai.2020\/72"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2366765"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"crossref","unstructured":"Wenwen Yu Yuliang Liu Wei Hua Deqiang Jiang Bo Ren and Xiang Bai. 2023. Turning a CLIP Model into a Scene Text Detector. CVPR 6978--6988.","DOI":"10.1109\/CVPR52729.2023.00674"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"crossref","unstructured":"Gangyan Zeng Yuan Zhang Yu Zhou and Xiaomeng Yang. 2021. Beyond OCR VQA: Involving OCR into the Flow for Robust and Accurate TextVQA. In ACM MM. 376--385.","DOI":"10.1145\/3474085.3475606"},{"key":"e_1_3_2_1_90_1","first-page":"109337","article-title":"Beyond OCR VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa","volume":"138","author":"Zeng Gangyan","year":"2023","unstructured":"Gangyan Zeng, Yuan Zhang, Yu Zhou, Xiaomeng Yang, Ning Jiang, Guoqing Zhao, Weiping Wang, and Xu-Cheng Yin. 2023. Beyond OCR VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa. PR, Vol. 138 (2023), 109337.","journal-title":"PR"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Chengquan Zhang Borong Liang Zuming Huang Mengyi En Junyu Han Errui Ding and Xinghao Ding. 2019. Look more than once: An accurate detector for text of arbitrary shapes. In CVPR. 10552--10561.","DOI":"10.1109\/CVPR.2019.01080"},{"key":"e_1_3_2_1_92_1","volume-title":"Li","author":"Zhang Shifeng","year":"2020","unstructured":"Shifeng Zhang, Cheng Chi, Yongqiang Yao, Zhen Lei, and Stan Z. Li. 2020a. Bridging the gap between anchor-based and anchor-free detection via adaptive training sample selection. In CVPR. 9756--9765."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"crossref","unstructured":"Shi-Xue Zhang Xiaobin Zhu Jie-Bo Hou Chang Liu Chun Yang Hongfa Wang and Xu-Cheng Yin. 2020b. Deep relational reasoning graph network for arbitrary shape text detection. In CVPR. 9696--9705.","DOI":"10.1109\/CVPR42600.2020.00972"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"crossref","unstructured":"Shi-Xue Zhang Xiaobin Zhu Chun Yang Hongfa Wang and Xu-Cheng Yin. 2021. Adaptive Boundary Proposal Network for Arbitrary Shape Text Detection. In ICCV. 1305--1314.","DOI":"10.1109\/ICCV48922.2021.00134"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"crossref","unstructured":"Zheng Zhang Chengquan Zhang Wei Shen Cong Yao Wenyu Liu and Xiang Bai. 2016. Multi-oriented text detection with fully convolutional networks. In CVPR. 4159--4167.","DOI":"10.1109\/CVPR.2016.451"},{"key":"e_1_3_2_1_96_1","volume-title":"EAST: An efficient and accurate scene text detector. In CVPR. 2642--2651.","author":"Zhou Xinyu","year":"2017","unstructured":"Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, and Jiajun Liang. 2017. EAST: An efficient and accurate scene text detector. In CVPR. 2642--2651."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"crossref","unstructured":"Yu Zhou Hongtao Xie Shancheng Fang Yan Li and Yongdong Zhang. 2020. CRNet: A center-aware representation for detecting text of arbitrary shapes. In ACM MM. 2571--2580.","DOI":"10.1145\/3394171.3413565"},{"key":"e_1_3_2_1_98_1","unstructured":"Xizhou Zhu Han Hu Stephen Lin and Jifeng Dai. 2019. Deformable convnets v2: More deformable better results. In CVPR. 9308--9316."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"crossref","unstructured":"Yiqin Zhu Jianyong Chen Lingyu Liang Zhanghui Kuang Lianwen Jin and Wayne Zhang. 2021. Fourier Contour Embedding for Arbitrary-Shaped Text Detection. In CVPR. 3123--3131.","DOI":"10.1109\/CVPR46437.2021.00314"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-015-4488-0"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611801","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611801","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:10:58Z","timestamp":1755821458000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611801"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":100,"alternative-id":["10.1145\/3581783.3611801","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611801","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}