{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T15:21:51Z","timestamp":1758122511577,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176060"],"award-info":[{"award-number":["62176060"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","award":["20511100400"],"award-info":[{"award-number":["20511100400"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611755","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"2898-2907","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Scene Text Segmentation with Text-Focused Transformers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1717-0474","authenticated-orcid":false,"given":"Haiyang","family":"Yu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3735-721X","authenticated-orcid":false,"given":"Xiaocong","family":"Wang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0181-924X","authenticated-orcid":false,"given":"Ke","family":"Niu","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9633-0033","authenticated-orcid":false,"given":"Bin","family":"Li","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4897-9209","authenticated-orcid":false,"given":"Xiangyang","family":"Xue","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer Normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"Bo Bai Fei Yin and Cheng Lin Liu. 2014. A Seed-Based Segmentation Method for Scene Text Extraction. In DAS. https:\/\/doi.org\/10.1109\/das.2014.34","DOI":"10.1109\/das.2014.34"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0242-8"},{"volume-title":"Pixel-Level Annotations Based on Weak Supervision for Scene Text Segmentation","author":"Bonechi Simone","key":"e_1_3_2_1_4_1","unstructured":"Simone Bonechi, Paolo Andreini, Monica Bianchini, and Franco Scarselli. 2019. COCO_TS Dataset: Pixel-Level Annotations Based on Weak Supervision for Scene Text Segmentation. In ICANN. Springer, 238--250."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2020.06.023"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Jingye Chen Haiyang Yu Jianqi Ma Bin Li and Xiangyang Xue. 2022b. Text gestalt: Stroke-aware scene text image super-resolution. In AAAI. 285--293.","DOI":"10.1609\/aaai.v36i1.19904"},{"key":"e_1_3_2_1_7_1","volume-title":"Text Image Editing Method Based on Font and Character Attribute Guidance. Journal of Computer Applications","author":"Chen Jing-Tiao","year":"2022","unstructured":"Jing-Tiao Chen, Shu-Gong Xu, and You-Dong Ding. 2022a. Text Image Editing Method Based on Font and Character Attribute Guidance. Journal of Computer Applications (2022), 0."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"e_1_3_2_1_9_1","unstructured":"Liang-Chieh Chen George Papandreou Florian Schroff and Hartwig Adam. 2017b. Rethinking Atrous Convolution for Semantic Image Segmentation."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Liang-Chieh Chen Yukun Zhu George Papandreou Florian Schroff and Hartwig Adam. 2018. Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation. In ECCV. 801--818.","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_2_1_11_1","first-page":"17864","article-title":"Per-Pixel Classification Is Not All You Need for Semantic Segmentation","volume":"34","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Alex Schwing, and Alexander Kirillov. 2021. Per-Pixel Classification Is Not All You Need for Semantic Segmentation. NeurIPS, Vol. 34 (2021), 17864--17875.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.157"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12269"},{"key":"e_1_3_2_1_14_1","volume-title":"Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"volume-title":"Few\/Zero-Shot Chinese Character Style Transfer via Radical Decomposition and Rendering","author":"Huang Yaoxiong","key":"e_1_3_2_1_16_1","unstructured":"Yaoxiong Huang, Mengchao He, Lianwen Jin, and Yongpan Wang. 2020. RD-GAN: Few\/Zero-Shot Chinese Character Style Transfer via Radical Decomposition and Rendering. In ECCV. Springer, 156--172."},{"key":"e_1_3_2_1_17_1","volume-title":"OneFormer: One Transformer to Rule Universal Image Segmentation. arXiv preprint arXiv:2211.06220","author":"Jain Jitesh","year":"2022","unstructured":"Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, and Humphrey Shi. 2022. OneFormer: One Transformer to Rule Universal Image Segmentation. arXiv preprint arXiv:2211.06220 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"R2CNN: Rotational region CNN for orientation robust scene text detection. arXiv preprint arXiv:1706.09579","author":"Jiang Yingying","year":"2017","unstructured":"Yingying Jiang, Xiangyu Zhu, Xiaobing Wang, Shuli Yang, Wei Li, Hua Wang, Pei Fu, and Zhenbo Luo. 2017. R2CNN: Rotational region CNN for orientation robust scene text detection. arXiv preprint arXiv:1706.09579 (2017)."},{"key":"e_1_3_2_1_19_1","volume-title":"ICDAR 2013 Robust Reading Competition. In ICDAR. IEEE, 1484--1493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis Gomez i Bigorda, Sergi Robles Mestre, Joan Mas, David Fernandez Mota, Jon Almazan Almazan, and Lluis Pere De Las Heras. 2013. ICDAR 2013 Robust Reading Competition. In ICDAR. IEEE, 1484--1493."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Chenhao Li Yuta Taniguchi Min Lu and Shin'ichi Konomi. 2021. Few-Shot Font Style Transfer between Different Languages. In WACV. 433--442.","DOI":"10.1109\/WACV48630.2021.00048"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2018.2825107"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","unstructured":"Xiaoqing Liu and Jagath Samarabandu. 2007. Multiscale Edge-Based Text Extraction from Complex Images. In ICME. https:\/\/doi.org\/10.1109\/icme.2006.262882","DOI":"10.1109\/icme.2006.262882"},{"key":"e_1_3_2_1_24_1","unstructured":"Yuliang Liu Hao Chen Chunhua Shen Tong He Lianwen Jin and Liangwei Wang. 2020. ABCNet: Real-Time Scene Text Spotting with Adaptive Bezier-Curve Network. In CVPR. 9809--9818."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Jonathan Long Evan Shelhamer and Trevor Darrell. 2015. Fully Convolutional Networks for Semantic Segmentation. In CVPR. 3431--3440.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_2_1_26_1","volume-title":"Decoupled Weight Decay Regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Andreas Lugmayr Martin Danelljan Andres Romero Fisher Yu Radu Timofte and Luc Van Gool. 2022. RePaint: Inpainting Using Denoising Diffusion Probabilistic Models. In CVPR. 11461--11471.","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2018.2818020"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2004.02.006"},{"key":"e_1_3_2_1_30_1","volume-title":"ICDAR2017 Robust Reading Challenge on Multi-lingual Scene Text Detection and Script Identification-Rrc-Mlt. In ICDAR","volume":"1","author":"Nayef Nibal","year":"2017","unstructured":"Nibal Nayef, Fei Yin, Imen Bizid, Hyunsoo Choi, Yuan Feng, Dimosthenis Karatzas, Zhenbo Luo, Umapada Pal, Christophe Rigaud, Joseph Chazalon, et al. 2017. ICDAR2017 Robust Reading Challenge on Multi-lingual Scene Text Detection and Script Identification-Rrc-Mlt. In ICDAR, Vol. 1. IEEE, 1454--1459."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/tsmc.1979.4310076"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Bolan Su Shijian Lu and Chew Lim Tan. 2010. Binarization of Historical Document Images Using the Local Maximum and Minimum. In DAS. https:\/\/doi.org\/10.1145\/1815330.1815351","DOI":"10.1145\/1815330.1815351"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Jingqun Tang Wenqing Zhang Hongye Liu MingKun Yang Bo Jiang Guanglong Hu and Xiang Bai. 2022. Few Could Be Better Than All: Feature Sampling and Grouping for Scene Text Detection. In CVPR. 4563--4572.","DOI":"10.1109\/CVPR52688.2022.00452"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3125260"},{"key":"e_1_3_2_1_36_1","unstructured":"Hugo Touvron Matthieu Cord Matthijs Douze Francisco Massa Alexandre Sablayrolles and Herv\u00e9 J\u00e9gou. 2021. Training Data-Efficient Image Transformers & Distillation Through Attention. In ICML. PMLR 10347--10357."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2021.3113157"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Wenhai Wang Enze Xie Xiang Li Deng-Ping Fan Kaitao Song Ding Liang Tong Lu Ping Luo and Ling Shao. 2021a. Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction Without Convolutions. In ICCV. 568--578.","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Qi Wei Lei Feng Haoliang Sun Ren Wang Chenhui Guo and Yilong Yin. 2023. Fine-Grained Classification with Noisy Labels. In CVPR. 11651--11660.","DOI":"10.1109\/CVPR52729.2023.01121"},{"volume-title":"Self-Filtering: A Noise-Aware Sample Selection for Label Noise with Confidence Penalization","author":"Wei Qi","key":"e_1_3_2_1_41_1","unstructured":"Qi Wei, Haoliang Sun, Xiankai Lu, and Yilong Yin. 2022. Self-Filtering: A Noise-Aware Sample Selection for Label Noise with Confidence Penalization. In ECCV. Springer, 516--532."},{"key":"e_1_3_2_1_42_1","first-page":"12077","article-title":"Segformer","volume":"34","author":"Xie Enze","year":"2021","unstructured":"Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, and Ping Luo. 2021. Segformer: Simple and Efficient Design for Semantic Segmentation with Transformers. NeurIPS, Vol. 34 (2021), 12077--12090.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_43_1","volume-title":"BTS: A Bi-lingual Benchmark for Text Segmentation in the Wild. In CVPR. 19152--19162.","author":"Xu Xixi","year":"2022","unstructured":"Xixi Xu, Zhongang Qi, Jianqi Ma, Honglun Zhang, Ying Shan, and Xiaohu Qie. 2022. BTS: A Bi-lingual Benchmark for Text Segmentation in the Wild. In CVPR. 19152--19162."},{"key":"e_1_3_2_1_44_1","unstructured":"Xingqian Xu Zhifei Zhang Zhaowen Wang Brian Price Zhonghao Wang and Humphrey Shi. 2021. Rethinking Text Segmentation: A Novel Dataset and A Text-Specific Refinement Approach. In CVPR. 12045--12055."},{"key":"e_1_3_2_1_45_1","volume-title":"Multi-channel Prediction. arXiv preprint arXiv:1606.09002","author":"Yao Cong","year":"2016","unstructured":"Cong Yao, Xiang Bai, Nong Sang, Xinyu Zhou, Shuchang Zhou, and Zhimin Cao. 2016. Scene Text Detection via Holistic, Multi-channel Prediction. arXiv preprint arXiv:1606.09002 (2016)."},{"key":"e_1_3_2_1_46_1","volume-title":"Benchmarking Chinese Text Recognition: Datasets, Baselines, and an Empirical Study. arXiv preprint arXiv:2112.15093","author":"Yu Haiyang","year":"2021","unstructured":"Haiyang Yu, Jingye Chen, Bin Li, Jianqi Ma, Mengnan Guan, Xixi Xu, Xiaocong Wang, Shaobo Qu, and Xiangyang Xue. 2021. Benchmarking Chinese Text Recognition: Datasets, Baselines, and an Empirical Study. arXiv preprint arXiv:2112.15093 (2021)."},{"key":"e_1_3_2_1_47_1","volume-title":"Chinese Character Recognition with Radical-Structured Stroke Trees. arXiv preprint arXiv:2211.13518","author":"Yu Haiyang","year":"2022","unstructured":"Haiyang Yu, Jingye Chen, Bin Li, and Xiangyang Xue. 2022. Chinese Character Recognition with Radical-Structured Stroke Trees. arXiv preprint arXiv:2211.13518 (2022)."},{"volume-title":"Object-Contextual Representations for Semantic Segmentation","author":"Yuan Yuhui","key":"e_1_3_2_1_48_1","unstructured":"Yuhui Yuan, Xilin Chen, and Jingdong Wang. 2020. Object-Contextual Representations for Semantic Segmentation. In ECCV. Springer, 173--190."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Jan Zdenek and Hideki Nakayama. 2020. Erasing Scene Text with Weak Supervision. In WACV. 2238--2246.","DOI":"10.1109\/WACV45572.2020.9093544"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Chengquan Zhang Borong Liang Zuming Huang Mengyi En Junyu Han Errui Ding and Xinghao Ding. 2019a. Look More Than Once: An Accurate Detector for Text of Arbitrary Shapes. In CVPR. 10552--10561.","DOI":"10.1109\/CVPR.2019.01080"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301801"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Zheng Zhang Chengquan Zhang Wei Shen Cong Yao Wenyu Liu and Xiang Bai. 2016. Multi-oriented Text Detection with Fully Convolutional Networks. In CVPR. 4159--4167.","DOI":"10.1109\/CVPR.2016.451"},{"key":"e_1_3_2_1_53_1","unstructured":"Hengshuang Zhao Jianping Shi Xiaojuan Qi Xiaogang Wang and Jiaya Jia. 2017. Pyramid Scene Parsing Network. In CVPR. 2881--2890."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","unstructured":"Sixiao Zheng Jiachen Lu Hengshuang Zhao Xiatian Zhu Zekun Luo Yabiao Wang Yanwei Fu Jianfeng Feng Tao Xiang Philip H.S. Torr and Li Zhang. 2021. Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers. In CVPR. https:\/\/doi.org\/10.1109\/cvpr46437.2021.00681","DOI":"10.1109\/cvpr46437.2021.00681"},{"key":"e_1_3_2_1_55_1","volume-title":"EAST: An Efficient and Accurate Scene Text Detector. In CVPR. 5551--5560.","author":"Zhou Xinyu","year":"2017","unstructured":"Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, and Jiajun Liang. 2017. EAST: An Efficient and Accurate Scene Text Detector. In CVPR. 5551--5560."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2995062"},{"key":"e_1_3_2_1_57_1","volume-title":"Weakly-Supervised Text Instance Segmentation. arXiv preprint arXiv:2303.10848","author":"Zu Xinyan","year":"2023","unstructured":"Xinyan Zu, Haiyang Yu, Bin Li, and Xiangyang Que. 2023. Weakly-Supervised Text Instance Segmentation. arXiv preprint arXiv:2303.10848 (2023)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547827"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611755","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611755","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:11:33Z","timestamp":1755821493000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611755"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3611755","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611755","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}