{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T09:58:00Z","timestamp":1769507880573,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Priority Academic Program Development of Jiangsu Higher Education Institutions"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2018YFA0701700, 2018YFA0701701"],"award-info":[{"award-number":["2018YFA0701700, 2018YFA0701701"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","name":"National Natural Science Foundation of China-Liaoning Joint Fund","doi-asserted-by":"publisher","award":["No.61672364, No.62176172, No.62002253"],"award-info":[{"award-number":["No.61672364, No.62176172, No.62002253"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680981","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"10134-10143","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["DNTextSpotter: Arbitrary-Shaped Scene Text Spotting via Improved Denoising Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6154-1411","authenticated-orcid":false,"given":"Qian","family":"Qiao","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Soochow University &amp; bilibili Inc., Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1695-9370","authenticated-orcid":false,"given":"Yu","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University &amp; bilibili Inc., Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1287-7472","authenticated-orcid":false,"given":"Jun","family":"Gao","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hongkong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4712-1241","authenticated-orcid":false,"given":"Tianxiang","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7676-8830","authenticated-orcid":false,"given":"Shaoyao","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5659-7457","authenticated-orcid":false,"given":"Jiaqing","family":"Fan","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1077-9033","authenticated-orcid":false,"given":"Ziqiang","family":"Cao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5266-8367","authenticated-orcid":false,"given":"Zili","family":"Wang","sequence":"additional","affiliation":[{"name":"Inf Tech, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5936-6410","authenticated-orcid":false,"given":"Yue","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Soochow University, Suzhou, Jiangsu, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part XXIX 16","author":"Baek Youngmin","year":"2020","unstructured":"Youngmin Baek, Seung Shin, Jeonghun Baek, Sungrae Park, Junyeop Lee, Daehyun Nam, and Hwalsuk Lee. 2020. Character region attention for text spotting. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXIX 16. Springer, 504--521."},{"key":"e_1_3_2_1_2_1","volume-title":"SRRV: A novel document object detector based on spatial-related relation and vision","author":"Bi Hengyue","year":"2022","unstructured":"Hengyue Bi, Canhui Xu, Cao Shi, Guozhu Liu, Yuteng Li, Honghong Zhang, and Jing Qu. 2022. SRRV: A novel document object detector based on spatial-related relation and vision. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"HGR-Net: Hierarchical Graph Reasoning Network for Arbitrary Shape Scene Text Detection","author":"Bi Hengyue","year":"2023","unstructured":"Hengyue Bi, Canhui Xu, Cao Shi, Guozhu Liu, Honghong Zhang, Yuteng Li, and Junyu Dong. 2023. HGR-Net: Hierarchical Graph Reasoning Network for Arbitrary Shape Scene Text Detection. IEEE Transactions on Image Processing (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.24792"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3440756"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10032-019-00334-z"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00917"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Alex Graves Santiago Fern\u00e1ndez Faustino Gomez and J\u00fcrgen Schmidhuber. 2006. Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In ICML.","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01786"},{"key":"e_1_3_2_1_15_1","volume-title":"ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01461"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.560"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2937086"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings, Part XI 16","author":"Liao Minghui","year":"2020","unstructured":"Minghui Liao, Guan Pang, Jing Huang, Tal Hassner, and Xiang Bai. 2020. Mask textspotter v3: Segmentation proposal network for robust scene text spotting. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XI 16. Springer, 706--722."},{"key":"e_1_3_2_1_23_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2017. Focal loss for dense object detection. In ICCV."},{"key":"e_1_3_2_1_24_1","volume-title":"Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329","author":"Liu Shilong","year":"2022","unstructured":"Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, and Lei Zhang. 2022. Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00595"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.02.002"},{"key":"e_1_3_2_1_28_1","first-page":"8048","article-title":"Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting","volume":"44","author":"Liu Yuliang","year":"2021","unstructured":"Yuliang Liu, Chunhua Shen, Lianwen Jin, Tong He, Peng Chen, Chongyu Liu, and Hao Chen. 2021. Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 11 (2021), 8048--8064.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_29_1","unstructured":"Yuliang Liu Jiaxin Zhang Dezhi Peng Mingxin Huang Xinyu Wang Jingqun Tang Can Huang Dahua Lin Chunhua Shen Xiang Bai et al. 2023. SPTS v2: single-point scene text spotting. arXiv preprint arXiv:2301.01635 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01369-0"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547942"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6864"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Roi Ronen Shahar Tsiper Oron Anschel Inbal Lavi Amir Markovitz and R Manmatha. 2022. GLASS: global to local attention for scene-text spotting. In ECCV.","DOI":"10.1007\/978-3-031-19815-1_15"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01144"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6896"},{"key":"e_1_3_2_1_39_1","volume-title":"PGNET: Real-time arbitrarily-shaped text spotting with point gathering network. In AAAI.","author":"Wang Pengfei","year":"2021","unstructured":"Pengfei Wang, Chengquan Zhang, Fei Qi, Shanshan Liu, Xiaoqiang Zhang, Pengyuan Lyu, Junyu Han, Jingtuo Liu, Errui Ding, and Guangming Shi. 2021. PGNET: Real-time arbitrarily-shaped text spotting with point gathering network. In AAAI."},{"key":"e_1_3_2_1_40_1","first-page":"5349","article-title":"Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text","volume":"44","author":"Wang Wenhai","year":"2021","unstructured":"Wenhai Wang, Enze Xie, Xiang Li, Xuebo Liu, Ding Liang, Zhibo Yang, Tong Lu, and Chunhua Shen. 2021. Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 9 (2021), 5349--5367.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547882"},{"key":"e_1_3_2_1_42_1","unstructured":"Jingjing Wu Pengyuan Lyu Guangming Lu Chengquan Zhang Kun Yao and Wenjie Pei. 2022. Decoupling Recognition from Detection: Single Shot Self-Reliant Scene Text Spotter. In ACM MM."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Linjie Xing Zhi Tian Weilin Huang and Matthew R Scott. 2019. Convolutional character networks. In ICCV.","DOI":"10.1109\/ICCV.2019.00922"},{"key":"e_1_3_2_1_44_1","volume-title":"Vitae: Vision transformer advanced by exploring intrinsic inductive bias. Advances in neural information processing systems","author":"Xu Yufei","year":"2021","unstructured":"Yufei Xu, Qiming Zhang, Jing Zhang, and Dacheng Tao. 2021. Vitae: Vision transformer advanced by exploring intrinsic inductive bias. Advances in neural information processing systems, Vol. 34 (2021), 28522--28535."},{"key":"e_1_3_2_1_45_1","unstructured":"Maoyuan Ye Jing Zhang Shanshan Zhao Juhua Liu Bo Du and Dacheng Tao. 2023. DPText-DETR: Towards Better Scene Text Detection with Dynamic Points in Transformer. In AAAI."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"e_1_3_2_1_47_1","volume-title":"Text detection and recognition in imagery: A survey","author":"Ye Qixiang","year":"2014","unstructured":"Qixiang Ye and David Doermann. 2014. Text detection and recognition in imagery: A survey. IEEE transactions on pattern analysis and machine intelligence, Vol. 37, 7 (2014), 1480--1500."},{"key":"e_1_3_2_1_48_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605","author":"Zhang Hao","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M Ni, and Heung-Yeung Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605 (2022)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01739-w"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"e_1_3_2_1_51_1","volume-title":"Arts: Eliminating inconsistency between text detection and recognition with auto-rectification text spotter. arXiv preprint arXiv:2110.10405","author":"Zhong Humen","year":"2021","unstructured":"Humen Zhong, Jun Tang, Wenhai Wang, Zhibo Yang, Cong Yao, and Tong Lu. 2021. Arts: Eliminating inconsistency between text detection and recognition with auto-rectification text spotter. arXiv preprint arXiv:2110.10405 (2021)."},{"key":"e_1_3_2_1_52_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680981","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680981","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680981"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":52,"alternative-id":["10.1145\/3664647.3680981","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680981","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}