{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:40:48Z","timestamp":1755826848367,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Project of China","award":["2022YFF0902000"],"award-info":[{"award-number":["2022YFF0902000"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658075","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"275-284","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TWIST: Text-only Weakly Supervised Scene Text Spotting Using Pseudo Labels"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9378-5932","authenticated-orcid":false,"given":"Lilong","family":"Wen","sequence":"first","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University &amp; Hangzhou High-Tech Zone (Binjiang) Institute of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8611-0283","authenticated-orcid":false,"given":"Xiu","family":"Tang","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University &amp; Hangzhou High-Tech Zone (Binjiang) Institute of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9964-2470","authenticated-orcid":false,"given":"Dongxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University &amp; Hangzhou High-Tech Zone (Binjiang) Institute of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2878899"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings, Part I 16","author":"Carion Nicolas","year":"2020","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 213--229."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.543"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.157"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2005.137"},{"key":"e_1_3_2_1_6_1","volume-title":"UNITS: Unsupervised Intermediate Training Stage for Scene Text Detection. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Guo Youhui","year":"2022","unstructured":"Youhui Guo, Yu Zhou, Xugong Qin, Enze Xie, and Weiping Wang. 2022. UNITS: Unsupervised Intermediate Training Stage for Scene Text Detection. In 2022 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"e_1_3_2_1_8_1","volume-title":"ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In 2015 13th international conference on document analysis and recognition (ICDAR). IEEE, 1156--1160."},{"key":"e_1_3_2_1_9_1","volume-title":"ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493","author":"Karatzas Dimosthenis","year":"2013","unstructured":"Dimosthenis Karatzas, Faisal Shafait, Seiichi Uchida, Masakazu Iwamura, Lluis Gomez i Bigorda, Sergi Robles Mestre, Joan Mas, David Fernandez Mota, Jon Almazan Almazan, and Lluis Pere De Las Heras. 2013. ICDAR 2013 robust reading competition. In 2013 12th international conference on document analysis and recognition. IEEE, 1484--1493."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"e_1_3_2_1_11_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly, Vol. 2, 1--2 (1955), 83--97."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41019-022-00200-9"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings, Part XI 16","author":"Liao Minghui","year":"2020","unstructured":"Minghui Liao, Guan Pang, Jing Huang, Tal Hassner, and Xiang Bai. 2020a. Mask textspotter v3: Segmentation proposal network for robust scene text spotting. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XI 16. Springer, 706--722."},{"key":"e_1_3_2_1_14_1","volume-title":"Mask TextSpotter v3: Segmentation Proposal Network for Robust Scene Text Spotting. ArXiv","author":"Liao Minghui","year":"2020","unstructured":"Minghui Liao, Guan Pang, Jing Huang, Tal Hassner, and Xiang Bai. 2020b. Mask TextSpotter v3: Segmentation Proposal Network for Robust Scene Text Spotting. ArXiv , Vol. abs\/2007.09482 (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-019-2737-0"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3155612"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.02.002"},{"key":"e_1_3_2_1_21_1","first-page":"8048","article-title":"Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting","volume":"44","author":"Liu Yuliang","year":"2021","unstructured":"Yuliang Liu, Chunhua Shen, Lianwen Jin, Tong He, Peng Chen, Chongyu Liu, and Hao Chen. 2021. Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 11 (2021), 8048--8064.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_22_1","unstructured":"Yuliang Liu Jiaxin Zhang Dezhi Peng Mingxin Huang Xinyu Wang Jingqun Tang Can Huang Dahua Lin Chunhua Shen Xiang Bai et al. 2023. SPTS v2: Single-Point Scene Text Spotting. arXiv preprint arXiv:2301.01635 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_2"},{"key":"e_1_3_2_1_24_1","volume-title":"Unrealtext: Synthesizing realistic scene text images from the unreal world. arXiv preprint arXiv:2003.10608","author":"Long Shangbang","year":"2020","unstructured":"Shangbang Long and Cong Yao. 2020. Unrealtext: Synthesizing realistic scene text images from the unreal world. arXiv preprint arXiv:2003.10608 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_26_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","volume-title":"Weakly Supervised Scene Text Detection using Deep Reinforcement Learning. arXiv preprint arXiv:2201.04866","author":"Metzenthin Emanuel","year":"2022","unstructured":"Emanuel Metzenthin, Christian Bartz, and Christoph Meinel. 2022. Weakly Supervised Scene Text Detection using Deep Reinforcement Learning. arXiv preprint arXiv:2201.04866 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547942"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412806"},{"key":"e_1_3_2_1_31_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196577"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.61"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.02.014"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547787"},{"key":"e_1_3_2_1_36_1","volume-title":"Seglink: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping. Pattern recognition","author":"Tang Jun","year":"2019","unstructured":"Jun Tang, Zhibo Yang, Yongpan Wang, Qi Zheng, Yongchao Xu, and Xiang Bai. 2019. Seglink: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping. Pattern recognition , Vol. 96 (2019), 106954."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00452"},{"key":"e_1_3_2_1_38_1","volume-title":"A Scene-Text Synthesis Engine Achieved Through Learning from Decomposed Real-World Data. arXiv preprint arXiv:2209.02397","author":"Tang Zhengmi","year":"2022","unstructured":"Zhengmi Tang, Tomo Miyazaki, and Shinichiro Omachi. 2022a. A Scene-Text Synthesis Engine Achieved Through Learning from Decomposed Real-World Data. arXiv preprint arXiv:2209.02397 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Domain Adaptive Scene Text Detection via Subcategorization. arXiv preprint arXiv:2212.00377","author":"Tian Zichen","year":"2022","unstructured":"Zichen Tian, Chuhui Xue, Jingyi Zhang, and Shijian Lu. 2022. Domain Adaptive Scene Text Detection via Subcategorization. arXiv preprint arXiv:2212.00377 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41019-023-00208-9"},{"key":"e_1_3_2_1_41_1","first-page":"5349","article-title":"Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text","volume":"44","author":"Wang Wenhai","year":"2021","unstructured":"Wenhai Wang, Enze Xie, Xiang Li, Xuebo Liu, Ding Liang, Zhibo Yang, Tong Lu, and Chunhua Shen. 2021. Pan: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 9 (2021), 5349--5367.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570428"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Wu Weijia","year":"2020","unstructured":"Weijia Wu, Ning Lu, Enze Xie, Yuxing Wang, Wenwen Yu, Cheng Yang, and Hong Zhou. 2020a. Synthetic-to-real unsupervised domain adaptation for scene text detection in the wild. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1155\/2020\/3871897"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00922"},{"key":"e_1_3_2_1_46_1","volume-title":"Philip HS Torr, and Song Bai","author":"Xue Chuhui","year":"2022","unstructured":"Chuhui Xue, Wenqing Zhang, Yu Hao, Shijian Lu, Philip HS Torr, and Song Bai. 2022. Language matters: A weakly supervised vision-language pre-training approach for scene text detection and spotting. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXVIII. Springer, 284--302."},{"key":"e_1_3_2_1_47_1","volume-title":"Dptext-detr: Towards better scene text detection with dynamic points in transformer. arXiv preprint arXiv:2207.04491","author":"Ye Maoyuan","year":"2022","unstructured":"Maoyuan Ye, Jing Zhang, Shanshan Zhao, Juhua Liu, Bo Du, and Dacheng Tao. 2022a. Dptext-detr: Towards better scene text detection with dynamic points in transformer. arXiv preprint arXiv:2207.04491 (2022)."},{"key":"e_1_3_2_1_48_1","volume-title":"DeepSolo: Let Transformer Decoder with Explicit Points Solo for Text Spotting. arXiv preprint arXiv:2211.10772","author":"Ye Maoyuan","year":"2022","unstructured":"Maoyuan Ye, Jing Zhang, Shanshan Zhao, Juhua Liu, Tongliang Liu, Bo Du, and Dacheng Tao. 2022b. DeepSolo: Let Transformer Decoder with Explicit Points Solo for Text Spotting. arXiv preprint arXiv:2211.10772 (2022)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/1878803.1878894"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-89188-6_11"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_16"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00920"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00129"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00134"},{"key":"e_1_3_2_1_55_1","volume-title":"Arbitrary shape text detection via boundary transformer. arXiv preprint arXiv:2205.05320","author":"Zhang Shi-Xue","year":"2022","unstructured":"Shi-Xue Zhang, Xiaobin Zhu, Chun Yang, and Xu-Cheng Yin. 2022b. Arbitrary shape text detection via boundary transformer. arXiv preprint arXiv:2205.05320 (2022)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings, Part IV. Springer, 79--94","author":"Zhang Wenqing","year":"2021","unstructured":"Wenqing Zhang, Yang Qiu, Minghui Liao, Rui Zhang, Xiaolin Wei, and Xiang Bai. 2021a. Scene text detection with scribble line. In Document Analysis and Recognition--ICDAR 2021: 16th International Conference, Lausanne, Switzerland, September 5--10, 2021, Proceedings, Part IV. Springer, 79--94."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3197987"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"e_1_3_2_1_60_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021","author":"Zhu Xizhou","year":"2021","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2021b. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3--7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=gZ9hCDWe6ke"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00314"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658075","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658075","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:53:08Z","timestamp":1755766388000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658075"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":61,"alternative-id":["10.1145\/3652583.3658075","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658075","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}