{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:05:51Z","timestamp":1775325951820,"version":"3.50.1"},"reference-count":80,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2022ZD0118700"],"award-info":[{"award-number":["2022ZD0118700"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62225603"],"award-info":[{"award-number":["62225603"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206104"],"award-info":[{"award-number":["62206104"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61936003"],"award-info":[{"award-number":["61936003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhuhai Industry Core and Key Technology Research Project","award":["2220004002350"],"award-info":[{"award-number":["2220004002350"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1109\/tpami.2023.3312285","type":"journal-article","created":{"date-parts":[[2023,9,5]],"date-time":"2023-09-05T17:47:12Z","timestamp":1693936032000},"page":"15665-15679","source":"Crossref","is-referenced-by-count":44,"title":["SPTS v2: Single-Point Scene Text Spotting"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3037-173X","authenticated-orcid":false,"given":"Yuliang","family":"Liu","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2144-9053","authenticated-orcid":false,"given":"Jiaxin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3263-3449","authenticated-orcid":false,"given":"Dezhi","family":"Peng","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5736-959X","authenticated-orcid":false,"given":"Mingxin","family":"Huang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9082-094X","authenticated-orcid":false,"given":"Xinyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2577-0119","authenticated-orcid":false,"given":"Jingqun","family":"Tang","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9126-3069","authenticated-orcid":false,"given":"Can","family":"Huang","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8865-7896","authenticated-orcid":false,"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8648-8718","authenticated-orcid":false,"given":"Chunhua","family":"Shen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3449-5940","authenticated-orcid":false,"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5456-0957","authenticated-orcid":false,"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, Guangdong, China"}]}],"member":"263","reference":[{"key":"ref13","first-page":"20","article-title":"TextSnake: A flexible representation for detecting text of arbitrary shapes","author":"long","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref57","first-page":"83","article-title":"TextNet: Irregular text reading from images with an end-to-end trainable network","author":"sun","year":"2018","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00314"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00853"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.166"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6864"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2937086"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6896"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00527"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00595"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413819"},{"key":"ref55","doi-asserted-by":"crossref","first-page":"5349","DOI":"10.1109\/TPAMI.2021.3095916","article-title":"PAN++: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text","volume":"44","author":"wang","year":"2022","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00917"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00480"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.529"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12242"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547942"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00959"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.242"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2825107"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.254"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2848939"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"ref42","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.126"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3095916"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/34.24792"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.371"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_4"},{"key":"ref6","first-page":"1083","article-title":"Detecting texts of arbitrary orientations in natural images","author":"yao","year":"2012","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.283"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58526-6_30"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0823-z"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547787"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548266"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_19"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3206615"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15549-9_43"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2011.295"},{"key":"ref75","first-page":"10524","article-title":"On layer normalization in the Transformer architecture","author":"xiong","year":"2020","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00815"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00252"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_15"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00254"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16383"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.368"},{"key":"ref1","first-page":"935","article-title":"Total-Text: A comprehensive dataset for scene text detection and recognition","author":"ch\u2019ng","year":"2017","journal-title":"Proc IAPR Int Conf Document Anal Recognit"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10593-2_34"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.102"},{"key":"ref71","first-page":"1","article-title":"DAB-DETR: Dynamic anchor boxes are better queries for DETR","author":"liu","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"ref73","first-page":"1","article-title":"Decoupled weight decay regularization","author":"loshchilov","year":"2019","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref72","first-page":"1454","article-title":"ICDAR 2017 robust reading challenge on multi-lingual scene text detection and script identification-RRC-MLT","author":"nayef","year":"2017","journal-title":"Proc IAPR Int Conf Document Anal Recognit"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2006.479"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"ref23","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2013.221"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.560"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"ref22","first-page":"1","article-title":"Pix2Seq: A language modeling framework for object detection","author":"chen","year":"2022","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223908"},{"key":"ref21","first-page":"706","article-title":"Mask TextSpotter v3: Segmentation proposal network for robust scene text spotting","author":"liao","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref65","first-page":"1","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"zhu","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25430"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.02.002"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00922"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2021.3075225"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10308548\/10239535.pdf?arnumber=10239535","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T19:54:09Z","timestamp":1701114849000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10239535\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":80,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3312285","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12]]}}}