{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T09:18:55Z","timestamp":1778059135674,"version":"3.51.4"},"reference-count":62,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,2,7]],"date-time":"2024-02-07T00:00:00Z","timestamp":1707264000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,7]],"date-time":"2024-02-07T00:00:00Z","timestamp":1707264000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach. Intell. Res."],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s11633-023-1460-6","type":"journal-article","created":{"date-parts":[[2024,2,7]],"date-time":"2024-02-07T11:02:06Z","timestamp":1707303726000},"page":"704-717","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["TextFormer: A Query-based End-to-end Text Spotter with Mixed Supervision"],"prefix":"10.1007","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2312-8651","authenticated-orcid":false,"given":"Yukun","family":"Zhai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoqiang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiameng","family":"Qin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9386-9677","authenticated-orcid":false,"given":"Sanyuan","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingping","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianbing","family":"Shen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,2,7]]},"reference":[{"key":"1460_CR1","doi-asserted-by":"publisher","unstructured":"R. Datta, D. Joshi, J. Li, J. Z. Wang. Image retrieval: Ideas, influences, and trends of the new age. ACM Computing Surveys, vol.40, no. 2, Article number 5, 2008. DOI: https:\/\/doi.org\/10.1145\/1348246.1348248.","DOI":"10.1145\/1348246.1348248"},{"issue":"2","key":"1460_CR2","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/s11633-019-1216-5","volume":"17","author":"H Reddy","year":"2020","unstructured":"H. Reddy, N. Raj, M. Gala, A. Basava. Text-mining-based fake news detection using ensemble methods. International Journal of Automation and Computing, vol.17, no.2, pp.210\u2013221, 2020. DOI: https:\/\/doi.org\/10.1007\/s11633-019-1216-5.","journal-title":"International Journal of Automation and Computing"},{"key":"1460_CR3","doi-asserted-by":"publisher","unstructured":"L. Wu, C. Q. Zhang, J. M. Liu, J. Y. Han, J. T. Liu, E. R. Ding, X. Bai. Editing text in the wild. In Proceedings of the 27th ACM International Conference on Multimedia, Nice, France, pp. 1500\u20131508, 2019. DOI: https:\/\/doi.org\/10.1145\/3343031.3350929.","DOI":"10.1145\/3343031.3350929"},{"key":"1460_CR4","doi-asserted-by":"publisher","unstructured":"X. J. Rong, B. Li, J. P. Munoz, J. Z. Xiao, A. Arditi, Y. L. Tian. Guided text spotting for assistive blind navigation in unfamiliar indoor environments. In Proceedings of the 12th International Symposium on Visual Computing, Las Vegas, USA, pp. 11\u201322, 2016. DOI: https:\/\/doi.org\/10.1007\/978-3-319-50832-02.","DOI":"10.1007\/978-3-319-50832-02"},{"key":"1460_CR5","doi-asserted-by":"publisher","unstructured":"S. Antol, A. Agrawal, J. S. Lu, M. Mitchell, D. Batra, C. L. Zitnick, D. Parikh. VQA: Visual question answering. In Proceedings of IEEE International Conference on Computer Vision, Santiago, Chile, pp. 2425\u20132433, 2015. DOI: https:\/\/doi.org\/10.1109\/iccv.2015.279.","DOI":"10.1109\/iccv.2015.279"},{"key":"1460_CR6","doi-asserted-by":"publisher","unstructured":"Y. L. Li, Y. X. Qian, Y. C. Yu, X. M. Qin, C. Q. Zhang, Y. Liu, K. Yao, J. Y. Han, J. T. Liu, E. R. Ding. StrucTexT: Structured text understanding with multi-modal transformers. In Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920, 2021. DOI: https:\/\/doi.org\/10.1145\/3474085.3475345.","DOI":"10.1145\/3474085.3475345"},{"key":"1460_CR7","unstructured":"Y. C. Yu, Y. L. Li, C. Q. Zhang, X. Q. Zhang, Z. Y. Guo, X. M. Qin, K. Yao, J. Y. Han, E. R. Ding, J. D. Wang. StrucTexTv2: Masked visual-textual prediction for document image pre-training, [Online], Available: https:\/\/arxiv.org\/abs\/2303.00289, 2023."},{"key":"1460_CR8","doi-asserted-by":"crossref","unstructured":"M. L. Zhai, Y. L. Li, X. M. Qin, C. Yi, Q. Y. Xie, C. Q. Zhang, K. Yao, Y. W. Wu, Y. D. Jia. Fast- StrucTexT: An efficient hourglass transformer with modality-guided dynamic token merge for document understanding, [Online], Available: https:\/\/arxiv.org\/abs\/2305.11392, 2023.","DOI":"10.24963\/ijcai.2023\/585"},{"issue":"1","key":"1460_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11263-015-0823-z","volume":"116","author":"M Jaderberg","year":"2016","unstructured":"M. Jaderberg, K. Simonyan, A. Vedaldi, A. Zisserman. Reading text in the wild with convolutional neural networks. International Journal of Computer Vision, vol. 116, no. 1, pp. 1\u201320, 2016. DOI: https:\/\/doi.org\/10.1007\/s11263-015-0823-z.","journal-title":"International Journal of Computer Vision"},{"key":"1460_CR10","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1016\/j.patcog.2017.04.027","volume":"70","author":"L Gomez","year":"2017","unstructured":"L. Gomez, D. Karatzas. TextProposals: A text-specific selective search algorithm for word spotting in the wild. Pattern Recognition, vol.70, pp.60\u201374, 2017. DOI: https:\/\/doi.org\/10.1016\/j.patcog.2017.04.027.","journal-title":"Pattern Recognition"},{"issue":"9","key":"1460_CR11","doi-asserted-by":"publisher","first-page":"1872","DOI":"10.1109\/TPAMI.2015.2496234","volume":"38","author":"L Neumann","year":"2016","unstructured":"L. Neumann, J. Matas. Real-time lexicon- free scene text localization and recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.38, no.9, pp.1872\u20131885, 2016. DOI: https:\/\/doi.org\/10.1109\/tpami.2015.2496234.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR12","doi-asserted-by":"publisher","unstructured":"P. Lyu, M. H. Liao, C. Yao, W. H. Wu, X. Bai. Mask TextSpotter: An end-to-end train- able neural network for spotting text with arbitrary shapes. In Proceedings of the 15th European Conference on Computer Vision, Munich, Germany, pp. 71\u201388, 2018. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01264-9_5.","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"1460_CR13","doi-asserted-by":"publisher","unstructured":"Y. L. Liu, H. Chen, C. H. Shen, T. He, L. W. Jin, L. W. Wang. ABCNet: Real-time scene text spotting with adaptive Bezier-curve network. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Seattle, USA, pp. 9806\u20139815, 2020. DOI: https:\/\/doi.org\/10.1109\/cvpr42600.2020.00983.","DOI":"10.1109\/cvpr42600.2020.00983"},{"issue":"9","key":"1460_CR14","doi-asserted-by":"publisher","first-page":"5349","DOI":"10.1109\/tpami.2021.3077555","volume":"44","author":"W H Wang","year":"2022","unstructured":"W. H. Wang, E. Z. Xie, X. Li, X. B. Liu, D. Liang, Z. B. Yang, T. Lu, C. H. Shen. PAN++: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.44, no.9, pp.5349\u20135367, 2022. DOI: https:\/\/doi.org\/10.1109\/tpami.2021.3077555.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR15","doi-asserted-by":"publisher","unstructured":"S. Y. Qin, A. Bissaco, M. Raptis, Y. Fujii, Y. Xiao. Towards unconstrained end-to- end text spotting. In Proceedings of IEEE\/C\u2019VF International Conference on Computer Vision, Seoul, Republic of Korea, pp.4703~4713, 2019. DOI: https:\/\/doi.org\/10.1109\/iccv.2019.00480.","DOI":"10.1109\/iccv.2019.00480"},{"key":"1460_CR16","doi-asserted-by":"publisher","unstructured":"L. Qiao, Y. Chen, Z. Z. Cheng, Y. L. Xu, Y. Niu, S. L. Pu, F. Wu. MANGO: A mask attention guided one-stage scene text spotter. In Proceedings of the 35th AAAI Conference on Artificial Intelligence, pp. 2467\u20132476, 2021. DOI: https:\/\/doi.org\/10.1609\/aaai.v35i3.16348.","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"1460_CR17","doi-asserted-by":"publisher","unstructured":"L. Qiao, S. L. Tang, Z. Z. Cheng, Y. L. Xu, Y. Niu, S. L. Pu, F. Wu. Text perceptron: Towards end-to-end arbitrary-shaped text spotting. In Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, USA, pp. 11899\u201311907, 2020. DOI: https:\/\/doi.org\/10.1609\/aaai.v34i07.6864.","DOI":"10.1609\/aaai.v34i07.6864"},{"key":"1460_CR18","doi-asserted-by":"publisher","unstructured":"H. Wang, P. Lu, H. Zhang, M. K. Yang, X. Bai, Y. C. Xu, M. C. He, Y. P. Wang, W. Y. Liu. All you need is boundary: Toward arbitrary-shaped text spotting. In Proceedings of the 36th AAAI Conference on Artificial Intelligence, New York, USA, pp. 12160\u201312167, 2020. DOI: https:\/\/doi.org\/10.1609\/aaai.v34i07.6896.","DOI":"10.1609\/aaai.v34i07.6896"},{"key":"1460_CR19","doi-asserted-by":"publisher","unstructured":"L. J. Xing, Z. Tian, W. L. Huang, M. Scott. Convolutional character networks. In Proceedings of IEEE\/C VF International Conference on Computer Vision, Seoul, Republic of Korea, pp. 9125\u20139135, 2019. DOI: https:\/\/doi.org\/10.1109\/iccv.2019.00922.","DOI":"10.1109\/iccv.2019.00922"},{"key":"1460_CR20","doi-asserted-by":"publisher","unstructured":"P. F. Wang, C. Q. Zhang, F. Qi, S. S. Liu, X. Q. Zhang, P. Lyu, J. Y. Han, J. T. Liu, E. R. Ding, G. M. Shi. PGNet: Real-time arbitrarily-shaped text spotting with point gathering network. In Proceedings of the 35th AAAI Conference on Artificial Intelligence, pp. 2782\u20132790, 2021. DOI: https:\/\/doi.org\/10.1609\/aaai.v35i4.16383.","DOI":"10.1609\/aaai.v35i4.16383"},{"key":"1460_CR21","doi-asserted-by":"publisher","unstructured":"W. H. Wang, X. B. Liu, X. Z. Ji, E. Z. Xie, D. Liang, Z. B. Yang, T. Lu, C. H. Shen, P. Luo. AE TextSpotter: Learning visual and linguistic representation for ambiguous text spotting. In Proceedings of the 16th European Conference on Computer Vision, Glasgow, UK, pp.457\u2013473, 2020. DOI: https:\/\/doi.org\/10.1007\/978-3-030-58568-627.","DOI":"10.1007\/978-3-030-58568-627"},{"key":"1460_CR22","doi-asserted-by":"publisher","unstructured":"X. B. Liu, D. Liang, S. Yan, D. G. Chen, Y. Qiao, J. J. Yan. FOTS: Fast oriented text spotting with a unified network. In Proceedings of IEEE\/C VF Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 5676\u20135685, 2018. DOI: https:\/\/doi.org\/10.1109\/cvpr.2018.00595.","DOI":"10.1109\/cvpr.2018.00595"},{"issue":"1","key":"1460_CR23","doi-asserted-by":"publisher","first-page":"91","DOI":"10.2307\/468410","volume":"5","author":"P Ricoeur","year":"1973","unstructured":"P. Ricoeur. The model of the text: Meaningful action considered as a text. New Literary History, vol.5, no.1, pp.91\u2013117, 1973. DOI: https:\/\/doi.org\/10.2307\/468410.","journal-title":"New Literary History"},{"key":"1460_CR24","doi-asserted-by":"publisher","unstructured":"N. Carion, F. Massa, G. Synnaeve, N. Usunier, A. Kirillov, S. Zagoruyko. End-to-end object detection with transformers. In Proceedings of the 16th European Conference on Computer Vision, Glasgow, UK, pp.213\u2013229, 2020. DOI: https:\/\/doi.org\/10.1007\/978-3-030-58452-813.","DOI":"10.1007\/978-3-030-58452-813"},{"key":"1460_CR25","unstructured":"X. Z. Zhu, W. J. Su, L. W. Lu, B. Li, X. G. Wang, J. F. Dai. Deformable DETR: Deformable transformers for end-to-end object detection. In Proceedings of the 9th International Conference on Learning Representations, 2021."},{"key":"1460_CR26","doi-asserted-by":"publisher","unstructured":"B. W. Cheng, I. Misra, A. G. Schwing, A. Kirillov, R. Girdhar. M asked-attent ion mask transformer for universal image segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 1280\u20131289, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.00135.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"1460_CR27","doi-asserted-by":"publisher","unstructured":"D. Z. Peng, X. Y. Wang, Y. L. Liu, J. X. Zhang, M. X. Huang, S. X. Lai, J. Li, S. G. Zhu, D. H. Lin, C. H. Shen, X. Bai, L. W. Jin. SPTS: Single-point text spotting. In Proceedings of the 30th ACM International Conference on Multimedia, Lisboa, Portugal, pp.4272\u20134281, 2022. DOI: https:\/\/doi.org\/10.1145\/3503161.3547942.","DOI":"10.1145\/3503161.3547942"},{"key":"1460_CR28","doi-asserted-by":"publisher","unstructured":"J. Q. Tang, S. Qiao, B. L. Cui, Y. H. Ma, S. Zhang, D. Kanoulas. You can even annotate text with voice: Transcription-only-supervised text spotting. In Proceedings of the 30th ACM International Conference on Multimedia, Lisboa, Portugal, pp.4154\u20134163, 2022. DOI: https:\/\/doi.org\/10.1145\/3503161.3547787.","DOI":"10.1145\/3503161.3547787"},{"key":"1460_CR29","doi-asserted-by":"publisher","unstructured":"Y. P. Sun, Z. H. Ni, C. K. Chng, Y. L. Liu, C. J. Luo, C. C. Ng, J. Y. Han, E. R. Ding, J. T. Liu, D. Karatzas, C. S. Chan, L. W. Jin. ICDAR 2019 competition on large-scale street view text with partial labeling\u2013RRC-LSVT. In Proceedings of International Conference on Document Analysis and Recognition, Sydney, Australia, pp. 1557\u20131562, 2019. DOI: https:\/\/doi.org\/10.1109\/icdar.2019.00250.","DOI":"10.1109\/icdar.2019.00250"},{"key":"1460_CR30","doi-asserted-by":"publisher","unstructured":"H. Li, P. Wang, C. H. Shen. Towards end-to-end text spotting with convolutional recurrent neural networks. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 5248\u20135256, 2017. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.560.","DOI":"10.1109\/iccv.2017.560"},{"issue":"6","key":"1460_CR31","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Q Ren","year":"2017","unstructured":"S. Q. Ren, K. M. He, R. Girshick, J. Sun. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.39, no.6, pp.1137\u20131149, 2017. DOI: https:\/\/doi.org\/10.1109\/tpami.2016.2577031.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR32","doi-asserted-by":"publisher","unstructured":"M. Busta, L. Neumann, J. Matas. Deep TextSpotter: An end-to-end trainable scene text localization and recognition framework. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 2223\u20132231, 2017. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.242.","DOI":"10.1109\/iccv.2017.242"},{"key":"1460_CR33","doi-asserted-by":"publisher","unstructured":"T. He, Z. Tian, W. L. Huang, C. H. Shen, Y. Qiao, C. M. Sun. An end-to-end TextSpotter with explicit alignment and attention. In Proceedings of IEEE\/C VF Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 5020\u20135029, 2018. DOI: https:\/\/doi.org\/10.1109\/cvpr.2018.00527.","DOI":"10.1109\/cvpr.2018.00527"},{"key":"1460_CR34","doi-asserted-by":"publisher","unstructured":"K. M. He, G. Gkioxari, P. Dollar, R. Girshick. Mask RCNN. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 2980\u20132988, 2017. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.322.","DOI":"10.1109\/iccv.2017.322"},{"key":"1460_CR35","doi-asserted-by":"publisher","unstructured":"W. Feng, W. H. He, F. Yin, X. Y. Zhang, C. L. Liu. TextDragon: An end-to-end framework for arbitrary shaped text spotting. In Proceedings of IEEE\/CVF International Conference on Computer Vision, Seoul, Republic of Korea, pp. 9075\u20139084, 2019. DOI: https:\/\/doi.org\/10.1109\/iccv.2019.00917.","DOI":"10.1109\/iccv.2019.00917"},{"issue":"6","key":"1460_CR36","doi-asserted-by":"publisher","first-page":"567","DOI":"10.1109\/34.24792","volume":"11","author":"F L Bookstein","year":"1989","unstructured":"F. L. Bookstein. Principal warps: Thin-plate splines and the decomposition of deformations. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.11, no.6, pp.567\u2013585, 1989. DOI: https:\/\/doi.org\/10.1109\/34.24792.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"11","key":"1460_CR37","doi-asserted-by":"publisher","first-page":"8048","DOI":"10.1109\/tpami.2021.3107437","volume":"44","author":"Y L Liu","year":"2022","unstructured":"Y. L. Liu, C. H. Shen, L. W. Jin, T. He, P. Chen, C. Y. Liu, H. Chen. ABCNet v2: Adaptive Bezier-curve network for real-time end-to-end text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.44, no. 11, pp.8048\u20138064, 2022. DOI: https:\/\/doi.org\/10.1109\/tpami.2021.3107437.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR38","doi-asserted-by":"publisher","unstructured":"Z. Raisi, M. A. Naiel, G. Younes, S. Wardell, J. S. Zelek. Transformer-based text detection in the wild. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, Nashville, USA, pp. 3156\u20133165, 2021. DOI: https:\/\/doi.org\/10.1109\/cvprw53098.2021.00353.","DOI":"10.1109\/cvprw53098.2021.00353"},{"key":"1460_CR39","doi-asserted-by":"publisher","unstructured":"J. Q. Tang, W. Q. Zhang, H. Y. Liu, M. K. Yang, B. Jiang, G. L. Hu, X. Bai. Few could Be Better than all: Feature sampling and grouping for scene text detection. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp.4553\u20134562, 2022. DOI: https:\/\/doi.org\/10.1109\/cvpr52688.2022.00452.","DOI":"10.1109\/cvpr52688.2022.00452"},{"key":"1460_CR40","doi-asserted-by":"publisher","unstructured":"J. Bozic, D. Tabernik, D. Skocaj. Mixed supervision for surface-defect detection: From weakly to fully supervised learning. Computers in Industry, vol.129, Article number 103459, 2021. DOI: https:\/\/doi.org\/10.1016\/j.compind.2021.103459.","DOI":"10.1016\/j.compind.2021.103459"},{"key":"1460_CR41","doi-asserted-by":"publisher","unstructured":"P. Mlynarski, H. Delingette, A. Criminisi, N. Ayache. Deep learning with mixed supervision for brain tumor segmentation. Journal of Medical Imaging, vol. 6, no. 3, Article number 034002, 2019. DOI: https:\/\/doi.org\/10.1117\/1.jmi.6.3.034002.","DOI":"10.1117\/1.jmi.6.3.034002"},{"key":"1460_CR42","doi-asserted-by":"publisher","unstructured":"S. X. Tian, S. J. Lu, C. S. Li. WeText: Scene text detection under weak supervision. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 1501\u20131509, 2017. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.166.","DOI":"10.1109\/iccv.2017.166"},{"key":"1460_CR43","doi-asserted-by":"publisher","unstructured":"T. Y. Lin, P. Dollar, R. Girshick, K. M. He, B. Hariharan, S. Belongie. Feature pyramid networks for object detection. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 936\u2013944, 2017. DOI: https:\/\/doi.org\/10.1109\/cvpr.2017.106.","DOI":"10.1109\/cvpr.2017.106"},{"key":"1460_CR44","doi-asserted-by":"publisher","unstructured":"F. Milletari, N. Navab, S. A. Ahmadi. V-Net: Fully convolutional neural networks for volumetric medical image segmentation. In Proceedings of the 4th International Conference on 3D Vision, Stanford, USA, pp. 565\u2013571, 2016. DOI: https:\/\/doi.org\/10.1109\/3dv.2016.79.","DOI":"10.1109\/3dv.2016.79"},{"key":"1460_CR45","doi-asserted-by":"publisher","unstructured":"T. Y. Lin, P. Goyal, R. Girshick, K. M. He, P. Dollar. Focal loss for dense object detection. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 2999\u20133007, 2017. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.324.","DOI":"10.1109\/iccv.2017.324"},{"key":"1460_CR46","doi-asserted-by":"publisher","unstructured":"D. Karatzas, L. Gomez-Bigorda, A. Nicolaou, S. Ghosh, A. Bagdanov, M. Iwamura, J. Matas, L. Neumann, V. R. Chandrasekhar, S. J. Lu, F. Shafait, S. Uchida, E. Valveny. ICDAR 2015 competition on Robust Reading. In Proceedings of the 13th International Conference on Document Analysis and Recognition, Tunis, Tunisia, pp. 1156\u20131160, 2015. DOI: https:\/\/doi.org\/10.1109\/icdar.2015.7333942.","DOI":"10.1109\/icdar.2015.7333942"},{"key":"1460_CR47","doi-asserted-by":"publisher","unstructured":"C. K. Ch\u2019ng, C. Chan. S. Total-text: A comprehensive dataset for scene text detection and recognition. In Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition, Kyoto, Japan, pp. 935\u2013942, 2017. DOI: https:\/\/doi.org\/10.1109\/icdar.2017.157.","DOI":"10.1109\/icdar.2017.157"},{"key":"1460_CR48","doi-asserted-by":"publisher","unstructured":"R. Zhang, Y. S. Zhou, Q. Y. Jiang, Q. Song, N. Li, K. Zhou, L. Wang, D. Wang, M. H. Liao, M. K. Yang, X. Bai, B. G. Shi, D. Karatzas, S. J. Lu, C. V. Jawahar. ICDAR 2019 robust reading challenge on reading Chinese text on signboard. In Proceedings of International Conference on Document Analysis and Recognition, Sydney, Australia, pp. 1577\u20131581, 2019. DOI: https:\/\/doi.org\/10.1109\/icdar.2019.00253.","DOI":"10.1109\/icdar.2019.00253"},{"issue":"6","key":"1460_CR49","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"A. Krizhevsky, I. Sutskever, G. E. Hinton. ImageNet classification with deep convolutional neural networks. Communications of the ACM, vol.60, no.6, pp.84\u201390, 2017. DOI: https:\/\/doi.org\/10.1145\/3065386.","journal-title":"Communications of the ACM"},{"key":"1460_CR50","unstructured":"I. Loshchilov, F. Hutter. Decoupled weight decay regularization. In Proceedings of the 7th International Conference on Learning Representations, New Orleans, USA, 2019."},{"issue":"4","key":"1460_CR51","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L C Chen","year":"2018","unstructured":"L. C. Chen, G. Papandreou, I. Kokkinos, K. Murphy, A. L. YuiUe. DeepLab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.40, no.4, pp.834\u2013848, 2018. DOI: https:\/\/doi.org\/10.1109\/tpami.2017.2699184.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR52","doi-asserted-by":"publisher","unstructured":"D. Karatzas, F. Shafait, S. Uchida, M. Iwamura, L. G. I. Bigorda, S. R. Mestre, J. Mas, D. F. Mota, J. A. Almazan, L. P. de las Heras. ICDAR 2013 robust reading competition. In Proceedings of the 12th International Conference on Document Analysis and Recognition, Washington, USA, pp. 1484\u20131493, 2013. DOI: https:\/\/doi.org\/10.1109\/icdar.2013.221.","DOI":"10.1109\/icdar.2013.221"},{"key":"1460_CR53","doi-asserted-by":"publisher","unstructured":"N. Nayef, F. Yin, I. Bizid, H. Choi, Y. Feng, D. Karatzas, Z. B. Luo, U. Pal, C. Rigaud, J. Chazalon, W. Khlif, M. M. Luqman, J. C. Burie, C. L. Liu, J. M. Ogier. ICDAR2017 robust reading chaUenge on multi-lingual scene text detection and script identification - RRC-MLT. In Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition, Kyoto, Japan, pp. 1454\u20131459, 2017. DOI: https:\/\/doi.org\/10.1109\/icdar.2017.237.","DOI":"10.1109\/icdar.2017.237"},{"key":"1460_CR54","unstructured":"A. Veit, T. Matera, L. Neumann, J. Matas, S. Belongie. COCO-Text: Dataset and benchmark for text detection and recognition in natural images, [Online], Available: https:\/\/arxiv.org\/abs\/1601.07140, 2016."},{"key":"1460_CR55","doi-asserted-by":"publisher","unstructured":"C. K. Chng, Y. L. Liu, Y. P. Sun, C. C. Ng, C. J. Luo, Z. H. Ni, C. M. Fang, S. T. Zhang, J. Y. Han, E. R. Ding, J. T. Liu, D. Karatzas, C. S. Seng Chan, L. W. Jin. ICDAR 2019 robust reading challenge on arbitrary-shaped text - RRC-ArT. In Proceedings of International Conference on Document Analysis and Recognition, Sydney, Australia, pp. 1571\u20131576, 2019. DOI: https:\/\/doi.org\/10.1109\/icdar.2019.00252.","DOI":"10.1109\/icdar.2019.00252"},{"key":"1460_CR56","doi-asserted-by":"publisher","unstructured":"Y. P. Sun, C. Q. Zhang, Z. M. Huang, J. M. Liu, J. Y. Han, E. R. Ding. TextNet: Irregular text reading from images with an end-to-end trainable network. In Proceedings of the 14th Asian Conference on Computer Vision, Perth, Australia, pp. 83\u201399, 2018. DOI: https:\/\/doi.org\/10.1007\/978-3-030-20893-6_6.","DOI":"10.1007\/978-3-030-20893-6_6"},{"key":"1460_CR57","doi-asserted-by":"publisher","unstructured":"M. H. Liao, G. Pang, J. Huang, T. Hassner, X. Bai. Mask TextSpotter v3: Segmentation proposal network for robust scene text spotting. In Proceedings of the 16th European Conference on Computer Vision, Glasgow, UK, pp. 706\u2013722, 2020. DOI: https:\/\/doi.org\/10.1007\/978-3-030-58621-8_41.","DOI":"10.1007\/978-3-030-58621-8_41"},{"key":"1460_CR58","doi-asserted-by":"publisher","first-page":"6200","DOI":"10.1109\/TIP.2022.3206615","volume":"31","author":"P Lu","year":"2022","unstructured":"P. Lu, H. Wang, S. G. Zhu, J. Wang, X. Bai, W. Y. Liu. Boundary TextSpotter: Toward arbitrary-shaped scene text spotting. IEEE Transactions on Image Processing, vol.31, pp. 6200\u20136212, 2022. DOI: https:\/\/doi.org\/10.1109\/tip.2022.3206615.","journal-title":"IEEE Transactions on Image Processing"},{"issue":"6","key":"1460_CR59","doi-asserted-by":"publisher","first-page":"7123","DOI":"10.1109\/TPAMI.2022.3223908","volume":"45","author":"S C Fang","year":"2023","unstructured":"S. C. Fang, Z. D. Mao, H. T. Xie, Y. X. Wang, C. G. Yan, Y. D. Zhang. ABINet++: Autonomous, bidirectional and iterative language modeling for scene text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol.45, no.6, pp.7123\u20137141, 2023. DOI: https:\/\/doi.org\/10.1109\/tpami.2022.3223908.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1460_CR60","doi-asserted-by":"publisher","unstructured":"X. Y. Zhou, C. Yao, H. Wen, Y. Z. Wang, S. C. Zhou, W. R. He, J. J. Liang. EAST: An efficient and accurate scene text detector. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 2642\u20132651, 2017. DOI: https:\/\/doi.org\/10.1109\/cvpr.2017.283.","DOI":"10.1109\/cvpr.2017.283"},{"key":"1460_CR61","doi-asserted-by":"publisher","unstructured":"W. H. Wang, E. Z. Xie, X. Li, W. B. Hou, T. Lu, G. Yu, S. Shao. Shape robust text detection with progressive scale expansion network. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 9328\u20139337, 2019. DOI: https:\/\/doi.org\/10.1109\/cvpr.2019.00956.","DOI":"10.1109\/cvpr.2019.00956"},{"key":"1460_CR62","doi-asserted-by":"publisher","unstructured":"A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber. Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd International Conference on Machine Learning, Pittsburgh, USA, pp. 369\u2013376, 2006. DOI: https:\/\/doi.org\/10.1145\/1143844.1143891.","DOI":"10.1145\/1143844.1143891"}],"container-title":["Machine Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-023-1460-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11633-023-1460-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-023-1460-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T08:44:47Z","timestamp":1778057087000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11633-023-1460-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,7]]},"references-count":62,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["1460"],"URL":"https:\/\/doi.org\/10.1007\/s11633-023-1460-6","relation":{},"ISSN":["2731-538X","2731-5398"],"issn-type":[{"value":"2731-538X","type":"print"},{"value":"2731-5398","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,7]]},"assertion":[{"value":"16 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 February 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declared that they have no conflicts of interest to this work.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations of conflict of interest"}}]}}