{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,19]],"date-time":"2026-04-19T04:13:20Z","timestamp":1776572000790,"version":"3.51.2"},"reference-count":103,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,4,3]],"date-time":"2025-04-03T00:00:00Z","timestamp":1743638400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,3]],"date-time":"2025-04-03T00:00:00Z","timestamp":1743638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["IJDAR"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s10032-025-00523-z","type":"journal-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T14:28:38Z","timestamp":1743776918000},"page":"681-700","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["An adaptive multi-head self-attention coupled with attention filtered LSTM for advanced scene text recognition"],"prefix":"10.1007","volume":"28","author":[{"given":"Prabu","family":"Selvam","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"S. N.","family":"Kumar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"S.","family":"Kannadhasan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"523_CR1","doi-asserted-by":"publisher","first-page":"100895","DOI":"10.1109\/ACCESS.2022.3207469","volume":"10","author":"P Selvam","year":"2022","unstructured":"Selvam, P., Koilraj, J.A.S., Romero, C.A.T., Alharbi, M., Mehbodniya, A., Webber, J.L., Sengan, S.: A transformer-based framework for scene text recognition. IEEE Access 10, 100895\u2013100910 (2022)","journal-title":"IEEE Access"},{"key":"523_CR2","doi-asserted-by":"crossref","unstructured":"S. Albawi, T. A. Mohammed and S. Al-Zawi, Understanding of a convolutional neural network, In the proceedings of the international conference on engineering and technology, Antalya, Turkey, 2017, pp. 1\u20136.","DOI":"10.1109\/ICEngTechnol.2017.8308186"},{"issue":"8","key":"523_CR3","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"523_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.physd.2019.132306","volume":"404","author":"A Sherstinsky","year":"2020","unstructured":"Sherstinsky, A.: Fundamentals of recurrent neural network (RNN) and long short-term memory (LSTM) network. Physica D: Non-linear Phenomena 404, 132306 (2020)","journal-title":"Physica D: Non-linear Phenomena"},{"issue":"1","key":"523_CR5","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1109\/MASSP.1986.1165342","volume":"3","author":"L Rabiner","year":"1986","unstructured":"Rabiner, L., Juang, B.: An introduction to hidden Markov models. IEEE ASSP Mag. 3(1), 4\u201316 (1986)","journal-title":"IEEE ASSP Mag."},{"key":"523_CR6","doi-asserted-by":"crossref","unstructured":"A. Ray, A. Chandawala and S. Chaudhury, Character Recognition using conditional random field based recognition engine, 2013 12th international conference on document analysis and recognition, Washington, DC, USA, pp. 18\u201322, 2013.","DOI":"10.1109\/ICDAR.2013.13"},{"key":"523_CR7","doi-asserted-by":"crossref","unstructured":"W. Hu, X. Cai, J. Hou, S. Yi, and Z. Lin, GTC: Guided training of CTC towards efficient and accurate scene text recognition, In the proceedings of the AAAI conference on artificial intelligence, New York, USA, vol. 34, no. 07, 2020, pp. 11005\u201311012.","DOI":"10.1609\/aaai.v34i07.6735"},{"key":"523_CR8","doi-asserted-by":"crossref","unstructured":"T. Wang, Y. Zhu, I. Jin, C. Luo, X. Chen, Y. Wu, Q. Wang, and M. Cai, Decoupled attention network for text recognition, Proceedings of the AAAI conference on artificial intelligence, 2020, pp. 12216\u201312224.","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"523_CR9","doi-asserted-by":"crossref","unstructured":"M. Liao, J. Zhang, Z. Wang, F. Xie, J. Liang, P. Lyu, C. Yao, and X. Bai, Scene text recognition from two-dimensional perspective, In Proceedings of the AAAI conference on artificial intelligence, vol. 33, no. 01, 2018, pp. 8714-8721","DOI":"10.1609\/aaai.v33i01.33018714"},{"key":"523_CR10","doi-asserted-by":"crossref","unstructured":"Y. Liu, Sequence Recognition of Scene Text Based on CRNN and CTPN Models, In the proceedings of the 2022 6th international conference on electronic information technology and computer engineering, Xiamen, China, 2023, pp. 196\u2013200.","DOI":"10.1145\/3573428.3573462"},{"key":"523_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10462-023-10530-3","volume":"56","author":"M Ghosh","year":"2023","unstructured":"Ghosh, M., Mukherjee, H., Obaidullah, S.M., Gao, X.Z., Roy, K.: Scene text understanding: recapitulating the past decade. Artif. Intell. Rev. 56, 1\u201373 (2023)","journal-title":"Artif. Intell. Rev."},{"issue":"14","key":"523_CR12","doi-asserted-by":"publisher","first-page":"20255","DOI":"10.1007\/s11042-022-12693-7","volume":"81","author":"F Naiemi","year":"2022","unstructured":"Naiemi, F., Ghods, V., Khalesi, H.: Scene text detection and recognition: a survey. Multimedia Tools Appl. 81(14), 20255\u201320290 (2022)","journal-title":"Multimedia Tools Appl."},{"issue":"2","key":"523_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3440756","volume":"54","author":"X Chen","year":"2021","unstructured":"Chen, X., Jin, L., Zhu, Y., Luo, C., Wang, T.: Text recognition in the wild: a survey. ACM Comput. Surv. 54(2), 1\u201335 (2021)","journal-title":"ACM Comput. Surv."},{"key":"523_CR14","doi-asserted-by":"crossref","unstructured":"Bissacco, M. Cummins, Y. Netzer and H. Neven, PhotoOCR: Reading Text in Uncontrolled Conditions, 2013 IEEE international conference on computer vision, Sydney, NSW, Australia, 2013, pp. 785\u2013792.","DOI":"10.1109\/ICCV.2013.102"},{"key":"523_CR15","doi-asserted-by":"crossref","unstructured":"Z. R. Tan, S. Tian and C. L. Tan, Using pyramid of histogram of oriented gradients on natural scene text recognition, 2014 IEEE international conference on image processing (ICIP), Paris, France, 2014, pp. 2629\u20132633.","DOI":"10.1109\/ICIP.2014.7025532"},{"key":"523_CR16","unstructured":"T. Wang, D. J. Wu, A. Coates and A. Y. Ng, End-to-end text recognition with convolutional neural networks,\u201d Proceedings of the 21st international conference on pattern recognition (ICPR2012), Tsukuba, Japan, 2012, pp. 3304\u20133308."},{"key":"523_CR17","doi-asserted-by":"crossref","unstructured":"L. Neumann and J. Matas, On Combining Multiple Segmentations in Scene Text Recognition, 2013 12th international conference on document analysis and recognition, Washington, DC, USA, 2013, pp. 523\u2013527.","DOI":"10.1109\/ICDAR.2013.110"},{"key":"523_CR18","doi-asserted-by":"crossref","unstructured":"X. Wang et al., \u201cEnd-to-End Scene Text Recognition in Videos Based on Multi Frame Tracking, 2017 14th IAPR international conference on document analysis and recognition (ICDAR), Kyoto, Japan, 2017, pp. 1255\u20131260.","DOI":"10.1109\/ICDAR.2017.207"},{"issue":"01","key":"523_CR19","first-page":"8714","volume":"33","author":"M Liao","year":"2019","unstructured":"Liao, M., et al.: Scene text recognition from two-dimensional perspective. Proc. Conf. AAAI Artif. Intell. 33(01), 8714\u20138721 (2019)","journal-title":"Proc. Conf. AAAI Artif. Intell."},{"key":"523_CR20","doi-asserted-by":"crossref","unstructured":"B. Shi, X. Bai and C. Yao, An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition, In IEEE transactions on pattern analysis and machine intelligence, vol. 39, no. 11, pp. 2298\u20132304.","DOI":"10.1109\/TPAMI.2016.2646371"},{"key":"523_CR21","doi-asserted-by":"publisher","first-page":"397","DOI":"10.1016\/j.patcog.2016.10.016","volume":"63","author":"B Su","year":"2017","unstructured":"Su, B., Lu, S.: Accurate recognition of words in scenes without character segmentation using recurrent neural network. Pattern Recognit. 63, 397\u2013405 (2017)","journal-title":"Pattern Recognit."},{"key":"523_CR22","first-page":"1","volume":"30","author":"J Wang","year":"2017","unstructured":"Wang, J., Hu, X.: Gated recurrent convolution neural network for ocr. Adv. Neural. Inf. Process. Syst. 30, 1\u201310 (2017)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"1","key":"523_CR23","doi-asserted-by":"publisher","first-page":"2676780","DOI":"10.1155\/2021\/2676780","volume":"2021","author":"MP Kantipudi","year":"2021","unstructured":"Kantipudi, M.P., Kumar, S., Kumar Jha, A.: Scene text recognition based on bidirectional LSTM and deep neural network. Comput. Intell. Neurosci. 2021(1), 2676780 (2021)","journal-title":"Comput. Intell. Neurosci."},{"issue":"5","key":"523_CR24","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1109\/TPAMI.2008.137","volume":"31","author":"A Graves","year":"2009","unstructured":"Graves, A., Liwicki, M., Fern\u00e1ndez, S., Bertolami, R., Bunke, H., Schmidhuber, J.: A Novel connectionist system for unconstrained handwriting recognition. IEEE Trans. Pattern Anal. Mach. Intell. 31(5), 855\u2013868 (2009)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"523_CR25","doi-asserted-by":"crossref","unstructured":"F. Zheng, C. Chen, K. Wang and W. Wang, A New Strategy for Improving the Accuracy in Scene Text Recognition, 2023 4th international conference on electronic communication and artificial intelligence (ICECAI), Guangzhou, China, 2023, pp. 319\u2013323.","DOI":"10.1109\/ICECAI58670.2023.10176817"},{"issue":"7","key":"523_CR26","doi-asserted-by":"publisher","first-page":"369","DOI":"10.3390\/info14070369","volume":"14","author":"W Yu","year":"2023","unstructured":"Yu, W., Ibrayim, M., Hamdulla, A.: Scene text recognition based on improved CRNN. Information (Basel) 14(7), 369 (2023)","journal-title":"Information (Basel)"},{"key":"523_CR27","doi-asserted-by":"crossref","unstructured":"Z. Qiao, Y. Zhou, D. Yang, Y. Zhou and W. Wang, SEED: semantics enhanced encoder-decoder framework for Scene Text Recognition, 2020 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Seattle, WA, USA, 2020, pp. 13525\u201313534.","DOI":"10.1109\/CVPR42600.2020.01354"},{"issue":"3","key":"523_CR28","first-page":"3549","volume":"76","author":"BH Nayef","year":"2023","unstructured":"Nayef, B.H., Sheikh Abdullah, S.N.H., Sulaiman, R., Saeed, A.M.: Text extraction with optimal bi-LSTM\u201d. Comput. Mater. Contin. 76(3), 3549\u20133567 (2023)","journal-title":"Comput. Mater. Contin."},{"key":"523_CR29","doi-asserted-by":"crossref","unstructured":"R. Litman, O. Anschel, S. Tsiper, R. Litman, S. Mazor and R. Manmatha, SCATTER: selective context attentional scene text recognizer, 2020 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Seattle, WA, USA, 2020, pp. 11959\u201311969.","DOI":"10.1109\/CVPR42600.2020.01198"},{"key":"523_CR30","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1016\/j.neucom.2019.01.094","volume":"339","author":"Y Gao","year":"2019","unstructured":"Gao, Y., Chen, Y., Wang, J., Tang, M., Lu, H.: Reading scene text with fully convolutional sequence modeling. Neurocomputing 339, 161\u2013170 (2019)","journal-title":"Neurocomputing"},{"key":"523_CR31","doi-asserted-by":"publisher","first-page":"159906","DOI":"10.1109\/ACCESS.2020.3020387","volume":"8","author":"G Wei","year":"2020","unstructured":"Wei, G., Rong, W., Liang, Y., Xiao, X., Liu, X.: Toward arbitrary-shaped text spotting based on end-to-end. IEEE Access 8, 159906\u2013159914 (2020)","journal-title":"IEEE Access"},{"issue":"1s","key":"523_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3231737","volume":"15","author":"H Xie","year":"2019","unstructured":"Xie, H., Fang, S., Zha, Z.-J., Yang, Y., Li, Y., Zhang, Y.: Convolutional attention networks for scene text recognition. ACM Trans. Multimed. Comput. Commun. Appl. 15(1s), 1\u201317 (2019)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"523_CR33","doi-asserted-by":"crossref","unstructured":"S. Fang, H. Xie, Z.-J. Zha, N. Sun, J. Tan, and Y. Zhang, Attention and language ensemble for scene text recognition with convolutional sequence modeling, In\u00a0proceedings of the 26th ACM international conference on Multimedia, 2018, pp. 248\u2013256.","DOI":"10.1145\/3240508.3240571"},{"key":"523_CR34","doi-asserted-by":"crossref","unstructured":"A. Graves, S. Fern\u00e1ndez, F. Gomez, and J. Schmidhuber, Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks, in\u00a0Proceedings of the 23rd international conference on Machine learning - ICML \u201906, 2006, pp. 369 - 376.","DOI":"10.1145\/1143844.1143891"},{"key":"523_CR35","unstructured":"D. Bahdanau, K. Cho, and Y. Bengio, Neural machine translation by jointly learning to align and translate, In Proceedings of the 3rd international conference on learning representations (ICLR\u201915), 2015, pp. 1\u201315."},{"key":"523_CR36","doi-asserted-by":"crossref","unstructured":"X. Qi, Y. Chen, R. Xiao, C. -G. Li, Q. Zou and S. Cui, A Novel Joint Character Categorization and Localization Approach for Character-Level Scene Text Recognition, 2019 international conference on document analysis and recognition workshops (ICDARW), Sydney, NSW, Australia, 2019, pp. 83\u201390.","DOI":"10.1109\/ICDARW.2019.40086"},{"key":"523_CR37","doi-asserted-by":"crossref","unstructured":"Y. Gao, Y. Chen, J. Wang, M. Tang and H. Lu, Dense Chained attention network for scene text recognition, 2018 25th IEEE international conference on image processing (ICIP), Athens, Greece, 2018, pp. 679\u2013683.","DOI":"10.1109\/ICIP.2018.8451273"},{"key":"523_CR38","doi-asserted-by":"publisher","first-page":"62616","DOI":"10.1109\/ACCESS.2019.2916616","volume":"7","author":"L-Q Zuo","year":"2019","unstructured":"Zuo, L.-Q., Sun, H.-M., Mao, Q.-C., Qi, R., Jia, R.-S.: Natural scene text recognition based on encoder-decoder framework. IEEE Access 7, 62616\u201362623 (2019)","journal-title":"IEEE Access"},{"key":"523_CR39","doi-asserted-by":"crossref","unstructured":"Y. Wang and J. -E. Ha, Scene Text Recognition with Multi-Encoders, 2022 22nd international conference on control, automation and systems (ICCAS), Jeju, Korea, Republic of, 2022, pp. 1615\u20131620.","DOI":"10.23919\/ICCAS55662.2022.10003838"},{"key":"523_CR40","doi-asserted-by":"crossref","unstructured":"B. Shi, X. Wang, P. Lyu, et al., Robust scene text recognition with automatic rectification, IEEE conference on computer vision and pattern recognition (CVPR), 2016, pp. 4168\u20134176.","DOI":"10.1109\/CVPR.2016.452"},{"key":"523_CR41","unstructured":"L. Wei, C. Chen, K.Y. Wong, et al., STAR-Net: A SpaTial Attention Residue Network for Scene Text Recognition, British Machine Vision Conference, 2016, pp. 1\u20137."},{"key":"523_CR42","doi-asserted-by":"crossref","unstructured":"H. Zhong, Z. Yang, Z. Li, P. Wang, J. Tang, W. Cheng, and C. Yao VL-Reader: Vision and Language Reconstructor is an Effective Scene Text Recognizer, In proceedings of the 32nd ACM international conference on multimedia, 2024, pp. 4207\u20134216.","DOI":"10.1145\/3664647.3681271"},{"key":"523_CR43","doi-asserted-by":"crossref","unstructured":"D. Bautista and R. Atienza, Scene text recognition with permuted autoregressive sequence models, in Proceedings of the European conference on computer vision, Cham: Springer Nature Switzerland, 2022, pp. 178\u2013196.","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"523_CR44","doi-asserted-by":"crossref","unstructured":"S. Fang, H. Xie, Y. Wang, Z. Mao, and Y. Zhang, \u201cRead like humans: Autonomous, bidirectional and iterative language modeling for scene text recognition,\u201d in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 2021, pp. 7098\u20137107.","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"523_CR45","doi-asserted-by":"crossref","unstructured":"T. Zheng, Z. Chen, B. Huang, W. Zhang and Y. -G. Jiang, MRN: multiplexed routing network for incremental multilingual text recognition, 2023 IEEE\/CVF international conference on computer vision (ICCV), Paris, France, 2023, pp. 18598\u201318607.","DOI":"10.1109\/ICCV51070.2023.01709"},{"key":"523_CR46","doi-asserted-by":"crossref","unstructured":"Q. Jiang, J. Wang, D. Peng, C. Liu, and L. Jin, \u201cRevisiting scene text recognition: A data perspective, In proceedings of the IEEE\/CVF international conference on computer vision, 2023, pp. 20543\u201320554.","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"523_CR47","doi-asserted-by":"crossref","unstructured":"T. Guan, W. Shen, X. Yang, Q. Feng, Z. Jiang and X. Yang, Self-supervised character-to-character distillation for text recognition, 2023 IEEE\/CVF international conference on computer vision (ICCV), Paris, France, 2023, pp. 19416\u201319427.","DOI":"10.1109\/ICCV51070.2023.01784"},{"key":"523_CR48","doi-asserted-by":"crossref","unstructured":"B. Na, Y. Kim, and S. Park, Multi-modal text recognition networks: Interactive enhancements between visual and semantic features, In European conference on computer vision, 2022, pp. 446\u2013463.","DOI":"10.1007\/978-3-031-19815-1_26"},{"key":"523_CR49","doi-asserted-by":"crossref","unstructured":"M. Rang, Z. Bi, C. Liu, Y. Wang and K. Han, An Empirical Study of Scaling Law for Scene Text Recognition, 2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Seattle, WA, USA, 2024, pp. 15619\u201315629.","DOI":"10.1109\/CVPR52733.2024.01479"},{"key":"523_CR50","doi-asserted-by":"crossref","unstructured":"Z. Zhao et al., Multi-modal In-Context Learning Makes an Ego-evolving scene text recognizer, 2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Seattle, WA, USA, 2024, pp. 15567\u201315576.","DOI":"10.1109\/CVPR52733.2024.01474"},{"key":"523_CR51","doi-asserted-by":"crossref","unstructured":"J. Xu, Y. Wang, H. Xie and Y. Zhang, OTE: Exploring Accurate scene text recognition using one token, 2024 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Seattle, WA, USA, 2024, pp. 28327\u201328336.","DOI":"10.1109\/CVPR52733.2024.02676"},{"key":"523_CR52","doi-asserted-by":"crossref","unstructured":"L. Zhao, Z. Wu, X. Wu, G. Wilsbacher, and S. Wang, Background-insensitive scene text recognition with text semantic segmentation, In european conference on computer vision, 2022, pp. 163\u2013182.","DOI":"10.1007\/978-3-031-19806-9_10"},{"key":"523_CR53","doi-asserted-by":"crossref","unstructured":"D. Zhong, S. Lyu, P. Shivakumara, B. Yin, J. Wu, U. Pal, and Y. Lu, Sgbanet: Semantic gan and balanced attention network for arbitrarily oriented scene text recognition, In European conference on computer vision, 2022, pp. 464\u2013480.","DOI":"10.1007\/978-3-031-19815-1_27"},{"key":"523_CR54","doi-asserted-by":"crossref","unstructured":"A. Aberdam et al., CLIPTER: Looking at the Bigger Picture in Scene Text Recognition, 2023 IEEE\/CVF international conference on computer vision (ICCV), Paris, France, 2023, pp. 21649\u201321660.","DOI":"10.1109\/ICCV51070.2023.01984"},{"key":"523_CR55","doi-asserted-by":"crossref","unstructured":"C. Cheng, P. Wang, C. Da, Q. Zheng and C. Yao, LISTER: neighbor decoding for length-insensitive scene text recognition, 2023 IEEE\/CVF international conference on computer vision (ICCV), Paris, France, 2023, pp. 19484\u201319494.","DOI":"10.1109\/ICCV51070.2023.01790"},{"key":"523_CR56","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/j.patcog.2019.01.020","volume":"90","author":"C Luo","year":"2019","unstructured":"Luo, C., Jin, L., Sun, Z.: MORAN: A multi-object rectified attention network for scene text recognition. Pattern Recogn. 90, 109\u2013118 (2019)","journal-title":"Pattern Recogn."},{"issue":"6","key":"523_CR57","doi-asserted-by":"publisher","first-page":"2789","DOI":"10.1109\/TIP.2016.2555080","volume":"25","author":"X Bai","year":"2016","unstructured":"Bai, X., Yao, C., Liu, W.: Strokelets: a learned multi-scale mid-level representation for scene text recognition. IEEE Trans. Image Process. 25(6), 2789\u20132802 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"523_CR58","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1016\/j.patcog.2017.04.027","volume":"70","author":"L Gomez","year":"2017","unstructured":"Gomez, L., Karatzas, D.: Textproposals: a text-specific selective search algorithm for word spotting in the wild. Pattern Recogn. 70, 60\u201374 (2017)","journal-title":"Pattern Recogn."},{"key":"523_CR59","doi-asserted-by":"publisher","first-page":"2853","DOI":"10.1016\/j.patcog.2014.03.023","volume":"47","author":"C Shi","year":"2021","unstructured":"Shi, C., Wang, C., Xiao, B., Gao, S., Hu, J.: End-to-end scene text recognition using tree-structured models. Pattern Recognit. 47, 2853\u20132866 (2021)","journal-title":"Pattern Recognit."},{"key":"523_CR60","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.patcog.2020.107692","volume":"111","author":"Q Lin","year":"2021","unstructured":"Lin, Q., Luo, C., Jin, L., Lai, S.: STAN: A sequential transformation attention-based network for scene text recognition. Pattern Recognit. 111, 1\u20139 (2021)","journal-title":"Pattern Recognit."},{"key":"523_CR61","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.patcog.2023.110206","volume":"149","author":"A Banerjee","year":"2024","unstructured":"Banerjee, A., Shivakumara, P., Bhattacharya, S., Pal, U., Liu, C.L.: An end-to-end model for multi-view scene text recognition. Pattern Recognit. 149, 1\u201334 (2024)","journal-title":"Pattern Recognit."},{"key":"523_CR62","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1016\/j.patrec.2021.07.016","volume":"151","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Fu, Z., Huang, F., Liu, Y.: PMMN: Pre-trained multi-model network for scene text recognition. Pattern Recognit. Lett. 151, 103\u2013111 (2021)","journal-title":"Pattern Recognit. Lett."},{"key":"523_CR63","first-page":"1","volume":"178","author":"S Tian","year":"2023","unstructured":"Tian, S., Zhu, K.X., Qin, H.B., Yang, C.: Dynamic receptive field adaptation for scene text recognition. Pattern Recognit. Lett. 178, 1\u201310 (2023)","journal-title":"Pattern Recognit. Lett."},{"key":"523_CR64","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1016\/j.neucom.2019.11.049","volume":"381","author":"X Chen","year":"2023","unstructured":"Chen, X., Wang, T., Zhu, Y., Jin, L., Luo, C.: Adaptive embedding gate for attention-based scene text recognition. Neurocomputing 381, 261\u2013271 (2023)","journal-title":"Neurocomputing"},{"key":"523_CR65","doi-asserted-by":"publisher","first-page":"410","DOI":"10.1016\/j.neucom.2020.06.071","volume":"413","author":"H Li","year":"2022","unstructured":"Li, H., Yang, D., Huang, S., Lam, K.M., Jin, L., Zhuang, Z.: Two-dimensional multi-scale perceptive context for scene text recognition. Neurocomputing 413, 410\u2013421 (2022)","journal-title":"Neurocomputing"},{"key":"523_CR66","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.knosys.2023.110964","volume":"280","author":"X Yan","year":"2023","unstructured":"Yan, X., Fang, Z., Jin, Y.: An adaptive n-gram transformer for multi-scale scene text recognition. Knowledge-Based Syst. 280, 1\u20139 (2023)","journal-title":"Knowledge-Based Syst."},{"key":"523_CR67","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.eswa.2023.122769","volume":"243","author":"Q Zhou","year":"2024","unstructured":"Zhou, Q., Gao, J., Yuan, Y., Wang, Q.: RRTrN: A lightweight and effective backbone for scene text recognition. Expert Syst. Appl. 243, 1\u201310 (2024)","journal-title":"Expert Syst. Appl."},{"key":"523_CR68","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.jvcir.2021.103289","volume":"80","author":"Y Wu","year":"2021","unstructured":"Wu, Y., Fan, J., Tao, R., Wang, J., Qin, H., Liu, A., Liu, X.: Sequential alignment attention model for scene text recognition. J. Vis. Commun. Image Represent. 80, 1\u20138 (2021)","journal-title":"J. Vis. Commun. Image Represent."},{"key":"523_CR69","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.inffus.2023.102080","volume":"102","author":"L Diao","year":"2024","unstructured":"Diao, L., Tang, X., Wang, J., Xie, G., Hu, J.: Hierarchical visual-semantic interaction for scene text recognition. Infor. Fusion 102, 1\u20139 (2024)","journal-title":"Infor. Fusion"},{"issue":"2","key":"523_CR70","doi-asserted-by":"publisher","first-page":"2071","DOI":"10.32604\/iasc.2023.029105","volume":"35","author":"S Prabu","year":"2023","unstructured":"Prabu, S., Sundar, K.J.A.: Enhanced attention-based encoder-decoder framework for text recognition. Intell. Automat. Soft Comput. 35(2), 2071\u20132086 (2023)","journal-title":"Intell. Automat. Soft Comput."},{"key":"523_CR71","doi-asserted-by":"crossref","unstructured":"S. Prabu and K. J. Abraham Sundar, K.J, DocPresRec Doctor\u2019s Handwritten Prescription Recognition Using Deep Learning Algorithm, Artificial Intelligence in Telemedicine: Artificial Intelligence in Telemedicine Processing of Biosignals and Medical Images, 2023, . 33\u201348.","DOI":"10.1201\/9781003307778-4"},{"key":"523_CR72","first-page":"2017","volume":"28","author":"M Jaderberg","year":"2015","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., Kavukcuoglu, K.: Spatial transformer networks. In Proc. Adv. Neural Info. Proc. Syst. 28, 2017\u20132025 (2015)","journal-title":"In Proc. Adv. Neural Info. Proc. Syst."},{"key":"523_CR73","doi-asserted-by":"crossref","unstructured":"B. Shi, X. Wang, P. Lyu, C. Yao and X. Bai, Robust scene text recognition with automatic rectification, In proceedings of the computer vision and pattern recognition, Las Vegas, USA, 2016, pp. 4168\u20134176.","DOI":"10.1109\/CVPR.2016.452"},{"key":"523_CR74","doi-asserted-by":"crossref","unstructured":"T. Zheng, Z. Chen, J. Bai, H. Xie, and Y. G. Jiang, TPS++: Attention-Enhanced Thin-Plate Spline for Scene Text Recognition, In the proceedings of the thirty-second international joint conference on artificial Intelligence, Macao, SAR, 2023, pp. 1777- 1785.","DOI":"10.24963\/ijcai.2023\/197"},{"key":"523_CR75","unstructured":"K. Simonyan, and A. Zisserman, Very deep convolutional networks for large-scale image recognition, In the proceedings of the 3rd international conference on learning representations, ICLR 2015, San Diego, CA, USA, 2015, pp. 1\u201314."},{"key":"523_CR76","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, and J. Sun, Deep residual learning for image recognition, In proceedings of the IEEE conference on computer vision and pattern recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"1","key":"523_CR77","first-page":"1","volume":"6","author":"Z Raisi","year":"2021","unstructured":"Raisi, Z., Naiel, M.A., Fieguth, P., Wardell, S., Zelek, J.: 2D positional embedding-based transformer for scene text recognition. J. Comput. Vision Imag. Syst. 6(1), 1\u20134 (2021)","journal-title":"J. Comput. Vision Imag. Syst."},{"key":"523_CR78","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, Jakob Uszkoreit et al., Attention is all you need, In proceedings of the 31st conference on neural information processing systems (NIPS 2017), Long Beach, CA, USA, 2027, pp. 1\u201311."},{"issue":"11","key":"523_CR79","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"523_CR80","doi-asserted-by":"crossref","unstructured":"A. Gupta, A. Vedaldi, and A. Zisserman, Synthetic data for text localization in natural images, In proceedings of computer vision and pattern recognition, 2016, pp. 2315\u20132324.","DOI":"10.1109\/CVPR.2016.254"},{"key":"523_CR81","unstructured":"M. Jaderberg, K. Simonyan, A. Vedaldi, and A. Zisserman, Synthetic data and artificial neural networks for natural scene text recognition, In proceedings of advances in neural information processing systems Workshop, 2014, pp. 1\u201310."},{"key":"523_CR82","doi-asserted-by":"crossref","unstructured":"M. Jaderberg, A. Vedaldi, and A. Zisserman, Deep features for text spotting, In proceedings of 13th European conference, Zurich, Switzerland, 2014, pp. 512\u2013528.","DOI":"10.1007\/978-3-319-10593-2_34"},{"key":"523_CR83","doi-asserted-by":"crossref","unstructured":"R. Zhang et al., ICDAR 2019 Robust Reading Challenge on Reading Chinese Text on Signboard, 2019 International conference on document analysis and recognition (ICDAR), Sydney, NSW, Australia, 2019, pp. 1577\u20131581.","DOI":"10.1109\/ICDAR.2019.00253"},{"key":"523_CR84","doi-asserted-by":"crossref","unstructured":"N. Nayef et al., ICDAR2019 Robust reading challenge on multi-lingual scene text detection and recognition \u2014 RRC-MLT-2019, 2019 international conference on document analysis and recognition (ICDAR), Sydney, NSW, Australia, 2019, pp. 1582\u20131587.","DOI":"10.1109\/ICDAR.2019.00254"},{"key":"523_CR85","doi-asserted-by":"crossref","unstructured":"Y. Sun et al., ICDAR 2019 Competition on Large-Scale Street View Text with Partial Labeling - RRC-LSVT, 2019 international conference on document analysis and recognition (ICDAR), Sydney, NSW, Australia, 2019, pp. 1557\u20131562.","DOI":"10.1109\/ICDAR.2019.00250"},{"key":"523_CR86","doi-asserted-by":"crossref","unstructured":"C. K. Chng et al., ICDAR2019 Robust Reading Challenge on Arbitrary-Shaped Text - RRC-ArT, 2019 international conference on document analysis and recognition (ICDAR), Sydney, NSW, Australia, 2019, pp. 1571\u20131576.","DOI":"10.1109\/ICDAR.2019.00252"},{"key":"523_CR87","unstructured":"Y. Zhang, L. Gueguen, I. Zharkov, P. Zhang, K. Seifert, and B. Kadlec, Uber-text: A large-scale dataset for optical character recognition from street-level imagery, In SUNw: Scene Understanding Workshop-CVPR, vol. 2017. 5."},{"key":"523_CR88","doi-asserted-by":"crossref","unstructured":"B. Shi et al., ICDAR2017 Competition on Reading Chinese Text in the Wild (RCTW-17), 2017 14th IAPR international conference on document analysis and recognition (ICDAR), Kyoto, Japan, 2017, pp. 1429\u20131434.","DOI":"10.1109\/ICDAR.2017.233"},{"key":"523_CR89","unstructured":"A. Veit, T. Matera, L. Neumann, J. Matas, and S. Belongie, COCO-text: Dataset and benchmark for text detection and recognition in natural images, arXiv [cs.CV], 2016, pp. 1\u20138."},{"key":"523_CR90","doi-asserted-by":"crossref","unstructured":"A. Mishra, K. Alahari, and C. V. Jawahar, Scene Text Recognition using Higher Order Language Priors, In proceedings of the british machine vision conference, Surrey, UK, 2012, pp. 127.1\u2013127.11.","DOI":"10.5244\/C.26.127"},{"key":"523_CR91","doi-asserted-by":"crossref","unstructured":"K. Wang, B. Babenko, and S. Belongie, End-to-end scene text recognition, In proceedings of the 2011 international conference on computer vision, barcelona, Spain, 2011, pp. 1457\u20131464.","DOI":"10.1109\/ICCV.2011.6126402"},{"issue":"2\u20133","key":"523_CR92","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/s10032-004-0134-3","volume":"7","author":"SM Lucas","year":"2005","unstructured":"Lucas, S.M., Panaretos, A., Sosa, L., Tang, A., Wong, S., Young, R., Ashida, K., Na- Gai, H., Okamoto, M., Yamamoto, H., et al.: ICDAR 2003 robust reading competitions: entries, results, and future directions. Int. J. Document Anal. Recognit. (IJDAR) 7(2\u20133), 105\u2013122 (2005)","journal-title":"Int. J. Document Anal. Recognit. (IJDAR)"},{"key":"523_CR93","doi-asserted-by":"crossref","unstructured":"D. Karatzas, F. Shafait, S. Uchida, M. Iwamura, L.G. I Bigorda, S.R. Mestre, J. Mas, D.F. Mota, J.A. Almazan, L.P. De Las Heras, \u201cICDAR 2013 robust reading competition, In proceedings of the international conference on document analysis and recognition (ICDAR), Washington, DC, USA, 2013, pp. 1484\u20131493.","DOI":"10.1109\/ICDAR.2013.221"},{"key":"523_CR94","doi-asserted-by":"crossref","unstructured":"D. Karatzas, L. Gomez-Bigorda, A. Nicolaou, S. Ghosh, A. Bagdanov, M. Iwa-mura, J. Matas, L. Neumann, V.R. Chandrasekhar, S. Lu, et al., \u201cICDAR 2015 competition on robust reading, In proceedings of the international conference on document analysis and recognition (ICDAR), Tunis, Tunisia, 2015, pp. 1156\u20131160.","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"523_CR95","doi-asserted-by":"crossref","unstructured":"T. Q. Phan, P. Shivakumara, S. Tian, and C. L. Tan, Recognizing Text with Perspective Distortion in Natural Scenes, In proceedings of the IEEE international conference on computer vision, Sydney, NSW, Australia, 2013, pp. 569\u2013576.","DOI":"10.1109\/ICCV.2013.76"},{"issue":"18","key":"523_CR96","doi-asserted-by":"publisher","first-page":"8027","DOI":"10.1016\/j.eswa.2014.07.008","volume":"41","author":"A Risnumawan","year":"2014","unstructured":"Risnumawan, A., Shivakumara, P., Chan, C.S., Tan, C.L.: A robust arbitrary text detection system for natural scene images. Expert Syst. Appl. 41(18), 8027\u20138048 (2014)","journal-title":"Expert Syst. Appl."},{"key":"523_CR97","unstructured":"A. Mao, M. Mohri, and Y. Zhong, Cross-Entropy Loss Functions: Theoretical Analysis and Applications, in the Proceedings of the 40th international conference on machine learning, vol. 202, 2023, pp. 23803\u201323828."},{"key":"523_CR98","unstructured":"D. P. Kingma and J. L. Ba, Adam: A method for stochastic optimization, In proceedings of the 3rd international conference learning representations, Banff, AB, Canada, 2014, pp. 1\u201315."},{"key":"523_CR99","first-page":"1","volume":"20","author":"X Gao","year":"2024","unstructured":"Gao, X., Pang, Y., Liu, Y., Han, M., Yu, J., Wang, W., Chen, Y.: Multimodal visual-semantic representations learning for scene text recognition. ACM Transact. Multimedia Comput., Commun. Appl. 20, 1\u20138 (2024)","journal-title":"ACM Transact. Multimedia Comput., Commun. Appl."},{"key":"523_CR100","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110244","volume":"149","author":"M Yang","year":"2024","unstructured":"Yang, M., Yang, B., Liao, M., Zhu, Y., Bai, X.: Class-Aware Mask-guided feature refinement for scene text recognition. Pattern Recogn. 149, 110244 (2024)","journal-title":"Pattern Recogn."},{"key":"523_CR101","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111178","volume":"284","author":"HE Kai","year":"2024","unstructured":"Kai, H.E., Jinlong, T.A.N.G., Zikang, L.I.U., Ziqi, Y.A.N.G.: HAFE: a hierarchical awareness and feature enhancement network for scene text recognition. Knowl.-Based Syst. 284, 111178 (2024)","journal-title":"Knowl.-Based Syst."},{"issue":"3","key":"523_CR102","doi-asserted-by":"publisher","first-page":"3229","DOI":"10.1007\/s40747-022-00916-1","volume":"9","author":"H Heng","year":"2023","unstructured":"Heng, H., Li, P., Guan, T., Yang, T.: Scene text recognition via context modeling for low-quality image in logistics industry. Complex Intell. Syst. 9(3), 3229\u20133248 (2023)","journal-title":"Complex Intell. Syst."},{"key":"523_CR103","doi-asserted-by":"publisher","first-page":"61892","DOI":"10.1109\/ACCESS.2023.3280547","volume":"11","author":"R Mahadshetti","year":"2023","unstructured":"Mahadshetti, R., Lee, G.-S., Choi, D.-J.: RMFPN: end-to-end scene text recognition using multi-feature pyramid network. IEEE Access 11, 61892\u201361900 (2023)","journal-title":"IEEE Access"}],"container-title":["International Journal on Document Analysis and Recognition (IJDAR)"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-025-00523-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10032-025-00523-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-025-00523-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T07:02:06Z","timestamp":1764313326000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10032-025-00523-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,3]]},"references-count":103,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["523"],"URL":"https:\/\/doi.org\/10.1007\/s10032-025-00523-z","relation":{},"ISSN":["1433-2833","1433-2825"],"issn-type":[{"value":"1433-2833","type":"print"},{"value":"1433-2825","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,3]]},"assertion":[{"value":"12 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have  no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}