{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,15]],"date-time":"2026-07-15T13:02:45Z","timestamp":1784120565863,"version":"3.55.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T00:00:00Z","timestamp":1653955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T00:00:00Z","timestamp":1653955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1007\/s10489-022-03728-5","type":"journal-article","created":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T10:02:47Z","timestamp":1653991367000},"page":"3444-3458","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["STR Transformer: A Cross-domain Transformer for Scene Text Recognition"],"prefix":"10.1007","volume":"53","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5331-022X","authenticated-orcid":false,"given":"Xing","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bin","family":"Tang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ming","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianjia","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yike","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,5,31]]},"reference":[{"key":"3728_CR1","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/j.neucom.2014.12.089","volume":"161","author":"JI Olszewska","year":"2015","unstructured":"Olszewska J I (2015) Active contour based optical character recognition for automated scene understanding. Neurocomputing 161:65\u201371","journal-title":"Neurocomputing"},{"issue":"5","key":"3728_CR2","doi-asserted-by":"publisher","first-page":"1063","DOI":"10.1109\/TMM.2016.2638622","volume":"19","author":"S Karaoglu","year":"2016","unstructured":"Karaoglu S, Tao R, Gevers T, Smeulders Arnold WM (2016) Words matter: Scene text for image classification and retrieval. IEEE Trans Multimed 19(5):1063\u20131076","journal-title":"IEEE Trans Multimed"},{"key":"3728_CR3","doi-asserted-by":"crossref","unstructured":"Singh A, Natarajan V, Shah M, Jiang Y, Chen X, Batra D, Parikh D, Rohrbach M (2019) Towards vqa models that can read. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8317\u20138326","DOI":"10.1109\/CVPR.2019.00851"},{"key":"3728_CR4","unstructured":"Wei L, Chen C, Wong K Y, Su Z, Han J (2016) Star-net: A spatial attention residue network for scene text recognition. In: British Machine Vision Conference 2016"},{"issue":"9","key":"3728_CR5","doi-asserted-by":"publisher","first-page":"2035","DOI":"10.1109\/TPAMI.2018.2848939","volume":"41","author":"B Shi","year":"2018","unstructured":"Shi B, Yang M, Wang X, Lyu P, Yao C, Bai X (2018) Aster: An attentional scene text recognizer with flexible rectification. IEEE Trans Pattern Anal Mach Intell 41(9):2035\u20132048","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"3728_CR6","doi-asserted-by":"crossref","unstructured":"Yu D, Li X, Zhang C, Liu T, Ding E (2020) Towards accurate scene text recognition with semantic reasoning networks. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.01213"},{"key":"3728_CR7","doi-asserted-by":"crossref","unstructured":"Fang S, Xie H, Wang Y, Mao Z, Zhang Y (2021) Read like humans: Autonomous, bidirectional and iterative language modeling for scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 7098\u20137107","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"3728_CR8","doi-asserted-by":"crossref","unstructured":"Chen Y, Zhuang T, Guo K (2021) Memory network with hierarchical multi-head attention for aspect-based sentiment analysis","DOI":"10.1007\/s10489-020-02069-5"},{"key":"3728_CR9","doi-asserted-by":"crossref","unstructured":"Sun S- (2021) Self-attention enhanced cnns with average margin loss for chinese zero pronoun resolution","DOI":"10.1007\/s10489-021-02697-5"},{"key":"3728_CR10","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Gulcehre C, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using rnn encoder-decoder for statistical machine translation. In: Conference on Empirical Methods in Natural Language Processing (EMNLP 2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"3728_CR11","doi-asserted-by":"crossref","unstructured":"Neumann L, Matas J (2012) Real-time scene text localization and recognition. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2012.6248097"},{"issue":"18","key":"3728_CR12","doi-asserted-by":"publisher","first-page":"8027","DOI":"10.1016\/j.eswa.2014.07.008","volume":"41","author":"A Risnumawan","year":"2014","unstructured":"Risnumawan A, Shivakumara P, Chan C S, Tan C L (2014) A robust arbitrary text detection system for natural scene images. Expert Syst Appl 41(18):8027\u20138048","journal-title":"Expert Syst Appl"},{"key":"3728_CR13","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1016\/j.ins.2019.08.059","volume":"508","author":"X Wu","year":"2020","unstructured":"Wu X, Zhong M, Guo Y, Fujita H (2020) The assessment of small bowel motility with attentive deformable neural network. Inf Sci 508:22\u201332","journal-title":"Inf Sci"},{"key":"3728_CR14","doi-asserted-by":"crossref","unstructured":"Wu X, Chen C, Zhong M, Wang J (2021) Hal: Hybrid active learning for efficient labeling in medical domain","DOI":"10.1016\/j.neucom.2020.10.115"},{"key":"3728_CR15","doi-asserted-by":"publisher","first-page":"101913","DOI":"10.1016\/j.media.2020.101913","volume":"68","author":"X Wu","year":"2021","unstructured":"Wu X, Chen C, Zhong M, Wang J, Shi J (2021) Covid-al: The diagnosis of covid-19 with deep active learning. Med Image Anal 68:101913","journal-title":"Med Image Anal"},{"key":"3728_CR16","doi-asserted-by":"crossref","unstructured":"Baek J, Kim G, Lee J, Park S, Han D, Yun S, Oh S J, Lee H (2019) What is wrong with scene text recognition model comparisons? dataset and model analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4715\u2013 4723","DOI":"10.1109\/ICCV.2019.00481"},{"key":"3728_CR17","doi-asserted-by":"crossref","unstructured":"Su B, Lu S (2014) Accurate scene text recognition based on recurrent neural network","DOI":"10.1007\/978-3-319-16865-4_3"},{"issue":"11","key":"3728_CR18","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2016","unstructured":"Shi B, Xiang B, Cong Y (2016) An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans Pattern Anal Mach Intell 39(11):2298\u20132304","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"3728_CR19","doi-asserted-by":"publisher","first-page":"397","DOI":"10.1016\/j.patcog.2016.10.016","volume":"63","author":"B Su","year":"2017","unstructured":"Su B, Lu S (2017) Accurate recognition of words in scenes without character segmentation using recurrent neural network. Pattern Recogn 63:397\u2013405","journal-title":"Pattern Recogn"},{"key":"3728_CR20","doi-asserted-by":"crossref","unstructured":"Li W, Wang Q, Wu J, Yu Z (2021) Piecewise convolutional neural networks with position attention and similar bag attention for distant supervision relation extraction","DOI":"10.1007\/s10489-021-02632-8"},{"key":"3728_CR21","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1016\/j.knosys.2017.01.023","volume":"121","author":"M Pei","year":"2017","unstructured":"Pei M, Wu X, Guo Y, Fujita H (2017) Small bowel motility assessment based on fully convolutional networks and long short-term memory. Knowl-Based Syst 121:163\u2013172","journal-title":"Knowl-Based Syst"},{"key":"3728_CR22","doi-asserted-by":"publisher","first-page":"8610","DOI":"10.1609\/aaai.v33i01.33018610","volume":"33","author":"H Li","year":"2019","unstructured":"Li H, Wang P, Shen C, Zhang G (2019) Show, attend and read: A simple and strong baseline for irregular text recognition. Proceedings of the AAAI Conference on Artificial Intelligence 33:8610\u20138617","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"3728_CR23","doi-asserted-by":"crossref","unstructured":"Cheng Z, Xu Y, Fan B, Yi N, Zhou S (2018) Aon: Towards arbitrarily-oriented text recognition. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2018.00584"},{"key":"3728_CR24","doi-asserted-by":"crossref","unstructured":"Li H, Wang P, Shen C (2017) Towards end-to-end text spotting with convolutional recurrent neural networks. In: Proceedings of the IEEE international conference on computer vision, pp 5238\u20135246","DOI":"10.1109\/ICCV.2017.560"},{"key":"3728_CR25","doi-asserted-by":"crossref","unstructured":"Wang P, Li H, Shen C (2021) Towards end-to-end text spotting in natural scenes","DOI":"10.1109\/TPAMI.2021.3095916"},{"key":"3728_CR26","doi-asserted-by":"crossref","unstructured":"Lyu P, Liao M, Yao C, Wu W, Bai X (2018) Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 67\u201383","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"3728_CR27","doi-asserted-by":"crossref","unstructured":"Xing L, Tian Z, Huang W, Scott M R (2019) Convolutional character networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 9126\u20139136","DOI":"10.1109\/ICCV.2019.00922"},{"key":"3728_CR28","doi-asserted-by":"crossref","unstructured":"He P, Huang W, Qiao Y, Loy C C, Tang X (2016) Reading scene text in deep convolutional sequences. In: Thirtieth AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v30i1.10465"},{"key":"3728_CR29","doi-asserted-by":"crossref","unstructured":"Ma X, He K, Zhang D, Li D (2021) Pieed: Position information enhanced encoder-decoder framework for scene text recognition","DOI":"10.1007\/s10489-021-02219-3"},{"key":"3728_CR30","doi-asserted-by":"crossref","unstructured":"Yin G, Chen F, Dong Y, Li G (2021) Knowledge-aware recommendation model with dynamic co-attention and attribute regularize","DOI":"10.1007\/s10489-021-02598-7"},{"key":"3728_CR31","doi-asserted-by":"crossref","unstructured":"Lee C Y, Osindero S (2016) Recursive recurrent nets with attention modeling for ocr in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2016.245"},{"key":"3728_CR32","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations"},{"key":"3728_CR33","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H (2021) Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, PMLR, pp 10347\u201310357"},{"key":"3728_CR34","doi-asserted-by":"crossref","unstructured":"Atienza R (2021) Vision transformer for fast and efficient scene text recognition. In: International Conference on Document Analysis and Recognition, Springer, pp 319\u2013334","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"3728_CR35","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"3728_CR36","doi-asserted-by":"crossref","unstructured":"Al-Rfou R, Choe D, Constant N, Guo M, Jones L (2019) Character-level language modeling with deeper self-attention. In: Proceedings of the AAAI conference on artificial intelligence, vol 33, pp 3159\u20133166","DOI":"10.1609\/aaai.v33i01.33013159"},{"key":"3728_CR37","doi-asserted-by":"crossref","unstructured":"Wang T, Zhu Y, Jin L, Luo C, Chen X, Wu Y, Wang Q, Cai M (2020) Decoupled attention network for text recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol 34, pp 12216\u201312224","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"3728_CR38","unstructured":"Jaderberg M, Simonyan K, Vedaldi A, Zisserman A (2014) Synthetic data and artificial neural networks for natural scene text recognition. Neural Information Processing Systems"},{"key":"3728_CR39","doi-asserted-by":"crossref","unstructured":"Gupta A, Vedaldi A, Zisserman A (2016) Synthetic data for text localisation in natural images. In: IEEE Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2016.254"},{"key":"3728_CR40","doi-asserted-by":"crossref","unstructured":"Karatzas D, Shafait F, Uchida S, Iwamura M, Bigorda L G I, Mestre S R, Romeu J M, Mota D F, Almaz\u00e1n J, Heras L D L (2013) Icdar 2013 robust reading competition","DOI":"10.1109\/ICDAR.2013.221"},{"key":"3728_CR41","unstructured":"Kai W, Babenko B, Belongie S (2012) End-to-end scene text recognition. In: IEEE International Conference on Computer Vision"},{"key":"3728_CR42","doi-asserted-by":"crossref","unstructured":"Mishra A, Alahari K, Jawahar CV (2012) Scene text recognition using higher order language priors. In: BMVC-British Machine Vision Conference, BMVA","DOI":"10.5244\/C.26.127"},{"key":"3728_CR43","doi-asserted-by":"crossref","unstructured":"Karatzas D, Bigorda L G I, Nicolaou A, Ghosh S, Bagdanov A D, Iwamura M, Matas J, Neumann L, Chandrasekhar V, Lu S, Shafait F, Uchida S, Valveny E (2015) Icdar 2015 competition on robust reading","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"3728_CR44","doi-asserted-by":"crossref","unstructured":"Phan T Q, Shivakumara P, Tian S, Tan C L (2014) Recognizing text with perspective distortion in natural scenes. In: IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV.2013.76"},{"key":"3728_CR45","doi-asserted-by":"crossref","unstructured":"Cubuk E D, Zoph B, Shlens J, Le Q V (2020) Randaugment: Practical automated data augmentation with a reduced search space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp 702\u2013703","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"3728_CR46","doi-asserted-by":"crossref","unstructured":"Shi B, Wang X, Lyu P, Cong Y, Xiang B (2016) Robust scene text recognition with automatic rectification. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2016.452"},{"key":"3728_CR47","unstructured":"Wang J, Hu X (2017) Gated recurrent convolution neural network for ocr. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp 334\u2013343"},{"key":"3728_CR48","doi-asserted-by":"crossref","unstructured":"Borisyuk F, Gordo A, Sivakumar V (2018) Rosetta: Large scale system for text detection and recognition in images. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 71\u201379","DOI":"10.1145\/3219819.3219861"},{"key":"3728_CR49","doi-asserted-by":"crossref","unstructured":"Atienza R (2021) Vision transformer for fast and efficient scene text recognition. In: International Conference on Document Analysis and Recognition, Springer, pp 319\u2013334","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"3728_CR50","unstructured":"Zhang Y, Gueguen L, Zharkov I, Zhang P, Seifert K, Kadlec B (2017) Uber-text: A large-scale dataset for optical character recognition from street-level imagery. In: SUNw: Scene Understanding Workshop-CVPR, vol 2017, p 5"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-03728-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-022-03728-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-03728-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,26]],"date-time":"2024-09-26T04:21:44Z","timestamp":1727324504000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-022-03728-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,31]]},"references-count":50,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,2]]}},"alternative-id":["3728"],"URL":"https:\/\/doi.org\/10.1007\/s10489-022-03728-5","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,5,31]]},"assertion":[{"value":"6 May 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 May 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}