{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:44:14Z","timestamp":1778082254478,"version":"3.51.4"},"reference-count":94,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T00:00:00Z","timestamp":1744675200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T00:00:00Z","timestamp":1744675200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02428-0","type":"journal-article","created":{"date-parts":[[2025,4,15]],"date-time":"2025-04-15T14:53:33Z","timestamp":1744728813000},"page":"5281-5301","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["SwinTextSpotter v2: Towards Better Synergy for Scene Text Spotting"],"prefix":"10.1007","volume":"133","author":[{"given":"Mingxin","family":"Huang","sequence":"first","affiliation":[]},{"given":"Dezhi","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Hongliang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhenghao","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Chongyu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3037-173X","authenticated-orcid":false,"given":"Yuliang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[]},{"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,15]]},"reference":[{"key":"2428_CR1","doi-asserted-by":"crossref","unstructured":"Baek, Y., Shin, S., Baek, J., Park, S., Lee, J., Nam, D. & Lee, H. (2020). Character region attention for text spotting. In: European Conference on Computer Vision, Springer, pp 504\u2013521","DOI":"10.1007\/978-3-030-58526-6_30"},{"key":"2428_CR2","doi-asserted-by":"crossref","unstructured":"Bissacco, A., Cummins, M., Netzer, Y. & Neven, H. (2013). Photoocr: Reading text in uncontrolled conditions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 785\u2013792","DOI":"10.1109\/ICCV.2013.102"},{"issue":"6","key":"2428_CR3","doi-asserted-by":"publisher","first-page":"567","DOI":"10.1109\/34.24792","volume":"11","author":"FL Bookstein","year":"1989","unstructured":"Bookstein, F. L. (1989). Principal warps: Thin-plate splines and the decomposition of deformations. IEEE Transactions on Pattern Analysis and Machine Intelligence, 11(6), 567\u2013585.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2428_CR4","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A. & Zagoruyko, S. (2020). End-to-end object detection with transformers. In: European Conference on Computer Vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2428_CR5","first-page":"1","volume":"2022","author":"T Chen","year":"2022","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D. J., & Hinton, G. (2022). Pix2Seq: A language modeling framework for object detection. ICLR, 2022, 1\u20139.","journal-title":"ICLR"},{"key":"2428_CR6","doi-asserted-by":"crossref","unstructured":"Chng, CK., Liu, Y., Sun, Y., Ng, CC., Luo, C., Ni, Z., Fang, C., Zhang, S., Han, J., Ding, E., et\u00a0al. (2019). Icdar2019 robust reading challenge on arbitrary-shaped text-rrc-art. In: International Conference on Document Analysis and Recognition, IEEE, pp 1571\u20131576","DOI":"10.1109\/ICDAR.2019.00252"},{"issue":"1","key":"2428_CR7","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1007\/s10032-019-00334-z","volume":"23","author":"CK Ch\u2019ng","year":"2020","unstructured":"Ch\u2019ng, C. K., Chan, C. S., & Liu, C. L. (2020). Total-text: toward orientation robustness in scene text detection. International Journal on Document Analysis and Recognition, 23(1), 31\u201352.","journal-title":"International Journal on Document Analysis and Recognition"},{"key":"2428_CR8","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H. & Wei, Y. (2017). Deformable convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 764\u2013773","DOI":"10.1109\/ICCV.2017.89"},{"key":"2428_CR9","doi-asserted-by":"crossref","unstructured":"Das, A., Biswas, S., Pal, U., Llad\u00f3s, J. & Bhattacharya, S. (2024). Fasttextspotter: A high-efficiency transformer for multilingual scene text spotting. In: International Conference on Pattern Recognition, Springer, pp 135\u2013150","DOI":"10.1007\/978-3-031-78498-9_10"},{"key":"2428_CR10","doi-asserted-by":"crossref","unstructured":"Fang, S., Xie, H., Wang, Y., Mao, Z. & Zhang, Y. (2021). Read like humans: Autonomous, bidirectional and iterative language modeling for scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7098\u20137107","DOI":"10.1109\/CVPR46437.2021.00702"},{"issue":"6","key":"2428_CR11","doi-asserted-by":"publisher","first-page":"7123","DOI":"10.1109\/TPAMI.2022.3223908","volume":"45","author":"S Fang","year":"2022","unstructured":"Fang, S., Mao, Z., Xie, H., Wang, Y., Yan, C., & Zhang, Y. (2022). Abinet++: Autonomous, bidirectional and iterative language modeling for scene text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(6), 7123\u20137141.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2428_CR12","doi-asserted-by":"crossref","unstructured":"Feng, W., He, W., Yin, F., Zhang, XY. & Liu, CL. (2019). Textdragon: An end-to-end framework for arbitrary shaped text spotting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 9076\u20139085","DOI":"10.1109\/ICCV.2019.00917"},{"key":"2428_CR13","doi-asserted-by":"publisher","first-page":"619","DOI":"10.1007\/s11263-020-01388-x","volume":"129","author":"W Feng","year":"2021","unstructured":"Feng, W., Yin, F., Zhang, X. Y., He, W., & Liu, C. L. (2021). Residual dual scale scene text spotting by fusing bottom-up and top-down processing. International Journal of Computer Vision, 129, 619\u2013637.","journal-title":"International Journal of Computer Vision"},{"key":"2428_CR14","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1016\/j.patcog.2017.04.027","volume":"70","author":"L G\u00f3mez","year":"2017","unstructured":"G\u00f3mez, L., & Karatzas, D. (2017). Textproposals: a text-specific selective search algorithm for word spotting in the wild. Pattern Recognition, 70, 60\u201374.","journal-title":"Pattern Recognition"},{"key":"2428_CR15","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A. & Zisserman, A. (2016). Synthetic data for text localisation in natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2315\u20132324","DOI":"10.1109\/CVPR.2016.254"},{"key":"2428_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P. & Girshick, R. (2017). Mask r-cnn. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2961\u20132969","DOI":"10.1109\/ICCV.2017.322"},{"key":"2428_CR17","doi-asserted-by":"crossref","unstructured":"He, T., Tian, Z., Huang, W., Shen, C., Qiao, Y. & Sun, C. (2018). An end-to-end textspotter with explicit alignment and attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5020\u20135029","DOI":"10.1109\/CVPR.2018.00527"},{"key":"2428_CR18","unstructured":"Hu, J., Cao, L., Lu, Y., Zhang, S., Wang, Y., Li, K., Huang, F., Shao, L. & Ji, R. (2021). Istr: End-to-end instance segmentation with transformers. arXiv preprint arXiv:2105.00637"},{"key":"2428_CR19","doi-asserted-by":"crossref","unstructured":"Huang, M., Liu, Y., Peng, Z., Liu, C., Lin, D., Zhu, S., Yuan, N., Ding, K. & Jin, L. (2022). Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4593\u20134603","DOI":"10.1109\/CVPR52688.2022.00455"},{"issue":"1","key":"2428_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11263-015-0823-z","volume":"116","author":"M Jaderberg","year":"2016","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., & Zisserman, A. (2016). Reading text in the wild with convolutional neural networks. International Journal of Computer Vision, 116(1), 1\u201320.","journal-title":"International Journal of Computer Vision"},{"key":"2428_CR21","volume-title":"Spatial transformer networks","author":"M Jaderberg","year":"2015","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., & Kavukcuoglu, K. (2015). Spatial transformer networks. Cambridge: MIT Press."},{"key":"2428_CR22","doi-asserted-by":"crossref","unstructured":"Jia, D., Yuan, Y., He, H., Wu, X., Yu, H., Lin, W., Sun, L., Zhang, C. & Hu, H. (2022). Detrs with hybrid matching. arXiv preprint arXiv:2207.13080","DOI":"10.1109\/CVPR52729.2023.01887"},{"key":"2428_CR23","first-page":"667","volume":"29","author":"X Jia","year":"2016","unstructured":"Jia, X., De Brabandere, B., Tuytelaars, T., & Gool, L. V. (2016). Dynamic filter networks. Advances in Neural Information Processing Systems, 29, 667\u2013675.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2428_CR24","doi-asserted-by":"crossref","unstructured":"Karatzas, D., Gomez-Bigorda, L., Nicolaou, A., Ghosh, S., Bagdanov, A., Iwamura, M., Matas, J., Neumann, L., Chandrasekhar, VR., Lu, S., et\u00a0al. (2015). Icdar 2015 competition on robust reading. In: International Conference on Document Analysis and Recognition, IEEE, pp 1156\u20131160","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"2428_CR25","doi-asserted-by":"crossref","unstructured":"Karatzas, D., Shafait, F., Uchida, S., Iwamura, M., i\u00a0Bigorda, LG., Mestre, SR., Mas, J., Mota, DF., Almazan, JA. & De\u00a0Las\u00a0Heras, LP. (2013). Icdar 2013 robust reading competition. In: International Conference on Document Analysis and Recognition, IEEE, pp 1484\u20131493","DOI":"10.1109\/ICDAR.2013.221"},{"key":"2428_CR26","doi-asserted-by":"crossref","unstructured":"Kil, T., Kim, S., Seo, S., Kim, Y. & Kim, D. (2023). Towards unified scene text spotting based on sequence generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 15223\u201315232","DOI":"10.1109\/CVPR52729.2023.01461"},{"key":"2428_CR27","doi-asserted-by":"crossref","unstructured":"Kittenplon, Y., Lavi, I., Fogel, S., Bar, Y., Manmatha, R. & Perona, P. (2022). Towards weakly-supervised text spotting using a multi-task transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4604\u20134613","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"2428_CR28","doi-asserted-by":"crossref","unstructured":"Li, H., Wang, P. & Shen, C. (2017). Towards end-to-end text spotting with convolutional recurrent neural networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 5238\u20135246","DOI":"10.1109\/ICCV.2017.560"},{"key":"2428_CR29","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Liu, S., Guo, J., Ni, LM. & Zhang, L. (2022). Dn-detr: Accelerate detr training by introducing query denoising. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13619\u201313627","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"2428_CR30","doi-asserted-by":"crossref","unstructured":"Liao, M., Pang, G., Huang, J., Hassner, T. & Bai, X. (2020). Mask textspotter v3: Segmentation proposal network for robust scene text spotting. In: European Conference on Computer Vision, Springer, pp 706\u2013722","DOI":"10.1007\/978-3-030-58621-8_41"},{"key":"2428_CR31","doi-asserted-by":"crossref","unstructured":"Liao, M., Shi, B., Bai, X., Wang, X. & Liu, W. (2017). Textboxes: A fast text detector with a single deep neural network. In: Thirty-first AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v31i1.11196"},{"issue":"2","key":"2428_CR32","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/TPAMI.2019.2937086","volume":"43","author":"M Liao","year":"2021","unstructured":"Liao, M., Lyu, P., He, M., Yao, C., Wu, W., & Bai, X. (2021). Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43(2), 532\u2013548. https:\/\/doi.org\/10.1109\/TPAMI.2019.2937086","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"8","key":"2428_CR33","doi-asserted-by":"publisher","first-page":"3676","DOI":"10.1109\/TIP.2018.2825107","volume":"27","author":"M Liao","year":"2018","unstructured":"Liao, M., Shi, B., & Bai, X. (2018). Textboxes++: A single-shot oriented scene text detector. IEEE Transactions on Image Processing, 27(8), 3676\u20133690.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2428_CR34","doi-asserted-by":"crossref","unstructured":"Lin TY, Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B. & Belongie, S. (2017). Feature pyramid networks for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"2428_CR35","doi-asserted-by":"crossref","unstructured":"Lin TY, Goyal, P., Girshick, R., He, K. & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2980\u20132988","DOI":"10.1109\/ICCV.2017.324"},{"key":"2428_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, H., Shen, C., He, T., Jin, L. & Wang, L. (2020). Abcnet: Real-time scene text spotting with adaptive bezier-curve network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9809\u20139818","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"2428_CR37","unstructured":"Liu, S., Li, F., Zhang, H., Yang, X., Qi, X., Su, H., Zhu, J. & Zhang, L. (2022). DAB-DETR: Dynamic anchor boxes are better queries for DETR. In: ICLR"},{"key":"2428_CR38","doi-asserted-by":"crossref","unstructured":"Liu, X., Liang, D., Yan, S., Chen, D., Qiao, Y. & Yan, J. (2018). Fots: Fast oriented text spotting with a unified network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5676\u20135685","DOI":"10.1109\/CVPR.2018.00595"},{"key":"2428_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S. & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2428_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, J., Peng, D., Huang, M., Wang, X., Tang, J., Huang, C., Lin, D., Shen, C., Bai, X., et\u00a0al. (2023). Spts v2: single-point scene text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence","DOI":"10.1109\/TPAMI.2023.3312285"},{"key":"2428_CR41","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1016\/j.patcog.2019.02.002","volume":"90","author":"Y Liu","year":"2019","unstructured":"Liu, Y., Jin, L., Zhang, S., Luo, C., & Zhang, S. (2019). Curved scene text detection via transverse and longitudinal sequence connection. Pattern Recognition, 90, 337\u2013345.","journal-title":"Pattern Recognition"},{"issue":"11","key":"2428_CR42","first-page":"8048","volume":"44","author":"Y Liu","year":"2021","unstructured":"Liu, Y., Shen, C., Jin, L., He, T., Chen, P., Liu, C., & Chen, H. (2021). Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(11), 8048\u20138064.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2428_CR43","doi-asserted-by":"crossref","unstructured":"Lyu, P., Liao, M., Yao, C., Wu, W. & Bai, X. (2018). Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. In: European Conference on Computer Vision, pp 67\u201383","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"2428_CR44","doi-asserted-by":"crossref","unstructured":"Meng, D., Chen, X., Fan, Z., Zeng, G., Li, H., Yuan, Y., Sun, L. & Wang, J. (2021). Conditional detr for fast training convergence. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 3651\u20133660","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"2428_CR45","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N. & Ahmadi, SA. (2016). V-net: Fully convolutional neural networks for volumetric medical image segmentation. In: Fourth International Conference on 3D Vision, IEEE, pp 565\u2013571","DOI":"10.1109\/3DV.2016.79"},{"key":"2428_CR46","first-page":"1454","volume":"1","author":"N Nayef","year":"2017","unstructured":"Nayef, N., Yin, F., Bizid, I., Choi, H., Feng, Y., Karatzas, D., Luo, Z., Pal, U., Rigaud, C., Chazalon, J., et al. (2017). Icdar 2017 robust reading challenge on multi-lingual scene text detection and script identification-rrc-mlt. International Conference on Document Analysis and Recognition, IEEE, 1, 1454\u20131459.","journal-title":"International Conference on Document Analysis and Recognition, IEEE"},{"issue":"9","key":"2428_CR47","doi-asserted-by":"publisher","first-page":"1872","DOI":"10.1109\/TPAMI.2015.2496234","volume":"38","author":"L Neumann","year":"2015","unstructured":"Neumann, L., & Matas, J. (2015). Real-time lexicon-free scene text localization and recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 38(9), 1872\u20131885.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2428_CR48","doi-asserted-by":"crossref","unstructured":"Nguyen, N., Nguyen, T., Tran, V., Tran, MT., Ngo, TD., Nguyen, TH. & Hoai, M. (2021). Dictionary-guided scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7383\u20137392","DOI":"10.1109\/CVPR46437.2021.00730"},{"key":"2428_CR49","unstructured":"Ouyang-Zhang, J., Cho, JH., Zhou, X. & Kr\u00e4henb\u00fchl, P. (2022). Nms strikes back. arXiv preprint arXiv:2212.06137"},{"key":"2428_CR50","unstructured":"Pan, Z., Cai, J. & Zhuang, B. (2022). Fast vision transformers with hilo attention. In: NeurIPS"},{"key":"2428_CR51","doi-asserted-by":"crossref","unstructured":"Peng, D., Wang, X., Liu, Y., Zhang, J., Huang, M., Lai, S., Li, J., Zhu, S., Lin, D., Shen, C., et\u00a0al. (2022). Spts: single-point text spotting. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4272\u20134281","DOI":"10.1145\/3503161.3547942"},{"issue":"11","key":"2428_CR52","doi-asserted-by":"publisher","first-page":"2623","DOI":"10.1007\/s11263-022-01654-0","volume":"130","author":"D Peng","year":"2022","unstructured":"Peng, D., Jin, L., Liu, Y., Luo, C., & Lai, S. (2022). Pagenet: Towards end-to-end weakly supervised page-level handwritten chinese text recognition. International Journal of Computer Vision, 130(11), 2623\u20132645.","journal-title":"International Journal of Computer Vision"},{"key":"2428_CR53","doi-asserted-by":"crossref","unstructured":"Qiao, L., Chen, Y., Cheng, Z., Xu, Y., Niu, Y., Pu, S. & Wu, F. (2021). Mango: A mask attention guided one-stage scene text spotter. In: Proceedings of the Thirty-Fifth AAAI Conference on Artificial Intelligence, pp 2467\u20132476","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"2428_CR54","doi-asserted-by":"publisher","first-page":"11899","DOI":"10.1609\/aaai.v34i07.6864","volume":"34","author":"L Qiao","year":"2020","unstructured":"Qiao, L., Tang, S., Cheng, Z., Xu, Y., Niu, Y., Pu, S., & Wu, F. (2020). Text perceptron: Towards end-to-end arbitrary-shaped text spotting. Proceedings of the AAAI Conference on Artificial Intelligence, 34, 11899\u201311907.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2428_CR55","doi-asserted-by":"crossref","unstructured":"Qin, S., Bissacco, A., Raptis, M., Fujii, Y. & Xiao, Y. (2019). Towards unconstrained end-to-end text spotting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4704\u20134714","DOI":"10.1109\/ICCV.2019.00480"},{"key":"2428_CR56","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in Neural Information Processing Systems, 28, 91\u201399.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2428_CR57","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I. & Savarese, S. (2019). Generalized intersection over union: A metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 658\u2013666","DOI":"10.1109\/CVPR.2019.00075"},{"key":"2428_CR58","doi-asserted-by":"crossref","unstructured":"Ronen, R., Tsiper, S., Anschel, O., Lavi, I., Markovitz, A. & Manmatha, R. (2022). Glass: Global to local attention for scene-text spotting. In: European Conference on Computer Vision, Springer, pp 249\u2013266","DOI":"10.1007\/978-3-031-19815-1_15"},{"key":"2428_CR59","doi-asserted-by":"crossref","unstructured":"Rong, X., Li, B., Munoz JP, Xiao, J., Arditi, A. & Tian, Y. (2016). Guided text spotting for assistive blind navigation in unfamiliar indoor environments. In: International Symposium on Visual Computing, Springer, pp 11\u201322","DOI":"10.1007\/978-3-319-50832-0_2"},{"issue":"11","key":"2428_CR60","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2016","unstructured":"Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 39(11), 2298\u20132304.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2428_CR61","doi-asserted-by":"crossref","unstructured":"Singh, A., Pang, G., Toh, M., Huang, J., Galuba, W. & Hassner, T. (2021). Textocr: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8802\u20138812","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"2428_CR62","doi-asserted-by":"crossref","unstructured":"Stewart, R., Andriluka, M. & Ng, AY. (2016). End-to-end people detection in crowded scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2325\u20132333","DOI":"10.1109\/CVPR.2016.255"},{"key":"2428_CR63","doi-asserted-by":"crossref","unstructured":"Sun, Y., Ni, Z., Chng, CK., Liu, Y., Luo, C., Ng CC, Han, J., Ding, E., Liu, J., Karatzas, D., et\u00a0al. (2019). Icdar 2019 competition on large-scale street view text with partial labeling-rrc-lsvt. In: International Conference on Document Analysis and Recognition, IEEE, pp 1557\u20131562","DOI":"10.1109\/ICDAR.2019.00250"},{"key":"2428_CR64","doi-asserted-by":"crossref","unstructured":"Sun, P., Zhang, R., Jiang, Y., Kong, T., Xu, C., Zhan, W., Tomizuka, M., Li, L., Yuan, Z., Wang, C., et\u00a0al. (2021). Sparse r-cnn: End-to-end object detection with learnable proposals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14454\u201314463","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"2428_CR65","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C. & Chen, H. (2020). Conditional convolutions for instance segmentation. In: European Conference on Computer Vision, Springer, pp 282\u2013298","DOI":"10.1007\/978-3-030-58452-8_17"},{"key":"2428_CR66","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, AN., Kaiser, \u0141. & Polosukhin, I. (2017). Attention is all you need. In: Advances in Neural Information Processing Systems, pp 5998\u20136008"},{"key":"2428_CR67","doi-asserted-by":"crossref","unstructured":"Wang, HC., Finn, C., Paull, L., Kaess, M., Rosenholtz, R., Teller, S. & Leonard, J. (2015). Bridging text spotting and slam with junction features. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems, IEEE, pp 3701\u20133708","DOI":"10.1109\/IROS.2015.7353895"},{"key":"2428_CR68","unstructured":"Wang, K., Babenko, B. & Belongie, S. (2011). End-to-end scene text recognition. In: 2011 International Conference on Computer Vision, IEEE, pp 1457\u20131464"},{"key":"2428_CR69","doi-asserted-by":"crossref","unstructured":"Wang, W., Liu, X., Ji, X., Xie, E., Liang, D., Yang, Z., Lu, T., Shen, C. & Luo, P. (2020). Ae textspotter: Learning visual and linguistic representation for ambiguous text spotting. In: European Conference on Computer Vision, Springer, pp 457\u2013473","DOI":"10.1007\/978-3-030-58568-6_27"},{"key":"2428_CR70","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xie, H., Fang, S., Wang, J., Zhu, S. & Zhang, Y. (2021). From two to one: A new scene text recognizer with visual language modeling network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 14194\u201314203","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"2428_CR71","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, DP., Song, K., Liang, D., Lu, T., Luo, P. & Shao, L. (2021). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 568\u2013578","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"2428_CR72","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Liu, X., Liang, D., Zhibo, Y., Lu, T. & Shen, C. (2021). Pan++: Towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Transactions on Pattern Analysis and Machine Intelligence","DOI":"10.1109\/TPAMI.2021.3077555"},{"key":"2428_CR73","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Song, X., Zang, Y., Wang, W., Lu, T., Yu, G. & Shen, C. (2019). Efficient and accurate arbitrary-shaped text detection with pixel aggregation network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 8440\u20138449","DOI":"10.1109\/ICCV.2019.00853"},{"key":"2428_CR74","doi-asserted-by":"crossref","unstructured":"Wang, P., Zhang, C., Qi, F., Liu, S., Zhang, X., Lyu, P., Han, J., Liu, J., Ding, E. & Shi, G. (2021). Pgnet: Real-time arbitrarily-shaped text spotting with point gathering network. arXiv preprint arXiv:2104.05458","DOI":"10.1609\/aaai.v35i4.16383"},{"key":"2428_CR75","doi-asserted-by":"publisher","first-page":"2738","DOI":"10.1609\/aaai.v35i4.16378","volume":"35","author":"J Wang","year":"2021","unstructured":"Wang, J., Liu, C., Jin, L., Tang, G., Zhang, J., Zhang, S., Wang, Q., Wu, Y., & Cai, M. (2021). Towards robust visual information extraction in real world: New dataset and novel solution. Proceedings of the AAAI Conference on Artificial Intelligence, 35, 2738\u20132745.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2428_CR76","doi-asserted-by":"publisher","first-page":"12160","DOI":"10.1609\/aaai.v34i07.6896","volume":"34","author":"H Wang","year":"2020","unstructured":"Wang, H., Lu, P., Zhang, H., Yang, M., Bai, X., Xu, Y., He, M., Wang, Y., & Liu, W. (2020). All you need is boundary: Toward arbitrary-shaped text spotting. Proceedings of the AAAI Conference on Artificial Intelligence, 34, 12160\u201312167.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"1\u20133","key":"2428_CR77","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/0169-7439(87)80084-9","volume":"2","author":"S Wold","year":"1987","unstructured":"Wold, S., Esbensen, K., & Geladi, P. (1987). Principal component analysis. Chemometrics and Intelligent Laboratory Systems, 2(1\u20133), 37\u201352.","journal-title":"Chemometrics and Intelligent Laboratory Systems"},{"issue":"4","key":"2428_CR78","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/s10032-006-0014-0","volume":"8","author":"C Wolf","year":"2006","unstructured":"Wolf, C., & Jolion, J. M. (2006). Object count\/area graphs for the evaluation of object detection and segmentation algorithms. International Journal of Document Analysis and Recognition, 8(4), 280\u2013296.","journal-title":"International Journal of Document Analysis and Recognition"},{"key":"2428_CR79","doi-asserted-by":"crossref","unstructured":"Xia, X., Ding, G. & Li, S. (2024). Lmtextspotter: Towards better scene text spotting with language modeling in transformer. In: International Conference on Document Analysis and Recognition, Springer, pp 76\u201392","DOI":"10.1007\/978-3-031-70549-6_5"},{"key":"2428_CR80","doi-asserted-by":"crossref","unstructured":"Xing, L., Tian, Z., Huang, W. & Scott, MR. (2019). Convolutional character networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 9126\u20139136","DOI":"10.1109\/ICCV.2019.00922"},{"key":"2428_CR81","unstructured":"Yang, J., Li, C., Zhang, P., Dai, X., Xiao, B., Yuan, L. & Gao, J. (2021). Focal self-attention for local-global interactions in vision transformers. arXiv preprint arXiv:2107.00641"},{"key":"2428_CR82","doi-asserted-by":"crossref","unstructured":"Ye, M., Zhang, J., Zhao, S., Liu, J., Du, B. & Tao, D. (2023). Dptext-detr: Towards better scene text detection with dynamic points in transformer. In: Proceedings of the AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v37i3.25430"},{"key":"2428_CR83","doi-asserted-by":"crossref","unstructured":"Ye, M., Zhang, J., Zhao, S., Liu, J., Liu, T., Du, B. & Tao, D. (2023). Deepsolo: Let transformer decoder with explicit points solo for text spotting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 19348\u201319357","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"2428_CR84","doi-asserted-by":"crossref","unstructured":"Yu, D., Li, X., Zhang, C., Liu, T., Han, J., Liu, J. & Ding, E. (2020). Towards accurate scene text recognition with semantic reasoning networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12113\u201312122","DOI":"10.1109\/CVPR42600.2020.01213"},{"key":"2428_CR85","doi-asserted-by":"crossref","unstructured":"Yu, W., Liu, Y., Zhu, X., Cao, H., Sun, X. & Bai, X. (2024). Turning a clip model into a scene text spotter. IEEE Transactions on Pattern Analysis and Machine Intelligence","DOI":"10.1109\/TPAMI.2024.3379828"},{"key":"2428_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, SX., Yang, C., Zhu, X., Zhou, H., Wang, H. & Yin, XC. (2024). Inverse-like antagonistic scene text spotting via reading-order estimation and dynamic sampling. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2024.3352399"},{"key":"2428_CR87","unstructured":"Zhang, H., Li, F., Liu, S., Zhang, L., Su, H., Zhu, J., Ni, LM. & Shum, HY. (2023).. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. ICLR pp 1\u20139"},{"key":"2428_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, X., Su, Y., Tripathi, S. & Tu, Z. (2022). Text spotting transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9519\u20139528","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"2428_CR89","doi-asserted-by":"publisher","unstructured":"Zhang, C., Tao, Y., Du, K., Ding, W., Wang, B., Liu, J. & Wang, W. (2021). Character-level street view text spotting based on deep multi-segmentation network for smarter autonomous driving. IEEE Transactions on Artificial Intelligence pp 1\u20131, https:\/\/doi.org\/10.1109\/TAI.2021.3116216","DOI":"10.1109\/TAI.2021.3116216"},{"key":"2428_CR90","unstructured":"Zhang, S., Wang, X., Wang, J., Pang, J. & Chen, K. (2022). What are expected queries in end-to-end object detection? arXiv preprint arXiv:2206.01232"},{"key":"2428_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, P., Xu, Y., Cheng, Z., Pu, S., Lu, J., Qiao, L., Niu, Y. & Wu, F. (2020). Trie: End-to-end text reading and information extraction for document understanding. In: Proceedings of the 28th ACM International Conference on Multimedia, pp 1413\u20131422","DOI":"10.1145\/3394171.3413900"},{"key":"2428_CR92","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zhou, Y., Jiang, Q., Song, Q., Li, N., Zhou, K., Wang, L., Wang, D., Liao, M., Yang, M., et\u00a0al. (2019). Icdar 2019 robust reading challenge on reading chinese text on signboard. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), IEEE, pp 1577\u20131581","DOI":"10.1109\/ICDAR.2019.00253"},{"key":"2428_CR93","unstructured":"Zhong, H., Tang, J., Wang, W., Yang, Z., Yao, C. & Lu, T. (2021). Arts: Eliminating inconsistency between text detection and recognition with auto-rectification text spotter. arXiv preprint arXiv:2110.10405"},{"key":"2428_CR94","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X. & Dai, J. (2021). Deformable detr: Deformable transformers for end-to-end object detection. ICLR pp 1\u20139"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02428-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02428-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02428-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T11:34:18Z","timestamp":1757158458000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02428-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,15]]},"references-count":94,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2428"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02428-0","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,15]]},"assertion":[{"value":"9 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}