{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T15:44:09Z","timestamp":1757778249303,"version":"3.40.3"},"publisher-location":"Cham","reference-count":91,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031727603"},{"type":"electronic","value":"9783031727610"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72761-0_10","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"165-183","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Platypus: A Generalized Specialist Model for\u00a0Reading Text in\u00a0Various Forms"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8617-1550","authenticated-orcid":false,"given":"Peng","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7704-3231","authenticated-orcid":false,"given":"Zhaohai","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3949-007X","authenticated-orcid":false,"given":"Jun","family":"Tang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8676-0811","authenticated-orcid":false,"given":"Humen","family":"Zhong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-5053","authenticated-orcid":false,"given":"Fei","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2343-7750","authenticated-orcid":false,"given":"Zhibo","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6564-4796","authenticated-orcid":false,"given":"Cong","family":"Yao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"10_CR1","unstructured":"Gpt-4v(ision) system card (2023)"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Aberdam, A., Litman, R., Tsiper, S., Anschel, O., Slossberg, R., Mazor, S., Manmatha, R., Perona, P.: Sequence-to-sequence contrastive learning for text recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 15297\u201315307 (2020)","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"10_CR3","doi-asserted-by":"publisher","first-page":"58","DOI":"10.1016\/j.patrec.2012.09.023","volume":"35","author":"F Alvaro","year":"2014","unstructured":"Alvaro, F., S\u00e1nchez, J.A., Bened\u00ed, J.M.: Recognition of on-line handwritten mathematical expressions using 2d stochastic context-free grammars and hidden markov models. Pattern Recognit. Lett. 35, 58\u201367 (2014)","journal-title":"Pattern Recognit. Lett."},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Atienza, R.: Vision transformer for fast and efficient scene text recognition. In: IEEE International Conference on Document Analysis and Recognition (2021)","DOI":"10.1007\/978-3-030-86549-8_21"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Baek, J., Kim, G., Lee, J., Park, S., Han, D., Yun, S., Oh, S.J., Lee, H.: What is wrong with scene text recognition model comparisons? dataset and model analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (October 2019)","DOI":"10.1109\/ICCV.2019.00481"},{"key":"10_CR6","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond (2023)"},{"key":"10_CR7","doi-asserted-by":"crossref","unstructured":"Bautista, D., Atienza, R.: Scene text recognition with permuted autoregressive sequence models. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"10_CR8","doi-asserted-by":"crossref","unstructured":"Bhunia, A.K., Das, A., Bhunia, A.K., Kishore, P.S.R., Roy, P.P.: Handwriting recognition in low-resource scripts using adversarial learning. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 4762\u20134771 (2018)","DOI":"10.1109\/CVPR.2019.00490"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, C., Wang, P., Da, C., Zheng, Q., Yao, C.: Lister: Neighbor decoding for length-insensitive scene text recognition. 2023 IEEE\/CVF International Conference on Computer Vision (ICCV) pp. 19484\u201319494 (2023)","DOI":"10.1109\/ICCV51070.2023.01790"},{"key":"10_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, C., Wang, P., Da, C., Zheng, Q., Yao, C.: Lister: Neighbor decoding for length-insensitive scene text recognition. 2023 IEEE\/CVF International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01790"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Cheng, Z., Bai, F., Xu, Y., Zheng, G., Pu, S., Zhou, S.: Focusing attention: Towards accurate text recognition in natural images. 2017 IEEE International Conference on Computer Vision (ICCV) pp. 5086\u20135094 (2017)","DOI":"10.1109\/ICCV.2017.543"},{"key":"10_CR12","doi-asserted-by":"publisher","unstructured":"Ch\u2019ng, C.K., Chan, C.S.: Total-text: A comprehensive dataset for scene text detection and recognition. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR). vol.\u00a001, pp. 935\u2013942 (2017). https:\/\/doi.org\/10.1109\/ICDAR.2017.157","DOI":"10.1109\/ICDAR.2017.157"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Da, C., Wang, P., Yao, C.: Levenshtein ocr. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19815-1_19"},{"key":"10_CR14","unstructured":"Da, C., Wang, P., Yao, C.: Multi-granularity prediction with learnable fusion for scene text recognition. CoRR abs\/2307.13244 (2023), https:\/\/doi.org\/10.48550\/arXiv.2307.13244"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Fang, S., Xie, H., Wang, Y., Mao, Z., Zhang, Y.: Read like humans: Autonomous, bidirectional and iterative language modeling for scene text recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 7094\u20137103 (2021)","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"10_CR16","unstructured":"Feng, H., Wang, Z., Tang, J., Lu, J., gang Zhou, W., Li, H., Huang, C.: Unidoc: A universal large multimodal model for simultaneous text detection, recognition, spotting and understanding. ArXiv abs\/2308.11592 (2023)"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Feng, W., He, W., Yin, F., Zhang, X.Y., Liu, C.L.: Textdragon: An end-to-end framework for arbitrary shaped text spotting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (October 2019)","DOI":"10.1109\/ICCV.2019.00917"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Garcia-Bordils, S., Mafla, A., Biten, A.F., Nuriel, O., Aberdam, A., Mazor, S., Litman, R., Karatzas, D.: Out-of-vocabulary challenge report. In: ECCV Workshops (2022)","DOI":"10.1007\/978-3-031-25069-9_24"},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Grosicki, E., Carr\u00e9, M., Brodin, J., Geoffrois, E.: Results of the RIMES evaluation campaign for handwritten mail processing. In: 10th International Conference on Document Analysis and Recognition, ICDAR 2009, Barcelona, Spain, 26-29 July 2009. pp. 941\u2013945. IEEE Computer Society (2009)","DOI":"10.1109\/ICDAR.2009.224"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A., Zisserman, A.: Synthetic data for text localisation in natural images. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) pp. 2315\u20132324 (2016)","DOI":"10.1109\/CVPR.2016.254"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"He, M., Liao, M., Yang, Z., Zhong, H., Tang, J., Cheng, W., Yao, C., Wang, Y., Bai, X.: Most: A multi-oriented scene text detector with localization refinement. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 8809\u20138818 (2021)","DOI":"10.1109\/CVPR46437.2021.00870"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Hu, W., Cai, X., Hou, J., Yi, S., Lin, Z.: Gtc: Guided training of ctc towards efficient and accurate scene text recognition. ArXiv abs\/2002.01276 (2020)","DOI":"10.1609\/aaai.v34i07.6735"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Huang, M., Liu, Y., Peng, Z., Liu, C., Lin, D., Zhu, S., Yuan, N.J., Ding, K., Jin, L.: Swintextspotter: Scene text spotting via better synergy between text detection and text recognition. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 4583\u20134593 (2022)","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Huang, M., Zhang, J., Peng, D., Lu, H., Huang, C., Liu, Y., Bai, X., Jin, L.: Estextspotter: Towards better scene text spotting with explicit synergy in transformer. 2023 IEEE\/CVF International Conference on Computer Vision (ICCV) pp. 19438\u201319448 (2023)","DOI":"10.1109\/ICCV51070.2023.01786"},{"key":"10_CR25","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., Zisserman, A.: Synthetic data and artificial neural networks for natural scene text recognition. NIPS Deep Learning Workshop (2014)"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Wang, J., Peng, D., Liu, C., Jin, L.: Revisiting scene text recognition: A data perspective. In: Proceedings of the IEEE\/CVF international conference on computer vision (2023)","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Karatzas, D., Gomez-Bigorda, L., Nicolaou, A., Ghosh, S.K., Bagdanov, A.D., Iwamura, M., Matas, J., Neumann, L., Chandrasekhar, V.R., Lu, S., Shafait, F., Uchida, S., Valveny, E.: ICDAR 2015 competition on robust reading. In: ICDAR. pp. 1156\u20131160 (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"10_CR28","doi-asserted-by":"crossref","unstructured":"Karatzas, D., Shafait, F., Uchida, S., Iwamura, M., i\u00a0Bigorda, L.G., Mestre, S.R., Mas, J., Mota, D.F., Almaz\u00e1n, J., de\u00a0las Heras, L.: ICDAR 2013 robust reading competition. In: ICDAR. pp. 1484\u20131493 (2013)","DOI":"10.1109\/ICDAR.2013.221"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Kil, T.H., Kim, S., Seo, S., Kim, Y., Kim, D.: Towards unified scene text spotting based on sequence generation. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 15223\u201315232 (2023)","DOI":"10.1109\/CVPR52729.2023.01461"},{"key":"10_CR30","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.Y., Doll\u00e1r, P., Girshick, R.B.: Segment anything. pp. 3992\u20134003 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Kleber, F., Fiel, S., Diem, M., Sablatnig, R.: Cvl-database: An off-line database for writer retrieval, writer identification and word spotting. In: ICDAR. pp. 560\u2013564 (2013)","DOI":"10.1109\/ICDAR.2013.117"},{"key":"10_CR32","unstructured":"Krylov, I., Nosov, S., Sovrasov, V.: Open images V5 text annotation and yet another mask text spotter. In: Balasubramanian, V.N., Tsang, I.W. (eds.) Asian Conference on Machine Learning. vol.\u00a0157, pp. 379\u2013389 (2021)"},{"key":"10_CR33","doi-asserted-by":"crossref","unstructured":"Le, A.D.: Recognizing handwritten mathematical expressions via paired dual loss attention network and printed mathematical expressions. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW) pp. 2413\u20132418 (2020)","DOI":"10.1109\/CVPRW50498.2020.00291"},{"key":"10_CR34","doi-asserted-by":"crossref","unstructured":"Lee, C.Y., Osindero, S.: Recursive recurrent nets with attention modeling for ocr in the wild. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) pp. 2231\u20132239 (2016)","DOI":"10.1109\/CVPR.2016.245"},{"key":"10_CR35","unstructured":"Li, M., Lv, T., Cui, L., Lu, Y., Flor\u00eancio, D.A.F., Zhang, C., Li, Z., Wei, F.: Trocr: Transformer-based optical character recognition with pre-trained models. In: AAAI Conference on Artificial Intelligence (2021)"},{"key":"10_CR36","unstructured":"Li, Z., Yang, B., Liu, Q., Ma, Z., Zhang, S., Yang, J., Sun, Y., Liu, Y., Bai, X.: Monkey: Image resolution and text label are important things for large multi-modal models. ArXiv abs\/2311.06607 (2023)"},{"issue":"2","key":"10_CR37","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/tpami.2019.2937086","volume":"43","author":"M Liao","year":"2021","unstructured":"Liao, M., Lyu, P., He, M., Yao, C., Wu, W., Bai, X.: Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes. IEEE Trans. Pattern Anal. Mach. Intell. 43(2), 532\u2013548 (2021). https:\/\/doi.org\/10.1109\/tpami.2019.2937086","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"10_CR38","doi-asserted-by":"publisher","first-page":"919","DOI":"10.1109\/TPAMI.2022.3155612","volume":"45","author":"M Liao","year":"2022","unstructured":"Liao, M., Zou, Z., Wan, Z., Yao, C., Bai, X.: Real-time scene text detection with differentiable binarization and adaptive scale fusion. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 919\u2013931 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R.B., He, K., Hariharan, B., Belongie, S.J.: Feature pyramid networks for object detection. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) pp. 936\u2013944 (2016)","DOI":"10.1109\/CVPR.2017.106"},{"key":"10_CR40","doi-asserted-by":"crossref","unstructured":"Liu, H., Wang, B., Bao, Z., Xue, M., Kang, S., Jiang, D., Liu, Y., Ren, B.: Perceiving stroke-semantic context: Hierarchical contrastive learning for robust scene text recognition. In: AAAI Conference on Artificial Intelligence (2022)","DOI":"10.1609\/aaai.v36i2.20062"},{"key":"10_CR41","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. ArXiv abs\/2310.03744 (2023)"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Liu, X., Liang, D., Yan, S., Chen, D., Qiao, Y., Yan, J.: Fots: Fast oriented text spotting with a unified network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (June 2018)","DOI":"10.1109\/CVPR.2018.00595"},{"key":"10_CR43","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, H., Shen, C., He, T., Jin, L., Wang, L.: Abcnet: Real-time scene text spotting with adaptive bezier-curve network. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (June 2020)","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"10_CR44","unstructured":"Liu, Y., Jin, L., Zhang, S., Zhang, S.: Detecting curve text in the wild: New dataset and new solution. ArXiv abs\/1712.02170 (2017)"},{"key":"10_CR45","unstructured":"Liu, Y., Li, Z., Li, H., Yu, W., Huang, M., Peng, D., Liu, M., Chen, M., Li, C., Jin, L., Bai, X.: On the hidden mystery of ocr in large multimodal models. ArXiv abs\/2305.07895 (2023)"},{"key":"10_CR46","first-page":"8048","volume":"44","author":"Y Liu","year":"2021","unstructured":"Liu, Y., Shen, C., Jin, L., He, T., Chen, P., Liu, C., Chen, H.: Abcnet v2: Adaptive bezier-curve network for real-time end-to-end text spotting. IEEE Trans. Pattern Anal. Mach. Intell. 44, 8048\u20138064 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR47","doi-asserted-by":"publisher","first-page":"15665","DOI":"10.1109\/TPAMI.2023.3312285","volume":"45","author":"Y Liu","year":"2023","unstructured":"Liu, Y., Zhang, J., Peng, D., Huang, M., Wang, X., Tang, J., Huang, C., Lin, D., Shen, C., Bai, X., Jin, L.: Spts v2: Single-point scene text spotting. IEEE Trans. Pattern Anal. Mach. Intell. 45, 15665\u201315679 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR48","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) pp. 9992\u201310002 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10_CR49","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1007\/s11263-020-01369-0","volume":"129","author":"S Long","year":"2018","unstructured":"Long, S., He, X., Yao, C.: Scene text detection and recognition: The deep learning era. Int. J. Comput. Vision 129, 161\u2013184 (2018)","journal-title":"Int. J. Comput. Vision"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Long, S., Qin, S., Panteleev, D., Bissacco, A., Fujii, Y., Raptis, M.: Towards end-to-end unified scene text detection and layout analysis. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 1039\u20131049 (2022)","DOI":"10.1109\/CVPR52688.2022.00112"},{"key":"10_CR51","doi-asserted-by":"crossref","unstructured":"Long, S., Ruan, J., Zhang, W., He, X., Wu, W., Yao, C.: Textsnake: A flexible representation for detecting text of arbitrary shapes. In: Proceedings of the European Conference on Computer Vision (ECCV) (September 2018)","DOI":"10.1007\/978-3-030-01216-8_2"},{"key":"10_CR52","doi-asserted-by":"crossref","unstructured":"Luo, C., Zhu, Y., Jin, L., Wang, Y.: Learn to augment: Joint data augmentation and network optimization for text recognition. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 13743\u201313752 (2020)","DOI":"10.1109\/CVPR42600.2020.01376"},{"issue":"1","key":"10_CR53","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1007\/s100320200071","volume":"5","author":"U Marti","year":"2002","unstructured":"Marti, U., Bunke, H.: The iam-database: an english sentence database for offline handwriting recognition. Int. J. Document Anal. Recognit. 5(1), 39\u201346 (2002)","journal-title":"Int. J. Document Anal. Recognit."},{"key":"10_CR54","doi-asserted-by":"crossref","unstructured":"Mishra, A., Alahari, K., Jawahar, C.V.: Scene text recognition using higher order language priors. In: BMVC. pp. 1\u201311 (2012)","DOI":"10.5244\/C.26.127"},{"key":"10_CR55","doi-asserted-by":"publisher","unstructured":"Nayef, N., Yin, F., Bizid, I., Choi, H., Feng, Y., Karatzas, D., Luo, Z., Pal, U., Rigaud, C., Chazalon, J., Khlif, W., Luqman, M.M., Burie, J., Liu, C., Ogier, J.: Icdar2017 robust reading challenge on multi-lingual scene text detection and script identification - rrc-mlt. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR). vol.\u00a001, pp. 1454\u20131459 (2017). https:\/\/doi.org\/10.1109\/ICDAR.2017.237","DOI":"10.1109\/ICDAR.2017.237"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Nuriel, O., Fogel, S., Litman, R.: Textadain: Paying attention to shortcut learning in text recognizers. In: European Conference on Computer Vision (2021)","DOI":"10.1007\/978-3-031-19815-1_25"},{"key":"10_CR57","doi-asserted-by":"crossref","unstructured":"Peng, D., Wang, X., Liu, Y., Zhang, J., Huang, M., Lai, S., Zhu, S., Li, J., Lin, D., Shen, C., Jin, L.: Spts: Single-point text spotting. Proceedings of the 30th ACM International Conference on Multimedia (2021)","DOI":"10.1145\/3503161.3547942"},{"key":"10_CR58","doi-asserted-by":"crossref","unstructured":"Phan, T.Q., Shivakumara, P., Tian, S., Tan, C.L.: Recognizing text with perspective distortion in natural scenes. In: ICCV. pp. 569\u2013576 (2013)","DOI":"10.1109\/ICCV.2013.76"},{"key":"10_CR59","doi-asserted-by":"crossref","unstructured":"Qiao, L., Chen, Y., Cheng, Z., Xu, Y., Niu, Y., Pu, S., Wu, F.: Mango: A mask attention guided one-stage scene text spotter. In: AAAI Conference on Artificial Intelligence (2020)","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"10_CR60","doi-asserted-by":"crossref","unstructured":"Qin, S., Bissacco, A., Raptis, M., Fujii, Y., Xiao, Y.: Towards unconstrained end-to-end text spotting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (October 2019)","DOI":"10.1109\/ICCV.2019.00480"},{"issue":"18","key":"10_CR61","doi-asserted-by":"publisher","first-page":"8027","DOI":"10.1016\/j.eswa.2014.07.008","volume":"41","author":"A Risnumawan","year":"2014","unstructured":"Risnumawan, A., Shivakumara, P., Chan, C.S., Tan, C.L.: A robust arbitrary text detection system for natural scene images. Expert Syst. Appl. 41(18), 8027\u20138048 (2014)","journal-title":"Expert Syst. Appl."},{"key":"10_CR62","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., Schramowski, P., Kundurthy, S., Crowson, K., Schmidt, L., Kaczmarczyk, R., Jitsev, J.: Laion-5b: An open large-scale dataset for training next generation image-text models. ArXiv abs\/2210.08402 (2022)"},{"issue":"11","key":"10_CR63","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2017","unstructured":"Shi, B., Bai, X., Yao, C.: An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans. Pattern Anal. Mach. Intell. 39(11), 2298\u20132304 (2017). https:\/\/doi.org\/10.1109\/TPAMI.2016.2646371","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"9","key":"10_CR64","doi-asserted-by":"publisher","first-page":"2035","DOI":"10.1109\/TPAMI.2018.2848939","volume":"41","author":"B Shi","year":"2019","unstructured":"Shi, B., Yang, M., Wang, X., Lyu, P., Yao, C., Bai, X.: Aster: An attentional scene text recognizer with flexible rectification. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2035\u20132048 (2019). https:\/\/doi.org\/10.1109\/TPAMI.2018.2848939","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR65","doi-asserted-by":"crossref","unstructured":"Shi, B., Bai, X., Belongie, S.J.: Detecting oriented text in natural images by linking segments. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) pp. 3482\u20133490 (2017)","DOI":"10.1109\/CVPR.2017.371"},{"key":"10_CR66","unstructured":"Shi, Y., Peng, D., Liao, W., Lin, Z., Chen, X., Liu, C., Zhang, Y., Jin, L.: Exploring ocr capabilities of gpt-4v(ision) : A quantitative and in-depth evaluation. ArXiv abs\/2310.16809 (2023)"},{"key":"10_CR67","doi-asserted-by":"crossref","unstructured":"Singh, A., Pang, G., Toh, M., Huang, J., Galuba, W., Hassner, T.: Textocr: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In: CVPR. pp. 8802\u20138812 (2021)","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Tang, J., Yang, Z., Wang, Y., Zheng, Q., Xu, Y., Bai, X.: Seglink++: Detecting dense and arbitrary-shaped scene text by instance-aware component grouping. Pattern Recognit. 96 (2019)","DOI":"10.1016\/j.patcog.2019.06.020"},{"key":"10_CR69","unstructured":"Vaswani, A., Shazeer, N.M., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: Neural Information Processing Systems (2017)"},{"key":"10_CR70","unstructured":"Veit, A., Matera, T., Neumann, L., Matas, J., Belongie, S.J.: Coco-text: Dataset and benchmark for text detection and recognition in natural images. CoRR abs\/1601.07140 (2016)"},{"key":"10_CR71","unstructured":"Wang, K., Babenko, B., Belongie, S.J.: End-to-end scene text recognition. In: ICCV. pp. 1457\u20131464 (2011)"},{"key":"10_CR72","doi-asserted-by":"crossref","unstructured":"Wang, P., Da, C., Yao, C.: Multi-granularity prediction for scene text recognition. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19815-1_20"},{"key":"10_CR73","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, Y., Jin, L., Luo, C., Chen, X., Wu, Y., Wang, Q., Cai, M.: Decoupled attention network for text recognition. In: AAAI Conference on Artificial Intelligence (2019)","DOI":"10.1609\/aaai.v34i07.6903"},{"key":"10_CR74","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Hou, W., Lu, T., Yu, G., Shao, S.: Shape robust text detection with progressive scale expansion network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (June 2019)","DOI":"10.1109\/CVPR.2019.00956"},{"key":"10_CR75","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Song, X., Zang, Y., Wang, W., Lu, T., Yu, G., Shen, C.: Efficient and accurate arbitrary-shaped text detection with pixel aggregation network. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 8440\u20138449 (2019)","DOI":"10.1109\/ICCV.2019.00853"},{"key":"10_CR76","doi-asserted-by":"publisher","first-page":"2386","DOI":"10.1007\/s11263-020-01291-5","volume":"128","author":"JW Wu","year":"2020","unstructured":"Wu, J.W., Yin, F., Zhang, Y., Zhang, X.Y., Liu, C.L.: Handwritten mathematical expression recognition via paired adversarial learning. Int. J. Comput. Vision 128, 2386\u20132401 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"10_CR77","doi-asserted-by":"crossref","unstructured":"Xie, Y., Mouch\u00e8re, H., Liwicki, F.S., Rakesh, S., Saini, R., Nakagawa, M., Nguyen, C.T., Truong, T.N.: Icdar 2023 crohme: Competition on recognition of handwritten mathematical expressions. In: IEEE International Conference on Document Analysis and Recognition (2023)","DOI":"10.1007\/978-3-031-41679-8_33"},{"key":"10_CR78","doi-asserted-by":"crossref","unstructured":"Xing, L., Tian, Z., Huang, W., Scott, M.R.: Convolutional character networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (October 2019)","DOI":"10.1109\/ICCV.2019.00922"},{"key":"10_CR79","doi-asserted-by":"crossref","unstructured":"Yang, M., Liao, M., Lu, P., Wang, J., Zhu, S., Luo, H., Tian, Q., Bai, X.: Reading and writing: Discriminative and generative modeling for self-supervised text recognition. Proceedings of the 30th ACM International Conference on Multimedia (2022)","DOI":"10.1145\/3503161.3547784"},{"key":"10_CR80","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110244","volume":"149","author":"M Yang","year":"2024","unstructured":"Yang, M., Yang, B., Liao, M., Zhu, Y., Bai, X.: Class-aware mask-guided feature refinement for scene text recognition. Pattern Recognit. 149, 110244 (2024)","journal-title":"Pattern Recognit."},{"key":"10_CR81","doi-asserted-by":"crossref","unstructured":"Yao, C., Bai, X., Shi, B., Liu, W.: Strokelets: A learned multi-scale representation for scene text recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 4042\u20134049 (2014)","DOI":"10.1109\/CVPR.2014.515"},{"key":"10_CR82","unstructured":"Yao, C., Zhang, X., Bai, X., Liu, W., Ma, Y., Tu, Z.: Detecting texts of arbitrary orientations in natural images (2012)"},{"key":"10_CR83","unstructured":"Ye, J., Hu, A., Xu, H., Ye, Q., Yan, M., Dan, Y., Zhao, C., Xu, G., Li, C., Tian, J., Qi, Q., Zhang, J., Huang, F.: mplug-docowl: Modularized multimodal large language model for document understanding. ArXiv abs\/2307.02499 (2023)"},{"key":"10_CR84","doi-asserted-by":"crossref","unstructured":"Ye, M., Zhang, J., Zhao, S., Liu, J., Liu, T., Du, B., Tao, D.: Deepsolo: Let transformer decoder with explicit points solo for text spotting. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 19348\u201319357 (2022)","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"10_CR85","doi-asserted-by":"crossref","unstructured":"Yu, D., Li, X., Zhang, C., Liu, T., Han, J., Liu, J., Ding, E.: Towards accurate scene text recognition with semantic reasoning networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (June 2020)","DOI":"10.1109\/CVPR42600.2020.01213"},{"key":"10_CR86","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Liu, X., Dikubab, W., Liu, H., Ji, Z., Wu, Z., Bai, X.: Syntax-aware network for handwritten mathematical expression recognition. arXiv preprint arXiv:2203.01601 (2022)","DOI":"10.1109\/CVPR52688.2022.00451"},{"key":"10_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, X., Su, Y., Tripathi, S., Tu, Z.: Text spotting transformers. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) pp. 9509\u20139518 (2022)","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"10_CR88","unstructured":"Zhang, Y., Gueguen, L., Zharkov, I., Zhang, P., Seifert, K., Kadlec, B.: Uber-text: A large-scale dataset for optical character recognition from street-level imagery. In: SUNw: Scene Understanding Workshop-CVPR. vol.\u00a02017, pp.\u00a01\u20135 (2017)"},{"key":"10_CR89","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Jimeno-Yepes, A.: Publaynet: Largest dataset ever for document layout analysis. 2019 International Conference on Document Analysis and Recognition (ICDAR) pp. 1015\u20131022 (2019)","DOI":"10.1109\/ICDAR.2019.00166"},{"key":"10_CR90","doi-asserted-by":"crossref","unstructured":"Zhou, X., Yao, C., Wen, H., Wang, Y., Zhou, S., He, W., Liang, J.: East: An efficient and accurate scene text detector. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (July 2017)","DOI":"10.1109\/CVPR.2017.283"},{"key":"10_CR91","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1007\/s11704-015-4488-0","volume":"10","author":"Y Zhu","year":"2015","unstructured":"Zhu, Y., Yao, C., Bai, X.: Scene text detection and recognition: recent advances and future trends. Front. Comp. Sci. 10, 19\u201336 (2015)","journal-title":"Front. Comp. Sci."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72761-0_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:29:28Z","timestamp":1727594968000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72761-0_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031727603","9783031727610"],"references-count":91,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72761-0_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}