{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T17:38:14Z","timestamp":1758044294251,"version":"3.44.0"},"publisher-location":"Cham","reference-count":58,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032046130","type":"print"},{"value":"9783032046147","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04614-7_20","type":"book-chapter","created":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:23:10Z","timestamp":1757679790000},"page":"351-369","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["T-LLaVA: An Effective Saliency-Aware Slicing Strategy for\u00a0Text Recognition"],"prefix":"10.1007","author":[{"given":"Mengze","family":"Wei","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chun","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fang","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaobin","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xu-Cheng","family":"Yin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,13]]},"reference":[{"key":"20_CR1","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"20_CR2","unstructured":"Blecher, L., Cucurull, G., Scialom, T., Stojnic, R.: Nougat: neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418 (2023)"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: improving large multi-modal models with better captions. In: European Conference on Computer Vision, pp. 370\u2013387. Springer (2024)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Ch\u2019ng, C.K., Chan, C.S.: Total-text: a comprehensive dataset for scene text detection and recognition. In: International Conference on Document Analysis and Recognition, vol.\u00a01, pp. 935\u2013942. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.157"},{"key":"20_CR6","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Diem, M., et al.: ICFHR 2014 competition on handwritten digit string recognition in challenging datasets. In: International Conference on Frontiers in Handwriting Recognition, pp. 779\u2013784. IEEE (2014)","DOI":"10.1109\/ICFHR.2014.136"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Gao, C., et al.: ICDAR 2023 competition on recognition of multi-line handwritten mathematical expressions. In: International Conference on Document Analysis and Recognition, pp. 566\u2013576. Springer (2023)","DOI":"10.1007\/978-3-031-41679-8_34"},{"key":"20_CR9","first-page":"39","volume":"34","author":"J Gu","year":"2021","unstructured":"Gu, J., et al.: UniDOC: unified pretraining framework for document understanding. Adv. Neural. Inf. Process. Syst. 34, 39\u201350 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Guo, Z., et al.: LLaVA-UHD: an LMM perceiving any aspect ratio and high-resolution images. In: European Conference on Computer Vision, pp. 390\u2013406. Springer (2024)","DOI":"10.1007\/978-3-031-73010-8_23"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A., Zisserman, A.: Synthetic data for text localisation in natural images. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2315\u20132324 (2016)","DOI":"10.1109\/CVPR.2016.254"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"Hu, W., Xu, Y., Li, Y., Li, W., Chen, Z., Tu, Z.: BLIVA: a simple multimodal LLM for better handling of text-rich visual questions. In: AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 2256\u20132264 (2024)","DOI":"10.1609\/aaai.v38i3.27999"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Wang, J., Peng, D., Liu, C., Jin, L.: Revisiting scene text recognition: a data perspective. In: IEEE International Conference on Computer Vision, pp. 20543\u201320554 (2023)","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et\u00a0al.: ICDAR 2015 competition on robust reading. In: International Conference on Document Analysis and Recognition, pp. 1156\u20131160. IEEE (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et al.: ICDAR 2013 robust reading competition. In: International Conference on Document Analysis and Recognition, pp. 1484\u20131493. IEEE (2013)","DOI":"10.1109\/ICDAR.2013.221"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Li, B., et al.: When counting meets HMER: counting-aware network for handwritten mathematical expression recognition. In: European Conference on Computer Vision, pp. 197\u2013214. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_12"},{"key":"20_CR18","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Monkey: image resolution and text label are important things for large multi-modal models. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 26763\u201326773 (2024)","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Liang, M., Ma, J.W., Zhu, X., Qin, J., Yin, X.C.: LayoutFormer: hierarchical text detection towards scene text understanding. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 15665\u201315674 (2024)","DOI":"10.1109\/CVPR52733.2024.01483"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: European Conference on Computer Vision, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"20_CR22","unstructured":"Liu, C., et al.: Focus anywhere for fine-grained multi-page document understanding. arXiv preprint arXiv:2405.14295 (2024)"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Liu, C., et al.: NAMER: non-autoregressive modeling for handwritten mathematical expression recognition. In: European Conference on Computer Vision, pp. 273\u2013291. Springer (2024)","DOI":"10.1007\/978-3-031-72998-0_16"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"20_CR25","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1016\/j.patcog.2019.02.002","volume":"90","author":"Y Liu","year":"2019","unstructured":"Liu, Y., Jin, L., Zhang, S., Luo, C., Zhang, S.: Curved scene text detection via transverse and longitudinal sequence connection. Pattern Recogn. 90, 337\u2013345 (2019)","journal-title":"Pattern Recogn."},{"issue":"12","key":"20_CR26","doi-asserted-by":"publisher","first-page":"220102","DOI":"10.1007\/s11432-024-4235-6","volume":"67","author":"Y Liu","year":"2024","unstructured":"Liu, Y., et al.: OCRBench: on the hidden mystery of OCR in large multimodal models. Sci. China Inf. Sci. 67(12), 220102 (2024)","journal-title":"Sci. China Inf. Sci."},{"key":"20_CR27","unstructured":"Liu, Y., et al.: TextMonkey: an OCR-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473 (2024)"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Long, S., Qin, S., Panteleev, D., Bissacco, A., Fujii, Y., Raptis, M.: Towards end-to-end unified scene text detection and layout analysis. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1049\u20131059 (2022)","DOI":"10.1109\/CVPR52688.2022.00112"},{"key":"20_CR29","unstructured":"Luo, G., Zhou, Y., Zhang, Y., Zheng, X., Sun, X., Ji, R.: Feast your eyes: mixture-of-resolution adaptation for multimodal large language models. arXiv preprint arXiv:2403.03003 (2024)"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Mahdavi, M., Zanibbi, R., Mouchere, H., Viard-Gaudin, C., Garain, U.: ICDAR 2019 CROHME+ TFD: competition on recognition of handwritten mathematical expressions and typeset formula detection. In: International Conference on Document Analysis and Recognition, pp. 1533\u20131538. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00247"},{"key":"20_CR31","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1007\/s100320200071","volume":"5","author":"UV Marti","year":"2002","unstructured":"Marti, U.V., Bunke, H.: The IAM-database: an English sentence database for offline handwriting recognition. Int. J. Doc. Anal. Recogn. 5, 39\u201346 (2002)","journal-title":"Int. J. Doc. Anal. Recogn."},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: IEEE Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Mishra, A., Alahari, K., Jawahar, C.: Scene text recognition using higher order language priors. In: British Machine Vision Conference, BMVC. BMVA (2012)","DOI":"10.5244\/C.26.127"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Mouchere, H., Viard-Gaudin, C., Zanibbi, R., Garain, U.: ICFHR 2014 competition on recognition of on-line handwritten mathematical expressions. In: International Conference on Frontiers in Handwriting Recognition, pp. 791\u2013796. IEEE (2014)","DOI":"10.1109\/ICFHR.2014.138"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Mouch\u00e8re, H., Viard-Gaudin, C., Zanibbi, R., Garain, U.: ICFHR2016 CROHME: competition on recognition of online handwritten mathematical expressions. In: International Conference on Frontiers in Handwriting Recognition, pp. 607\u2013612. IEEE (2016)","DOI":"10.1109\/ICFHR.2016.0116"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Phan, T.Q., Shivakumara, P., Tian, S., Tan, C.L.: Recognizing text with perspective distortion in natural scenes. In: IEEE International Conference on Computer Vision, pp. 569\u2013576 (2013)","DOI":"10.1109\/ICCV.2013.76"},{"issue":"18","key":"20_CR37","doi-asserted-by":"publisher","first-page":"8027","DOI":"10.1016\/j.eswa.2014.07.008","volume":"41","author":"A Risnumawan","year":"2014","unstructured":"Risnumawan, A., Shivakumara, P., Chan, C.S., Tan, C.L.: A robust arbitrary text detection system for natural scene images. Exp. Syst. Appl. 41(18), 8027\u20138048 (2014)","journal-title":"Exp. Syst. Appl."},{"issue":"9","key":"20_CR38","doi-asserted-by":"publisher","first-page":"2853","DOI":"10.1016\/j.patcog.2014.03.023","volume":"47","author":"C Shi","year":"2014","unstructured":"Shi, C., Wang, C., Xiao, B., Gao, S., Hu, J.: End-to-end scene text recognition using tree-structured models. Pattern Recogn. 47(9), 2853\u20132866 (2014)","journal-title":"Pattern Recogn."},{"key":"20_CR39","unstructured":"Shi, Y., et al.: Exploring OCR capabilities of GPT-4v(ision): a quantitative and in-depth evaluation. arXiv preprint arXiv:2310.16809 (2023)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Singh, A., Pang, G., Toh, M., Huang, J., Galuba, W., Hassner, T.: TextOCR: towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 8802\u20138812 (2021)","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"20_CR41","unstructured":"Wang, B., et al.: UniMERNet: a universal network for real-world mathematical expression recognition. arXiv preprint arXiv:2404.15254 (2024)"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Wang, P., et al.: Platypus: a generalized specialist model for reading text in various forms. In: European Conference on Computer Vision, pp. 165\u2013183. Springer (2024)","DOI":"10.1007\/978-3-031-72761-0_10"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xie, H., Fang, S., Wang, J., Zhu, S., Zhang, Y.: From two to one: a new scene text recognizer with visual language modeling network. In: IEEE International Conference on Computer Vision, pp. 14194\u201314203 (2021)","DOI":"10.1109\/ICCV48922.2021.01393"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Wei, H., et al.: Vary: scaling up the vision vocabulary for large vision-language model. In: European Conference on Computer Vision, pp. 408\u2013424. Springer (2024)","DOI":"10.1007\/978-3-031-73235-5_23"},{"key":"20_CR45","unstructured":"Wei, H., et\u00a0al.: General OCR theory: towards OCR-2.0 via a unified end-to-end model. arXiv preprint arXiv:2409.01704 (2024)"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Yang, W., Li, Z., Peng, D., Jin, L., He, M., Yao, C.: Read ten lines at one glance: line-aware semi-autoregressive transformer for multi-line handwritten mathematical expression recognition. In: ACM International Conference on Multimedia (2023)","DOI":"10.1145\/3581783.3612499"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Xie, X., Fu, L., Zhang, Z., Wang, Z., Bai, X.: Toward understanding wordArt: corner-guided transformer for scene text recognition. In: European Conference on Computer Vision, pp. 303\u2013321. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_18"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Ye, J., et\u00a0al.: UReader: universal OCR-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 13040\u201313051 (2024)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"20_CR50","unstructured":"Yu, Y.Q., Liao, M., Wu, J., Liao, Y., Zheng, X., Zeng, W.: TextHawk: exploring efficient fine-grained perception of multimodal large language models. arXiv preprint arXiv:2404.09204 (2024)"},{"key":"20_CR51","doi-asserted-by":"crossref","unstructured":"Yuan, Y., et al.: Syntax-aware network for handwritten mathematical expression recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 4553\u20134562 (2022)","DOI":"10.1109\/CVPR52688.2022.00451"},{"key":"20_CR52","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. In: IEEE International Conference on Computer Vision, pp. 11975\u201311986 (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"20_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, J., Du, J., Dai, L.: Multi-scale attention with dense encoder for handwritten mathematical expression recognition. In: International Conference on Pattern Recognition, pp. 2245\u20132250. IEEE (2018)","DOI":"10.1109\/ICPR.2018.8546031"},{"key":"20_CR54","doi-asserted-by":"publisher","first-page":"825","DOI":"10.1109\/TIP.2024.3352399","volume":"33","author":"SX Zhang","year":"2024","unstructured":"Zhang, S.X., Yang, C., Zhu, X., Zhou, H., Wang, H., Yin, X.C.: Inverse-like antagonistic scene text spotting via reading-order estimation and dynamic sampling. IEEE Trans. Image Process. 33, 825\u2013839 (2024)","journal-title":"IEEE Trans. Image Process."},{"issue":"3","key":"20_CR55","first-page":"2736","volume":"45","author":"SX Zhang","year":"2022","unstructured":"Zhang, S.X., Zhu, X., Chen, L., Hou, J.B., Yin, X.C.: Arbitrary shape text detection via segmentation with probability maps. IEEE Trans. Pattern Anal. Mach. Intell. 45(3), 2736\u20132750 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"20_CR56","unstructured":"Zhang, Y., et al.: LLaVAR: enhanced visual instruction tuning for text-rich image understanding. arXiv preprint arXiv:2306.17107 (2023)"},{"key":"20_CR57","doi-asserted-by":"crossref","unstructured":"Zhao, W., Gao, L.: CoMER: modeling coverage for transformer-based handwritten mathematical expression recognition. In: European conference on Computer Vision, pp. 392\u2013408. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_23"},{"key":"20_CR58","doi-asserted-by":"crossref","unstructured":"Zhao, W., Gao, L., Yan, Z., Peng, S., Du, L., Zhang, Z.: Handwritten mathematical expression recognition with bidirectionally trained transformer. In: Document Analysis and Recognition, Part II, pp. 570\u2013584. Springer (2021)","DOI":"10.1007\/978-3-030-86331-9_37"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04614-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:23:28Z","timestamp":1757679808000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04614-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,13]]},"ISBN":["9783032046130","9783032046147"],"references-count":58,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04614-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,13]]},"assertion":[{"value":"13 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}