{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T06:11:36Z","timestamp":1758089496345,"version":"3.44.0"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032046239"},{"type":"electronic","value":"9783032046246"}],"license":[{"start":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T00:00:00Z","timestamp":1758067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T00:00:00Z","timestamp":1758067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04624-6_34","type":"book-chapter","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:33:53Z","timestamp":1758000833000},"page":"577-594","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Adapting Vision-Language Models for\u00a0Hindi OCR"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8452-9043","authenticated-orcid":false,"given":"Shaon","family":"Bhattacharyya","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3300-0266","authenticated-orcid":false,"given":"Souvik","family":"Ghosh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9585-5664","authenticated-orcid":false,"given":"Prantik","family":"Deb","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4808-8860","authenticated-orcid":false,"given":"Ajoy","family":"Mondal","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6767-7057","authenticated-orcid":false,"given":"C. V.","family":"Jawahar","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,17]]},"reference":[{"key":"34_CR1","unstructured":"Bharat Scene Text Dataset (2024). https:\/\/github.com\/Bhashini-IITJ\/BharatSceneTextDataset"},{"key":"34_CR2","unstructured":"Abdin, M., et al.: Phi-3 technical report: a highly capable language model locally on your phone. arXiv abs\/2404.14219 (2024)"},{"key":"34_CR3","doi-asserted-by":"crossref","unstructured":"Adak, C., Chaudhuri, B.B., Blumenstein, M.: Offline cursive Bengali word recognition using CNNs with a recurrent model. In: ICFHR, pp. 429\u2013434 (2016)","DOI":"10.1109\/ICFHR.2016.0086"},{"key":"34_CR4","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/BF02703311","volume":"27","author":"T Ashwin","year":"2002","unstructured":"Ashwin, T., Sastry, P.: A font and size-independent OCR system for printed Kannada documents using support vector machines. Sadhana 27, 35\u201358 (2002)","journal-title":"Sadhana"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: CVPR, pp. 9357\u20139366 (2019)","DOI":"10.1109\/CVPR.2019.00959"},{"key":"34_CR6","unstructured":"Bai, J., et al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"34_CR7","unstructured":"Bazzi, et al.: OCR of degraded documents using HMM-based techniques. In: Symposium on Document Image Understanding Technology, p.\u00a0149 (1999)"},{"key":"34_CR8","doi-asserted-by":"crossref","unstructured":"Brunelli, R.: Template Matching Techniques in Computer Vision: Theory and Practice. Wiley (2009)","DOI":"10.1002\/9780470744055"},{"key":"34_CR9","doi-asserted-by":"crossref","unstructured":"Chaudhuri, B., Pal, U.: An OCR system to read two Indian language scripts: Bangla and Devnagari (Hindi). In: ICFHR, pp. 1011\u20131015 (1997)","DOI":"10.1109\/ICDAR.1997.620662"},{"key":"34_CR10","unstructured":"Chen, S., et al.: Ocean-OCR: towards general OCR application via a vision-language model. arXiv (2025)"},{"key":"34_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: On scaling up a multilingual vision and language model. In: CVPR, pp. 14432\u201314444 (2024)","DOI":"10.1109\/CVPR52733.2024.01368"},{"key":"34_CR12","doi-asserted-by":"publisher","first-page":"508","DOI":"10.1109\/TPAMI.2022.3144899","volume":"45","author":"D Coquenet","year":"2020","unstructured":"Coquenet, D., Chatelain, C., Paquet, T.: End-to-end handwritten paragraph text recognition using a vertical attention network. IEEE Trans. PAMI 45, 508\u2013524 (2020)","journal-title":"IEEE Trans. PAMI"},{"key":"34_CR13","doi-asserted-by":"crossref","unstructured":"Da, C., Wang, P., Yao, C.: Levenshtein OCR. In: ECCV, pp. 322\u2013338 (2022)","DOI":"10.1007\/978-3-031-19815-1_19"},{"key":"34_CR14","doi-asserted-by":"crossref","unstructured":"Dabre, R., et al.: Indicbart: a pre-trained model for indic natural language generation. arXiv preprint arXiv:2109.02903 (2021)","DOI":"10.18653\/v1\/2022.findings-acl.145"},{"key":"34_CR15","doi-asserted-by":"crossref","unstructured":"Deng, D., Liu, H., Li, X., Cai, D.: Pixellink: detecting scene text via instance segmentation. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12269"},{"key":"34_CR16","unstructured":"Diaz, D.H., Qin, S., Ingle, R.R., Fujii, Y., Bissacco, A.: Rethinking text line recognition models. arXiv (2021)"},{"key":"34_CR17","doi-asserted-by":"crossref","unstructured":"Feng, S., Manmatha, R.: A hierarchical, hmm-based automatic evaluation of OCR accuracy for a digital library of books. In: ACM\/IEEE-CS Joint Conference on Digital Libraries, pp. 109\u2013118 (2006)","DOI":"10.1145\/1141753.1141776"},{"key":"34_CR18","doi-asserted-by":"crossref","unstructured":"Garain, U., Mioulet, L., Chaudhuri, B.B., Chatelain, C., Paquet, T.: Unconstrained Bengali handwriting recognition with recurrent models. In: ICDAR, pp. 1056\u20131060 (2015)","DOI":"10.1109\/ICDAR.2015.7333923"},{"key":"34_CR19","doi-asserted-by":"crossref","unstructured":"Gongidi, S., Jawahar, C.V.: IIIT-Indic-HW-words: a dataset for indic handwritten text recognition. In: ICDAR, pp. 444\u2013459 (2021)","DOI":"10.1007\/978-3-030-86337-1_30"},{"key":"34_CR20","doi-asserted-by":"publisher","first-page":"6594","DOI":"10.1109\/TPAMI.2021.3092688","volume":"44","author":"S Grieggs","year":"2019","unstructured":"Grieggs, S., et al.: Measuring human perception to improve handwritten document transcription. IEEE Trans. PAMI 44, 6594\u20136601 (2019)","journal-title":"IEEE Trans. PAMI"},{"key":"34_CR21","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. ICLR 1(2), 3 (2022)"},{"key":"34_CR22","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: LayoutLMv3: pre-training for document AI with unified text and image masking. In: ACM MM, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"34_CR23","unstructured":"JaidedAI: Easyocr. https:\/\/github.com\/JaidedAI\/EasyOCR"},{"issue":"1","key":"34_CR24","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1109\/34.824819","volume":"22","author":"A Jain","year":"2000","unstructured":"Jain, A., Duin, R., Mao, J.: Statistical pattern recognition: a review. IEEE Trans. PAMI 22(1), 4\u201337 (2000)","journal-title":"IEEE Trans. PAMI"},{"key":"34_CR25","doi-asserted-by":"crossref","unstructured":"Kakwani, D., et al.: Indicnlpsuite: monolingual corpora, evaluation benchmarks and pre-trained multilingual language models for Indian languages. In: EMNLP, pp. 4948\u20134961 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.445"},{"key":"34_CR26","doi-asserted-by":"crossref","unstructured":"Khan, M.S.U.R., et al.: Indicllmsuite: a blueprint for creating pre-training and fine-tuning datasets for Indian languages. arXiv preprint arXiv:2403.06350 (2024)","DOI":"10.18653\/v1\/2024.acl-long.843"},{"key":"34_CR27","doi-asserted-by":"crossref","unstructured":"Khan, S., et al.: Chitrarth: bridging vision and language for a billion people. In: ICASSP, pp.\u00a01\u20135 (2025)","DOI":"10.1109\/ICASSP49660.2025.10888601"},{"key":"34_CR28","unstructured":"Khanam, R., Hussain, M.: Yolov11: an overview of the key architectural enhancements. arXiv preprint arXiv:2410.17725 (2024)"},{"key":"34_CR29","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: ECCV, pp. 498\u2013517 (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"34_CR30","doi-asserted-by":"crossref","unstructured":"Kuang, Z., et al.: MMOCR: a comprehensive toolbox for text detection, recognition and understanding. In: ACM MM, pp. 3791\u20133794 (2021)","DOI":"10.1145\/3474085.3478328"},{"key":"34_CR31","unstructured":"Kumar, R., et al.: Pretraining data and tokenizer for indic LLM. arXiv preprint arXiv:2407.12481 (2024)"},{"key":"34_CR32","doi-asserted-by":"crossref","unstructured":"Lalitha, E., Mondal, A., Jawahar, C.: Enhancing accuracy in indic handwritten text recognition. In: CVIP (2024)","DOI":"10.1007\/978-3-031-93688-3_17"},{"key":"34_CR33","unstructured":"Lee, K., et al.: Pix2struct: screenshot parsing as pretraining for visual language understanding. In: ICML, pp. 18893\u201318912 (2023)"},{"key":"34_CR34","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: TrOCR: transformer-based optical character recognition with pre-trained models. In: AAAI, pp. 13094\u201313102 (2023)","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"34_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y.: HTR-VT: handwritten text recognition with vision transformer. Pattern Recognit. 158, 110967\u2013110978 (2025)","DOI":"10.1016\/j.patcog.2024.110967"},{"key":"34_CR36","doi-asserted-by":"crossref","unstructured":"Lin, J., et al.: Transferring general multimodal pretrained models to text recognition. arXiv preprint arXiv:2212.09297 (2022)","DOI":"10.18653\/v1\/2023.findings-acl.37"},{"key":"34_CR37","doi-asserted-by":"crossref","unstructured":"Long, S., Ruan, J., Zhang, W., He, X., Wu, W., Yao, C.: Textsnake: a flexible representation for detecting text of arbitrary shapes. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01216-8_2"},{"key":"34_CR38","doi-asserted-by":"crossref","unstructured":"Lunia, H., Mondal, A., Jawahar, C.: Indicstr12: a dataset for indic scene text recognition. In: ICDARW, pp. 233\u2013250 (2023)","DOI":"10.1007\/978-3-031-41498-5_17"},{"key":"34_CR39","doi-asserted-by":"crossref","unstructured":"Mathew, M., Mondal, A., Jawahar, C.: Towards deployable OCR models for indic languages. In: ICPR, pp. 167\u2013182 (2025)","DOI":"10.1007\/978-3-031-78495-8_11"},{"key":"34_CR40","unstructured":"Mindee: doctr: Document text recognition (2021). https:\/\/github.com\/mindee\/doctr"},{"key":"34_CR41","unstructured":"Niyogi, M., Bhattacharya, A.: Paramanu: a family of novel efficient generative foundation language models for Indian languages. arXiv preprint arXiv:2401.18034 (2024)"},{"issue":"9","key":"34_CR42","doi-asserted-by":"publisher","first-page":"1887","DOI":"10.1016\/j.patcog.2004.02.003","volume":"37","author":"U Pal","year":"2004","unstructured":"Pal, U., Chaudhuri, B.: Indian script character recognition: a survey. Pattern Recogn. 37(9), 1887\u20131899 (2004)","journal-title":"Pattern Recogn."},{"key":"34_CR43","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S.K., Girshick, R.B., Farhadi, A.: You only look once: unified, real-time object detection. In: CVPR, pp. 779\u2013788 (2015)","DOI":"10.1109\/CVPR.2016.91"},{"key":"34_CR44","doi-asserted-by":"crossref","unstructured":"Rothacker, L., Fink, G.A., Banerjee, P., Bhattacharya, U., Chaudhuri, B.: Bag-of-features HMMs for segmentation-free Bangla word spotting. In: International Workshop on Multilingual OCR, pp.\u00a01\u20135 (2013)","DOI":"10.1145\/2505377.2505384"},{"key":"34_CR45","doi-asserted-by":"crossref","unstructured":"Sabir, E., Rawls, S., Natarajan, P.: Implicit language model in LSTM for OCR. In: ICDAR, pp. 27\u201331 (2017)","DOI":"10.1109\/ICDAR.2017.361"},{"key":"34_CR46","doi-asserted-by":"crossref","unstructured":"Shen, Z., Zhang, R., Dell, M., Lee, B.C.G., Carlson, J., Li, W.: Layoutparser: a unified toolkit for deep learning based document image analysis. In: ICDAR, pp. 131\u2013146 (2021)","DOI":"10.1007\/978-3-030-86549-8_9"},{"key":"34_CR47","doi-asserted-by":"crossref","unstructured":"Singh, H., Gupta, N., Bharadwaj, S., Tewari, D., Talukdar, P.: Indicgenbench: a multilingual benchmark to evaluate generation capabilities of LLMs on indic languages. arXiv preprint arXiv:2404.16816 (2024)","DOI":"10.18653\/v1\/2024.acl-long.595"},{"key":"34_CR48","doi-asserted-by":"crossref","unstructured":"Smith, R.: An overview of the tesseract OCR engine. In: ICDAR, pp. 629\u2013633 (2007)","DOI":"10.1109\/ICDAR.2007.4376991"},{"key":"34_CR49","doi-asserted-by":"crossref","unstructured":"Wang, P., Da, C., Yao, C.: Multi-granularity prediction for scene text recognition. In: European Conference on Computer Vision, pp. 339\u2013355. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_20"},{"key":"34_CR50","unstructured":"Wei, H., et al.: General OCR theory: towards OCR-2.0 via a unified end-to-end model. arXiv abs\/2409.01704 (2024)"},{"key":"34_CR51","doi-asserted-by":"crossref","unstructured":"Wick, C., Z\u00f6llner, J., Gr\u00fcning, T.: Transformer for handwritten text recognition using bidirectional post-decoding. In: ICDAR, pp. 112\u2013126 (2021)","DOI":"10.1007\/978-3-030-86334-0_8"},{"key":"34_CR52","unstructured":"Wu, J., Peng, Y., Zhang, S., Qi, W., Zhang, J.: Masked vision-language transformers for scene text recognition. arXiv preprint arXiv:2211.04785 (2022)"},{"key":"34_CR53","doi-asserted-by":"crossref","unstructured":"Xu, Y., et\u00a0al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: ACL, pp. 2579\u20132591 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"34_CR54","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: ACM SIGKDD, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"34_CR55","doi-asserted-by":"crossref","unstructured":"Xue, C., Zhang, W., Hao, Y., Lu, S., Torr, P.H., Bai, S.: Language matters: a weakly supervised vision-language pre-training approach for scene text detection and spotting. In: ECCV, pp. 284\u2013302 (2022)","DOI":"10.1007\/978-3-031-19815-1_17"},{"key":"34_CR56","doi-asserted-by":"publisher","first-page":"5625","DOI":"10.1109\/TPAMI.2024.3369699","volume":"46","author":"J Zhang","year":"2023","unstructured":"Zhang, J., Huang, J., Jin, S., Lu, S.: Vision-language models for vision tasks: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 46, 5625\u20135644 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"34_CR57","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv abs\/2205.01068 (2022)"},{"key":"34_CR58","doi-asserted-by":"publisher","first-page":"6893","DOI":"10.1109\/TIP.2024.3512354","volume":"33","author":"S Zhao","year":"2024","unstructured":"Zhao, S., Quan, R., Zhu, L., Yang, Y.: Clip4str: a simple baseline for scene text recognition with pre-trained vision-language model. IEEE Trans. Image Process. 33, 6893\u20136904 (2024)","journal-title":"IEEE Trans. Image Process."},{"issue":"22","key":"34_CR59","doi-asserted-by":"publisher","first-page":"7371","DOI":"10.3390\/s24227371","volume":"24","author":"X Zhao","year":"2024","unstructured":"Zhao, X., Xu, M., Silamu, W., Li, Y.: Clip-llama: a new approach for scene text recognition with a pre-trained vision-language model and a pre-trained language model. Sensors 24(22), 7371 (2024)","journal-title":"Sensors"},{"key":"34_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, X., et al.: East: an efficient and accurate scene text detector. In: CVPR, pp. 5551\u20135560 (2017)","DOI":"10.1109\/CVPR.2017.283"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04624-6_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:34:08Z","timestamp":1758000848000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04624-6_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,17]]},"ISBN":["9783032046239","9783032046246"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04624-6_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,9,17]]},"assertion":[{"value":"17 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}