{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T19:13:46Z","timestamp":1778699626160,"version":"3.51.4"},"reference-count":115,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s11432-024-4235-6","type":"journal-article","created":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T22:21:27Z","timestamp":1734128487000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":107,"title":["OCRBench: on the hidden mystery of OCR in large multimodal models"],"prefix":"10.1007","volume":"67","author":[{"given":"Yuliang","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingxin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Biao","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenwen","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunyuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xu-Cheng","family":"Yin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng-Lin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,11]]},"reference":[{"key":"4235_CR1","volume-title":"ChatGPT","author":"OpenAI","year":"2023","unstructured":"OpenAI. ChatGPT. 2023. https:\/\/openai.com\/blog\/chatgpt"},{"key":"4235_CR2","unstructured":"Achiam J, Adler S, Agarwal S, et al. GPT-4 technical report. 2023. ArXiv:2303.08774"},{"key":"4235_CR3","unstructured":"Touvron H, Lavril T, Izacard G, et al. Llama: open and efficient foundation language models. 2023. ArXiv:2302.13971"},{"key":"4235_CR4","volume-title":"Stanford Alpaca: an instruction-following LLaMA model","author":"R Taori","year":"2023","unstructured":"Taori R, Gulrajani I, Zhang T Y, et al. Stanford Alpaca: an instruction-following LLaMA model. 2023. https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"4235_CR5","volume-title":"Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"Vicuna","year":"2023","unstructured":"Vicuna. Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. 2023. https:\/\/vicuna.lmsys.org."},{"key":"4235_CR6","unstructured":"Peng B L, Li C Y, He P C, et al. Instruction tuning with GPT-4. 2023. ArXiv:2304.03277"},{"key":"4235_CR7","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1561\/0600000105","volume":"14","author":"Z Gan","year":"2022","unstructured":"Gan Z, Li L J, Li C Y, et al. Vision-language pre-training: basics, recent advances, and future trends. FNT Comput Graph Vision, 2022, 14: 163\u2013352","journal-title":"FNT Comput Graph Vision"},{"key":"4235_CR8","unstructured":"Radford A, Kim J W, Hallacy C, et al. Learning transferable visual models from natural language supervision. 2021. ArXiv:2103.00020"},{"key":"4235_CR9","unstructured":"Yuan L, Chen D D, Chen Y L, et al. Florence: a new foundation model for computer vision. 2021. ArXiv:2111.11432"},{"key":"4235_CR10","unstructured":"Jia C, Yang Y F, Xia Y, et al. Scaling up visual and vision-language representation learning with noisy text supervision. 2021. ArXiv:2102.05918"},{"key":"4235_CR11","first-page":"9287","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"C Y Li","year":"2022","unstructured":"Li C Y, Liu H T, Li L N, et al. ELEVATER: a benchmark and toolkit for evaluating language-augmented visual models. In: Proceedings of the Advances in Neural Information Processing Systems, 2022. 9287\u20139301"},{"key":"4235_CR12","unstructured":"Driess D, Xia F, Sajjadi S M, et al. PaLM-E: an embodied multimodal language model. 2023. ArXiv:2303.03378"},{"key":"4235_CR13","first-page":"23716","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"J B Alayrac","year":"2022","unstructured":"Alayrac J B, Donahue J, Luc P, et al. Flamingo: a visual language model for few-shot learning. In: Proceedings of the Advances in Neural Information Processing Systems, 2022. 23716\u201323736"},{"key":"4235_CR14","unstructured":"Wang J F, Yang Z Y, Hu X W, et al. GIT: a generative image-to-text transformer for vision and language. 2022. ArXiv:2205.14100"},{"key":"4235_CR15","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Wu Q Y, et al. Visual instruction tuning. In: Proceedings of the Advances in Neural Information Processing Systems, 2024"},{"key":"4235_CR16","unstructured":"Google Gemini Team. Gemini: a family of highly capable multimodal models. 2023. ArXiv:2312.11805"},{"key":"4235_CR17","volume-title":"GPT-4V(vision) system card","author":"OpenAI","year":"2023","unstructured":"OpenAI. GPT-4V(vision) system card. 2023. https:\/\/openai.com\/index\/gpt-4v-system-card"},{"key":"4235_CR18","first-page":"19730","volume-title":"Proceedings of the International Conference on Machine Learning","author":"J N Li","year":"2023","unstructured":"Li J N, Li D X, Savarese S, et al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the International Conference on Machine Learning, 2023. 19730\u201319742"},{"key":"4235_CR19","unstructured":"Awadalla A, Gao I, Gardner J, et al. OpenFlamingo: an open-source framework for training large autoregressive vision-language models. 2023. ArXiv:2308.01390"},{"key":"4235_CR20","first-page":"26296","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Li Y H, et al. Improved baselines with visual instruction tuning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2024. 26296\u201326306"},{"key":"4235_CR21","unstructured":"Zhu D Y, Chen J, Shen X Q, et al. MiniGPT-4: enhancing vision-language understanding with advanced large language models. 2023. ArXiv:2304.10592"},{"key":"4235_CR22","unstructured":"Ye Q H, Xu H Y, Ye J, et al. mPLUG-Owl: modularization empowers large language models with multimodality. 2023. ArXiv:2304.14178"},{"key":"4235_CR23","first-page":"13040","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Q H Ye","year":"2024","unstructured":"Ye Q H, Xu H Y, Ye J B, et al. mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2024. 13040\u201313051"},{"key":"4235_CR24","unstructured":"Zhang Y Z, Zhang R Y, Gu J B, et al. LLaVAR: enhanced visual instruction tuning for text-rich image understanding. 2023. ArXiv:2306.17107"},{"key":"4235_CR25","first-page":"2256","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"W B Hu","year":"2024","unstructured":"Hu W B, Xu Y F, Li Y, et al. BLIVA: a simple multimodal llm for better handling of text-rich visual questions. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2024. 2256\u20132264"},{"key":"4235_CR26","unstructured":"Chen J, Zhu D Y, Shen X Q, et al. MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. 2023. ArXiv:2310.09478"},{"key":"4235_CR27","unstructured":"Feng H, Wang Z J, Tang J Q, et al. UniDoc: a universal large multimodal model for simultaneous text detection, recognition, spotting and understanding. 2023. ArXiv:2308.11592"},{"key":"4235_CR28","doi-asserted-by":"crossref","unstructured":"Feng H, Liu Q, Liu H, et al. DocPedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding. 2023. ArXiv:2311.11810","DOI":"10.1007\/s11432-024-4250-y"},{"key":"4235_CR29","first-page":"26763","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Z Li","year":"2024","unstructured":"Li Z, Yang B, Liu Q, et al. Monkey: image resolution and text label are important things for large multi-modal models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2024. 26763\u201326773"},{"key":"4235_CR30","unstructured":"Liu Y L, Yang B, Liu Q, et al. TextMonkey: an OCR-free large multimodal model for understanding document. 2024. ArXiv:2403.04473"},{"key":"4235_CR31","doi-asserted-by":"publisher","first-page":"11198","DOI":"10.1145\/3664647.3685520","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia","author":"H D Duan","year":"2024","unstructured":"Duan H D, Yang J M, Qiao Y X, et al. VLMEvalKit: an open-source toolkit for evaluating large multi-modality models. In: Proceedings of the 32nd ACM International Conference on Multimedia, 2024. 11198\u201311201"},{"key":"4235_CR32","unstructured":"Zhang K C, Li B, Zhang P Y, et al. LMMs-eval: reality check on the evaluation of large multimodal models. 2024. ArXiv:2407.12772"},{"key":"4235_CR33","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"F X Liu","year":"2023","unstructured":"Liu F X, Lin K, Li L J, et al. Mitigating hallucination in large multi-modal models via robust instruction tuning. In: Proceedings of the 12th International Conference on Learning Representations, 2023"},{"key":"4235_CR34","first-page":"216","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Y Liu","year":"2025","unstructured":"Liu Y, Duan H D, Zhang Y H, et al. MMBench: is your multi-modal model an all-around player? In: Proceedings of the European Conference on Computer Vision, 2025. 216\u2013233"},{"key":"4235_CR35","unstructured":"Fu C Y, Chen P X, Shen Y H, et al. MME: a comprehensive evaluation benchmark for multimodal large language models. 2024. ArXiv:2306.13394"},{"key":"4235_CR36","first-page":"4291","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"A F Biten","year":"2019","unstructured":"Biten A F, Tito R P, Mafla A, et al. Scene text visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, 2019. 4291\u20134301"},{"key":"4235_CR37","first-page":"2687","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Mishra","year":"2012","unstructured":"Mishra A, Alahari K, Jawahar C V. Top-down and bottom-up cues for scene text recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2012. 2687\u20132694"},{"key":"4235_CR38","doi-asserted-by":"publisher","first-page":"2853","DOI":"10.1016\/j.patcog.2014.03.023","volume":"47","author":"C Z Shi","year":"2014","unstructured":"Shi C Z, Wang C H, Xiao B H, et al. End-to-end scene text recognition using tree-structured models. Pattern Recogn, 2014, 47: 2853\u20132866","journal-title":"Pattern Recogn"},{"key":"4235_CR39","first-page":"1484","volume-title":"Proceedings of the 12th International Conference on Document Analysis and Recognition","author":"D Karatzas","year":"2013","unstructured":"Karatzas D, Shafait F, Uchida S, et al. ICDAR 2013 robust reading competition. In: Proceedings of the 12th International Conference on Document Analysis and Recognition, 2013. 1484\u20131493"},{"key":"4235_CR40","first-page":"1156","volume-title":"Proceedings of the 13th International Conference on Document Analysis and Recognition","author":"D Karatzas","year":"2015","unstructured":"Karatzas D, Gomez-Bigorda L, Nicolaou A, et al. ICDAR 2015 competition on robust reading. In: Proceedings of the 13th International Conference on Document Analysis and Recognition, 2015. 1156\u20131160"},{"key":"4235_CR41","first-page":"569","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"T Q Phan","year":"2013","unstructured":"Phan T Q, Shivakumara P, Tian S X, et al. Recognizing text with perspective distortion in natural scenes. In: Proceedings of the IEEE International Conference on Computer Vision, 2013. 569\u2013576"},{"key":"4235_CR42","doi-asserted-by":"publisher","first-page":"8027","DOI":"10.1016\/j.eswa.2014.07.008","volume":"41","author":"A Risnumawan","year":"2014","unstructured":"Risnumawan A, Shivakumara P, Chan C S, et al. A robust arbitrary text detection system for natural scene images. Expert Syst Appl, 2014, 41: 8027\u20138048","journal-title":"Expert Syst Appl"},{"key":"4235_CR43","unstructured":"Veit A, Matera T, Neumann L, et al. COCO-Text: dataset and benchmark for text detection and recognition in natural images. 2016. ArXiv:1601.07140"},{"key":"4235_CR44","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1016\/j.patcog.2019.02.002","volume":"90","author":"Y L Liu","year":"2019","unstructured":"Liu Y L, Jin L W, Zhang S T, et al. Curved scene text detection via transverse and longitudinal sequence connection. Pattern Recogn, 2019, 90: 337\u2013345","journal-title":"Pattern Recogn"},{"key":"4235_CR45","first-page":"935","volume-title":"Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition","author":"C-K Chng","year":"2017","unstructured":"Chng C-K, Chan C S. Total-Text: a comprehensive dataset for scene text detection and recognition. In: Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition, 2017. 935\u2013942"},{"key":"4235_CR46","first-page":"14194","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Y X Wang","year":"2021","unstructured":"Wang Y X, Xie H T, Fang S C, et al. From two to one: a new scene text recognizer with visual language modeling network. In: Proceedings of the IEEE International Conference on Computer Vision, 2021. 14194\u201314203"},{"key":"4235_CR47","first-page":"303","volume-title":"Proceedings of the European conference on computer vision","author":"X D Xie","year":"2022","unstructured":"Xie X D, Fu L, Zhang Z F, et al. Toward understanding WordArt: corner-guided transformer for scene text recognition. In: Proceedings of the European conference on computer vision, 2022. 303\u2013321"},{"key":"4235_CR48","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1007\/s100320200071","volume":"5","author":"U V Marti","year":"2002","unstructured":"Marti U V, Bunke H. The IAM-database: an English sentence database for offline handwriting recognition. Int J Document Anal Recogn, 2002, 5: 39\u201346","journal-title":"Int J Document Anal Recogn"},{"key":"4235_CR49","first-page":"1577","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"R Zhang","year":"2019","unstructured":"Zhang R, Zhou Y S, Jiang Q Y, et al. ICDAR 2019 robust reading challenge on reading Chinese text on signboard. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1577\u20131581"},{"key":"4235_CR50","first-page":"779","volume-title":"Proceedings of the 14th International Conference on Frontiers in Handwriting Recognition","author":"M Diem","year":"2014","unstructured":"Diem M, Fiel S, Kleber F, et al. ICFHR 2014 competition on handwritten digit string recognition in challenging datasets (HDSRC 2014). In: Proceedings of the 14th International Conference on Frontiers in Handwriting Recognition, 2014. 779\u2013784"},{"key":"4235_CR51","first-page":"1563","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"A F Biten","year":"2019","unstructured":"Biten A F, Tito R, Mafia A, et al. ICDAR 2019 competition on scene text visual question answering. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1563\u20131570"},{"key":"4235_CR52","first-page":"8317","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Singh","year":"2019","unstructured":"Singh A, Natarajan V, Shah M, et al. Towards VQA models that can read. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019. 8317\u20138326"},{"key":"4235_CR53","first-page":"947","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"A Mishra","year":"2019","unstructured":"Mishra A, Shekhar S, Singh A K, et al. OCR-VQA: visual question answering by reading text in images. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 947\u2013952"},{"key":"4235_CR54","first-page":"10126","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"X Y Wang","year":"2020","unstructured":"Wang X Y, Liu Y L, Shen C H, et al. On the general value of evidence, and bilingual scene-text visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2020. 10126\u201310135"},{"key":"4235_CR55","first-page":"2200","volume-title":"Proceedings of the IEEE Winter Conference on Applications of Computer Vision","author":"M Mathew","year":"2021","unstructured":"Mathew M, Karatzas D, Jawahar C V. DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE Winter Conference on Applications of Computer Vision, 2021. 2200\u20132209"},{"key":"4235_CR56","first-page":"1697","volume-title":"Proceedings of the IEEE Winter Conference on Applications of Computer Vision","author":"M Mathew","year":"2022","unstructured":"Mathew M, Bagal V, Tito R, et al. InfographicVQA. In: Proceedings of the IEEE Winter Conference on Applications of Computer Vision, 2022. 1697\u20131706"},{"key":"4235_CR57","doi-asserted-by":"crossref","unstructured":"Masry A, Long D X, Tan J Q, et al. ChartQA: a benchmark for question answering about charts with visual and logical reasoning. 2022. ArXiv:2203.10244","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"4235_CR58","first-page":"1516","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"Z Huang","year":"2019","unstructured":"Huang Z, Chen K, He J H, et al. ICDAR2019 competition on scanned receipt OCR and information extraction. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1516\u20131520"},{"key":"4235_CR59","first-page":"1","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition Workshops","author":"G Jaume","year":"2019","unstructured":"Jaume G, Ekenel H K, Thiran J-P. FUNSD: a dataset for form understanding in noisy scanned documents. In: Proceedings of the International Conference on Document Analysis and Recognition Workshops, 2019. 1\u20136"},{"key":"4235_CR60","first-page":"36","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"J F Kuang","year":"2023","unstructured":"Kuang J F, Hua W, Liang D K, et al. Visual information extraction in the wild: practical dataset and end-to-end solution. In: Proceedings of the International Conference on Document Analysis and Recognition, 2023. 36\u201353"},{"key":"4235_CR61","first-page":"178","volume-title":"Proceedings of the European Conference on Computer Vision","author":"D Bautista","year":"2022","unstructured":"Bautista D, Atienza R. Scene text recognition with permuted autoregressive sequence models. In: Proceedings of the European Conference on Computer Vision, 2022. 178\u2013196"},{"key":"4235_CR62","doi-asserted-by":"publisher","first-page":"507","DOI":"10.1007\/978-3-031-06555-2_34","volume-title":"Proceedings of the International Workshop on Document Analysis Systems","author":"D Kass","year":"2022","unstructured":"Kass D, Vats E. AttentionHTR: handwritten text recognition based on attention encoder-decoder networks. In: Proceedings of the International Workshop on Document Analysis Systems, 2022. 507\u2013522"},{"key":"4235_CR63","first-page":"4715","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"J Baek","year":"2019","unstructured":"Baek J, Kim G, Lee J, et al. What is wrong with scene text recognition model comparisons? Dataset and model analysis. In: Proceedings of the IEEE International Conference on Computer Vision, 2019. 4715\u20134723"},{"key":"4235_CR64","doi-asserted-by":"publisher","first-page":"404","DOI":"10.1007\/978-3-031-21648-0_28","volume-title":"Proceedings of the International Conference on Frontiers in Handwriting Recognition","author":"M-M Yu","year":"2022","unstructured":"Yu M-M, Zhang H, Yin F, et al. An efficient prototype-based model for handwritten text recognition with multi-loss fusion. In: Proceedings of the International Conference on Frontiers in Handwriting Recognition, 2022. 404\u2013418"},{"key":"4235_CR65","unstructured":"Qiao Y X, Chen H, Wang J, et al. Winner Team Mia at TextVQA challenge 2021: vision-and-language representation learning with pre-trained sequence-to-sequence model. 2021. ArXiv:2106.15332"},{"key":"4235_CR66","first-page":"1","volume-title":"Proceedings of the IEEE International Conference on Multimedia and Expo","author":"Z Q Fang","year":"2022","unstructured":"Fang Z Q, Li L, Xie Z W. Cross-modal attention networks with modality disentanglement for scene-text VQA. In: Proceedings of the IEEE International Conference on Multimedia and Expo, 2022. 1\u20136"},{"key":"4235_CR67","first-page":"3744","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"Q M Peng","year":"2022","unstructured":"Peng Q M, Pan Y X, Wang W J, et al. ERNIE-Layout: layout knowledge enhanced pre-training for visually-rich document understanding. In: Proceedings of Findings of the Association for Computational Linguistics, 2022. 3744\u20133756"},{"key":"4235_CR68","first-page":"693","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing","author":"K Aggarwal","year":"2023","unstructured":"Aggarwal K, Khandelwal A, Tanmay K, et al. DUBLIN: visual document understanding by language-image network. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2023. 693\u2013706"},{"key":"4235_CR69","first-page":"10381","volume-title":"Proceedings of the Findings of the Association for Computational Linguistics","author":"F Y Liu","year":"2023","unstructured":"Liu F Y, Eisenschlos J M, Piccinno F, et al. DePlot: one-shot visual language reasoning by plot-to-table translation. In: Proceedings of the Findings of the Association for Computational Linguistics, 2023. 10381\u201310399"},{"key":"4235_CR70","doi-asserted-by":"publisher","first-page":"1912","DOI":"10.1145\/3474085.3475345","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"Y L Li","year":"2021","unstructured":"Li Y L, Qian Y X, Yu Y C, et al. StrucTexT: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, 2021. 1912\u20131920"},{"key":"4235_CR71","first-page":"197","volume-title":"Proceedings of the European Conference on Computer Vision","author":"B H Li","year":"2022","unstructured":"Li B H, Yuan Y, Liang D K, et al. When counting meets HMER: counting-aware network for handwritten mathematical expression recognition. In: Proceedings of the European Conference on Computer Vision, 2022. 197\u2013214"},{"key":"4235_CR72","first-page":"19485","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"J B He","year":"2023","unstructured":"He J B, Wang L, Hu Y P, et al. ICL-D3IE: in-context learning with diverse demonstrations updating for document information extraction. In: Proceedings of the IEEE International Conference on Computer Vision, 2023. 19485\u201319494"},{"key":"4235_CR73","volume-title":"Proceedings of the Document Intelligence Workshop at Neural Information Processing Systems","author":"S Park","year":"2019","unstructured":"Park S, Shin S, Lee B, et al. CORD: a consolidated receipt dataset for post-ocr parsing. In: Proceedings of the Document Intelligence Workshop at Neural Information Processing Systems, 2019"},{"key":"4235_CR74","unstructured":"Li B, Fang G X, Yang Y, et al. Evaluating ChatGPT\u2019s information extraction capabilities: an assessment of performance, explainability, calibration, and faithfulness. 2023. ArXiv:2304.11633"},{"key":"4235_CR75","unstructured":"Singhal K, Tu T, Gottweis J, et al. Towards expert-level medical question answering with large language models. 2023. ArXiv:2305.09617"},{"key":"4235_CR76","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"C Y Li","year":"2024","unstructured":"Li C Y, Wong C, Zhang S, et al. LLaVA-Med: training a large language-and-vision assistant for biomedicine in one day. In: Proceedings of the Advances in Neural Information Processing Systems, 2024"},{"key":"4235_CR77","volume-title":"MiniCPM-V 2.6","author":"OpenBMB","year":"2024","unstructured":"OpenBMB. MiniCPM-V 2.6. 2024. https:\/\/huggingface.co\/openbmb\/MiniCPM-V-2_6"},{"key":"4235_CR78","volume-title":"Proceedings of the 38th Annual Conference on Neural Information Processing Systems","author":"S Tong","year":"2024","unstructured":"Tong S, Brown E, Wu P, et al. Cambrian-1: a fully open, vision-centric exploration of multimodal LLMs. In: Proceedings of the 38th Annual Conference on Neural Information Processing Systems, 2024"},{"key":"4235_CR79","doi-asserted-by":"crossref","unstructured":"Chen Z, Wang W Y, Tian H, et al. How far are we to GPT-4V? Closing the gap to commercial multimodal models with open-source suites. 2024. ArXiv:2404.16821","DOI":"10.1007\/s11432-024-4231-5"},{"key":"4235_CR80","unstructured":"Beyer L, Steiner A, Pinto A S, et al. PaliGemma: a versatile 3B VLM for transfer. 2024. ArXiv:2407.07726"},{"key":"4235_CR81","volume-title":"Congrong","author":"CloudWalk","year":"2024","unstructured":"CloudWalk. Congrong. 2024. https:\/\/www.cloudwalk.com"},{"key":"4235_CR82","unstructured":"Wang W H, Lv Q, Yu W M, et al. CogVLM: visual expert for pretrained language models. 2023. ArXiv:2311.03079"},{"key":"4235_CR83","volume-title":"MiniCPM-V-2","author":"OpenBMB","year":"2024","unstructured":"OpenBMB. MiniCPM-V-2. 2024. https:\/\/huggingface.co\/openbmb\/MiniCPM-V-2"},{"key":"4235_CR84","unstructured":"Huang M X, Liu Y L, Liang D K, et al. Mini-Monkey: alleviate the sawtooth effect by multi-scale adaptive cropping. 2024. ArXiv:2408.02034"},{"key":"4235_CR85","volume-title":"Claude3.5-sonnet","author":"Anthropic","year":"2024","unstructured":"Anthropic. Claude3.5-sonnet. 2024. https:\/\/docs.anthropic.com\/en\/docs\/build-with-claude\/vision"},{"key":"4235_CR86","volume-title":"LLaVa-NeXT: improved reasoning, OCR, and world knowledge","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Li Y H, et al. LLaVa-NeXT: improved reasoning, OCR, and world knowledge. 2024. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next"},{"key":"4235_CR87","volume-title":"GPT-4o-mini-20240718","author":"OpenAI","year":"2024","unstructured":"OpenAI. GPT-4o-mini-20240718. 2024. https:\/\/openai.com\/index\/gpt-4o-mini-advancing-cost-efficient-intelligence"},{"key":"4235_CR88","unstructured":"Dong X Y, Zhang P, Zang Y H, et al. InternLM-XComposer2: mastering free-form text-image composition and comprehension in vision-language large model. 2024. ArXiv:2401.16420"},{"key":"4235_CR89","volume-title":"Rekaflash","author":"Reka AI","year":"2024","unstructured":"Reka AI. Rekaflash. 2024. https:\/\/www.reka.ai"},{"key":"4235_CR90","volume-title":"Gemini models","author":"Google","year":"2024","unstructured":"Google. Gemini models. 2024. https:\/\/deepmind.google\/technologies\/gemini"},{"key":"4235_CR91","volume-title":"XVERSE-V","author":"XVERSE","year":"2024","unstructured":"XVERSE. XVERSE-V. 2024. https:\/\/github.com\/xverse-ai\/XVERSE-V-13B"},{"key":"4235_CR92","unstructured":"Lu S Y, Li Y, Chen Q-G, et al. Ovis: structural embedding alignment for multimodal large language model. 2024. ArXiv:2405.20797"},{"key":"4235_CR93","unstructured":"Bai J Z, Bai S, Tan S S, et al. Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. 2023. ArXiv:2308.12966"},{"key":"4235_CR94","volume-title":"MiniCPM-Llama3-V2.5","author":"OpenBMB","year":"2024","unstructured":"OpenBMB. MiniCPM-Llama3-V2.5. 2024. https:\/\/huggingface.co\/openbmb\/MiniCPM-Llama3-V-2_5"},{"key":"4235_CR95","first-page":"14398","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Q Sun","year":"2024","unstructured":"Sun Q, Cui Y F, Zhang X S, et al. Generative multimodal models are in-context learners. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2024. 14398\u201314409"},{"key":"4235_CR96","unstructured":"Lu H Y, Liu W, Zhang B, et al. DeepSeek-VL: towards real-world vision-language understanding. 2024. ArXiv:2403.05525"},{"key":"4235_CR97","volume-title":"Claude3","author":"Anthropic","year":"2024","unstructured":"Anthropic. Claude3. 2024. https:\/\/docs.anthropic.com\/en\/docs\/build-with-claude\/vision"},{"key":"4235_CR98","volume-title":"OmniLMM-12B","author":"OpenBMB","year":"2024","unstructured":"OpenBMB. OmniLMM-12B. 2024. https:\/\/huggingface.co\/openbmb\/OmniLMM-12B"},{"key":"4235_CR99","volume-title":"TransCore-M","author":"PCI Research","year":"2024","unstructured":"PCI Research. TransCore-M. 2024. https:\/\/github.com\/PCIResearch\/TransCore-M"},{"key":"4235_CR100","unstructured":"Zhang P, Dong X Y, Zang Y H, et al. InternLM-XComposer-2.5: a versatile large vision language model supporting long-contextual input and output. 2024. ArXiv:2407.03320"},{"key":"4235_CR101","volume-title":"XTuner: a toolkit for efficiently fine-tuning LLM","author":"XTuner Contributors","year":"2023","unstructured":"XTuner Contributors. XTuner: a toolkit for efficiently fine-tuning LLM. 2023. https:\/\/github.com\/InternLM\/xtuner"},{"key":"4235_CR102","volume-title":"Proceedings of the European Conference on Computer Vision","author":"L Chen","year":"2024","unstructured":"Chen L, Li J S, Dong X Y, et al. ShareGPT4V: improving large multi-modal models with better captions. In: Proceedings of the European Conference on Computer Vision, 2024"},{"key":"4235_CR103","volume-title":"360VL","author":"QiHoo360","year":"2024","unstructured":"QiHoo360. 360VL. 2024. https:\/\/github.com\/360CVGroup\/360VL"},{"key":"4235_CR104","unstructured":"Dong X Y, Zhang P, Zang Y H, et al. InternLM-XComposer2-4KHD: a pioneering large vision-language model handling resolutions from 336 pixels to 4K HD. 2024. ArXiv:2404.06512"},{"key":"4235_CR105","volume-title":"MiniCPM-V","author":"OpenBMB","year":"2024","unstructured":"OpenBMB. MiniCPM-V. 2024. https:\/\/huggingface.co\/openbmb\/MiniCPM-V"},{"key":"4235_CR106","volume-title":"Yi-VL-34B","author":"AI 01","year":"2024","unstructured":"AI 01. Yi-VL-34B. 2024. https:\/\/huggingface.co\/01-ai\/Yi-VL-34B"},{"key":"4235_CR107","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"H Laurencon","year":"2024","unstructured":"Laurencon H, Saulnier L, Tronchon L, et al. OBELICS: an open web-scale filtered dataset of interleaved image-text documents. In: Proceedings of the Advances in Neural Information Processing Systems, 2024"},{"key":"4235_CR108","volume-title":"MMAlaya","author":"DataCanvas Ltd","year":"2024","unstructured":"DataCanvas Ltd. MMAlaya. 2024. https:\/\/github.com\/DataCanvasIO\/MMAlaya"},{"key":"4235_CR109","volume-title":"Phi-3-vision","author":"Microsoft","year":"2024","unstructured":"Microsoft. Phi-3-vision. 2024. https:\/\/huggingface.co\/microsoft\/Phi-3-vision-128k-instruct"},{"key":"4235_CR110","first-page":"320","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics","author":"Z X Du","year":"2022","unstructured":"Du Z X, Qian Y J, Liu X, et al. GLM: general language model pretraining with autoregressive blank infilling. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, 2022. 320\u2013335"},{"key":"4235_CR111","volume-title":"OpenFlamingo v2: new models and enhanced training setup","author":"A Awadalla","year":"2023","unstructured":"Awadalla A, Gao I. OpenFlamingo v2: new models and enhanced training setup. https:\/\/laion.ai\/blog\/open-flamingo-v2, 2023"},{"key":"4235_CR112","unstructured":"Laurencon H, Tronchon L, Cord M, et al. What matters when building vision-language models? 2024. ArXiv:2405.02246"},{"key":"4235_CR113","unstructured":"Su Y X, Lan T, Li H Y, et al. PandaGPT: one model to instruction-follow them all. 2023. ArXiv:2305.16355"},{"key":"4235_CR114","volume-title":"Step-1v","author":"StepFun","year":"2024","unstructured":"StepFun. Step-1v. https:\/\/platform.stepfun.com, 2024"},{"key":"4235_CR115","unstructured":"Team Chameleon. Chameleon: mixed-modal early-fusion foundation models. 2024. ArXiv:2405.09818"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4235-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4235-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4235-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T22:02:46Z","timestamp":1768860166000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4235-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":115,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["4235"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4235-6","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"12 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 November 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 December 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"220102"}}