{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:50:52Z","timestamp":1778082652545,"version":"3.51.4"},"publisher-location":"Singapore","reference-count":51,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819785100","type":"print"},{"value":"9789819785117","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8511-7_20","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T05:08:40Z","timestamp":1730524120000},"page":"276-289","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Enhancing Visual Information Extraction with Large Language Models Through Layout-Aware Instruction Tuning"],"prefix":"10.1007","author":[{"given":"Teng","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiapeng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"20_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al.: Gpt-4 technical report (2023). arXiv:2303.08774"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: Docformer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 993\u20131003 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Tang, P., Dong, Q., Sankaran, N., Zhou, Y., Manmatha, R.: Docformerv2: local features for document understanding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 709\u2013718 (2024)","DOI":"10.1609\/aaai.v38i2.27828"},{"key":"20_CR4","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F., et\u00a0al.: Qwen technical report (2023). arXiv:2309.16609"},{"key":"20_CR5","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: a frontier large vision-language model with versatile abilities (2023). arXiv:2308.12966"},{"key":"20_CR6","unstructured":"Denk, T.I., Reisswig, C.: Bertgrid: contextualized embedding for 2d document representation and understanding. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"20_CR7","unstructured":"Dong, Q., Li, L., Dai, D., Zheng, C., Wu, Z., Chang, B., Sun, X., Xu, J., Sui, Z.: A survey on in-context learning (2022). arXiv:2301.00234"},{"key":"20_CR8","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"20_CR9","unstructured":"Feng, H., Liu, Q., Liu, H., Zhou, W., Li, H., Huang, C.: Docpedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding (2023). arXiv:2311.11810"},{"key":"20_CR10","unstructured":"Feng, H., Wang, Z., Tang, J., Lu, J., Zhou, W., Li, H., Huang, C.: Unidoc: a universal large multimodal model for simultaneous text detection, recognition, spotting and understanding (2023). arXiv:2308.11592"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"He, J., Wang, L., Hu, Y., Liu, N., Liu, H., Xu, X., Shen, H.T.: Icl-d3ie: in-context learning with diverse demonstrations updating for document information extraction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19485\u201319494 (2023)","DOI":"10.1109\/ICCV51070.2023.01785"},{"key":"20_CR12","unstructured":"Hu, A., Xu, H., Ye, J., Yan, M., Zhang, L., Zhang, B., Li, C., Zhang, J., Jin, Q., Huang, F., et\u00a0al.: mplug-docowl 1.5: unified structure learning for ocr-free document understanding (2024). arXiv:2403.12895"},{"key":"20_CR13","unstructured":"Hu, E.J., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W., et\u00a0al.: Lora: low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document ai with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Huang, Z., Chen, K., He, J., Bai, X., Karatzas, D., Lu, S., Jawahar, C.: Icdar2019 competition on scanned receipt ocr and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.P.: Funsd: a dataset for form understanding in noisy scanned documents. In: 2019 International Conference on Document Analysis and Recognition Workshops (ICDARW), vol.\u00a02, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Katti, A.R., Reisswig, C., Guder, C., Brarda, S., Bickel, S., H\u00f6hne, J., Faddoul, J.B.: Chargrid: towards understanding 2d documents. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 4459\u20134469 (2018)","DOI":"10.18653\/v1\/D18-1476"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Kim, G., Hong, T., Yim, M., Nam, J., Park, J., Yim, J., Hwang, W., Yun, S., Han, D., Park, S.: Ocr-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Lee, C.Y., Li, C.L., Dozat, T., Perot, V., Su, G., Hua, N., Ainslie, J., Wang, R., Fujii, Y., Pfister, T.: Formnet: structural encoding beyond sequential modeling in form document information extraction. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 3735\u20133754 (2022)","DOI":"10.18653\/v1\/2022.acl-long.260"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Lee, C.Y., Li, C.L., Zhang, H., Dozat, T., Perot, V., Su, G., Zhang, X., Sohn, K., Glushnev, N., Wang, R., et\u00a0al.: Formnetv2: multimodal graph contrastive learning for form document information extraction. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 9011\u20139026 (2023)","DOI":"10.18653\/v1\/2023.acl-long.501"},{"key":"20_CR21","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Qian, Y., Yu, Y., Qin, X., Zhang, C., Liu, Y., Yao, K., Han, J., Liu, J., Ding, E.: Structext: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Li, Z., Yang, B., Liu, Q., Ma, Z., Zhang, S., Yang, J., Sun, Y., Liu, Y., Bai, X.: Monkey: image resolution and text label are important things for large multi-modal models (2023). arXiv:2311.06607","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Liao, H., RoyChowdhury, A., Li, W., Bansal, A., Zhang, Y., Tu, Z., Satzoda, R.K., Manmatha, R., Mahadevan, V.: Doctr: document transformer for structured information extraction in documents. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19584\u201319594 (2023)","DOI":"10.1109\/ICCV51070.2023.01794"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Lin, W., Gao, Q., Sun, L., Zhong, Z., Hu, K., Ren, Q., Huo, Q.: Vibertgrid: a jointly trained multi-modal 2d document representation for key information extraction from documents. In: Document Analysis and Recognition\u2014ICDAR 2021: 16th International Conference, Lausanne, Switzerland, Proceedings, Part I 16. pp. 548\u2013563. Springer (2021)","DOI":"10.1007\/978-3-030-86549-8_35"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"20_CR27","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q., Zhao, H.: Graph convolution for multimodal information extraction from visually rich documents. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: human Language Technologies, Volume 2 (Industry Papers), pp. 32\u201339 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"20_CR29","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"20_CR30","unstructured":"Park, S., Shin, S., Lee, B., Lee, J., Surh, J., Seo, M., Lee, H.: Cord: a consolidated receipt dataset for post-ocr parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"20_CR31","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., He, Y.: Zero: Memory optimizations toward training trillion parameter models. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201316. IEEE (2020)","DOI":"10.1109\/SC41405.2020.00024"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Tang, G., Xie, L., Jin, L., Wang, J., Chen, J., Xu, Z., Wang, Q., Wu, Y., Li, H.: Matchvie: exploiting match relevancy between entities for visual information extraction (2021). arXiv:2106.12940","DOI":"10.24963\/ijcai.2021\/144"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Tang, Z., Yang, Z., Wang, G., Fang, Y., Liu, Y., Zhu, C., Zeng, M., Zhang, C., Bansal, M.: Unifying vision, text, and layout for universal document processing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19254\u201319264 (2023)","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"20_CR35","unstructured":"Team, I.: Internlm: A multilingual language model with progressively enhanced capabilities (2023)"},{"key":"20_CR36","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al.: Llama: Open and efficient foundation language models (2023). arXiv:2302.13971"},{"key":"20_CR37","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models (2023). arXiv:2307.09288"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Tu, Y., Guo, Y., Chen, H., Tang, J.: Layoutmask: enhance text-layout interaction in multi-modal pre-training for document understanding. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 15200\u201315212 (2023)","DOI":"10.18653\/v1\/2023.acl-long.847"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Wang, D., Raman, N., Sibue, M., Ma, Z., Babkin, P., Kaur, S., Pei, Y., Nourbakhsh, A., Liu, X.: Docllm: A layout-aware generative language model for multimodal document understanding (2023). arXiv:2401.00908","DOI":"10.18653\/v1\/2024.acl-long.463"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Wang, J., Jin, L., Ding, K.: Lilt: a simple yet effective language-independent layout transformer for structured document understanding. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7747\u20137757 (2022)","DOI":"10.18653\/v1\/2022.acl-long.534"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Wei, H., Kong, L., Chen, J., Zhao, L., Ge, Z., Yang, J., Sun, J., Han, C., Zhang, X.: Vary: Scaling up the vision vocabulary for large vision-language models (2023). arXiv:2312.06109","DOI":"10.1007\/978-3-031-73235-5_23"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et\u00a0al.: Layoutlmv2: multi-modal pre-training for visually-rich document understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2579\u20132591 (2021)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"20_CR44","unstructured":"Yang, A., Xiao, B., Wang, B., Zhang, B., Bian, C., Yin, C., Lv, C., Pan, D., Wang, D., Yan, D., et\u00a0al.: Baichuan 2: open large-scale language models (2023). arXiv:2309.10305"},{"key":"20_CR45","unstructured":"Ye, J., Hu, A., Xu, H., Ye, Q., Yan, M., Dan, Y., Zhao, C., Xu, G., Li, C., Tian, J., et\u00a0al.: mplug-docowl: modularized multimodal large language model for document understanding (2023). arXiv:2307.02499"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Ye, J., Hu, A., Xu, H., Ye, Q., Yan, M., Xu, G., Li, C., Tian, J., Qian, Q., Zhang, J., et\u00a0al.: Ureader: universal ocr-free visually-situated language understanding with multimodal large language model. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 2841\u20132858 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: Pick: processing key information extraction from documents using improved graph learning-convolutional networks. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 4363\u20134370. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412927"},{"key":"20_CR48","unstructured":"Yu, Y., Li, Y., Zhang, C., Zhang, X., Guo, Z., Qin, X., Yao, K., Han, J., Ding, E., Wang, J.: Structextv2: masked visual-textual prediction for document image pre-training. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, P., Xu, Y., Cheng, Z., Pu, S., Lu, J., Qiao, L., Niu, Y., Wu, F.: Trie: end-to-end text reading and information extraction for document understanding. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhang, X., Yu, Y.: Chatglm-6b fine-tuning for cultural and creative products advertising words. In: 2023 International Conference on Culture-Oriented Science and Technology (CoST), pp. 291\u2013295. IEEE (2023)","DOI":"10.1109\/CoST60524.2023.00066"},{"key":"20_CR51","unstructured":"Zhang, Y., Zhang, R., Gu, J., Zhou, Y., Lipka, N., Yang, D., Sun, T.: Llavar: enhanced visual instruction tuning for text-rich image understanding (2023). arXiv:2306.17107"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8511-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T05:10:16Z","timestamp":1730524216000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8511-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9789819785100","9789819785117"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8511-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}