{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T05:28:50Z","timestamp":1771046930124,"version":"3.50.1"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031705458","type":"print"},{"value":"9783031705465","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70546-5_9","type":"book-chapter","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T05:02:47Z","timestamp":1725944567000},"page":"142-159","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["LAPDoc: Layout-Aware Prompting for\u00a0Documents"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4345-6888","authenticated-orcid":false,"given":"Marcel","family":"Lamott","sequence":"first","affiliation":[]},{"given":"Yves-Noel","family":"Weweler","sequence":"additional","affiliation":[]},{"given":"Adrian","family":"Ulges","sequence":"additional","affiliation":[]},{"given":"Faisal","family":"Shafait","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0984-5918","authenticated-orcid":false,"given":"Dirk","family":"Krechel","sequence":"additional","affiliation":[]},{"given":"Darko","family":"Obradovic","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,11]]},"reference":[{"key":"9_CR1","unstructured":"Borchmann, \u0141., et al.: DUE: end-to-end document understanding benchmark. In: NeurIPS Datasets and Benchmarks (2021). https:\/\/api.semanticscholar.org\/CorpusID:244906279"},{"key":"9_CR2","unstructured":"Li, M., et al.: TableBank: table benchmark for image-based table detection and recognition. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 1918\u20131925 (2020)"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: DocBank: a benchmark dataset for document layout analysis. arXiv preprint arXiv:2006.01038 (2020)","DOI":"10.18653\/v1\/2020.coling-main.82"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: XFUND: a benchmark dataset for multilingual visually rich form understanding. In: Findings of the Association for Computational Linguistics: ACL 2022, Dublin, Ireland, pp. 3214\u20133224. Association for Computational Linguistics, May 2022. https:\/\/aclanthology.org\/2022.findings-acl.253","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Cao, H., et al.: GMN: generative multi-modal network for practical document information extraction. arXiv preprint arXiv:2207.04713 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.276"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Tang, Z., et al.: Unifying vision, text, and layout for universal document processing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19254\u201319264 (2023)","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Luo, C., et al.: GeoLayoutLM: geometric pre-training for visual information extraction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7092\u20137101 (2023)","DOI":"10.1109\/CVPR52729.2023.00685"},{"key":"9_CR8","unstructured":"Wang, D., et al.: DocLLM: a layout-aware generative language model for multimodal document understanding. arXiv preprint arXiv:2401.00908 (2023)"},{"key":"9_CR9","doi-asserted-by":"publisher","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part XXVIII, Tel Aviv, Israel, pp. 498\u2013517. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_29","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"9_CR10","unstructured":"Lv, T., et al.: Kosmos-2.5: a multimodal literate model. arXiv preprint arXiv:2309.11419 (2023)"},{"key":"9_CR11","unstructured":"Liu, Y.-H., et al.: Summary of ChatGPT\/GPT-4 research and perspective towards the future of large LanguageModels. arXiv: abs\/2304.01852 (2023). https:\/\/api.semanticscholar.org\/CorpusID:263893278"},{"key":"9_CR12","unstructured":"Wei, J., et al.: Emergent abilities of large language models. Trans. Mach. Learn. Res. (2022). https:\/\/openreview.net\/forum?id=yzkSU5zdwD"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: SOLAR 10.7B: scaling large language models with simple yet effective depth up-scaling. arXiv: 2312.15166 [cs.CL] (2023)","DOI":"10.18653\/v1\/2024.naacl-industry.3"},{"key":"9_CR14","unstructured":"Vaswani, A., et al.: Attention is all you need. CoRR abs\/1706.03762 (2017). arXiv:1706.03762, http:\/\/arxiv.org\/abs\/1706.03762"},{"key":"9_CR15","unstructured":"OpenAI. GPT-4 Technical Report. arXiv: abs\/2303.08774 (2023). https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"9_CR16","unstructured":"Touvron, H., et al.: Llama 2: Open Foundation and Fine-Tuned Chat Models, July 2023"},{"key":"9_CR17","unstructured":"Zhang, S., et al.: Instruction tuning for large language models: a survey. arXiv:2308.10792 [cs.CL] (2023)"},{"key":"9_CR18","unstructured":"Feng, H., et al.: UniDoc: a universal large multimodal model for simultaneous text detection, recognition, spotting and understanding. arXiv preprint arXiv:2308.11592 (2023)"},{"key":"9_CR19","doi-asserted-by":"publisher","unstructured":"Xu, Y., et al.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, KDD 2020. Virtual Event, pp. 1192\u20131200. Association for Computing Machinery, CA, USA (2020). https:\/\/doi.org\/10.1145\/3394486.3403172","DOI":"10.1145\/3394486.3403172"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Zong, C., et al. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), August 2021, pp. 2579\u20132591. Association for Computational Linguistics (2021). https:\/\/aclanthology.org\/2021.acl-long.201","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"9_CR21","doi-asserted-by":"publisher","unstructured":"Huang, Y., et al.: LayoutLMv3: pre-training for document AI with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia, MM 2022, Lisboa, Portugal, pp. 4083\u20134091. Association for Computing Machinery (2022). https:\/\/doi.org\/10.1145\/3503161.3548112","DOI":"10.1145\/3503161.3548112"},{"key":"9_CR22","unstructured":"Devlin, J., et al.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Minneapolis, Minnesota, June 2019, pp. 4171\u20134186. Association for Computational Linguistics (2019). https:\/\/aclanthology.org\/N19-1423"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., et al.: DocFormer: end-to-end transformer for document understanding. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 973\u2013983 (2021). https:\/\/api.semanticscholar.org\/CorpusID:235592814","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Tang, Z., et al.: Unifying vision, text, and layout for universal document processing. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19254\u201319264 (2022). https:\/\/api.semanticscholar.org\/CorpusID:254275326","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"9_CR25","doi-asserted-by":"publisher","unstructured":"Davis, B., et al.: End-to-end document recognition and understanding with Dessurt. In: Karlinsky, L., Michaeli, T., Nishino, K. (eds.) Computer Vision - ECCV 2022 Workshops, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part IV, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-25069-9_19","DOI":"10.1007\/978-3-031-25069-9_19"},{"key":"9_CR26","unstructured":"Wang, W., et al.: Layout and task aware instruction prompt for zeroshot document image question answering. arXiv: 2306.00526 [cs.CL] (2023)"},{"key":"9_CR27","unstructured":"OpenAI Docs Prompt Engineering (2024). https:\/\/platform.openai.com\/docs\/guides\/prompt-engineering\/six-strategies-for-getting-betterresults. Visited 27 Jan 2024"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: WebSRC: a dataset for web-based structural reading comprehension. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. Online and Punta Cana, Dominican Republic, November 2021, pp. 4173\u20134185. Association for Computational Linguistics (2021). https:\/\/aclanthology.org\/2021.emnlp-main.343","DOI":"10.18653\/v1\/2021.emnlp-main.343"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520 (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"9_CR30","unstructured":"OpenAI Docs JSON Mode (2024). https:\/\/platform.openai.com\/docs\/guides\/text-generation\/json-mode. Visited 27 Jan 2024"},{"key":"9_CR31","unstructured":"https:\/\/x-lance.github.io\/WebSRC\/"},{"key":"9_CR32","doi-asserted-by":"publisher","unstructured":"Powalski, R., et al.: Going Full-TILT Boogie on document understanding with text-image-layout transformer. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) Document Analysis and Recognition- ICDAR 2021: 16th International Conference, Lausanne, Switzerland, 5\u201310 September 2021, Proceedings, Part II 16. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_47","DOI":"10.1007\/978-3-030-86331-9_47"},{"key":"9_CR33","unstructured":"Wu, S., et al.: DocPrompt: large-scale continue pretrain for zero-shot and few-shot document question answering. arXiv preprint arXiv:2308.10959 (2023)"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Li, J., et al.: MarkupLM: pre-training of text and markup language for visually-rich document understanding. arXiv preprint arXiv:2110.08518 (2021)","DOI":"10.18653\/v1\/2022.acl-long.420"},{"key":"9_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: StrucTexT: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"9_CR36","unstructured":"Jiang, A.Q., et al.: Mistral 7B. arXiv: 2310.06825 [cs.CL] (2023)"},{"key":"9_CR37","unstructured":"https:\/\/openai.com\/blog\/chatgpt-can-now-see-hear-and-speak"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70546-5_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T05:05:02Z","timestamp":1725944702000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70546-5_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705458","9783031705465"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70546-5_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"11 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}