{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T09:08:45Z","timestamp":1768295325781,"version":"3.49.0"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031781186","type":"print"},{"value":"9783031781193","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78119-3_7","type":"book-chapter","created":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:00:00Z","timestamp":1733277600000},"page":"91-106","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Perception-Enhanced Generative Transformer for\u00a0Key Information Extraction from\u00a0Documents"],"prefix":"10.1007","author":[{"given":"Runbo","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Jun Jie","family":"Ou Yang","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Xugong","family":"Qin","sequence":"additional","affiliation":[]},{"given":"Gangyan","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Xiaoxu","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,5]]},"reference":[{"key":"7_CR1","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: DocFormer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 993\u20131003 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"7_CR2","unstructured":"Bao, H., et\u00a0al.: UniLMv2: Pseudo-masked language models for unified language model pre-training. In: International Conference on Machine Learning, pp. 642\u2013652. PMLR (2020)"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Cao, P., Wang, Y., Zhang, Q., Meng, Z.: GenKIE: robust generative multimodal document key information extraction. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 14702\u201314713 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.979"},{"key":"7_CR4","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I, pp. 213\u2013229. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"7_CR5","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal Image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX, pp. 104\u2013120. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"7_CR6","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\,\\times \\,$$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"7_CR7","unstructured":"Feng, H., Liu, Q., Liu, H., Zhou, W., Li, H., Huang, C.: DocPedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding. arXiv preprint arXiv:2311.11810 (2023)"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Feng, H., Wang, Y., Zhou, W., Deng, J., Li, H.: DocTr: document image transformer for geometric unwarping and illumination correction. In: ACM Multimedia, pp. 273\u2013281 (2021)","DOI":"10.1145\/3474085.3475388"},{"key":"7_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1007\/978-3-030-86549-8_34","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"\u0141 Garncarek","year":"2021","unstructured":"Garncarek, \u0141, et al.: LAMBERT: layout-aware language modeling for information extraction. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12821, pp. 532\u2013547. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86549-8_34"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"He, J., et al.: ICL-D3IE: in-context learning with diverse demonstrations updating for document information extraction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19485\u201319494 (2023)","DOI":"10.1109\/ICCV51070.2023.01785"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: BROS: a pre-trained language model focusing on text and layout for better key information extraction from documents. In: AAAI, vol.\u00a036, pp. 10767\u201310775 (2022)","DOI":"10.1609\/aaai.v36i10.21322"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: LayoutLMv3: pre-training for document AI with unified text and image masking. In: ACM Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: ICDAR, pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.P.: FUNSD: a dataset for form understanding in noisy scanned documents. In: ICDARW, vol.\u00a02, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"7_CR16","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"7_CR17","doi-asserted-by":"publisher","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_29","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"7_CR18","doi-asserted-by":"publisher","unstructured":"Kuang, J., et al.: Visual information extraction in the wild: practical dataset and end-to-end solution. In: ICDAR, pp. 36\u201353. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-41731-3_3","DOI":"10.1007\/978-3-031-41731-3_3"},{"key":"7_CR19","unstructured":"Lample, G., Conneau, A.: Cross-lingual language model pretraining. arXiv preprint arXiv:1901.07291 (2019)"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: AAAI, vol.\u00a034, pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"issue":"1","key":"7_CR21","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TKDE.2020.2981314","volume":"34","author":"J Li","year":"2020","unstructured":"Li, J., Sun, A., Han, J., Li, C.: A survey on deep learning for named entity recognition. IEEE Trans. Knowl. Data Eng. 34(1), 50\u201370 (2020)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Li, P., et al.: SelfDoc: self-supervised document representation learning. In: CVPR, pp. 5652\u20135660 (2021)","DOI":"10.1109\/CVPR46437.2021.00560"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: StrucTexT: structured text understanding with multi-modal transformers. In: ACM Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"7_CR24","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"7_CR25","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS, vol.\u00a032 (2019)"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Materzy\u0144ska, J., Torralba, A., Bau, D.: Disentangling visual and written concepts in clip. In: CVPR, pp. 16410\u201316419 (2022)","DOI":"10.1109\/CVPR52688.2022.01592"},{"key":"7_CR27","unstructured":"Park, S., et al.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Qiao, Z., Qin, X., Zhou, Y., Yang, F., Wang, W.: Gaussian constrained attention network for scene text recognition. In: ICPR, pp. 3328\u20133335. IEEE (2020)","DOI":"10.1109\/ICPR48806.2021.9412806"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Qin, X., et al.: Towards robust real-time scene text detection: from semantic to instance representation learning. In: ACM Multimedia, pp. 2025\u20132034 (2023)","DOI":"10.1145\/3581783.3611801"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Qin, X., et al.: Mask is all you need: rethinking mask R-CNN for dense and arbitrary-shaped scene text detection. In: ACM Multimedia, pp. 414\u2013423 (2021)","DOI":"10.1145\/3474085.3475178"},{"key":"7_CR31","doi-asserted-by":"crossref","unstructured":"Qin, X., Zhou, Y., Guo, Y., Wu, D., Wang, W.: Fc$$ ^{\\text{2}}$$rn: a fully convolutional corner refinement network for accurate multi-oriented scene text detection. In: ICASSP, pp. 4350\u20134354. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413821"},{"key":"7_CR32","doi-asserted-by":"crossref","unstructured":"Qin, X., Zhou, Y., Yang, D., Wang, W.: Curved text detection in natural scene images with semi-and weakly-supervised learning. In: ICDAR, pp. 559\u2013564. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00095"},{"key":"7_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"7_CR34","doi-asserted-by":"publisher","first-page":"564","DOI":"10.1007\/978-3-030-86549-8_36","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021: 16th International Conference, Lausanne, Switzerland, September 5\u201310, 2021, Proceedings, Part I","author":"T Stanis\u0142awek","year":"2021","unstructured":"Stanis\u0142awek, T., et al.: Kleister: key information extraction datasets involving long documents with complex layouts. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) Document Analysis and Recognition \u2013 ICDAR 2021: 16th International Conference, Lausanne, Switzerland, September 5\u201310, 2021, Proceedings, Part I, pp. 564\u2013579. Springer International Publishing, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86549-8_36"},{"key":"7_CR35","doi-asserted-by":"publisher","DOI":"10.1108\/AJIM-05-2023-0180","author":"S Sun","year":"2023","unstructured":"Sun, S., Deng, J., Qin, X.: Unearthing historical insights: semantic organization and application of historical newspapers from a fine-grained knowledge element perspective. ASLIB J. Inf. Manage. (2023). https:\/\/doi.org\/10.1108\/AJIM-05-2023-0180","journal-title":"ASLIB J. Inf. Manage."},{"key":"7_CR36","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: EMNLP-IJCNLP, pp. 5100\u20135111 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"7_CR37","doi-asserted-by":"crossref","unstructured":"Tong, X., Dai, P., Qin, X., Wang, R., Ren, W.: Granularity-aware single-point scene text spotting with sequential recurrence self-attention. IEEE Trans. Circuits Syst. Video Technol. (2024)","DOI":"10.1109\/TCSVT.2024.3431993"},{"key":"7_CR38","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"7_CR39","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"7_CR40","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. In: NeurIPS (2024)"},{"key":"7_CR41","doi-asserted-by":"crossref","unstructured":"Xu, Y., et\u00a0al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. ACL Assoc. Comput. Linguist. (2021)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"7_CR42","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: ACM SIGKDD, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: TAP: text-aware pre-training for Text-VQA and text-caption. In: CVPR, pp. 8751\u20138761 (2021)","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"7_CR44","unstructured":"Yu, Y., et al.: StrucTexTv2: masked visual-textual prediction for document image pre-training. In: International Conference on Learning Representations (2023)"},{"key":"7_CR45","doi-asserted-by":"publisher","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from CLIP. In: European Conference on Computer Vision, pp. 696\u2013712. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_40","DOI":"10.1007\/978-3-031-19815-1_40"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78119-3_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:03:57Z","timestamp":1733277837000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78119-3_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,5]]},"ISBN":["9783031781186","9783031781193"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78119-3_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,5]]},"assertion":[{"value":"5 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}