{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T21:12:17Z","timestamp":1742937137453,"version":"3.40.3"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031705328"},{"type":"electronic","value":"9783031705335"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70533-5_12","type":"book-chapter","created":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:02:25Z","timestamp":1725685345000},"page":"191-207","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Light-Weight Multi-modality Feature Fusion Network for\u00a0Visually-Rich Document Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6847-0075","authenticated-orcid":false,"given":"Jeff","family":"Yang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8980-283X","authenticated-orcid":false,"given":"Huynh Vu","family":"The","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7524-951X","authenticated-orcid":false,"given":"Hai Luu","family":"Tuan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,8]]},"reference":[{"key":"12_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1007\/978-3-642-00382-0_32","volume-title":"Computational Linguistics and Intelligent Text Processing","author":"S Adali","year":"2009","unstructured":"Adali, S., Sonmez, A.C., Gokturk, M.: An integrated architecture for processing business documents in Turkish. In: Gelbukh, A. (ed.) CICLing 2009. LNCS, vol. 5449, pp. 394\u2013405. Springer, Heidelberg (2009). https:\/\/doi.org\/10.1007\/978-3-642-00382-0_32"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: Docformer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 993\u20131003 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"12_CR3","unstructured":"Bao, H., et\u00a0al.: Unilmv2: pseudo-masked language models for unified language model pre-training. In: International Conference on Machine Learning, pp. 642\u2013652. PMLR (2020)"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Bela\u00efd, Y., Bela\u00efd, A.: Morphological tagging approach in document analysis of invoices. In: Proceedings of the 17th International Conference on Pattern Recognition, 2004. ICPR 2004, vol.\u00a01, pp. 469\u2013472. IEEE (2004)","DOI":"10.1109\/ICPR.2004.1334166"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Carbonell, M., Riba, P., Villegas, M., Forn\u00e9s, A., Llad\u00f3s, J.: Named entity recognition and relation extraction with graph neural networks in semi structured documents. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 9622\u20139627. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412669"},{"key":"12_CR6","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., Wei, Y.: Deformable convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 764\u2013773 (2017)","DOI":"10.1109\/ICCV.2017.89"},{"key":"12_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"433","DOI":"10.1007\/3-540-45869-7_47","volume-title":"Document Analysis Systems V","author":"AR Dengel","year":"2002","unstructured":"Dengel, A.R., Klein, B.: smartFIX: a requirements-driven system for document analysis and understanding. In: Lopresti, D., Hu, J., Kashi, R. (eds.) DAS 2002. LNCS, vol. 2423, pp. 433\u2013444. Springer, Heidelberg (2002). https:\/\/doi.org\/10.1007\/3-540-45869-7_47"},{"key":"12_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Esser, D., Schuster, D., Muthmann, K., Berger, M., Schill, A.: Automatic indexing of scanned documents: a layout-based approach. In: Document recognition and retrieval XIX, vol.\u00a08297, pp. 118\u2013125. SPIE (2012)","DOI":"10.1117\/12.908542"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Gori, M., Monfardini, G., Scarselli, F.: A new model for learning in graph domains. In: Proceedings. 2005 IEEE International Joint Conference on Neural Networks, vol.\u00a02, pp. 729\u2013734 (2005)","DOI":"10.1109\/IJCNN.2005.1555942"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Gui, T., Zou, Y., Zhang, Q., Peng, M., Fu, J., Wei, Z., Huang, X.J.: A lexicon-based graph neural network for chinese ner. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). pp. 1040\u20131050 (2019)","DOI":"10.18653\/v1\/D19-1096"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR14","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: Bros: a pre-trained language model for understanding texts in document (2020)"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document ai with unified text and image masking. arXiv preprint arXiv:2204.08387 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Huang, Z., Chen, K., He, J., Bai, X., Karatzas, D., Lu, S., Jawahar, C.: Icdar2019 competition on scanned receipt ocr and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Hwang, W., Yim, J., Park, S., Yang, S., Seo, M.: Spatial dependency parsing for semi-structured document information extraction. arXiv preprint arXiv:2005.00642 (2020)","DOI":"10.18653\/v1\/2021.findings-acl.28"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.P.: Funsd: a dataset for form understanding in noisy scanned documents. In: 2019 International Conference on Document Analysis and Recognition Workshops (ICDARW), vol.\u00a02, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., Chen, X.: In defense of grid features for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10267\u201310276 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Katti, A.R., Reisswig, C., Guder, C., Brarda, S., Bickel, S., H\u00f6hne, J., Faddoul, J.B.: Chargrid: Towards understanding 2d documents. arXiv preprint arXiv:1809.08799 (2018)","DOI":"10.18653\/v1\/D18-1476"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Computing Surveys (CSUR) (2021)","DOI":"10.1145\/3505244"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-540-28640-0_43","volume-title":"Document Analysis Systems VI","author":"B Klein","year":"2004","unstructured":"Klein, B., Agne, S., Dengel, A.: Results of a Study on invoice-reading systems in Germany. In: Marinai, S., Dengel, A.R. (eds.) DAS 2004. LNCS, vol. 3163, pp. 451\u2013462. Springer, Heidelberg (2004). https:\/\/doi.org\/10.1007\/978-3-540-28640-0_43"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Lample, G., Ballesteros, M., Subramanian, S., Kawakami, K., Dyer, C.: Neural architectures for named entity recognition. arXiv preprint arXiv:1603.01360 (2016)","DOI":"10.18653\/v1\/N16-1030"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Qian, Y., Yu, Y., Qin, X., Zhang, C., Liu, Y., Yao, K., Han, J., Liu, J., Ding, E.: Structext: Structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia. pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Liao, M., Wan, Z., Yao, C., Chen, K., Bai, X.: Real-time scene text detection with differentiable binarization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11474\u201311481 (2020)","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q., Zhao, H.: Graph convolution for multimodal information extraction from visually rich documents. arXiv preprint arXiv:1903.11279 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"Majumder, B., Potti, N., Tata, S., Wendt, J.B., Zhao, Q., Najork, M.: Representation learning for information extraction from form-like documents (2020)","DOI":"10.18653\/v1\/2020.acl-main.580"},{"key":"12_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"353","DOI":"10.1007\/978-3-030-86159-9_25","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021 Workshops","author":"T-AD Nguyen","year":"2021","unstructured":"Nguyen, T.-A.D., Vu, H.M., Son, N.H., Nguyen, M.-T.: A span extraction approach for information extraction on visually-rich documents. In: Barney Smith, E.H., Pal, U. (eds.) ICDAR 2021. LNCS, vol. 12917, pp. 353\u2013363. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86159-9_25"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Palm, R.B., Winther, O., Laws, F.: Cloudscan-a configuration-free invoice analysis system using recurrent neural networks. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 406\u2013413. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.74"},{"key":"12_CR30","unstructured":"Park, S., Shin, S., Lee, B., Lee, J., Surh, J., Seo, M., Lee, H.: Cord: a consolidated receipt dataset for post-ocr parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"12_CR31","unstructured":"Qian, Y., Santus, E., Jin, Z., Guo, J., Barzilay, R.: Graphie: a graph-based framework for information extraction. arXiv preprint arXiv:1810.13083 (2018)"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-bert: sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)","DOI":"10.18653\/v1\/D19-1410"},{"key":"12_CR33","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Sage, C., Aussem, A., Elghazel, H., Eglin, V., Espinas, J.: Recurrent neural network approach for table field extraction in business documents. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1308\u20131313. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00211"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Tang, G., et al.: Matchvie: exploiting match relevancy between entities for visual information extraction. arXiv preprint arXiv:2106.12940 (2021)","DOI":"10.24963\/ijcai.2021\/144"},{"key":"12_CR36","unstructured":"Veli\u010dkovi\u0107, P., Cucurull, G., Casanova, A., Romero, A., Lio, P., Bengio, Y.: Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Towards robust visual information extraction in real world: new dataset and novel solution. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2738\u20132745 (2021)","DOI":"10.1609\/aaai.v35i4.16378"},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Wang, Z., Shang, J.: Towards few-shot entity recognition in document images: a label-aware sequence-to-sequence framework. arXiv preprint arXiv:2204.05819 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.329"},{"key":"12_CR39","doi-asserted-by":"crossref","unstructured":"Wei, M., He, Y., Zhang, Q.: Robust layout-aware IE for visually rich documents with pre-trained language models. In: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2367\u20132376 (2020)","DOI":"10.1145\/3397271.3401442"},{"key":"12_CR40","doi-asserted-by":"crossref","unstructured":"Xu, Y., et\u00a0al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"12_CR42","unstructured":"Xu, Y., et al.: Layoutxlm: multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836 (2021)"},{"key":"12_CR43","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Xfund: a benchmark dataset for multilingual visually rich form understanding. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 3214\u20133224 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: Pick: processing key information extraction from documents using improved graph learning-convolutional networks. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 4363\u20134370. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412927"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: Trie: end-to-end text reading and information extraction for document understanding. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70533-5_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:05:22Z","timestamp":1725685522000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70533-5_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705328","9783031705335"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70533-5_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"8 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}