{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T22:04:54Z","timestamp":1777932294472,"version":"3.51.4"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030865481","type":"print"},{"value":"9783030865498","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86549-8_35","type":"book-chapter","created":{"date-parts":[[2021,9,4]],"date-time":"2021-09-04T02:05:57Z","timestamp":1630721157000},"page":"548-563","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":46,"title":["ViBERTgrid: A Jointly Trained Multi-modal 2D Document Representation for Key Information Extraction from Documents"],"prefix":"10.1007","author":[{"given":"Weihong","family":"Lin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qifang","family":"Gao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhuoyao","family":"Zhong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qin","family":"Ren","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Huo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,9,2]]},"reference":[{"key":"35_CR1","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR 2019 competition on scanned receipt ocr and information extraction. In: ICDAR, pp. 1516\u20131520 (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"35_CR2","doi-asserted-by":"crossref","unstructured":"Lample, G., Ballesteros, M., Subramanian, S., Kawakami, K., Dyer, C.: Neural architectures for named entity recognition. In: NAACL, pp. 260\u2013270 (2016)","DOI":"10.18653\/v1\/N16-1030"},{"key":"35_CR3","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1162\/tacl_a_00104","volume":"4","author":"JP Chiu","year":"2016","unstructured":"Chiu, J.P., Nichols, E.: Named entity recognition with bidirectional LSTM-CNNs. TACL 4, 357\u2013370 (2016)","journal-title":"TACL"},{"key":"35_CR4","doi-asserted-by":"crossref","unstructured":"Ma, X., Hovy, E.: End-to-end sequence labeling via Bi-directional LSTM-CNNs-CRF. In: ACL, pp. 1064\u20131074 (2016)","DOI":"10.18653\/v1\/P16-1101"},{"key":"35_CR5","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019)"},{"key":"35_CR6","unstructured":"Liu, Y., et al.: RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"35_CR7","doi-asserted-by":"crossref","unstructured":"Palm, R.B., Winther, O., Laws, F.: CloudScan - a configuration-free invoice analysis system using recurrent neural networks. In: ICDAR, pp. 406\u2013413 (2017)","DOI":"10.1109\/ICDAR.2017.74"},{"key":"35_CR8","doi-asserted-by":"crossref","unstructured":"Sage, C., Aussem, A., Elghazel, H., Eglin, V., Espinas, J.: Recurrent neural network approach for table field extraction in business documents. In: ICDAR, pp. 1308\u20131313 (2019)","DOI":"10.1109\/ICDAR.2019.00211"},{"key":"35_CR9","unstructured":"Hwang, W., et al.: Post-OCR parsing: building simple and robust parser via BIO tagging. In: Workshop on Document Intelligence at NeurIPS (2019)"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: SIGKDD, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"35_CR11","unstructured":"Garncarek, \u0141., et al.: LAMBERT: layout-aware (language) modeling for information extraction. arXiv preprint arXiv:2002.08087 (2020)"},{"key":"35_CR12","unstructured":"Pramanik, S., Mujumdar, S., Patel, H.: Towards a multi-modal, multi-task learning based pre-training framework for document representation learning. arXiv preprint arXiv:2009.14457 (2020)"},{"key":"35_CR13","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"35_CR14","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: PICK: processing key information extraction from documents using improved graph learning-convolutional networks. In: ICPR (2020)"},{"key":"35_CR15","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 6000\u20136010 (2017)"},{"key":"35_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"144","DOI":"10.1007\/978-3-030-21074-8_12","volume-title":"Computer Vision \u2013 ACCV 2018 Workshops","author":"D Lohani","year":"2019","unstructured":"Lohani, D., Bela\u00efd, A., Bela\u00efd, Y.: An invoice reading system using a graph convolutional network. In: Carneiro, G., You, S. (eds.) ACCV 2018. LNCS, vol. 11367, pp. 144\u2013158. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-21074-8_12"},{"key":"35_CR17","doi-asserted-by":"crossref","unstructured":"Qian, Y., Santus, E., Jin, Z., Guo, J., Barzilay, R.: GraphIE: a graph-based framework for information extraction. In: NAACL, pp. 751\u2013761 (2019)","DOI":"10.18653\/v1\/N19-1082"},{"key":"35_CR18","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q., Zhao, H.: Graph convolution for multimodal information extraction from visually rich documents. In: NAACL, pp. 32\u201339 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Wei, M., He, Y., Zhang, Q.: Robust layout-aware IE for visually rich documents with pre-trained language models. In: ACM SIGIR, pp. 2367\u20132376 (2020)","DOI":"10.1145\/3397271.3401442"},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Luo, C., Wang, Y., Zheng, Q., Li, L., Gao, F., Zhang, S.: Merge and recognize: a geometry and 2D context aware graph model for named entity recognition from visual documents. In: TextGraphs Workshop at COLING, pp. 24\u201334 (2020)","DOI":"10.18653\/v1\/2020.textgraphs-1.3"},{"key":"35_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: TRIE: end-to-end text reading and information extraction for document understanding. In: ACM Multimedia, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Towards robust visual information extraction in real world: new dataset and novel solution. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i4.16378"},{"key":"35_CR23","doi-asserted-by":"crossref","unstructured":"Katti, A.R., et al.: Chargrid: towards understanding 2D documents. In: EMNLP, pp. 4459\u20134469 (2018)","DOI":"10.18653\/v1\/D18-1476"},{"key":"35_CR24","unstructured":"Dang, T.N., Thanh, D.N.: End-to-End information extraction by character-level embedding and multi-stage attentional U-Net. In: BMVC (2019)"},{"key":"35_CR25","unstructured":"Zhao, X., Niu, E., Wu, Z., Wang, X.: CUTIE: learning to understand documents with convolutional universal text information extractor. arXiv preprint arXiv:1903.12363 (2019)"},{"key":"35_CR26","unstructured":"Denk, T.I., Reisswig, C.: BERTgrid: contextualized embedding for 2D document representation and understanding. In: Document Intelligence Workshop at NeurIPS (2019)"},{"key":"35_CR27","doi-asserted-by":"crossref","unstructured":"Kerroumi, M., Sayem, O., Shabou, A.: VisualWordGrid: information extraction from scanned documents using a multimodal approach. arXiv preprint arXiv:2010.02358 (2020)","DOI":"10.1007\/978-3-030-86159-9_28"},{"key":"35_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"433","DOI":"10.1007\/3-540-45869-7_47","volume-title":"Document Analysis Systems V","author":"AR Dengel","year":"2002","unstructured":"Dengel, A.R., Klein, B.: SmartFIX: a requirements-driven system for document analysis and understanding. In: Lopresti, D., Hu, J., Kashi, R. (eds.) DAS 2002. LNCS, vol. 2423, pp. 433\u2013444. Springer, Heidelberg (2002). https:\/\/doi.org\/10.1007\/3-540-45869-7_47"},{"key":"35_CR29","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1007\/s10032-002-0084-6","volume":"6","author":"F Cesarini","year":"2003","unstructured":"Cesarini, F., Francesconi, E., Gori, M., Soda, G.: Analysis and understanding of multi-class invoices. IJDAR 6, 102\u2013114 (2003)","journal-title":"IJDAR"},{"key":"35_CR30","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10032-010-0137-1","volume":"14","author":"E Medvet","year":"2011","unstructured":"Medvet, E., Bartoli, A., Davanzo, G.: A probabilistic approach to printed document understanding. IJDAR 14, 335\u2013347 (2011)","journal-title":"IJDAR"},{"key":"35_CR31","doi-asserted-by":"crossref","unstructured":"Esser, D., Schuster, D., Muthmann, K., Berger, M., Schill, A.: Automatic indexing of scanned documents - a layout-based approach. In: DRR (2012)","DOI":"10.1117\/12.908542"},{"key":"35_CR32","doi-asserted-by":"crossref","unstructured":"Schuster, D., et al.: Intellix -- End-User trained information extraction for document archiving. In: ICDAR, pp. 101\u2013105 (2013)","DOI":"10.1109\/ICDAR.2013.28"},{"key":"35_CR33","doi-asserted-by":"crossref","unstructured":"Rusinol, M., Benkhelfallah, T., Poulain d'Andecy, V.: Field extraction from administrative documents by incremental structural templates. In: ICDAR, pp. 1100\u20131104 (2013)","DOI":"10.1109\/ICDAR.2013.223"},{"key":"35_CR34","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1016\/j.patrec.2020.05.001","volume":"136","author":"M Carbonell","year":"2020","unstructured":"Carbonell, M., Forn\u00e9s, A., Villegas, M., Llad\u00f3s, J.: A neural model for text localization, transcription and named entity recognition in full pages. Pattern Recogn. Lett. 136, 219\u2013227 (2020)","journal-title":"Pattern Recogn. Lett."},{"key":"35_CR35","doi-asserted-by":"crossref","unstructured":"Majumder, B.P., Potti, N., Tata, S., Wendt, J.B., Zhao, Q., Najork, M.: Representation learning for information extraction from form-like documents. In: ACL, pp. 6495\u20136504 (2020)","DOI":"10.18653\/v1\/2020.acl-main.580"},{"key":"35_CR36","doi-asserted-by":"crossref","unstructured":"Hwang, W., Yim, J., Park, S., Yang, S., Seo, M.: Spatial dependency parsing for semi-structured document information extraction. arXiv preprint arXiv:2005.00642 (2020)","DOI":"10.18653\/v1\/2021.findings-acl.28"},{"key":"35_CR37","unstructured":"Teakgyu Hong, D.K.M.J., Hwang, W., Nam, D., Park, S.: BROS: a pre-trained language model for understanding texts in document. In: Submitted to ICLR (2021)"},{"key":"35_CR38","doi-asserted-by":"crossref","unstructured":"Palm, R.B., Laws, F., Winther, O.: Attend, copy, parse - End-to-end information extraction from documents. In: ICDAR, pp. 329\u2013336 (2019)","DOI":"10.1109\/ICDAR.2019.00060"},{"key":"35_CR39","doi-asserted-by":"crossref","unstructured":"Guo, H., Qin, X., Liu, J., Han, J., Liu, J., Ding, E.: EATEN: entity-aware attention for single shot visual text extraction. In: ICDAR, pp. 254\u2013259 (2019)","DOI":"10.1109\/ICDAR.2019.00049"},{"key":"35_CR40","doi-asserted-by":"crossref","unstructured":"Sage, C., Aussem, A., Eglin, V., Elghazel, H., Espinas, J.: End-to-End extraction of structured information from business documents with pointer-generator networks. In: SPNLP Workshop at EMNLP, pp. 43\u201352 (2020)","DOI":"10.18653\/v1\/2020.spnlp-1.6"},{"key":"35_CR41","doi-asserted-by":"crossref","unstructured":"Yang, X., Yumer, E., Asente, P., Kraley, M., Kifer, D., Giles, C.L.: Learning to extract semantic structure from documents using multimodal fully convolutional neural network. In: CVPR, pp. 4342\u20134351 (2017)","DOI":"10.1109\/CVPR.2017.462"},{"key":"35_CR42","doi-asserted-by":"crossref","unstructured":"Barman, R., Ehrmann, M., Clematide, S., Oliveira, S.A., Kaplan, F.: Combining visual and textual features for semantic segmentation of historical newspapers. J. Data Min. Digital Humanit. HistoInformatics, jdmdh:7097 (2021)","DOI":"10.46298\/jdmdh.6107"},{"key":"35_CR43","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"35_CR44","doi-asserted-by":"crossref","unstructured":"He, T., Zhang, Z., Zhang, H., Zhang, Z., Xie, J., Li, M.: Bag of tricks for image classification with convolutional neural networks. In: CVPR, pp. 558\u2013567 (2019)","DOI":"10.1109\/CVPR.2019.00065"},{"key":"35_CR45","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Jian, S.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"35_CR46","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: ICCV, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"35_CR47","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"35_CR48","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"35_CR49","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Gupta, A., Girshick, R.: Training region-based object detectors with online hard example mining. In: CVPR, pp. 761\u2013769 (2016)","DOI":"10.1109\/CVPR.2016.89"},{"key":"35_CR50","unstructured":"Turc, I., Chang, M.-W., Lee, K., Toutanova, K.: Well-read students learn better: on the importance of pre-training compact models. arXiv preprint arXiv:1908.08962 (2019)"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2021"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86549-8_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T22:04:35Z","timestamp":1756937075000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86549-8_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030865481","9783030865498"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86549-8_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"2 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lausanne","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iapr.org\/icdar2021","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"340","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"182","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.9","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.9","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Additionally, 13 competition reports are included.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}