{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T15:54:29Z","timestamp":1774540469677,"version":"3.50.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T00:00:00Z","timestamp":1743033600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T00:00:00Z","timestamp":1743033600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61936003"],"award-info":[{"award-number":["61936003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61936003"],"award-info":[{"award-number":["61936003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61936003"],"award-info":[{"award-number":["61936003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["IJDAR"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s10032-025-00518-w","type":"journal-article","created":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T10:45:46Z","timestamp":1743331546000},"page":"669-680","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Bi-VLDoc: bidirectional vision-language modeling for visually-rich document understanding"],"prefix":"10.1007","volume":"28","author":[{"given":"Chuwei","family":"Luo","sequence":"first","affiliation":[]},{"given":"Guozhi","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Cong","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Chenliang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Luo","family":"Si","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,27]]},"reference":[{"key":"518_CR1","doi-asserted-by":"crossref","unstructured":"Li, P., Gu, J., Kuen, J.: SelfDoc: self-supervised document representation learning. In: CVPR, pp. 5652\u20135660 (2021)","DOI":"10.1109\/CVPR46437.2021.00560"},{"key":"518_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U.: DocFormer: end-to-end transformer for document understanding. In: ICCV, pp. 4171\u20134186 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"518_CR3","doi-asserted-by":"crossref","unstructured":"Xu, Y., Xu, Y., Lv, T., Cui, L.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: ACL (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"518_CR4","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document ai with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"518_CR5","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S.: LayoutLM: pre-training of text and layout for document image understanding. In: KDD, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"518_CR6","unstructured":"Li, C., Bi, B., Yan, M.: StructuralLM: structural pre-training for form understanding. In: ACL (2021)"},{"key":"518_CR7","doi-asserted-by":"crossref","unstructured":"Qian, Y., Santus, E., Jin, Z.: GraphIE: a graph-based framework for information extraction. In: NAACL-HLT (1), pp. 751\u2013761 (2019)","DOI":"10.18653\/v1\/N19-1082"},{"key":"518_CR8","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q.: Graph convolution for multimodal information extraction from visually rich documents. In: NAACL-HLT (1), pp. 32\u201339 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"518_CR9","doi-asserted-by":"crossref","unstructured":"Luo, C., Wang, Y., Zheng, Q.: Merge and recognize: a geometry and 2d context aware graph model for named entity recognition from visual documents. In: TextGraphs, pp. 24\u201334 (2020)","DOI":"10.18653\/v1\/2020.textgraphs-1.3"},{"key":"518_CR10","doi-asserted-by":"crossref","unstructured":"Li, L., Gao, F., Bu, J., Wang, Y.: An end-to-end ocr text re-organization sequence learning for rich-text detail image comprehension. In: ECCV, pp. 85\u2013100 (2020)","DOI":"10.1007\/978-3-030-58595-2_6"},{"key":"518_CR11","doi-asserted-by":"crossref","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: Pick: processing key information extraction from documents using improved graph learning-convolutional networks. In: ICPR (2021)","DOI":"10.1109\/ICPR48806.2021.9412927"},{"key":"518_CR12","doi-asserted-by":"crossref","unstructured":"Tang, G., Xie, L., Jin, L., Wang: MatchVIE: exploiting match relevancy between entities for visual information extraction. In: IJCAI (2021)","DOI":"10.24963\/ijcai.2021\/144"},{"key":"518_CR13","doi-asserted-by":"crossref","unstructured":"Yang, X., Yumer, E., Asente, P.: Learning to extract semantic structure from documents using multimodal fully convolutional neural networks. In: CVPR, pp. 5315\u20135324 (2017)","DOI":"10.1109\/CVPR.2017.462"},{"key":"518_CR14","doi-asserted-by":"crossref","unstructured":"Katti, A.R., Reisswig, C., Guder, C., Brarda, S.: Chargrid: towards understanding 2d documents. In: EMNLP, pp. 4459\u20134469 (2018)","DOI":"10.18653\/v1\/D18-1476"},{"key":"518_CR15","unstructured":"Denk, T.I., Reisswig, C.: BERTgrid: contextualized embedding for 2d document representation and understanding. In: NIPS (2019)"},{"key":"518_CR16","unstructured":"Devlin, J., Chang, M.-W., Lee, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (1), pp. 4171\u20134186 (2019)"},{"key":"518_CR17","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J.: Roberta: a robustly optimized bert pretraining approach. In: arXiv (2019)"},{"key":"518_CR18","unstructured":"Bao, H., Dong, L., Wei, F.: Unilmv2: pseudo-masked language models for unified language model pre-training. In: ICML, pp. 642\u2013652 (2020)"},{"key":"518_CR19","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"518_CR20","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., Dai, J.: VL-BERT: pre-training of generic visual-linguistic representations. In: ICLR (2020)"},{"key":"518_CR21","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., El Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: UNITER: learning universal image-text representations. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"518_CR22","doi-asserted-by":"crossref","unstructured":"Li, W., Gao, C., Niu, G., Xiao, X., Liu, H., Liu, J., Wu, H., Wang, H.: UNIMO: towards unified-modal understanding and generation via cross-modal contrastive learning. In: ACL (2021)","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"518_CR23","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., : Oscar: object-semantics aligned pre-training for vision-language tasks. In: ECCV, pp. 121\u2013137 (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"518_CR24","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q.V., Sung, Y., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"518_CR25","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"518_CR26","doi-asserted-by":"crossref","unstructured":"Peng, Q., Pan, Y., Wang, W., Luo, B., Zhang, Z., Huang, Z., Hu, T., Yin, W., Chen, Y., Zhang, Y., et al.: ERNIE-Layout: layout knowledge enhanced pre-training for visually-rich document understanding. EMNLP 2022 findings (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.274"},{"key":"518_CR27","doi-asserted-by":"crossref","unstructured":"Wang, W., Huang, Z., Luo, B., Chen, Q., Peng, Q., Pan, Y., Yin, W., Feng, S., Sun, Y., Yu, D., : ERNIE-mmLayout: multi-grained multimodal transformer for document understanding. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548406"},{"key":"518_CR28","first-page":"39","volume":"34","author":"J Gu","year":"2021","unstructured":"Gu, J., Kuen, J., Morariu, V.I., Zhao, H., Jain, R., Barmpalios, N., Nenkova, A., Sun, T.: Unidoc: unified pretraining framework for document understanding. Adv. Neural. Inf. Process. Syst. 34, 39\u201350 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"518_CR29","doi-asserted-by":"crossref","unstructured":"Jain, R., Wigington, C.: Multimodal document image classification. In: ICDAR, pp. 71\u201377 (2019)","DOI":"10.1109\/ICDAR.2019.00021"},{"key":"518_CR30","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Wang, P., Liu, W.: Distance-iou loss: faster and better learning for bounding box regression. In: AAAI, pp. 12993\u201313000 (2020)","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"518_CR31","doi-asserted-by":"crossref","unstructured":"Lewis, D., Agam, G., Argamon, S.: Building a test collection for complex document information processing. In: ACM SIGIR, pp. 665\u2013666 (2006)","DOI":"10.1145\/1148170.1148307"},{"key":"518_CR32","first-page":"1","volume":"2","author":"G Jaume","year":"2019","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.-P.: Funsd: a dataset for form understanding in noisy scanned documents. ICDARW 2, 1\u20136 (2019)","journal-title":"ICDARW"},{"key":"518_CR33","unstructured":"Park, S., Shin, S., Lee, B., Lee, J.: CORD: a consolidated receipt dataset for post-ocr parsing. In: NIPS (2019)"},{"key":"518_CR34","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: ICDAR, pp. 991\u2013995 (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"518_CR35","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for vqa on document images. In: WACV, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"518_CR36","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W.: BROS: a pre-trained language model for understanding texts in document. In arXiv (2020)"},{"key":"518_CR37","first-page":"883","volume":"1","author":"MZ Afzal","year":"2017","unstructured":"Afzal, M.Z., K\u00f6lsch, A.: Ahmed: cutting the error by half: Investigation of very deep cnn and advanced training strategies for document image classification. ICDAR 1, 883\u2013888 (2017)","journal-title":"ICDAR"},{"key":"518_CR38","doi-asserted-by":"crossref","unstructured":"Das, A., Roy, S., Bhattacharya, U., Parui, S.K.: Document image classification with intra-domain transfer learning and stacked generalization of deep convolutional neural networks. In: ICPR, pp. 3180\u20133185 (2018). IEEE","DOI":"10.1109\/ICPR.2018.8545630"},{"key":"518_CR39","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V.: Inception-v4, inception-resnet and the impact of residual connections on learning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"518_CR40","doi-asserted-by":"crossref","unstructured":"Sarkhel, R., Nandi, A.: Deterministic routing between layout abstractions for multi-scale classification of visually rich documents. In: IJCAI (2019)","DOI":"10.24963\/ijcai.2019\/466"},{"key":"518_CR41","unstructured":"Dauphinee, T., Patel, N., Rashidi, M.: Modular multimodal architecture for document classification. In arXiv (2019)"},{"key":"518_CR42","doi-asserted-by":"crossref","unstructured":"Powalski, R., Borchmann, \u0141., Jurkiewicz, D., Dwojak, T., Pietruszka, M., Pa\u0142ka, G.: Going full-tilt boogie on document understanding with text-image-layout transformer. In: Document Analysis and Recognition\u2013ICDAR 2021: 16th International Conference, Lausanne, Switzerland, September 5\u201310, 2021, Proceedings, Part II 16, pp. 732\u2013747 (2021). Springer","DOI":"10.1007\/978-3-030-86331-9_47"},{"key":"518_CR43","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: ICCV, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"518_CR44","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: ICDAR, pp. 1015\u20131022 (2019)","DOI":"10.1109\/ICDAR.2019.00166"},{"key":"518_CR45","doi-asserted-by":"crossref","unstructured":"\u0160imsa, \u0160., \u0160ulc, M., U\u0159i\u010d\u00e1\u0159, M., Patel, Y., Hamdi, A., Koci\u00e1n, M., Skalick\u1ef3, M., Matas, J., Doucet, A., Coustaty, M., Karatzas, D.: DocILE Benchmark for Document Information Localization and Extraction (2023). arXiv:2302.05658","DOI":"10.1007\/978-3-031-41679-8_9"}],"container-title":["International Journal on Document Analysis and Recognition (IJDAR)"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-025-00518-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10032-025-00518-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-025-00518-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T06:13:19Z","timestamp":1764310399000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10032-025-00518-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,27]]},"references-count":45,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["518"],"URL":"https:\/\/doi.org\/10.1007\/s10032-025-00518-w","relation":{},"ISSN":["1433-2833","1433-2825"],"issn-type":[{"value":"1433-2833","type":"print"},{"value":"1433-2825","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,27]]},"assertion":[{"value":"1 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}