{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T18:06:33Z","timestamp":1763661993630,"version":"3.44.0"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030865481"},{"type":"electronic","value":"9783030865498"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86549-8_8","type":"book-chapter","created":{"date-parts":[[2021,9,4]],"date-time":"2021-09-04T02:05:57Z","timestamp":1630721157000},"page":"115-130","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":52,"title":["VSR: A Unified Framework for Document Layout Analysis Combining Vision, Semantics and Relations"],"prefix":"10.1007","author":[{"given":"Peng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Can","family":"Li","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Zhanzhan","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Shiliang","family":"Pu","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Niu","sequence":"additional","affiliation":[]},{"given":"Fei","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,2]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Aggarwal, M., Sarkar, M., Gupta, H., Krishnamurthy, B.: Multi-modal association based grouping for form structure extraction. In: WACV, pp. 2064\u20132073 (2020)","DOI":"10.1109\/WACV45572.2020.9093376"},{"issue":"2","key":"8_CR2","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltrusaitis","year":"2019","unstructured":"Baltrusaitis, T., Ahuja, C., Morency, L.: Multimodal machine learning: a survey and taxonomy. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 423\u2013443 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Barman, R., Ehrmann, M., Clematide, S., Oliveira, S.A., Kaplan, F.: Combining visual and textual features for semantic segmentation of historical newspapers. CoRR https:\/\/arxiv.org\/abs\/2002.06144 (2020)","DOI":"10.46298\/jdmdh.6107"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"BinMakhashen, G.M., Mahmoud, S.A.: Document layout analysis: a comprehensive survey. ACM Comput. Surv. 52(6), 109:1\u2013109:36 (2020)","DOI":"10.1145\/3355610"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Chen, K., Seuret, M., Liwicki, M., Hennebert, J., Ingold, R.: Page segmentation of historical document images with convolutional autoencoders. In: ICDAR, pp. 1011\u20131015 (2015)","DOI":"10.1109\/ICDAR.2015.7333914"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Conway, A.: Page grammars and page parsing. A syntactic approach to document layout recognition. In: ICDAR, pp. 761\u2013764 (1993)","DOI":"10.1109\/ICDAR.1993.395626"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Corbelli, A., Baraldi, L., Grana, C., Cucchiara, R.: Historical document digitization through layout analysis and deep content classification. In: ICPR, pp. 4077\u20134082 (2016)","DOI":"10.1109\/ICPR.2016.7900272"},{"key":"8_CR8","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: CVPR, pp. 1933\u20131941 (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Gatos, B., Louloudis, G., Stamatopoulos, N.: Segmentation of historical handwritten documents into text zones and text lines. In: ICFHR, pp. 464\u2013469 (2014)","DOI":"10.1109\/ICFHR.2014.84"},{"issue":"11","key":"8_CR11","doi-asserted-by":"publisher","first-page":"3171","DOI":"10.1109\/TCYB.2017.2761775","volume":"48","author":"J Han","year":"2018","unstructured":"Han, J., Chen, H., Liu, N., Yan, C., Li, X.: CNNs-based RGB-D saliency detection via cross-view transfer and multiview fusion. IEEE Trans. Cybern. 48(11), 3171\u20133183 (2018)","journal-title":"IEEE Trans. Cybern."},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"He, D., Cohen, S., Price, B.L., Kifer, D., Giles, C.L.: Multi-scale multi-task FCN for semantic page segmentation and table detection. In: ICDAR, pp. 254\u2013261 (2017)","DOI":"10.1109\/ICDAR.2017.50"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.B.: Mask R-CNN. In: ICCV, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"8_CR15","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., Wei, Y.: Relation networks for object detection. In: CVPR, pp. 3588\u20133597 (2018)","DOI":"10.1109\/CVPR.2018.00378"},{"issue":"7","key":"8_CR17","doi-asserted-by":"publisher","first-page":"737","DOI":"10.1109\/34.221173","volume":"15","author":"MS Krishnamoorthy","year":"1993","unstructured":"Krishnamoorthy, M.S., Nagy, G., Seth, S.C., Viswanathan, M.: Syntactic segmentation and labeling of digitized pages from technical journals. IEEE Trans. Pattern Anal. Mach. Intell. 15(7), 737\u2013747 (1993)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Lee, J., Hayashi, H., Ohyama, W., Uchida, S.: Page segmentation using a convolutional neural network with trainable co-occurrence features. In: ICDAR, pp. 1023\u20131028 (2019)","DOI":"10.1109\/ICDAR.2019.00167"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Cross-domain document object detection: benchmark suite and method. In: CVPR, pp. 12912\u201312921 (2020)","DOI":"10.1109\/CVPR42600.2020.01293"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Docbank: a benchmark dataset for document layout analysis. In: COLING, pp. 949\u2013960 (2020)","DOI":"10.18653\/v1\/2020.coling-main.82"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, F., Xue, T., Liu, L., Ogier, J., Liu, C.: Instance aware document image segmentation using label pyramid networks and deep watershed transformation. In: ICDAR, pp. 514\u2013519 (2019)","DOI":"10.1109\/ICDAR.2019.00088"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: Feature pyramid networks for object detection. In: CVPR, pp. 936\u2013944 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Lin, T., RoyChowdhury, A., Maji, S.: Bilinear CNN models for fine-grained visual recognition. In: ICCV, pp. 1449\u20131457 (2015)","DOI":"10.1109\/ICCV.2015.170"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q., Zhao, H.: Graph convolution for multimodal information extraction from visually rich documents. In: NAACL-HLT, pp. 32\u201339 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"8_CR25","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized BERT pretraining approach. CoRR https:\/\/arxiv.org\/abs\/1907.11692 (2019)"},{"key":"8_CR26","unstructured":"Ren, S., He, K., Girshick, R.B., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NeurIPS, pp. 91\u201399 (2015)"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Shilman, M., Liang, P., Viola, P.A.: Learning non-generative grammatical models for document analysis. In: ICCV, pp. 962\u2013969 (2005)","DOI":"10.1109\/ICCV.2005.140"},{"key":"8_CR28","unstructured":"Shinyama, Y.: Pdfminer: python pdf parser and analyzer. Retrieved on 11 (2015)"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Siegel, N., Lourie, N., Power, R., Ammar, W.: Extracting scientific figures with distantly supervised neural networks. In: JCDL, pp. 223\u2013232 (2018)","DOI":"10.1145\/3197026.3197040"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Smith, R.: An overview of the tesseract OCR engine. In: ICDAR, pp. 629\u2013633 (2007)","DOI":"10.1109\/ICDAR.2007.4376991"},{"key":"8_CR31","doi-asserted-by":"crossref","unstructured":"Soto, C., Yoo, S.: Visual detection with context for document layout analysis. In: EMNLP-IJCNLP, pp. 3462\u20133468 (2019)","DOI":"10.18653\/v1\/D19-1348"},{"key":"8_CR32","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 5998\u20136008 (2017)"},{"key":"8_CR33","unstructured":"Velickovic, P., Cucurull, G., Casanova, A., Romero, A., Li\u00f2, P., Bengio, Y.: Graph attention networks. In: ICLR (2018)"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Vo, Q.N., Lee, G.: Dense prediction for text line segmentation in handwritten document images. In: ICIP, pp. 3264\u20133268 (2016)","DOI":"10.1109\/ICIP.2016.7532963"},{"key":"8_CR35","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R.B., Gupta, A., He, K.: Non-local neural networks. In: CVPR, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Wick, C., Puppe, F.: Fully convolutional neural networks for page segmentation of historical document images. In: DAS, pp. 287\u2013292 (2018)","DOI":"10.1109\/DAS.2018.39"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R.B., Doll\u00e1r P., He, K.: Aggregated residual transformations for deep neural networks. In: CVPR, pp. 5987\u20135995 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: KDD, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Yang, X., Yumer, E., Asente, P., Kraley, M., Kifer, D., Giles, C.L.: Learning to extract semantic structure from documents using multimodal fully convolutional neural networks. In: CVPR, pp. 4342\u20134351 (2017)","DOI":"10.1109\/CVPR.2017.462"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: PICK: processing key information extraction from documents using improved graph learning-convolutional networks. In: ICPR, pp. 4363\u20134370 (2020)","DOI":"10.1109\/ICPR48806.2021.9412927"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Zagoris, K., Pratikakis, I., Gatos, B.: Segmentation-based historical handwritten word spotting using document-specific local features. In: ICFHR, pp. 9\u201314 (2014)","DOI":"10.1109\/ICFHR.2014.10"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: TRIE: end-to-end text reading and information extraction for document understanding. In: MM, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Jimeno-Yepes, A.: Publaynet: largest dataset ever for document layout analysis. In: ICDAR, pp. 1015\u20131022 (2019)","DOI":"10.1109\/ICDAR.2019.00166"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2021"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86549-8_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T22:04:24Z","timestamp":1756937064000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86549-8_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030865481","9783030865498"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86549-8_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"2 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lausanne","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iapr.org\/icdar2021","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"340","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"182","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.9","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.9","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Additionally, 13 competition reports are included.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}