{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T09:55:38Z","timestamp":1742982938451,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031705328"},{"type":"electronic","value":"9783031705335"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70533-5_15","type":"book-chapter","created":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:02:25Z","timestamp":1725685345000},"page":"244-261","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["One-Shot Transformer-Based Framework for\u00a0Visually-Rich Document Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8980-283X","authenticated-orcid":false,"given":"Huynh Vu","family":"The","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8867-9899","authenticated-orcid":false,"given":"Van Pham","family":"Hoai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6847-0075","authenticated-orcid":false,"given":"Jeff","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,8]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: Docformer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 993\u20131003 (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"15_CR2","first-page":"12612","volume":"35","author":"R Cao","year":"2021","unstructured":"Cao, R., Luo, P.: Extracting zero-shot structured information from form-like documents: pretraining with keys and triggers. Proc. AAAI Conf. Artif. Intell. 35, 12612\u201312620 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Carbonell, M., Riba, P., Villegas, M., Forn\u00e9s, A., Llad\u00f3s, J.: Named entity recognition and relation extraction with graph neural networks in semi structured documents. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 9622\u20139627. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412669"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, M., Qiu, M., Shi, X., Huang, J., Lin, W.: One-shot text field labeling using attention and belief propagation for structure information extraction. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 340\u2013348 (2020)","DOI":"10.1145\/3394171.3413511"},{"key":"15_CR5","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"d\u2019Andecy, V.P., Hartmann, E., Rusinol, M.: Field extraction by hybrid incremental and a-priori structural templates. In: 2018 13th IAPR International Workshop on Document Analysis Systems (DAS), pp. 251\u2013256. IEEE (2018)","DOI":"10.1109\/DAS.2018.29"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Deckert, F., Seidler, B., Ebbecke, M., Gillmann, M.: Table content understanding in smartfix. In: 2011 International Conference on Document Analysis and Recognition, pp. 488\u2013492. IEEE (2011)","DOI":"10.1109\/ICDAR.2011.104"},{"key":"15_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Gui, T., et al.: A lexicon-based graph neural network for Chinese NER. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 1040\u20131050 (2019)","DOI":"10.18653\/v1\/D19-1096"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Jaume, G., Hazim Kemal\u00a0Ekenel, J.P.T.: Funsd: a dataset for form understanding in noisy scanned documents. In: Accepted to ICDAR-OST (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: International Conference on Document Analysis and Recognition (ICDAR) (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"He, J., et al.: Icl-d3ie: in-context learning with diverse demonstrations updating for document information extraction. arXiv preprint arXiv:2303.05063 (2023)","DOI":"10.1109\/ICCV51070.2023.01785"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"15_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1007\/978-3-319-24261-3_7","volume-title":"Similarity-Based Pattern Recognition","author":"E Hoffer","year":"2015","unstructured":"Hoffer, E., Ailon, N.: Deep metric learning using triplet network. In: Feragen, A., Pelillo, M., Loog, M. (eds.) SIMBAD 2015. LNCS, vol. 9370, pp. 84\u201392. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24261-3_7"},{"key":"15_CR15","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: Bros: a pre-trained language model for understanding texts in document (2020)"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: Layoutlmv3: pre-training for document AI with unified text and image masking. arXiv preprint arXiv:2204.08387 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: Icdar2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Hwang, W., Yim, J., Park, S., Yang, S., Seo, M.: Spatial dependency parsing for semi-structured document information extraction. arXiv preprint arXiv:2005.00642 (2020)","DOI":"10.18653\/v1\/2021.findings-acl.28"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Katti, A.R., et al.: Chargrid: towards understanding 2d documents. arXiv preprint arXiv:1809.08799 (2018)","DOI":"10.18653\/v1\/D18-1476"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Comput. Surv. 54(10s), 1\u201341 (2022)","DOI":"10.1145\/3505244"},{"key":"15_CR21","unstructured":"Lafferty, J., McCallum, A., Pereira, F.C.: Conditional random fields: probabilistic models for segmenting and labeling sequence data (2001)"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Lample, G., Ballesteros, M., Subramanian, S., Kawakami, K., Dyer, C.: Neural architectures for named entity recognition. arXiv preprint arXiv:1603.01360 (2016)","DOI":"10.18653\/v1\/N16-1030"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Structext: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Liu, X., Gao, F., Zhang, Q., Zhao, H.: Graph convolution for multimodal information extraction from visually rich documents. arXiv preprint arXiv:1903.11279 (2019)","DOI":"10.18653\/v1\/N19-2005"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Palm, R.B., Winther, O., Laws, F.: Cloudscan-a configuration-free invoice analysis system using recurrent neural networks. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 406\u2013413. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.74"},{"key":"15_CR26","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Rusinol, M., Benkhelfallah, T., Poulain\u00a0dAndecy, V.: Field extraction from administrative documents by incremental structural templates. In: 2013 12th International Conference on Document Analysis and Recognition, pp. 1100\u20131104. IEEE (2013)","DOI":"10.1109\/ICDAR.2013.223"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Sage, C., Aussem, A., Elghazel, H., Eglin, V., Espinas, J.: Recurrent neural network approach for table field extraction in business documents. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1308\u20131313. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00211"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Schuster, D., et al.: Intellix\u2013end-user trained information extraction for document archiving. In: 2013 12th International Conference on Document Analysis and Recognition, pp. 101\u2013105. IEEE (2013)","DOI":"10.1109\/ICDAR.2013.28"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Simon, M., Rodner, E., Denzler, J.: Fine-grained classification of identity document types with only one example. In: 2015 14th IAPR International Conference on Machine Vision Applications (MVA), pp. 126\u2013129. IEEE (2015)","DOI":"10.1109\/MVA.2015.7153149"},{"key":"15_CR31","unstructured":"Sunder, V., Srinivasan, A., Vig, L., Shroff, G., Rahul, R.: One-shot information extraction from document images using neuro-deductive program synthesis. arXiv preprint arXiv:1906.02427 (2019)"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Tang, G., et al.: Matchvie: exploiting match relevancy between entities for visual information extraction. arXiv preprint arXiv:2106.12940 (2021)","DOI":"10.24963\/ijcai.2021\/144"},{"key":"15_CR33","unstructured":"Veli\u010dkovi\u0107, P., Cucurull, G., Casanova, A., Romero, A., Lio, P., Bengio, Y.: Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)"},{"key":"15_CR34","first-page":"2738","volume":"35","author":"J Wang","year":"2021","unstructured":"Wang, J., et al.: Towards robust visual information extraction in real world: new dataset and novel solution. Proc. AAAI Conf. Artif. Intell. 35, 2738\u20132745 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Wei, M., He, Y., Zhang, Q.: Robust layout-aware IE for visually rich documents with pre-trained language models. In: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2367\u20132376 (2020)","DOI":"10.1145\/3397271.3401442"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Layoutlmv2: multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"15_CR38","unstructured":"Yao, M., Liu, Z., Wang, L., Li, H., Zhuang, L.: One-shot key information extraction from document with deep partial graph matching. arXiv preprint arXiv:2109.13967 (2021)"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: TRIE: end-to-end text reading and information extraction for document understanding. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1413\u20131422 (2020)","DOI":"10.1145\/3394171.3413900"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70533-5_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T05:06:16Z","timestamp":1725685576000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70533-5_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705328","9783031705335"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70533-5_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"8 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}