{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:36:38Z","timestamp":1773246998406,"version":"3.50.1"},"publisher-location":"Cham","reference-count":56,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729942","type":"print"},{"value":"9783031729959","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72995-9_27","type":"book-chapter","created":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T19:17:10Z","timestamp":1732389430000},"page":"474-491","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Textual Grounding for\u00a0Open-Vocabulary Visual Information Extraction in\u00a0Layout-Diversified Documents"],"prefix":"10.1007","author":[{"given":"Mengjun","family":"Cheng","sequence":"first","affiliation":[]},{"given":"Chengquan","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6747-0646","authenticated-orcid":false,"given":"Chang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yuke","family":"Li","sequence":"additional","affiliation":[]},{"given":"Bohan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Kun","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Xiawu","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,24]]},"reference":[{"key":"27_CR1","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, NeurIPS 2022 (2022)"},{"key":"27_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: DocFormer: end-to-end transformer for document understanding. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2021, pp. 973\u2013983. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"27_CR3","unstructured":"Bai, J., et al.: Qwen technical report. CoRR abs\/2309.16609 (2023)"},{"key":"27_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. CoRR abs\/2308.12966 (2023)"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Cao, R., Luo, P.: Extracting zero-shot structured information from form-like documents: pretraining with keys and triggers. In: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI 2021, pp. 12612\u201312620. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i14.17494"},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, H., et al.: $$\\text{M}^{6}$$Doc: a large-scale multi-format, multi-type, multi-layout, multi-language, multi-annotation category dataset for modern document layout analysis. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, pp. 15138\u201315147. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.01453"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TransVG: end-to-end visual grounding with transformers. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2021, pp. 1749\u20131759. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"issue":"11","key":"27_CR8","doi-asserted-by":"publisher","first-page":"13636","DOI":"10.1109\/TPAMI.2023.3296823","volume":"45","author":"J Deng","year":"2023","unstructured":"Deng, J., et al.: TransVG++: end-to-end visual grounding with language conditioned vision transformer. IEEE Trans. Pattern Anal. Mach. Intell. 45(11), 13636\u201313652 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR9","unstructured":"Gentile, A.L., Zhang, Z., Ciravegna, F.: Web scale information extraction with LODIE. In: 2013 AAAI Fall Symposia Series. AAAI Press (2013)"},{"key":"27_CR10","unstructured":"Gu, X., Lin, T., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. In: The Tenth International Conference on Learning Representations, ICLR 2022. OpenReview.net (2022)"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: LayoutLMv3: pre-training for document AI with unified text and image masking. In: Magalh\u00e3es, J., et al. (eds.) Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091. ACM (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: International Conference on Document Analysis and Recognition, ICDAR 2019, pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.: FUNSD: a dataset for form understanding in noisy scanned documents. In: 2nd International Workshop on Open Services and Tools for Document Analysis, OST@ICDAR 2019, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"27_CR14","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) International Conference on Machine Learning, ICML 2023, vol.\u00a0202, pp. 19730\u201319742. PMLR (2023)"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, pp. 10955\u201310965. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: StrucTexT: structured text understanding with multi-modal transformers. In: Shen, H.T., et al. (eds.) Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920. ACM (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Liao, H., et al.: Doctr: document transformer for structured information extraction in documents. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, pp. 19527\u201319537. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01794"},{"issue":"1","key":"27_CR18","doi-asserted-by":"publisher","first-page":"919","DOI":"10.1109\/TPAMI.2022.3155612","volume":"45","author":"M Liao","year":"2023","unstructured":"Liao, M., Zou, Z., Wan, Z., Yao, C., Bai, X.: Real-time scene text detection with differentiable binarization and adaptive scale fusion. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 919\u2013931 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Lin, B.Y., Sheng, Y., Vo, N., Tata, S.: FreeDOM: a transferable neural architecture for structured information extraction on web documents. In: Gupta, R., Liu, Y., Tang, J., Prakash, B.A. (eds.) Proceedings of the 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 1092\u20131102. ACM (2020)","DOI":"10.1145\/3394486.3403153"},{"key":"27_CR20","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, NeurIPS 2023 (2023)"},{"key":"27_CR21","unstructured":"Liu, Y., et al.: On the hidden mystery of OCR in large multimodal models. CoRR abs\/2305.07895 (2023)"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Luo, C., Cheng, C., Zheng, Q., Yao, C.: GeoLayoutLM: geometric pre-training for visual information extraction. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, pp. 7092\u20137101. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.00685"},{"key":"27_CR23","unstructured":"Lv, T., et al.: Kosmos-2.5: a multimodal literate model. CoRR abs\/2309.11419 (2023)"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Majumder, B.P., Potti, N., Tata, S., Wendt, J.B., Zhao, Q., Najork, M.: Representation learning for information extraction from form-like documents. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J.R. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020, pp. 6495\u20136504. Association for Computational Linguistics (2020)","DOI":"10.18653\/v1\/2020.acl-main.580"},{"issue":"4","key":"27_CR25","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10032-010-0137-1","volume":"14","author":"E Medvet","year":"2011","unstructured":"Medvet, E., Bartoli, A., Davanzo, G.: A probabilistic approach to printed document understanding. Int. J. Doc. Anal. Recognit. (IJDAR) 14(4), 335\u2013347 (2011)","journal-title":"Int. J. Doc. Anal. Recognit. (IJDAR)"},{"key":"27_CR26","unstructured":"OpenAI: ChatGPT (Mar 14 version) [Large language model] (2023). https:\/\/chat.openai.com\/chat"},{"key":"27_CR27","unstructured":"OpenAI: GPT-4 technical report. CoRR abs\/2303.08774 (2023)"},{"key":"27_CR28","unstructured":"Park, S., et al.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"27_CR29","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. In: The Eleventh International Conference on Learning Representations, ICLR 2024. OpenReview.net (2024)"},{"key":"27_CR30","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML 2021. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"27_CR31","unstructured":"Shi, Y., et al.: Exploring OCR capabilities of GPT-4V(ision): a quantitative and in-depth evaluation. CoRR abs\/2310.16809 (2023)"},{"key":"27_CR32","unstructured":"Soboroff, I.: Complex document information processing (CDIP) dataset. National Institute of Standards and Technology (2022)"},{"key":"27_CR33","unstructured":"Sun, Y., et al.: ERNIE 3.0: large-scale knowledge enhanced pre-training for language understanding and generation. CoRR abs\/2107.02137 (2021)"},{"key":"27_CR34","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. CoRR abs\/2302.13971 (2023)"},{"key":"27_CR35","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. CoRR abs\/2307.09288 (2023)"},{"key":"27_CR36","unstructured":"Wang, D., et al.: DocLLM: a layout-aware generative language model for multimodal document understanding. CoRR abs\/2401.00908 (2024)"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Towards robust visual information extraction in real world: new dataset and novel solution. In: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI 2021, pp. 2738\u20132745. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i4.16378"},{"key":"27_CR38","unstructured":"Wang, W., Li, Y., Ou, Y., Zhang, Y.: Layout and task aware instruction prompt for zero-shot document image question answering. CoRR abs\/2306.00526 (2023)"},{"key":"27_CR39","doi-asserted-by":"publisher","first-page":"4334","DOI":"10.1109\/TMM.2023.3321501","volume":"26","author":"L Xiao","year":"2024","unstructured":"Xiao, L., Yang, X., Peng, F., Yan, M., Wang, Y., Xu, C.: CLIP-VG: self-paced curriculum adapting of CLIP for visual grounding. IEEE Trans. Multimedia 26, 4334\u20134347 (2024)","journal-title":"IEEE Trans. Multimedia"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL\/IJCNLP 2021, (Volume 1: Long Papers), pp. 2579\u20132591. Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Gupta, R., Liu, Y., Tang, J., Prakash, B.A. (eds.) Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery Data Mining, pp. 1192\u20131200. ACM (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: XFUND: a benchmark dataset for multilingual visually rich form understanding. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Findings of the Association for Computational Linguistics: ACL 2022, pp. 3214\u20133224. Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"27_CR43","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, pp. 19141\u201319151. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"27_CR44","doi-asserted-by":"crossref","unstructured":"Yang, Q., Hu, Y., Cao, R., Li, H., Luo, P.: Zero-shot key information extraction from mixed-style tables: pre-training on wikipedia. In: Bailey, J., Miettinen, P., Koh, Y.S., Tao, D., Wu, X. (eds.) IEEE International Conference on Data Mining, ICDM 2021, pp. 1451\u20131456. IEEE (2021)","DOI":"10.1109\/ICDM51629.2021.00187"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Ye, J., et al.: Ureader: universal OCR-free visually-situated language understanding with multimodal large language model. In: Bouamor, H., Pino, J., Bali, K. (eds.) Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 2841\u20132858. Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"27_CR46","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality. CoRR abs\/2304.14178 (2023)"},{"key":"27_CR47","doi-asserted-by":"crossref","unstructured":"Yu, W., Lu, N., Qi, X., Gong, P., Xiao, R.: PICK: processing key information extraction from documents using improved graph learning-convolutional networks. In: 25th International Conference on Pattern Recognition, ICPR 2020, pp. 4363\u20134370. IEEE (2020)","DOI":"10.1109\/ICPR48806.2021.9412927"},{"key":"27_CR48","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"536","DOI":"10.1007\/978-3-031-41679-8_32","volume-title":"ICDAR 2023","author":"W Yu","year":"2023","unstructured":"Yu, W., et al.: ICDAR 2023 competition on structured text extraction from visually-rich document images. In: Fink, G.A., Jain, R., Kise, K., Zanibbi, R. (eds.) ICDAR 2023. LNCS, vol. 14188, pp. 536\u2013552. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-41679-8_32"},{"key":"27_CR49","unstructured":"Yu, Y., et al.: StrucTexTv2: masked visual-textual prediction for document image pre-training. In: The Eleventh International Conference on Learning Representations, ICLR 2023. OpenReview.net (2023)"},{"key":"27_CR50","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-031-20077-9_7","volume-title":"ECCV 2022","author":"Y Zang","year":"2022","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary DETR with conditional matching. In: Avidan, S., Brostow, G.J., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 106\u2013122. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_7"},{"key":"27_CR51","unstructured":"Zeng, A., et al.: GLM-130B: an open bilingual pre-trained model. In: The Eleventh International Conference on Learning Representations, ICLR 2023. OpenReview.net (2023)"},{"key":"27_CR52","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. CoRR abs\/2205.01068 (2022)"},{"key":"27_CR53","unstructured":"Zhang, Y., et al.: LLaVAR: enhanced visual instruction tuning for text-rich image understanding. CoRR abs\/2306.17107 (2023)"},{"key":"27_CR54","unstructured":"Zhao, X., Wu, Z., Wang, X.: CUTIE: learning to understand documents with convolutional universal text information extractor. CoRR abs\/1903.12363 (2019)"},{"key":"27_CR55","doi-asserted-by":"crossref","unstructured":"Zhong, X., Tang, J., Jimeno-Yepes, A.: PubLayNet: largest dataset ever for document layout analysis. In: International Conference on Document Analysis and Recognition, ICDAR 2019, pp. 1015\u20131022. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00166"},{"key":"27_CR56","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: The Eleventh International Conference on Learning Representations, ICLR 2024. OpenReview.net (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72995-9_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T20:07:22Z","timestamp":1732392442000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72995-9_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,24]]},"ISBN":["9783031729942","9783031729959"],"references-count":56,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72995-9_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,24]]},"assertion":[{"value":"24 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}