{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T04:36:59Z","timestamp":1758083819222,"version":"3.44.0"},"publisher-location":"Cham","reference-count":56,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032046260"},{"type":"electronic","value":"9783032046277"}],"license":[{"start":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T00:00:00Z","timestamp":1757980800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T00:00:00Z","timestamp":1757980800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04627-7_23","type":"book-chapter","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T02:07:25Z","timestamp":1757988445000},"page":"398-413","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SelectVision: Adaptive Vision Resolution Selection for\u00a0Visual Document Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1835-9271","authenticated-orcid":false,"given":"Zhongjiang","family":"He","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9514-7310","authenticated-orcid":false,"given":"Ye","family":"Yuan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7926-2158","authenticated-orcid":false,"given":"An","family":"Zhao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4379-2971","authenticated-orcid":false,"given":"Han","family":"Fang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7917-1628","authenticated-orcid":false,"given":"Hao","family":"Sun","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4726-093X","authenticated-orcid":false,"given":"Kongming","family":"Liang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2950-2488","authenticated-orcid":false,"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,16]]},"reference":[{"key":"23_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23716\u201323736 (2022)"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Tang, P., Dong, Q., Sankaran, N., Zhou, Y., Manmatha, R.: Docformerv2: local features for document understanding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 709\u2013718 (2024)","DOI":"10.1609\/aaai.v38i2.27828"},{"key":"23_CR3","unstructured":"Bai, J., et al.: Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"23_CR4","unstructured":"Bai, S., et\u00a0al.: Qwen2. 5-VL technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Cha, J., Kang, W., Mun, J., Roh, B.: Honeybee: locality-enhanced projector for multimodal LLM. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13817\u201313827 (2024)","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"23_CR6","unstructured":"Chen, W., et al.: Tabfact: a large-scale dataset for table-based fact verification. arXiv preprint arXiv:1909.02164 (2019)"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"23_CR8","unstructured":"Dong, X., et\u00a0al.: Internlm-xcomposer2: mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)"},{"key":"23_CR9","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"12","key":"23_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11432-024-4250-y","volume":"67","author":"H Feng","year":"2024","unstructured":"Feng, H., et al.: Docpedia: Unleashing the power of large multimodal model in the frequency domain for versatile document understanding. Sci. China Inf. Sci. 67(12), 1\u201314 (2024)","journal-title":"Sci. China Inf. Sci."},{"key":"23_CR11","unstructured":"Feng, H., et al.: Unidoc: a universal large multimodal model for simultaneous text detection, recognition, spotting and understanding. arXiv preprint arXiv:2308.11592 (2023)"},{"key":"23_CR12","unstructured":"GLM, Team, et\u00a0al.: ChatGLM: a family of large language models from GLM-130b to GLM-4 all tools. arXiv preprint arXiv:2406.12793 (2024)"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Hu, A., et\u00a0al.: mplug-docowl 1.5: unified structure learning for OCR-free document understanding. arXiv preprint arXiv:2403.12895 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.175"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: mplug-docowl2: high-resolution compressing for OCR-free multi-page document understanding. arXiv preprint arXiv:2409.03420 (2024)","DOI":"10.18653\/v1\/2025.acl-long.291"},{"key":"23_CR15","unstructured":"Huang, M., Liu, Y., Liang, D., Jin, L., Bai, X.: Mini-monkey: alleviating the semantic sawtooth effect for lightweight MLLMs via complementary image pyramid. arXiv preprint arXiv:2408.02034 (2024)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"23_CR17","unstructured":"Lee, K., et al.: Pix2struct: screenshot parsing as pretraining for visual language understanding. In: International Conference on Machine Learning, pp. 18893\u201318912. PMLR (2023)"},{"key":"23_CR18","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Enhancing visual document understanding with contrastive learning in large visual-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15546\u201315555 (2024)","DOI":"10.1109\/CVPR52733.2024.01472"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Structext: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Liu, C., et al.: HRVDA: high-resolution visual document assistant. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15534\u201315545 (2024)","DOI":"10.1109\/CVPR52733.2024.01471"},{"key":"23_CR22","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36, pp. 34892\u201334916 (2023)"},{"key":"23_CR23","unstructured":"Liu, Y., et al.: Textmonkey: an OCR-free large multimodal model for understanding document. arXiv preprint arXiv:2403.04473 (2024)"},{"key":"23_CR24","unstructured":"Lu, H., et al.: Uniadapter: unified parameter-efficient transfer learning for cross-modal modeling. arXiv preprint arXiv:2302.06605 (2023)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Lu, J., et\u00a0al.: A bounding box is worth one token: interleaving layout and text in a large language model for document understanding. arXiv preprint arXiv:2407.01976 (2024)","DOI":"10.18653\/v1\/2025.findings-acl.379"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Luo, C., Shen, Y., Zhu, Z., Zheng, Q., Yu, Z., Yao, C.: LayoutLLM: layout instruction tuning with large language models for document understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15630\u201315640 (2024)","DOI":"10.1109\/CVPR52733.2024.01480"},{"key":"23_CR27","unstructured":"Luo, G., Zhou, Y., Ren, T., Chen, S., Sun, X., Ji, R.: Cheap and quick: efficient vision-language instruction tuning for large language models. In: Advances in Neural Information Processing Systems, vol. 36, pp. 29615\u201329627 (2023)"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Masry, A., Long, D.X., Tan, J.Q., Joty, S., Hoque, E.: Chartqa: a benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R., Karatzas, D., Valveny, E., Jawahar, C.: Infographicvqa. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1697\u20131706 (2022)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: Docvqa: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Methani, N., Ganguly, P., Khapra, M.M., Kumar, P.: Plotqa: reasoning over scientific plots. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1527\u20131536 (2020)","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Pasupat, P., Liang, P.: Compositional semantic parsing on semi-structured tables. arXiv preprint arXiv:1508.00305 (2015)","DOI":"10.3115\/v1\/P15-1142"},{"key":"23_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-58536-5_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"O Sidorov","year":"2020","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 742\u2013758. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_44"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"23_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"564","DOI":"10.1007\/978-3-030-86549-8_36","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"T Stanis\u0142awek","year":"2021","unstructured":"Stanis\u0142awek, T., et al.: Kleister: key information extraction datasets involving long documents with complex layouts. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12821, pp. 564\u2013579. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86549-8_36"},{"key":"23_CR36","unstructured":"Svetlichnaya, S.: Deepform: understand structured documents at scale (2020)"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Tanaka, R., Nishida, K., Yoshida, S.: VisualMRC: machine reading comprehension on document images. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 13878\u201313888 (2021)","DOI":"10.1609\/aaai.v35i15.17635"},{"key":"23_CR38","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"23_CR39","unstructured":"Wang, D., et al.: DocLLM: a layout-aware generative language model for multimodal document understanding. arXiv preprint arXiv:2401.00908 (2023)"},{"key":"23_CR40","unstructured":"Wang, P., et\u00a0al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"23_CR41","unstructured":"Wang, W., et al.: CogVLM: visual expert for pretrained language models. In: Advances in Neural Information Processing Systems, vol. 37, pp. 121475\u2013121499 (2025)"},{"key":"23_CR42","unstructured":"Wang, W., et\u00a0al.: Enhancing the reasoning ability of multimodal large language models via mixed preference optimization. arXiv preprint arXiv:2411.10442 (2024)"},{"key":"23_CR43","unstructured":"Wang, Y., Zhou, W., Feng, H., Zhou, K., Li, H.: Towards improving document understanding: an exploration on text-grounding via MLLMs. arXiv preprint arXiv:2311.13194 (2023)"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Wei, H., et al.: Vary: scaling up the vision vocabulary for large vision-language model. In: European Conference on Computer Vision, pp. 408\u2013424. Springer, Cham (2024)","DOI":"10.1007\/978-3-031-73235-5_23"},{"key":"23_CR45","unstructured":"Ye, J., et\u00a0al.: mplug-docowl: modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)"},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Ye, J., et\u00a0al.: Ureader: universal OCR-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"23_CR47","unstructured":"Yu, Y.Q., Liao, M., Wu, J., Liao, Y., Zheng, X., Zeng, W.: Texthawk: exploring efficient fine-grained perception of multimodal large language models. arXiv preprint arXiv:2404.09204 (2024)"},{"key":"23_CR48","unstructured":"Yu, Y.Q., Liao, M., Zhang, J., Wu, J.: Texthawk2: a large vision-language model excels in bilingual ocr and grounding with 16x fewer tokens. arXiv preprint arXiv:2410.05261 (2024)"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11975\u201311986 (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"23_CR50","unstructured":"Zhang, J., Chen, B., Cheng, H., Guo, F., Ding, K., Jin, L.: Docaligner: annotating real-world photographic document images by simply taking pictures. arXiv preprint arXiv:2306.05749 (2023)"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, J., Yang, W., Lai, S., Xie, Z., Jin, L.: Dockylin: a large multimodal model for visual document understanding with efficient visual slimming. arXiv preprint arXiv:2406.19101 (2024)","DOI":"10.1609\/aaai.v39i9.33076"},{"key":"23_CR52","unstructured":"Zhang, P., et\u00a0al.: Internlm-xcomposer: a vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)"},{"key":"23_CR53","unstructured":"Zhang, R., et al.: Llama-adapter: efficient fine-tuning of large language models with zero-initialized attention. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"23_CR54","unstructured":"Zhang, Y., et al.: Llavar: enhanced visual instruction tuning for text-rich image understanding. arXiv preprint arXiv:2306.17107 (2023)"},{"key":"23_CR55","unstructured":"Zhang, Y.F., et al.: Beyond LLaVA-HD: diving into high-resolution large multimodal models. arXiv preprint arXiv:2406.08487 (2024)"},{"key":"23_CR56","unstructured":"Zhang, Y., et al.: Llava-next: a strong zero-shot video understanding model (2024). https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04627-7_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T02:07:41Z","timestamp":1757988461000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04627-7_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,16]]},"ISBN":["9783032046260","9783032046277"],"references-count":56,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04627-7_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,9,16]]},"assertion":[{"value":"16 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}