{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:47:10Z","timestamp":1777657630460,"version":"3.51.4"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730092","type":"print"},{"value":"9783031730108","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73010-8_23","type":"book-chapter","created":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T13:11:33Z","timestamp":1731157893000},"page":"390-406","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["LLaVA-UHD: An LMM Perceiving Any Aspect Ratio and High-Resolution Images"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8492-2130","authenticated-orcid":false,"given":"Zonghao","family":"Guo","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2812-9206","authenticated-orcid":false,"given":"Ruyi","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8276-3620","authenticated-orcid":false,"given":"Yuan","family":"Yao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3031-5212","authenticated-orcid":false,"given":"Junbo","family":"Cui","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6428-1456","authenticated-orcid":false,"given":"Zanlin","family":"Ni","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4276-3133","authenticated-orcid":false,"given":"Chunjiang","family":"Ge","sequence":"additional","affiliation":[]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7709-2543","authenticated-orcid":false,"given":"Zhiyuan","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7251-0988","authenticated-orcid":false,"given":"Gao","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,10]]},"reference":[{"key":"23_CR1","unstructured":"Introducing ChatGPT (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"23_CR2","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"23_CR3","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: IEEE ICCV, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"23_CR5","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"23_CR6","unstructured":"Bavishi, R., et al.: Introducing our multimodal models (2023). adept.ai\/blog\/fuyu-8b"},{"key":"23_CR7","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"23_CR8","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"23_CR9","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/vicuna.lmsys.org. Accessed 14 Apr 2023"},{"key":"23_CR10","unstructured":"Chung, H.W., et al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"23_CR11","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"23_CR12","unstructured":"Dong, X., et al.: InternLM-XComposer2-4KHD: a pioneering large vision-language model handling resolutions from 336 pixels to 4k HD. arXiv preprint arXiv:2404.06512 (2024)"},{"key":"23_CR13","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Gurari, D., et al.: VizWiz grand challenge: answering visual questions from blind people. In: IEEE CVPR, pp. 3608\u20133617 (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: IEEE ICCV, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Hong, W., et\u00a0al.: CogAgent: a visual language model for GUI agents. arXiv preprint arXiv:2312.08914 (2023)","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: IEEE CVPR, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123, 32\u201373 (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"23_CR19","unstructured":"Li, B., Zhang, P., Yang, J., Zhang, Y., Pu, F., Liu, Z.: OtterHD: a high-resolution multi-modality model. arXiv preprint arXiv:2311.04219 (2023)"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"23_CR21","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Monkey: image resolution and text label are important things for large multi-modal models. arXiv preprint arXiv:2311.06607 (2023)","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Lin, Z., et al.: SPHINX: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)","DOI":"10.1007\/978-3-031-73033-7_3"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"23_CR26","unstructured":"Liu, H., et al.: LLaVA-NeXT: improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"23_CR27","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS, vol. 36 (2024)"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"23_CR29","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. Adv. Neural. Inf. Process. Syst. 35, 2507\u20132521 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR30","unstructured":"Luo, G., Zhou, Y., Zhang, Y., Zheng, X., Sun, X., Ji, R.: Feast your eyes: mixture-of-resolution adaptation for multimodal large language models. arXiv preprint arXiv:2403.03003 (2024)"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: IEEE ICDAR, pp. 947\u2013952 (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"23_CR32","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"23_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"23_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-58536-5_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"O Sidorov","year":"2020","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 742\u2013758. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_44"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: IEEE CVPR, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"23_CR36","doi-asserted-by":"crossref","unstructured":"Sun, Z., et\u00a0al.: Aligning large multimodal models with factually augmented RLHF. arXiv preprint arXiv:2309.14525 (2023)","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"23_CR37","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"23_CR38","unstructured":"Yang, Z., et al.: The dawn of LMMs: preliminary explorations with GPT-4V (ision). arXiv preprint arXiv:2309.17421, vol. 9, no. 1, p. 1 (2023)"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Ye, J., et\u00a0al.: UReader: universal OCR-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.187"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257 (2023)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"23_CR41","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"23_CR42","unstructured":"Zhang, Y., et al.: Beyond LLaVA-HD: diving into high-resolution large multimodal models. arXiv preprint arXiv:2406.08487 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73010-8_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T14:06:59Z","timestamp":1731161219000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73010-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,10]]},"ISBN":["9783031730092","9783031730108"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73010-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,10]]},"assertion":[{"value":"10 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}