{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:25Z","timestamp":1777865305199,"version":"3.51.4"},"reference-count":71,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["IIS-2338418"],"award-info":[{"award-number":["IIS-2338418"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02128","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"22922-22933","source":"Crossref","is-referenced-by-count":0,"title":["HRScene: How Far are VLMs from Effective High-Resolution Image Understanding?"],"prefix":"10.1109","author":[{"given":"Yusen","family":"Zhang","sequence":"first","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenliang","family":"Zheng","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aashrith","family":"Madasu","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Shi","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ryo","family":"Kamoi","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhuoyang","family":"Zou","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sarkar Snigdha Sarathi","family":"Das","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vipul","family":"Gupta","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoxin","family":"Lu","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ranran Haoran","family":"Zhang","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Avitej","family":"Iyer","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Renze","family":"Lou","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenpeng","family":"Yin","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Penn State University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"issue":"7418","key":"ref1","first-page":"61","article-title":"Brigham & Women\u2019s Hospital & Harvard Medical School Chin Lynda 911 Park Peter J. 12 Kucherlapati Raju 13, Genome data analysis: Baylor College of Medicine Creighton Chad J. 2223 Donehower Lawrence A. 2223 24 25, Institute for Systems Biology Reynolds Sheila 31 Kreisberg Richard B. 31 Bernard Brady 31 Bressler Ryan 31 Erkkila Timo 32 Lin Jake 31 Thorsson Vesteinn 31 Zhang Wei 33 Shmulevich Ilya 31, et al. Comprehensive molecular portraits of human breast tumours","volume":"490","year":"2012","journal-title":"Nature"},{"key":"ref2","volume-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref3","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref4","year":"2024","journal-title":"AI Anthropic. The claude 3 model family: Opus, sonnet, haiku. Claude-3 Model Card"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2019.05.010"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2024.110928"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3645107"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3645107"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00646"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref12","article-title":"Molmo and pixmo: Open weights and open data for state-of-the-art multimodal models","author":"Deitke","year":"2024","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-024-03944-3"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-024-03944-3"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1348"},{"key":"ref16","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024","journal-title":"arXiv preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02284"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00997"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_23"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.175"},{"key":"ref23","article-title":"mplug-docow12: High-resolution compressing for ocrfree multi-page document understanding","author":"Hu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680790"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446520"},{"key":"ref26","article-title":"Mini-monkey: Alleviating the semantic sawtooth effect for lightweight MLLMs via complementary image pyramid","volume-title":"International Conference on Learning Representations (ICLR)","author":"Huang"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00141"},{"key":"ref28","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.329"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_33"},{"key":"ref31","article-title":"Cosmoclip: Generalizing large vision-language models for astronomical imaging","author":"Imam","year":"2024","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"Mmad: The first-ever comprehensive benchmark for multimodal large language models in industrial anomaly detection","author":"Jiang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1038\/sdata.2018.251"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1038\/sdata.2018.251"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2024.06.002"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_28"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"ref40","article-title":"The artbench dataset: Benchmarking generative models with artworks","author":"Liao","year":"2022","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00638"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref44","article-title":"Deepseek-vl: towards real-world visionlanguage understanding","author":"Lu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref45","article-title":"Feast your eyes: Mixture-of-resolution adaptation for multimodal large language models","volume-title":"International Conference on Learning Representations (ICLR)","author":"Luo"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73397-0_18"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-021-02951-w"},{"key":"ref49","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.352"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3035969"},{"key":"ref52","article-title":"Milebench: Benchmarking mllms in long context","author":"Song","year":"2024","journal-title":"arXiv preprint"},{"key":"ref53","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint"},{"key":"ref54","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2019.02.012"},{"key":"ref56","article-title":"Muirbench: A comprehensive bench-mark for robust multi-image understanding","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref57","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref58","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref59","article-title":"Divide, conquer and combine: A training-free framework for high-resolution image perception in multimodal large language models","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32852"},{"key":"ref61","first-page":"20540","article-title":"Needle in a multimodal haystack","volume":"37","author":"Wang","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00333"},{"key":"ref63","article-title":"Aerial photo imagery from fall waterfowl surveys, izembek lagoon, alaska, 20172019","author":"Weiser","year":"2022","journal-title":"U.S. Geological Survey data release"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01243"},{"key":"ref65","article-title":"Visual haystacks: A vision-centric needle-in-ahaystack benchmark","author":"Wu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","article-title":"Deepseek-vl2: Mixture-ofexperts vision-language models for advanced multimodal understanding","author":"Wu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21037\/atm.2020.03.132"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.70"},{"key":"ref71","article-title":"Mme-realworld: Could your multimodal 11 m challenge high-resolution real-world scenarios that are difficult for humans?","author":"Zhang","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444378.pdf?arnumber=11444378","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:11:44Z","timestamp":1777529504000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444378\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":71,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02128","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}