{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:21:05Z","timestamp":1778084465365,"version":"3.51.4"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62436007"],"award-info":[{"award-number":["62436007"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02277","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"1-10","source":"Crossref","is-referenced-by-count":1,"title":["Iris: Breaking GUI Complexity with Adaptive Focus and Self-Refining"],"prefix":"10.1109","author":[{"given":"Zhiqi","family":"Ge","sequence":"first","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Juncheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinglei","family":"Pang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minghe","family":"Gao","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaihang","family":"Pan","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wang","family":"Lin","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenqiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1, 2","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"ref2","first-page":"7, 8","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"ref3","first-page":"7","article-title":"The claude 3 model family: Opus, sonnet, haiku","volume":"1","author":"Anthropic","year":"2024","journal-title":"Claude-3 Model Card"},{"key":"ref4","first-page":"2, 6, 7, 8","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023"},{"key":"ref5","first-page":"7","volume-title":"Fuyu-8b: A multimodal architecture for ai agents","author":"Bavishi","year":"2023"},{"key":"ref6","first-page":"7","volume-title":"Paligemma: A versatile 3b vlm for transfer","author":"Beyer","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.1986.4767851"},{"key":"ref8","first-page":"7","volume-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning","author":"Chen","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.505"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"ref11","first-page":"1, 3, 7, 8","article-title":"Mind2web: Towards a generalist agent for the web","volume":"36","author":"Deng","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","first-page":"3","volume-title":"Internlm-xcomposer24khd: A pioneering large vision-language model handling resolutions from 336 pixels to 4 k hd","author":"Dong","year":"2024"},{"key":"ref13","first-page":"3, 7","volume-title":"Navigating the digital world as humans do: Universal visual grounding for gui agents","author":"Gou","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.70675\/802872a7z8092z4556za1fdz2c7af45f0e72"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"ref16","first-page":"7","volume-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies","author":"Hu","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0529"},{"key":"ref18","first-page":"2","volume-title":"Fine-tuning multimodal 11 ms to follow zero-shot demonstrative instructions","author":"Li","year":"2023"},{"key":"ref19","first-page":"3","volume-title":"Ferret-ui 2: Mastering universal user interface understanding across platforms","author":"Li","year":"2024"},{"key":"ref20","first-page":"3","volume-title":"Sphinx-x: Scaling data and parameters for a family of multi-modal large language models","author":"Liu","year":"2024"},{"key":"ref21","first-page":"2, 6","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref22","first-page":"1","volume-title":"Visualwebbench: How far have multimodal 11 ms evolved in web page understanding and grounding?","author":"Liu","year":"2024"},{"key":"ref23","first-page":"1","volume-title":"Gui odyssey: A comprehensive dataset for cross-app gui navigation on mobile devices","author":"Lu","year":"2024"},{"key":"ref24","first-page":"1, 3, 7, 8","article-title":"Androidinthewild: A largescale dataset for android device control","volume":"36","author":"Rawles","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.449"},{"key":"ref26","first-page":"1, 2, 7","volume-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"ref27","volume-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref28","first-page":"1","volume-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref29","first-page":"1","volume-title":"Mobile-agent: Autonomous multi-modal mobile device agent with visual perception","author":"Wang","year":"2024"},{"key":"ref30","first-page":"3","volume-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581158"},{"key":"ref32","first-page":"1","volume-title":"Os-copilot: Towards generalist computer agents with self-improvement","author":"Wu","year":"2024"},{"key":"ref33","first-page":"3, 8","volume-title":"Llava-uhd: an 1 mm perceiving any aspect ratio and high-resolution images","author":"Xu","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73039-9_14"},{"key":"ref35","first-page":"1","volume-title":"Ufo: A ui-focused agent for windows os interaction","author":"Zhang","year":"2024"},{"key":"ref36","first-page":"8","volume-title":"You only look at screens: Multimodal chain-of-action agents","author":"Zhang","year":"2023"},{"key":"ref37","first-page":"3, 6, 7","volume-title":"Agentstudio: A toolkit for building general virtual agents","author":"Zheng","year":"2024"},{"key":"ref38","first-page":"1","volume-title":"Webarena: A realistic web environment for building autonomous agents","author":"Zhou","year":"2023"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444272.pdf?arnumber=11444272","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:09:32Z","timestamp":1777612172000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444272\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02277","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}