{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T04:19:53Z","timestamp":1781929193770,"version":"3.54.5"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02239","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"24159-24169","source":"Crossref","is-referenced-by-count":1,"title":["Vision-Language Models Can't See the Obvious"],"prefix":"10.1109","author":[{"given":"Yasser","family":"Dahou","sequence":"first","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ngoc Dung","family":"Huynh","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Phuc H.","family":"Le-Khac","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wamiq Reyaz","family":"Para","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ankit","family":"Singh","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sanath","family":"Narayan","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute,Abu Dhabi,UAE"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","volume-title":"arXiv preprint","author":"Abdin","year":"2024"},{"key":"ref2","article-title":"Gpt-4 technical report","volume-title":"arXiv preprint","author":"Achiam","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref5","volume-title":"Claude 3.5 sonnet","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.visres.2012.04.005"},{"key":"ref7","article-title":"Paligemma: A versatile 3 b vlm for transfer","volume-title":"arXiv preprint","author":"Beyer","year":"2024"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref9","article-title":"$\\pi \\_0$: A vision-language-action flow model for general robot control","volume-title":"arXiv preprint","author":"Black","year":"2024"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01264"},{"key":"ref11","doi-asserted-by":"crossref","DOI":"10.52202\/079017-0850","article-title":"Are we on the right way for evaluating large vision-language models?","volume-title":"arXiv preprint","author":"Chen","year":"2024"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref13","article-title":"Nvlm: Open frontier-class multimodal 11 ms","volume-title":"arXiv preprint","author":"Dai","year":"2024"},{"key":"ref14","article-title":"Molmo and pixmo: Open weights and open data for state-of-the-art multimodal models","volume-title":"arXiv preprint","author":"Deitke","year":"2024"},{"key":"ref15","article-title":"The llama 3 herd of models","volume-title":"arXiv preprint","author":"Dubey","year":"2024"},{"key":"ref16","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","volume-title":"arXiv preprint","author":"Fu","year":"2023"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"ref20","volume-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies","author":"Hu","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref23","article-title":"Openvla: An open-source vision-languageaction model","volume-title":"arXiv preprint","author":"Kim","year":"2024"},{"key":"ref24","article-title":"Do saliency models detect odd-oneout targets? new datasets and evaluations","volume-title":"arXiv preprint","author":"Kotseruba","year":"2020"},{"key":"ref25","article-title":"Building and better understanding vision-language models: insights and future directions","volume-title":"arXiv preprint","author":"Lauren\u00e7on","year":"2024"},{"key":"ref26","article-title":"What matters when building visionlanguage models?","volume-title":"arXiv preprint","author":"Lauren\u00e7on","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01263"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02520"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref31","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref32","article-title":"Mmbench: Is your multi-modal model an all-around player?","volume-title":"arXiv preprint","author":"Liu","year":"2023"},{"key":"ref33","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS)","author":"Lu","year":"2022"},{"key":"ref34","article-title":"Mathvista: Evaluating math reasoning in visual contexts with gpt4v, bard, and other large multimodal models","volume-title":"arXiv e-prints","author":"Lu","year":"2023"},{"key":"ref35","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2022.findings-acl.177","article-title":"Chartqa: A benchmark for question answering about charts with visual and logical reasoning","volume-title":"arXiv preprint","author":"Masry","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"ref38","volume-title":"Llama 3.2: Revolutionizing edge ai and vision with open, customizable models","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref40","volume-title":"Mind children: The future of robot and human intelligence","author":"Moravec","year":"1988"},{"key":"ref41","article-title":"Screenagent: A vision language model-driven computer control agent","volume-title":"arXiv preprint","author":"Niu","year":"2024"},{"key":"ref42","volume-title":"Gpt-4o","year":"2024"},{"key":"ref43","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref46","volume-title":"Internvl2: Better than the best - expanding performance boundaries of open-source multimodal models with the progressive scaling strategy","year":"2024"},{"key":"ref47","article-title":"Cambrian-1: A fully open, vision-centric exploration of multimodal llms","volume-title":"arXiv preprint","author":"Tong","year":"2024"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00914"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0285(80)90005-5"},{"key":"ref50","volume-title":"Measuring multimodal mathematical reasoning with math-vision dataset","author":"Wang","year":"2024"},{"key":"ref51","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-017-0058"},{"key":"ref53","volume-title":"Grok-1.5v","year":"2024"},{"key":"ref54","article-title":"Qwen2 technical report","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1140-0"},{"key":"ref60","article-title":"Minigpt-4: Enhancing visionlanguage understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446134.pdf?arnumber=11446134","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:18:02Z","timestamp":1777529882000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446134\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02239","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}