{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:39Z","timestamp":1777865319403,"version":"3.51.4"},"reference-count":70,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation (NSF)","doi-asserted-by":"publisher","award":["IIS-2338252,IIS-2207052,IS-2302730"],"award-info":[{"award-number":["IIS-2338252,IIS-2207052,IS-2302730"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00219","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"2270-2280","source":"Crossref","is-referenced-by-count":0,"title":["VSP: Diagnosing the Dual Challenges of Perception and Reasoning in Spatial Planning Tasks for MLLMS"],"prefix":"10.1109","author":[{"given":"Qiucheng","family":"Wu","sequence":"first","affiliation":[{"name":"UC,Santa Barbara"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Handong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Saxon","sequence":"additional","affiliation":[{"name":"UC,Santa Barbara"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Trung","family":"Bui","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"William Yang","family":"Wang","sequence":"additional","affiliation":[{"name":"UC,Santa Barbara"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[{"name":"MIT-IBM Watson AI Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiyu","family":"Chang","sequence":"additional","affiliation":[{"name":"UC,Santa Barbara"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref2","volume-title":"Pddll the planning domain definition language","author":"Aeronautiques","year":"1998"},{"key":"ref3","article-title":"Can large language models be good path planners? a benchmark and investigation on spatial-temporal reasoning","author":"Aghzal","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref4","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref5","article-title":"Anthropic AI","year":"2024","journal-title":"The claude 3 model family: Opus, sonnet, haiku. Claude-3 Model Card"},{"key":"ref6","article-title":"Mistral AI","year":"2024","journal-title":"Announcing pixtral 12b"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref9","volume-title":"Claude 3.7 sonnet and claude code","year":"2025"},{"key":"ref10","article-title":"Openflamingo: An opensource framework for training large autoregressive visionlanguage models","author":"Awadalla","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref11","article-title":"Qwen2. 5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref12","article-title":"Deepseek 11m: Scaling opensource language models with longtermism","author":"Bi","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref13","article-title":"Openai gym","author":"Brockman","year":"2016","journal-title":"arXiv preprint arXiv"},{"key":"ref14","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446658"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/iros58592.2024.10802562"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4293"},{"key":"ref18","volume-title":"Introducing gemini 2.0: our new ai model for the agentic era","year":"2024"},{"key":"ref19","article-title":"Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model","author":"Dong","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0802"},{"key":"ref21","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref22","article-title":"Mllm-bench, evaluating multimodal 11 ms using gpt-4v","author":"Ge","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref23","article-title":"Are language models puzzle prodigies? algorithmic puzzles unveil serious challenges in multimodal reasoning","author":"Ghosal","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref24","article-title":"Blocksworld revisited: Learning and reasoning to generate event-sequences from image pairs","author":"Gokhale","year":"2019","journal-title":"arXiv preprint arXiv"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3459"},{"key":"ref26","article-title":"Deep learning for real-time atari game play using offline monte-carlo tree search planning","volume":"27","author":"Guo","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.507"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i18.29991"},{"key":"ref29","article-title":"Look before you leap: Unveiling the power of gpt4v in robotic vision-language planning","author":"Hu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref30","article-title":"Mixtral of experts","author":"Jiang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref31","article-title":"Copal: Corrective planning of robot actions with large language models","author":"Joublin","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1111\/nyas.15125"},{"key":"ref33","article-title":"Llms can\u2019t plan, but can help planning in 11 m -modulo frameworks","author":"Kambhampati","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref34","article-title":"Seed-bench: Benchmarking multimodal 11 ms with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref35","article-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","author":"Lin","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref36","article-title":"Hallusionbench: You see what you think? or you think what you see? an image-context reasoning benchmark challenging for gpt4v (ision), llava-1.5, and other multi-modality models","author":"Liu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref37","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref38","article-title":"Visualagentbench: Towards large multimodal models as visual foundation agents","author":"Liu","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref39","first-page":"arXiv-2310","article-title":"Mathvista: Evaluating math reasoning in visual contexts with gpt-4v, bard, and other large multimodal models","author":"Lu","year":"2023","journal-title":"arXiv e-prints"},{"key":"ref40","article-title":"Dolphins: Multimodal language model for driving","author":"Ma","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2019"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref44","article-title":"Mmrel: A relation understanding dataset and benchmark in the mllm era","author":"Nie","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref45","article-title":"Gemini vs gpt-4v: A preliminary comparison and combination of vision-language models through qualitative cases","author":"Qi","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref46","article-title":"Tptu: Task planning and tool usage of large language model-based ai agents","author":"Ruan","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10611369"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"ref49","article-title":"Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face","volume":"36","author":"Shen","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0377"},{"key":"ref51","article-title":"Alfworld: Aligning text and embodied environments for interactive learning","author":"Shridhar","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref52","article-title":"On the self-verification limitations of large language models on reasoning and planning tasks","author":"Stechly","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-200-7.50073-8"},{"key":"ref54","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref55","article-title":"Drivevlm: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref56","article-title":"Eyes wide shut? exploring the visual shortcomings of multimodal 11 ms","author":"Tong","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref57","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref58","article-title":"Large language models still can\u2019t plan (a benchmark for 11 ms on planning and reasoning about change)","author":"Valmeekam","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref59","first-page":"75993","article-title":"On the planning abilities of large language models - a critical investigation. In Advances in Neural Information Processing Systems","author":"Valmeekam","year":"2023","journal-title":"Curran Associates, Inc."},{"key":"ref60","article-title":"Measuring multimodal mathematical reasoning with math-vision dataset","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1800"},{"key":"ref62","article-title":"Smartplay: A benchmark for 11 ms as intelligent agents","author":"Wu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref63","article-title":"Travelplanner: A benchmark for real-world planning with language agents","author":"Xie","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref64","article-title":"Exploring diverse in-context configurations for image captioning","volume":"36","author":"Yang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"1","key":"ref65","first-page":"1","article-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision)","volume":"9","author":"Yang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref66","article-title":"React: Synergizing reasoning and acting in language models","author":"Yao","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0517"},{"key":"ref68","article-title":"Mmt-bench: A comprehensive multimodal benchmark for evaluating large vision-language models towards multitask agi","author":"Ying","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref69","article-title":"Mm -vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1072"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0228"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445898.pdf?arnumber=11445898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:16:07Z","timestamp":1777529767000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445898\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":70,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00219","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}