{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,29]],"date-time":"2026-07-29T02:03:13Z","timestamp":1785290593384,"version":"3.55.0"},"reference-count":43,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11246684","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"2392-2399","source":"Crossref","is-referenced-by-count":12,"title":["WMNav: Integrating Vision-Language Models into World Models for Object Goal Navigation"],"prefix":"10.1109","author":[{"given":"Dujun","family":"Nie","sequence":"first","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Automation"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xianda","family":"Guo","sequence":"additional","affiliation":[{"name":"Wuhan University,School of Computer Science"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yiqun","family":"Duan","sequence":"additional","affiliation":[{"name":"University of Technology Sydney,School of Computer Science"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruijun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Automation"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Automation"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Objectnav revisited: On evaluation of embodied agents navigating to objects","author":"Batra","year":"2020"},{"key":"ref2","article-title":"ZSON: Zero-shot object-goal navigation using multimodal goal embeddings","volume-title":"Advances in Neural Information Processing Systems","author":"Majumdar","year":"2022"},{"key":"ref3","article-title":"Zero-shot object searching using large-scale object relationship prior","author":"Chen","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02219"},{"key":"ref5","article-title":"Esc: Exploration with soft commonsense constraints for zero-shot object navigation","author":"Zhou","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.075"},{"key":"ref7","article-title":"Voronav: Voronoi-based zero-shot object navigation with large language model","author":"Wu","year":"2024"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.24"},{"key":"ref9","article-title":"Topv-nav: Unlocking the top-view spatial reasoning potential of mllm for zero-shot object navigation","author":"Zhong","year":"2024"},{"key":"ref10","article-title":"Offline visual representation learning for embodied navigation","volume-title":"Workshop on Reincarnating Reinforcement Learning at ICLR 2023","author":"Yadav"},{"key":"ref11","article-title":"Ovrl-v2: A simple state-of-art baseline for imagenav and objectnav","author":"Yadav","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610499"},{"key":"ref13","article-title":"End-to-end navigation with vision language models: Transforming spatial reasoning into question-answering","author":"Goetting","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610712"},{"key":"ref15","first-page":"19 730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li"},{"key":"ref16","article-title":"Cross from left to right brain: Adaptive text dreamer for vision-and-language navigation","author":"Zhang","year":"2025"},{"key":"ref17","article-title":"Is your llm secretly a world model of the internet? model-based planning for web agents","author":"Gu","year":"2024"},{"key":"ref18","article-title":"Habitat-matterport 3d dataset (hm3d): 1000 large-scale 3d environments for embodied ai","author":"Ramakrishnan","year":"2021"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01509"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01441"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00511"},{"key":"ref23","article-title":"Learning active camera for multi-object navigation","volume-title":"Advances in Neural Information Processing Systems","author":"Chen","year":"2022"},{"key":"ref24","article-title":"FILM: Following instructions in language with modular methods","volume-title":"International Conference on Learning Representations","author":"Min"},{"key":"ref25","article-title":"Jarvis: A neuro-symbolic commonsense reasoning framework for conversational embodied agents","author":"Zheng","year":"2022"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01652"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342512"},{"key":"ref28","article-title":"Navigation with large language models: Semantic guesswork as a heuristic for planning","volume-title":"7th Annual Conference on Robot Learning","author":"Shah"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610499"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref31","article-title":"Surds: Benchmarking spatial understanding and reasoning in driving scenarios with vision language models","author":"Guo","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/122344.122377"},{"key":"ref33","article-title":"World models","author":"Ha","year":"2018"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1561\/2200000086"},{"key":"ref35","article-title":"Cognitive map for language models: Optimal planning via verbally representing the world model","author":"Kim","year":"2024"},{"key":"ref36","article-title":"Web agents with world models: Learning and leveraging environment dynamics in web navigation","author":"Chae","year":"2024"},{"key":"ref37","article-title":"Moma-kitchen: A 100k+ benchmark for affordance-grounded last-mile navigation in mobile manipulation","author":"Zhang","year":"2025"},{"key":"ref38","article-title":"Habitat-matterport 3d dataset (HM3d): 1000 large-scale 3d environments for embodied AI","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)","author":"Ramakrishnan"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73254-6_10"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01553"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref43","article-title":"Qwen2 technical report","year":"2024"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","location":"Hangzhou, China","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11246684.pdf?arnumber=11246684","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T06:12:53Z","timestamp":1765519973000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11246684\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11246684","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}