{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:17:59Z","timestamp":1777889879752,"version":"3.51.4"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Plan of China","doi-asserted-by":"publisher","award":["2023YFC3310700"],"award-info":[{"award-number":["2023YFC3310700"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62036012,U21B2044,62236008,62472422,U2333215"],"award-info":[{"award-number":["62036012,U21B2044,62236008,62472422,U2333215"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005090","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","award":["4242051"],"award-info":[{"award-number":["4242051"]}],"id":[{"id":"10.13039\/501100005090","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00525","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"5536-5546","source":"Crossref","is-referenced-by-count":0,"title":["NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments"],"prefix":"10.1109","author":[{"given":"Xuan","family":"Yao","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences (CASIA),State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS)"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junyu","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences (CASIA),State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS)"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences (CASIA),State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS)"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Bevbert: Topo-metric map pre-training for language-guided navigation","author":"An","year":"2022","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"1st place solutions for rxr-habitat vision-and-language navigation competition","author":"An","year":"2022","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3386695"},{"key":"ref4","article-title":"On evaluation of embodied navigation agents","author":"Anderson","year":"2018","journal-title":"arXiv preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref6","first-page":"671","article-title":"Sim-to-real transfer for vision-and-language navigation","volume-title":"In CoRL","author":"Anderson","year":"2021"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611071"},{"key":"ref8","first-page":"1384","article-title":"The robotslang benchmark: Dialog-guided robot localization and navigation","volume-title":"In CoRL","author":"Banerjee","year":"2021"},{"key":"ref9","article-title":"Navigation world models","author":"Bar","year":"2024","journal-title":"ArXiv"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"key":"ref11","article-title":"Affordances-oriented planning using foundation models for continuous vision-language navigation","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2764"},{"key":"ref13","article-title":"A2nav: Action-aware zero-shot robot navigation by exploiting vision-and-language ability of foundation models","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00478"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3311447"},{"key":"ref16","first-page":"14902","article-title":"Fast-slow testtime adaptation for online vision-and-language navigation","volume-title":"In ICML","author":"Gao","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3546312"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01502"},{"key":"ref19","article-title":"Recurrent world models facilitate policy evolution","volume-title":"In NeurIPS","author":"Ha","year":"2018"},{"key":"ref20","first-page":"2555","article-title":"Learning latent dynamics for planning from pixels","volume-title":"In ICML","author":"Hafner","year":"2019"},{"key":"ref21","article-title":"Dream to control: Learning behaviors by latent imagination","volume-title":"In ICLR","author":"Hafner","year":"2020"},{"key":"ref22","article-title":"Mastering diverse domains through world models","author":"Hafner","year":"2023","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.507"},{"key":"ref24","article-title":"Frequency-enhanced data augmentation for vision-and-language navigation","volume-title":"In NeurIPS","author":"He","year":"2024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01500"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00284"},{"key":"ref27","first-page":"20703","article-title":"Model-based imitation learning for urban driving","volume-title":"In NeurIPS","author":"Hu","year":"2022"},{"key":"ref28","article-title":"General evaluation for instruction conditioned navigation using dynamic time warping","author":"Ilharco","year":"2019","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956561"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01041"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581220"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01447"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_34"},{"key":"ref34","first-page":"104","article-title":"Beyond the nav-graph: Vision-andlanguage navigation in continuous environments","volume-title":"In ECCV","author":"Krantz","year":"2020"},{"key":"ref35","first-page":"15162","article-title":"Waypoint models for instructionguided navigation in continuous environments","volume-title":"In ICCV","author":"Krantz","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"ref37","article-title":"Memorymaze: Scenario driven benchmark and visual language navigation model for guiding blind people","author":"Kuribayashi","year":"2024","journal-title":"arXiv preprint"},{"issue":"1","key":"ref38","first-page":"1","article-title":"A path towards autonomous machine intelligence version 0.9. 2, 2022-06-27","volume":"62","author":"LeCun","year":"2022","journal-title":"Open Review"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01040"},{"key":"ref40","article-title":"Panogen: Text-conditioned panoramic environment generation for vision-and-language navigation","volume-title":"In NeurIPS","author":"Li","year":"2024"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01470"},{"key":"ref43","article-title":"Embodiedgpt: Vision-language pre-training via embodied chain of thought","volume-title":"In NeurIPS","author":"Mu","year":"2024"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981405"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00667"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10174-9"},{"key":"ref47","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume-title":"In NeurIPS","author":"Paszke","year":"2019"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02161"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01000"},{"issue":"7","key":"ref50","first-page":"8524","article-title":"Hop+: History-enhanced and orderaware pre-training for vision-and-language navigation","volume":"45","author":"Qiao","year":"2023","journal-title":"IEEE TPAMI"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.328"},{"key":"ref52","first-page":"627","article-title":"A reduction of imitation learning and structured prediction to noregret online learning","volume-title":"In AISTATS","author":"Ross","year":"2011"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3594806.3596532"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.3390\/s23218953"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00998"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01868"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01397"},{"key":"ref59","first-page":"15625","article-title":"Gridmm: Grid memory map for vision-andlanguage navigation","volume-title":"In ICCV","author":"Wang","year":"2023"},{"key":"ref60","article-title":"Sim-to-real transfer via 3d feature fields for vision-and-language navigation","volume-title":"In CoRL","author":"Wang","year":"2024"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01305"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1176\/appi.books.9781585622665.33114"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-023-09217-1"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01536"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.079"},{"key":"ref66","article-title":"Storm: Efficient stochastic transformer based world models for reinforcement learning","volume-title":"In NeurIPS","author":"Zhang","year":"2024"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444991.pdf?arnumber=11444991","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:23:56Z","timestamp":1777613036000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444991\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00525","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}