{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T16:37:05Z","timestamp":1780418225423,"version":"3.54.1"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726668","type":"print"},{"value":"9783031726675","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72667-5_15","type":"book-chapter","created":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:11:48Z","timestamp":1727554308000},"page":"260-278","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":42,"title":["NavGPT-2: Unleashing Navigational Reasoning Capability for\u00a0Large Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0279-9277","authenticated-orcid":false,"given":"Gengze","family":"Zhou","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5068-1508","authenticated-orcid":false,"given":"Yicong","family":"Hong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9502-050X","authenticated-orcid":false,"given":"Zun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2605-5504","authenticated-orcid":false,"given":"Xin Eric","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3631-256X","authenticated-orcid":false,"given":"Qi","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,29]]},"reference":[{"key":"15_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"An, D., Qi, Y., Huang, Y., Wu, Q., Wang, L., Tan, T.: Neighbor-view enhanced model for vision and language navigation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 5101\u20135109 (2021)","DOI":"10.1145\/3474085.3475282"},{"key":"15_CR3","unstructured":"An, D., et al.: BEVBert: topo-metric map pre-training for language-guided navigation. arXiv preprint arXiv:2212.04385 (2022)"},{"key":"15_CR4","unstructured":"Anderson, P., et\u00a0al.: On evaluation of embodied navigation agents. arXiv preprint arXiv:1807.06757 (2018)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"15_CR6","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3D: learning from RGB-D data in indoor environments. In: 2017 International Conference on 3D Vision (3DV), pp. 667\u2013676. IEEE (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"15_CR8","unstructured":"Chen, J., Lin, B., Xu, R., Chai, Z., Liang, X., Wong, K.Y.K.: MapGPT: map-guided prompting for unified vision-and-language navigation. arXiv preprint arXiv:2401.07314 (2024)"},{"key":"15_CR9","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Chen, K., Chen, J.K., Chuang, J., V\u00e1zquez, M., Savarese, S.: Topological planning with transformers for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11276\u201311286 (2021)","DOI":"10.1109\/CVPR46437.2021.01112"},{"key":"15_CR11","first-page":"5834","volume":"34","author":"S Chen","year":"2021","unstructured":"Chen, S., Guhur, P.L., Schmid, C., Laptev, I.: History aware multimodal transformer for vision-and-language navigation. Adv. Neural. Inf. Process. Syst. 34, 5834\u20135847 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Think global, act local: dual-scale graph transformer for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16537\u201316547 (2022)","DOI":"10.1109\/CVPR52688.2022.01604"},{"key":"15_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal Image-TExt Representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"15_CR14","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality, March 2023. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"15_CR15","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"15_CR16","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"15_CR17","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Dou, Z.Y., Peng, N.: FOAM: a follower-aware speaker model for vision-and-language navigation. arXiv preprint arXiv:2206.04294 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.322"},{"key":"15_CR19","unstructured":"Driess, D., et\u00a0al.: PaLM-E: an embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"15_CR21","unstructured":"Fried, D., et al.: Speaker-follower models for vision-and-language navigation. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Guhur, P.L., Tapaswi, M., Chen, S., Laptev, I., Schmid, C.: Airbert: in-domain pretraining for vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1634\u20131643 (2021)","DOI":"10.1109\/ICCV48922.2021.00166"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Hao, W., Li, C., Li, X., Carin, L., Gao, J.: Towards learning a generic agent for vision-and-language navigation via pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13137\u201313146 (2020)","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Hong, Y., Wu, Q., Qi, Y., Rodriguez-Opazo, C., Gould, S.: A recurrent vision-and-language BERT for navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1643\u20131653, June 2021","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Huang, H., et al.: Transferable representation learning in vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7404\u20137413 (2019)","DOI":"10.1109\/ICCV.2019.00750"},{"key":"15_CR26","unstructured":"Ilharco, G., Jain, V., Ku, A., Ie, E., Baldridge, J.: General evaluation for instruction conditioned navigation using dynamic time warping. arXiv preprint arXiv:1907.05446 (2019)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Kamath, A., et al.: A new path: scaling vision-and-language navigation with synthetic instructions and imitation learning. arXiv preprint arXiv:2210.03112 (2022)","DOI":"10.1109\/CVPR52729.2023.01041"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Ke, L., et al.: Tactical rewind: self-correction via backtracking in vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6741\u20136749 (2019)","DOI":"10.1109\/CVPR.2019.00690"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Ku, A., Anderson, P., Patel, R., Ie, E., Baldridge, J.: Room-across-room: multilingual vision-and-language navigation with dense spatiotemporal grounding. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 4392\u20134412 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"15_CR30","unstructured":"Li, J., Bansal, M.: PanoGen: text-conditioned panoramic environment generation for vision-and-language navigation. arXiv preprint arXiv:2305.19195 (2023)"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Li, J., Tan, H., Bansal, M.: EnvEdit: environment editing for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15407\u201315417 (2022)","DOI":"10.1109\/CVPR52688.2022.01497"},{"key":"15_CR32","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"15_CR33","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"15_CR34","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, Z., Yang, J., Wang, Y., Jiang, S.: KERM: knowledge enhanced reasoning for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2583\u20132592 (2023)","DOI":"10.1109\/CVPR52729.2023.00254"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Robust navigation with language pretraining and stochastic sampling. arXiv preprint arXiv:1909.02244 (2019)","DOI":"10.18653\/v1\/D19-1159"},{"key":"15_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"15_CR37","unstructured":"Lin, B., et al.: NavCoT: boosting LLM-based vision-and-language navigation via learning disentangled reasoning. arXiv preprint arXiv:2403.07376 (2024)"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, Y., Chen, Z., Liang, X., Liu, J., Liang, X.: Adapt: vision-language navigation with modality-aligned action prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15396\u201315406 (2022)","DOI":"10.1109\/CVPR52688.2022.01496"},{"key":"15_CR39","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Liu, R., Wang, X., Wang, W., Yang, Y.: Bird\u2019s-eye-view scene graph for vision-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10968\u201310980 (2023)","DOI":"10.1109\/ICCV51070.2023.01007"},{"key":"15_CR41","unstructured":"Long, Y., Cai, W., Wang, H., Zhan, G., Dong, H.: InstructNav: zero-shot system for generic instruction navigation in unexplored environment. arXiv preprint arXiv:2406.04882 (2024)"},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Long, Y., Li, X., Cai, W., Dong, H.: Discuss before moving: visual language navigation via multi-expert discussions. arXiv preprint arXiv:2309.11382 (2023)","DOI":"10.1109\/ICRA57147.2024.10611565"},{"key":"15_CR43","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"15_CR44","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"15_CR45","unstructured":"Ma, C.Y., et al.: Self-monitoring navigation agent via auxiliary progress estimation. arXiv preprint arXiv:1901.03035 (2019)"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Ma, C.Y., Wu, Z., AlRegib, G., Xiong, C., Kira, Z.: The regretful agent: heuristic-aided navigation through progress estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6732\u20136740 (2019)","DOI":"10.1109\/CVPR.2019.00689"},{"key":"15_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1007\/978-3-030-58539-6_16","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Majumdar","year":"2020","unstructured":"Majumdar, A., Shrivastava, A., Lee, S., Anderson, P., Parikh, D., Batra, D.: Improving vision-and-language navigation with image-text pairs from the web. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 259\u2013274. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_16"},{"key":"15_CR48","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Pan, B., et al.: LangNav: language as a perceptual representation for navigation. arXiv preprint arXiv:2310.07889 (2023)","DOI":"10.18653\/v1\/2024.findings-naacl.60"},{"key":"15_CR50","first-page":"5296","volume":"33","author":"A Parvaneh","year":"2020","unstructured":"Parvaneh, A., Abbasnejad, E., Teney, D., Shi, J.Q., van den Hengel, A.: Counterfactual vision-and-language navigation: unravelling the unseen. Adv. Neural. Inf. Process. Syst. 33, 5296\u20135307 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR51","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"15_CR52","doi-asserted-by":"crossref","unstructured":"Qi, Y., et al.: REVERIE: remote embodied visual referring expression in real indoor environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9982\u20139991 (2020)","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"15_CR53","doi-asserted-by":"crossref","unstructured":"Qiao, Y., Qi, Y., Hong, Y., Yu, Z., Wang, P., Wu, Q.: HOP+: history-enhanced and order-aware pre-training for vision-and-language navigation. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3234243"},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Qiao, Y., Qi, Y., Yu, Z., Liu, J., Wu, Q.: March in chat: interactive prompting for remote embodied referring expression. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15758\u201315767 (2023)","DOI":"10.1109\/ICCV51070.2023.01444"},{"issue":"8","key":"15_CR55","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"15_CR56","unstructured":"Ramakrishnan, S.K., et\u00a0al.: Habitat-Matterport 3D dataset (HM3D): 1000 large-scale 3D environments for embodied AI. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"15_CR57","unstructured":"Ross, S., Gordon, G., Bagnell, D.: A reduction of imitation learning and structured prediction to no-regret online learning. In: Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics, pp. 627\u2013635. JMLR Workshop and Conference Proceedings (2011)"},{"key":"15_CR58","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)"},{"key":"15_CR59","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5100\u20135111 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"15_CR60","doi-asserted-by":"crossref","unstructured":"Tan, H., Yu, L., Bansal, M.: Learning to navigate unseen environments: back translation with environmental dropout. In: Proceedings of NAACL-HLT, pp. 2610\u20132621 (2019)","DOI":"10.18653\/v1\/N19-1268"},{"key":"15_CR61","unstructured":"Thomason, J., Murray, M., Cakmak, M., Zettlemoyer, L.: Vision-and-dialog navigation. In: Conference on Robot Learning, pp. 394\u2013406 (2020)"},{"key":"15_CR62","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"15_CR63","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"15_CR64","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"15_CR65","doi-asserted-by":"crossref","unstructured":"Wang, H., Liang, W., Shen, J., Van\u00a0Gool, L., Wang, W.: Counterfactual cycle-consistent learning for instruction following and generation in vision-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 15471\u201315481 (2022)","DOI":"10.1109\/CVPR52688.2022.01503"},{"key":"15_CR66","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1007\/978-3-030-58542-6_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"H Wang","year":"2020","unstructured":"Wang, H., Wang, W., Shu, T., Liang, W., Shen, J.: Active visual information gathering for vision-language navigation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12367, pp. 307\u2013322. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58542-6_19"},{"key":"15_CR67","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"15_CR68","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6629\u20136638 (2019)","DOI":"10.1109\/CVPR.2019.00679"},{"key":"15_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/978-3-030-01270-0_3","volume-title":"Computer Vision \u2013 ECCV 2018","author":"X Wang","year":"2018","unstructured":"Wang, X., Xiong, W., Wang, H., Wang, W.Y.: Look before you leap: bridging model-free and model-based reinforcement learning for planned-ahead vision-and-language navigation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11220, pp. 38\u201355. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01270-0_3"},{"key":"15_CR70","doi-asserted-by":"crossref","unstructured":"Wang, Z., Li, X., Yang, J., Liu, Y., Jiang, S.: GridMM: grid memory map for vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15625\u201315636 (2023)","DOI":"10.1109\/ICCV51070.2023.01432"},{"key":"15_CR71","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Scaling data generation in vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12009\u201312020 (2023)","DOI":"10.1109\/ICCV51070.2023.01103"},{"key":"15_CR72","doi-asserted-by":"crossref","unstructured":"Xia, F., Zamir, A.R., He, Z., Sax, A., Malik, J., Savarese, S.: Gibson Env: real-world perception for embodied agents. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 9068\u20139079 (2018)","DOI":"10.1109\/CVPR.2018.00945"},{"key":"15_CR73","unstructured":"Zhan, Z., Yu, L., Yu, S., Tan, G.: MC-GPT: empowering vision-and-language navigation with memory map and reasoning chains. arXiv preprint arXiv:2405.10620 (2024)"},{"key":"15_CR74","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: NaVid: video-based VLM plans the next step for vision-and-language navigation. arXiv preprint arXiv:2402.15852 (2024)","DOI":"10.15607\/RSS.2024.XX.079"},{"key":"15_CR75","doi-asserted-by":"crossref","unstructured":"Zhao, Y., et al.: Target-driven structured transformer planner for vision-language navigation. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4194\u20134203 (2022)","DOI":"10.1145\/3503161.3548281"},{"key":"15_CR76","doi-asserted-by":"crossref","unstructured":"Zheng, D., Huang, S., Zhao, L., Zhong, Y., Wang, L.: Towards learning a generalist model for embodied navigation. arXiv preprint arXiv:2312.02010 (2023)","DOI":"10.1109\/CVPR52733.2024.01293"},{"key":"15_CR77","doi-asserted-by":"crossref","unstructured":"Zhou, G., Hong, Y., Wu, Q.: NavGPT: explicit reasoning in vision-and-language navigation with large language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 7641\u20137649 (2024)","DOI":"10.1609\/aaai.v38i7.28597"},{"key":"15_CR78","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"15_CR79","doi-asserted-by":"crossref","unstructured":"Zhu, F., Zhu, Y., Chang, X., Liang, X.: Vision-language navigation with self-supervised auxiliary reasoning tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10012\u201310022 (2020)","DOI":"10.1109\/CVPR42600.2020.01003"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72667-5_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:09:09Z","timestamp":1732828149000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72667-5_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,29]]},"ISBN":["9783031726668","9783031726675"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72667-5_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,29]]},"assertion":[{"value":"29 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}