{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:37:52Z","timestamp":1777657072840,"version":"3.51.4"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726514","type":"print"},{"value":"9783031726521","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72652-1_27","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"459-476","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["LLM as\u00a0Copilot for\u00a0Coarse-Grained Vision-and-Language Navigation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5606-0702","authenticated-orcid":false,"given":"Yanyuan","family":"Qiao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8225-4673","authenticated-orcid":false,"given":"Qianyi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8160-1796","authenticated-orcid":false,"given":"Jiajun","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0903-9131","authenticated-orcid":false,"given":"Jing","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3631-256X","authenticated-orcid":false,"given":"Qi","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"27_CR1","unstructured":"An, D., et al.: BEVBert: multimodal map pre-training for language-guided navigation. In: ICCV, pp. 2737\u20132748 (2023)"},{"key":"27_CR2","unstructured":"Anderson, P., et al.: On evaluation of embodied navigation agents. CoRR abs\/1807.06757 (2018)"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: CVPR, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Chen, H., Suhr, A., Misra, D., Snavely, N., Artzi, Y.: TOUCHDOWN: natural language navigation and spatial reasoning in visual street environments. In: CVPR, pp. 12538\u201312547 (2019)","DOI":"10.1109\/CVPR.2019.01282"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Chen, S., Guhur, P., Schmid, C., Laptev, I.: History aware multimodal transformer for vision-and-language navigation. In: NeurIPS, pp. 5834\u20135847 (2021)","DOI":"10.1109\/ICCV48922.2021.00166"},{"key":"27_CR6","doi-asserted-by":"publisher","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Learning from unlabeled 3D environments for vision-and-language navigation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13699, pp. 638\u2013655. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19842-7_37","DOI":"10.1007\/978-3-031-19842-7_37"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Think global, act local: dual-scale graph transformer for vision-and-language navigation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01604"},{"key":"27_CR8","doi-asserted-by":"crossref","unstructured":"Fan, Y., Chen, W., Jiang, T., Zhou, C., Zhang, Y., Wang, X.E.: Aerial vision-and-dialog navigation. In: Findings of the Association for Computational Linguistics (ACL 2023) (2023)","DOI":"10.18653\/v1\/2023.findings-acl.190"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Feng, W., Fu, T.J., Lu, Y., Wang, W.Y.: ULN: towards underspecified vision-and-language navigation. In: EMNLP, pp. 6394\u20136412 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.429"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Gao, C., et al.: Adaptive zone-aware hierarchical planner for vision-language navigation. In: CVPR, pp. 14911\u201314920 (2023)","DOI":"10.1109\/CVPR52729.2023.01432"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Guhur, P., Tapaswi, M., Chen, S., Laptev, I., Schmid, C.: Airbert: in-domain pretraining for vision-and-language navigation. In: ICCV, pp. 1634\u20131643 (2021)","DOI":"10.1109\/ICCV48922.2021.00166"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Hong, Y., Opazo, C.R., Wu, Q., Gould, S.: Sub-instruction aware vision-and-language navigation. In: Webber, B., Cohn, T., He, Y., Liu, Y. (eds.) EMNLP, pp. 3360\u20133376 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.271"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Hong, Y., Wu, Q., Qi, Y., Opazo, C.R., Gould, S.: : A recurrent vision-and-language BERT for navigation. In: CVPR, pp. 1643\u20131653 (2021)","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"Jain, V., Magalh\u00e3es, G., Ku, A., Vaswani, A., Ie, E., Baldridge, J.: Stay on the path: instruction fidelity in vision-and-language navigation, pp. 1862\u20131872 (2019)","DOI":"10.18653\/v1\/P19-1181"},{"key":"27_CR15","unstructured":"Kaufman, L., Rousseeuw, P.J.: Finding Groups in Data: An Introduction to Cluster Analysis. Wiley (2009)"},{"key":"27_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58604-1_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Krantz","year":"2020","unstructured":"Krantz, J., Wijmans, E., Majumdar, A., Batra, D., Lee, S.: Beyond the nav-graph: vision-and-language navigation in continuous environments. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_7"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Ku, A., Anderson, P., Patel, R., Ie, E., Baldridge, J.: Room-across-room: multilingual vision-and-language navigation with dense spatiotemporal grounding. In: EMNLP, pp. 4392\u20134412 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Li, J., Tan, H., Bansal, M.: EnvEdit: environment editing for vision-and-language navigation. In: CVPR, pp. 15386\u201315396 (2022)","DOI":"10.1109\/CVPR52688.2022.01497"},{"key":"27_CR19","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022). https:\/\/api.semanticscholar.org\/CorpusID:246411402"},{"key":"27_CR20","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, Z., Yang, J., Wang, Y., Jiang, S.: KERM: knowledge enhanced reasoning for vision-and-language navigation. In: CVPR, pp. 2583\u20132592 (2023)","DOI":"10.1109\/CVPR52729.2023.00254"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., Yuan, L.: Video-LLaVA: learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Lin, K., Chen, P., Huang, D., Li, T.H., Tan, M., Gan, C.: Learning vision-and-language navigation from Youtube videos. In: ICCV, pp. 8317\u20138326 (2023)","DOI":"10.1109\/ICCV51070.2023.00764"},{"key":"27_CR23","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Liu, R., Wang, X., Wang, W., Yang, Y.: Bird\u2019s-eye-view scene graph for vision-language navigation. In: ICCV, pp. 10968\u201310980 (2023)","DOI":"10.1109\/ICCV51070.2023.01007"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, H., Qi, Y., Wang, P., Zhang, Y., Wu, Q.: AerialVLN: vision-and-language navigation for UAVs. In: ICCV, October 2023, pp. 15384\u201315394 (2023)","DOI":"10.1109\/ICCV51070.2023.01411"},{"key":"27_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR. OpenReview.net (2019)"},{"key":"27_CR27","doi-asserted-by":"crossref","unstructured":"Nguyen, K., III, H.D.: Help, Anna! visual navigation with natural multimodal assistance via retrospective curiosity-encouraging imitation learning. In: Inui, K., Jiang, J., Ng, V., Wan, X. (eds.) EMNLP, pp. 684\u2013695. Association for Computational Linguistics (2019)","DOI":"10.18653\/v1\/D19-1063"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Qi, Y., Pan, Z., Hong, Y., Yang, M., van\u00a0den Hengel, A., Wu, Q.: The road to know-where: an object-and-room informed sequential BERT for indoor vision-language navigation. In: ICCV, pp. 1655\u20131664 (2021)","DOI":"10.1109\/ICCV48922.2021.00168"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Qi, Y., et al.: REVERIE: remote embodied visual referring expression in real indoor environments. In: CVPR, pp. 9979\u20139988 (2020)","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"27_CR30","first-page":"8524","volume":"45","author":"Y Qiao","year":"2023","unstructured":"Qiao, Y., Qi, Y., Hong, Y., Yu, Z., Wang, P., Wu, Q.: HOP+: history-enhanced and order-aware pre-training for vision-and-language navigation. IEEE TPAMI 45, 8524\u20138537 (2023)","journal-title":"IEEE TPAMI"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"Qiao, Y., Qi, Y., Yu, Z., Liu, J., Wu, Q.: March in chat: interactive prompting for remote embodied referring expression. In: ICCV, pp. 15758\u201315767 (2023)","DOI":"10.1109\/ICCV51070.2023.01444"},{"key":"27_CR32","doi-asserted-by":"crossref","unstructured":"Qiao, Y., Yu, Z., Wu, Q.: VLN-PETL: parameter-efficient transfer learning for vision-and-language navigation. In: ICCV, pp. 15443\u201315452 (2023)","DOI":"10.1109\/ICCV51070.2023.01416"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-BERT: sentence embeddings using siamese BERT-networks. In: Inui, K., Jiang, J., Ng, V., Wan, X. (eds.) EMNLP-IJCNLP, pp. 3980\u20133990 (2019)","DOI":"10.18653\/v1\/D19-1410"},{"key":"27_CR34","unstructured":"Roman, H.R., Bisk, Y., Thomason, J., Celikyilmaz, A., Gao, J.: RMM: a recursive mental model for dialog navigation. arXiv preprint arXiv:2005.00728 (2020)"},{"key":"27_CR35","doi-asserted-by":"crossref","unstructured":"Schumann, R., Zhu, W., Feng, W., Fu, T.J., Riezler, S., Wang, W.Y.: VELMA: verbalization embodiment of LLM agents for vision and language navigation in street view. In: AAAI (2023)","DOI":"10.1609\/aaai.v38i17.29858"},{"key":"27_CR36","doi-asserted-by":"crossref","unstructured":"Song, C.H., Wu, J., Washington, C., Sadler, B.M., Chao, W.L., Su, Y.: LLM-planner: few-shot grounded planning for embodied agents with large language models. arXiv preprint arXiv:2212.04088 (2022)","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"27_CR37","unstructured":"Thomason, J., Murray, M., Cakmak, M., Zettlemoyer, L.: Vision-and-dialog navigation. In: CoRL, pp. 394\u2013406 (2019)"},{"key":"27_CR38","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv abs\/2307.09288 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259950998"},{"key":"27_CR39","doi-asserted-by":"crossref","unstructured":"Wang, X., Wang, W., Shao, J., Yang, Y.: LANA: a language-capable navigator for instruction following and generation. In: CVPR, pp. 19048\u201319058 (2023)","DOI":"10.1109\/CVPR52729.2023.01826"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation. In: CVPR, pp. 6629\u20136638 (2019)","DOI":"10.1109\/CVPR.2019.00679"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Wang, Z., Li, X., Yang, J., Liu, Y., Jiang, S.: GridMM: grid memory map for vision-and-language navigation. In: ICCV, pp. 15625\u201315636 (2023)","DOI":"10.1109\/ICCV51070.2023.01432"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Scaling data generation in vision-and-language navigation. In: ICCV, pp. 12009\u201312020 (2023)","DOI":"10.1109\/ICCV51070.2023.01103"},{"key":"27_CR43","unstructured":"Zheng, L., et\u00a0al.: Judging LLM-as-a-judge with MT-bench and Chatbot arena. arXiv preprint arXiv:2306.05685 (2023)"},{"key":"27_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, F., Liang, X., Zhu, Y., Yu, Q., Chang, X., Liang, X.: SOON: scenario oriented object navigation with graph-based exploration. In: CVPR, pp. 12689\u201312699 (2021)","DOI":"10.1109\/CVPR46437.2021.01250"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Self-motivated communication agent for real-world vision-dialog navigation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00162"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72652-1_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T12:01:46Z","timestamp":1732968106000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72652-1_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031726514","9783031726521"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72652-1_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}