{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T15:42:25Z","timestamp":1775144545857,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681150","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"856-865","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Narrowing the Gap between Vision and Action in Navigation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2153-6536","authenticated-orcid":false,"given":"Yue","family":"Zhang","sequence":"first","affiliation":[{"name":"Michigan State University, East Lansing, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4606-1824","authenticated-orcid":false,"given":"Parisa","family":"Kordjamshidi","sequence":"additional","affiliation":[{"name":"Michigan State University, East Lansing, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2737--2748","author":"An Dong","year":"2023","unstructured":"Dong An, Yuankai Qi, Yangguang Li, Yan Huang, Liang Wang, Tieniu Tan, and Jing Shao. 2023. Bevbert: Multimodal map pre-training for language-guided navigation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2737--2748."},{"key":"e_1_3_2_2_2_1","volume-title":"ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments. arXiv preprint arXiv:2304.03047","author":"An Dong","year":"2023","unstructured":"Dong An, Hanqing Wang, Wenguan Wang, Zun Wang, Yan Huang, Keji He, and Liang Wang. 2023. ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments. arXiv preprint arXiv:2304.03047 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"Alexey Dosovitskiy, Saurabh Gupta, Vladlen Koltun, Jana Kosecka, Jitendra Malik, Roozbeh Mottaghi, Manolis Savva, et al.","author":"Anderson Peter","year":"2018","unstructured":"Peter Anderson, Angel Chang, Devendra Singh Chaplot, Alexey Dosovitskiy, Saurabh Gupta, Vladlen Koltun, Jana Kosecka, Jitendra Malik, Roozbeh Mottaghi, Manolis Savva, et al. 2018. On evaluation of embodied navigation agents. arXiv preprint arXiv:1807.06757 (2018)."},{"key":"e_1_3_2_2_4_1","volume-title":"Conference on Robot Learning. PMLR, 671--681","author":"Anderson Peter","year":"2021","unstructured":"Peter Anderson, Ayush Shrivastava, Joanne Truong, Arjun Majumdar, Devi Parikh, Dhruv Batra, and Stefan Lee. 2021. Sim-to-real transfer for vision-and-language navigation. In Conference on Robot Learning. PMLR, 671--681."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01282"},{"key":"e_1_3_2_2_8_1","first-page":"38149","article-title":"Weakly-supervised multi-granularity map learning for vision-and-language navigation","volume":"35","author":"Chen Peihao","year":"2022","unstructured":"Peihao Chen, Dongyu Ji, Kunyang Lin, Runhao Zeng, Thomas Li, Mingkui Tan, and Chuang Gan. 2022. Weakly-supervised multi-granularity map learning for vision-and-language navigation. Advances in Neural Information Processing Systems 35 (2022), 38149--38161.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_9_1","volume-title":"History aware multimodal transformer for vision-and-language navigation. Advances in neural information processing systems 34","author":"Chen Shizhe","year":"2021","unstructured":"Shizhe Chen, Pierre-Louis Guhur, Cordelia Schmid, and Ivan Laptev. 2021. History aware multimodal transformer for vision-and-language navigation. Advances in neural information processing systems 34 (2021), 5834--5847."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01604"},{"key":"e_1_3_2_2_11_1","volume-title":"Louis- Philippe Morency","author":"Fried Daniel","year":"2018","unstructured":"Daniel Fried, Ronghang Hu, Volkan Cirik, Anna Rohrbach, Jacob Andreas, Louis- Philippe Morency, Taylor Berg-Kirkpatrick, Kate Saenko, Dan Klein, and Trevor Darrell. 2018. Speaker-follower models for vision-and-language navigation. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01502"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01500"},{"key":"e_1_3_2_2_16_1","volume-title":"A recurrent vision-and-language bert for navigation. arXiv preprint arXiv:2011.13922","author":"Hong Yicong","year":"2020","unstructured":"Yicong Hong, Qi Wu, Yuankai Qi, Cristian Rodriguez-Opazo, and Stephen Gould. 2020. A recurrent vision-and-language bert for navigation. arXiv preprint arXiv:2011.13922 (2020)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00284"},{"key":"e_1_3_2_2_18_1","volume-title":"General evaluation for instruction conditioned navigation using dynamic time warping. arXiv preprint arXiv:1907.05446","author":"Ilharco Gabriel","year":"2019","unstructured":"Gabriel Ilharco, Vihan Jain, Alexander Ku, Eugene Ie, and Jason Baldridge. 2019. General evaluation for instruction conditioned navigation using dynamic time warping. arXiv preprint arXiv:1907.05446 (2019)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1181"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01488"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_34"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_7"},{"key":"e_1_3_2_2_23_1","volume-title":"Beyond the Nav-Graph: Vision and Language Navigation in Continuous Environments. In European Conference on Computer Vision (ECCV).","author":"Krantz Jacob","year":"2020","unstructured":"Jacob Krantz, Erik Wijmans, Arjun Majundar, Dhruv Batra, and Stefan Lee. 2020. Beyond the Nav-Graph: Vision and Language Navigation in Continuous Environments. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_2_24_1","volume-title":"Room-Across-Room: Multilingual Vision-and-Language Navigation with Dense Spatiotemporal Grounding. In Conference on Empirical Methods for Natural Language Processing (EMNLP).","author":"Ku Alexander","year":"2020","unstructured":"Alexander Ku, Peter Anderson, Roma Patel, Eugene Ie, and Jason Baldridge. 2020. Room-Across-Room: Multilingual Vision-and-Language Navigation with Dense Spatiotemporal Grounding. In Conference on Empirical Methods for Natural Language Processing (EMNLP)."},{"key":"e_1_3_2_2_25_1","volume-title":"Panogen: Text-conditioned panoramic environment generation for vision-and-language navigation. Advances in Neural Information Processing Systems 36","author":"Li Jialu","year":"2024","unstructured":"Jialu Li and Mohit Bansal. 2024. Panogen: Text-conditioned panoramic environment generation for vision-and-language navigation. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_26_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01497"},{"key":"e_1_3_2_2_28_1","volume-title":"Self-monitoring navigation agent via auxiliary progress estimation. arXiv preprint arXiv:1901.03035","author":"Ma Chih-Yao","year":"2019","unstructured":"Chih-Yao Ma, Jiasen Lu, Zuxuan Wu, Ghassan AlRegib, Zsolt Kira, Richard Socher, and Caiming Xiong. 2019. Self-monitoring navigation agent via auxiliary progress estimation. arXiv preprint arXiv:1901.03035 (2019)."},{"key":"e_1_3_2_2_29_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_30_1","volume-title":"Tsung-Yen Yang, Ruslan Partsey, Ruta Desai, Alexander William Clegg, Michal Hlavac, So Yeon Min, et al.","author":"Puig Xavier","year":"2023","unstructured":"Xavier Puig, Eric Undersander, Andrew Szot, Mikael Dallaire Cote, Tsung-Yen Yang, Ruslan Partsey, Ruta Desai, Alexander William Clegg, Michal Hlavac, So Yeon Min, et al. 2023. Habitat 3.0: A co-habitat for humans, avatars and robots. arXiv preprint arXiv:2310.13724 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9982--9991","author":"Qi Yuankai","unstructured":"Yuankai Qi, Qi Wu, Peter Anderson, Xin Wang, William Yang Wang, Chunhua Shen, and Anton van den Hengel. 2020. Reverie: Remote embodied visual referring expression in real indoor environments. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9982--9991."},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15418--15427","author":"Qiao Yanyuan","year":"2022","unstructured":"Yanyuan Qiao, Yuankai Qi, Yicong Hong, Zheng Yu, Peng Wang, and Qi Wu. 2022. Hop: history-and-order aware pre-training for vision-and-language navigation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 15418--15427."},{"key":"e_1_3_2_2_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_2_35_1","volume-title":"Language-aligned waypoint (law) supervision for vision-and-language navigation in continuous environments. arXiv preprint arXiv:2109.15207","author":"Raychaudhuri Sonia","year":"2021","unstructured":"Sonia Raychaudhuri, Saim Wani, Shivansh Patel, Unnat Jain, and Angel X Chang. 2021. Language-aligned waypoint (law) supervision for vision-and-language navigation in continuous environments. arXiv preprint arXiv:2109.15207 (2021)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Olga Russakovsky Jia Deng Hao Su Jonathan Krause Sanjeev Satheesh Sean Ma Zhiheng Huang Andrej Karpathy Aditya Khosla Michael Bernstein et al. 2015. Imagenet large scale visual recognition challenge. International journal of computer vision 115 (2015) 211--252.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"e_1_3_2_2_39_1","unstructured":"Andrew Szot Alex Clegg Eric Undersander Erik Wijmans Yili Zhao John Turner Noah Maestre Mustafa Mukadam Devendra Chaplot Oleksandr Maksymets Aaron Gokaslan Vladimir Vondrus Sameer Dharur Franziska Meier Wojciech Galuba Angel Chang Zsolt Kira Vladlen Koltun Jitendra Malik Manolis Savva and Dhruv Batra. 2021. Habitat 2.0: Training Home Assistants to Rearrange their Habitat. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_2_40_1","volume-title":"Learning to navigate unseen environments: Back translation with environmental dropout. arXiv preprint arXiv:1904.04195","author":"Tan Hao","year":"2019","unstructured":"Hao Tan, Licheng Yu, and Mohit Bansal. 2019. Learning to navigate unseen environments: Back translation with environmental dropout. arXiv preprint arXiv:1904.04195 (2019)."},{"key":"e_1_3_2_2_41_1","volume-title":"Conference on Robot Learning. PMLR, 394--406","author":"Thomason Jesse","year":"2020","unstructured":"Jesse Thomason, Michael Murray, Maya Cakmak, and Luke Zettlemoyer. 2020. Vision-and-dialog navigation. In Conference on Robot Learning. PMLR, 394--406."},{"key":"e_1_3_2_2_42_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00835"},{"key":"e_1_3_2_2_44_1","volume-title":"Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191","author":"Wang Yi","year":"2022","unstructured":"Yi Wang, Kunchang Li, Yizhuo Li, Yinan He, Bingkun Huang, Zhiyu Zhao, Hongjie Zhang, Jilan Xu, Yi Liu, Zun Wang, et al. 2022. Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01103"},{"key":"e_1_3_2_2_46_1","volume-title":"Dd-ppo: Learning near-perfect pointgoal navigators from 2.5 billion frames. arXiv preprint arXiv:1911.00357","author":"Wijmans Erik","year":"2019","unstructured":"Erik Wijmans, Abhishek Kadian, Ari Morcos, Stefan Lee, Irfan Essa, Devi Parikh, Manolis Savva, and Dhruv Batra. 2019. Dd-ppo: Learning near-perfect pointgoal navigators from 2.5 billion frames. arXiv preprint arXiv:1911.00357 (2019)."},{"key":"e_1_3_2_2_47_1","volume-title":"Building generalizable agents with a realistic and rich 3d environment. arXiv preprint arXiv:1801.02209","author":"Wu Yi","year":"2018","unstructured":"Yi Wu, Yuxin Wu, Georgia Gkioxari, and Yuandong Tian. 2018. Building generalizable agents with a realistic and rich 3d environment. arXiv preprint arXiv:1801.02209 (2018)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00945"},{"key":"e_1_3_2_2_49_1","volume-title":"Common Sense Reasoning for Deep Fake Detection. arXiv preprint arXiv:2402.00126","author":"Zhang Yue","year":"2024","unstructured":"Yue Zhang, Ben Colman, Ali Shahriyari, and Gaurav Bharaj. 2024. Common Sense Reasoning for Deep Fake Detection. arXiv preprint arXiv:2402.00126 (2024)."},{"key":"e_1_3_2_2_50_1","volume-title":"Towards navigation by reasoning over spatial configurations. arXiv preprint arXiv:2105.06839","author":"Zhang Yue","year":"2021","unstructured":"Yue Zhang, Quan Guo, and Parisa Kordjamshidi. 2021. Towards navigation by reasoning over spatial configurations. arXiv preprint arXiv:2105.06839 (2021)."},{"key":"e_1_3_2_2_51_1","volume-title":"NavHint: Vision and language navigation agent with a hint generator. arXiv preprint arXiv:2402.02559","author":"Zhang Yue","year":"2024","unstructured":"Yue Zhang, Quan Guo, and Parisa Kordjamshidi. 2024. NavHint: Vision and language navigation agent with a hint generator. arXiv preprint arXiv:2402.02559 (2024)."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-srw.24"},{"key":"e_1_3_2_2_53_1","volume-title":"Lovis: Learning orientation and visual signals for vision and language navigation. arXiv preprint arXiv:2209.12723","author":"Zhang Yue","year":"2022","unstructured":"Yue Zhang and Parisa Kordjamshidi. 2022. Lovis: Learning orientation and visual signals for vision and language navigation. arXiv preprint arXiv:2209.12723 (2022)."},{"key":"e_1_3_2_2_54_1","volume-title":"Vln-trans: Translator for the vision and language navigation agent. arXiv preprint arXiv:2302.09230","author":"Zhang Yue","year":"2023","unstructured":"Yue Zhang and Parisa Kordjamshidi. 2023. Vln-trans: Translator for the vision and language navigation agent. arXiv preprint arXiv:2302.09230 (2023)."},{"key":"e_1_3_2_2_55_1","volume-title":"Vision-and-Language Navigation Today and Tomorrow: A Survey in the Era of Foundation Models. arXiv preprint arXiv:2407.07035","author":"Zhang Yue","year":"2024","unstructured":"Yue Zhang, Ziqiao Ma, Jialu Li, Yanyuan Qiao, Zun Wang, Joyce Chai, Qi Wu, Mohit Bansal, and Parisa Kordjamshidi. 2024. Vision-and-Language Navigation Today and Tomorrow: A Survey in the Era of Foundation Models. arXiv preprint arXiv:2407.07035 (2024)."},{"key":"e_1_3_2_2_56_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01250"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01003"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681150","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681150","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681150"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3681150","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681150","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}