{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:00Z","timestamp":1765339620191,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"National Key Research Plan","award":["2021YFB2900100"],"award-info":[{"award-number":["2021YFB2900100"]}]},{"name":"NSFC","award":["No. 62372265, No. 62302254, and No. 62402276"],"award-info":[{"award-number":["No. 62372265, No. 62302254, and No. 62402276"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754887","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"7444-7452","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OpenMap: Instruction Grounding via Open-Vocabulary Visual-Language Mapping"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7527-4669","authenticated-orcid":false,"given":"Danyang","family":"Li","sequence":"first","affiliation":[{"name":"School of Software, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1232-1190","authenticated-orcid":false,"given":"Zenghui","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Central South University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4733-8975","authenticated-orcid":false,"given":"Guangpeng","family":"Qi","sequence":"additional","affiliation":[{"name":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4192-5568","authenticated-orcid":false,"given":"Songtao","family":"Pang","sequence":"additional","affiliation":[{"name":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8571-3605","authenticated-orcid":false,"given":"Guangyong","family":"Shang","sequence":"additional","affiliation":[{"name":"Inspur Yunzhou Industrial Internet Co., Ltd, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5791-1890","authenticated-orcid":false,"given":"Qiang","family":"Ma","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4048-2684","authenticated-orcid":false,"given":"Zheng","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Software, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Austin Stone, and Daniel Kappler","author":"Chen Boyuan","year":"2023","unstructured":"Boyuan Chen, Fei Xia, Brian Ichter, Kanishka Rao, Keerthana Gopalakrishnan, Michael S Ryoo, Austin Stone, and Daniel Kappler. 2023b. Open-vocabulary queryable scene representations for real world planning. In ICRA."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Runnan Chen Youquan Liu Lingdong Kong Xinge Zhu Yuexin Ma Yikang Li Yuenan Hou Yu Qiao and Wenping Wang. 2023a. Clip2scene: Towards label-efficient 3d scene understanding by clip. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"e_1_3_2_1_4_1","volume-title":"Hongyuan Zhu, and Cheston Tan.","author":"Duan Jiafei","year":"2022","unstructured":"Jiafei Duan, Samson Yu, Hui Li Tan, Hongyuan Zhu, and Cheston Tan. 2022. A survey of embodied ai: From simulators to research tasks. IEEE Transactions on Emerging Topics in Computational Intelligence (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Sheng Fan Rui Liu Wenguan Wang and Yi Yang. 2024. Navigation instruction generation with bev perception and large language models. In ECCV.","DOI":"10.1007\/978-3-031-72670-5_21"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"e_1_3_2_1_8_1","volume-title":"Piotr Dollar, and Ross Girshick","author":"Gupta Agrim","year":"2019","unstructured":"Agrim Gupta, Piotr Dollar, and Ross Girshick. 2019. Lvis: A dataset for large vocabulary instance segmentation. In CVPR."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"e_1_3_2_1_10_1","volume-title":"Ivlmap: Instance-aware visual language grounding for consumer robot navigation. arXiv preprint arXiv:2403.19336","author":"Huang Jiacui","year":"2024","unstructured":"Jiacui Huang, Hongtao Zhang, Mingbo Zhao, and Zhou Wu. 2024b. Ivlmap: Instance-aware visual language grounding for consumer robot navigation. arXiv preprint arXiv:2403.19336 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"e_1_3_2_1_12_1","volume-title":"European Conference on Computer Vision.","author":"Huang Zhening","year":"2024","unstructured":"Zhening Huang, Xiaoyang Wu, Xi Chen, Hengshuang Zhao, Lei Zhu, and Joan Lasenby. 2024a. Openins3d: Snap and lookup for 3d open-vocabulary instance segmentation. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_13_1","volume-title":"International conference on machine learning.","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning."},{"key":"e_1_3_2_1_14_1","volume-title":"UK","author":"Krantz Jacob","year":"2020","unstructured":"Jacob Krantz, Erik Wijmans, Arjun Majumdar, Dhruv Batra, and Stefan Lee. 2020. Beyond the nav-graph: Vision-and-language navigation in continuous environments. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXVIII 16."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00342"},{"key":"e_1_3_2_1_16_1","volume-title":"Leovr: Motion-inspired visual-lidar fusion for environment depth estimation","author":"Li Danyang","year":"2023","unstructured":"Danyang Li, Jingao Xu, Zheng Yang, Qiang Ma, Li Zhang, and Pengpeng Chen. 2023. Leovr: Motion-inspired visual-lidar fusion for environment depth estimation. IEEE Transactions on Mobile Computing (2023)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538918"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621278"},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Feng Liang Bichen Wu Xiaoliang Dai Kunpeng Li Yinan Zhao Hang Zhang Peizhao Zhang Peter Vajda and Diana Marculescu. 2023. Open-vocabulary semantic segmentation with mask-adapted clip. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01007"},{"key":"e_1_3_2_1_22_1","volume-title":"InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment. In 8th Annual Conference on Robot Learning.","author":"Long Yuxing","year":"2024","unstructured":"Yuxing Long, Wenzhe Cai, Hongcheng Wang, Guanqi Zhan, and Hao Dong. 2024. InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment. In 8th Annual Conference on Robot Learning."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Qi Lu Jason Kuen Shen Tiancheng Gu Jiuxiang Guo Weidong Jia Jiaya Lin Zhe and Yang Ming-Hsuan. 2023b. High-Quality Entity Segmentation. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00374"},{"key":"e_1_3_2_1_24_1","volume-title":"Conference on Robot Learning.","author":"Lu Shiyang","year":"2023","unstructured":"Shiyang Lu, Haonan Chang, Eric Pu Jing, Abdeslam Boularias, and Kostas Bekris. 2023a. Ovir-3d: Open-vocabulary 3d instance retrieval without training on 3d data. In Conference on Robot Learning."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00385"},{"key":"e_1_3_2_1_26_1","volume-title":"Visual language navigation: A survey and open challenges. Artificial Intelligence Review","author":"Park Sang-Min","year":"2023","unstructured":"Sang-Min Park and Young-Gab Kim. 2023. Visual language navigation: A survey and open challenges. Artificial Intelligence Review (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Songyou Peng Kyle Genova Chiyu Jiang Andrea Tagliasacchi Marc Pollefeys Thomas Funkhouser et al. 2023. Openscene: 3d scene understanding with open vocabularies. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Lu Qi Jason Kuen Tiancheng Shen Jiuxiang Gu Wenbo Li Weidong Guo Jiaya Jia Zhe Lin and Ming-Hsuan Yang. 2023. High Quality Entity Segmentation. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00374"},{"key":"e_1_3_2_1_29_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"David Rozenberszki Or Litany and Angela Dai. 2022. Language-grounded indoor 3d semantic segmentation in the wild. In ECCV.","DOI":"10.1007\/978-3-031-19827-4_8"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Jonas Schult Francis Engelmann Alexander Hermans Or Litany Siyu Tang and Bastian Leibe. 2023. Mask3d: Mask transformer for 3d semantic instance segmentation. In ICRA.","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"e_1_3_2_1_32_1","volume-title":"Alfred: A benchmark for interpreting grounded instructions for everyday tasks","author":"Shridhar Mohit","year":"2020","unstructured":"Mohit Shridhar, Jesse Thomason, Daniel Gordon, Yonatan Bisk, Winson Han, Roozbeh Mottaghi, Luke Zettlemoyer, and Dieter Fox. 2020. Alfred: A benchmark for interpreting grounded instructions for everyday tasks. In IEEE\/CVF CVPR."},{"key":"e_1_3_2_1_33_1","volume-title":"A survey of object goal navigation","author":"Sun Jingwen","year":"2024","unstructured":"Jingwen Sun, Jing Wu, Ze Ji, and Yu-Kun Lai. 2024. A survey of object goal navigation. IEEE Transactions on Automation Science and Engineering (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Openmask3d: Open-vocabulary 3d instance segmentation. arXiv preprint arXiv:2306.13631","author":"Takmaz Ay\u00e7a","year":"2023","unstructured":"Ay\u00e7a Takmaz, Elisabetta Fedele, Robert W Sumner, Marc Pollefeys, Federico Tombari, and Francis Engelmann. 2023. Openmask3d: Open-vocabulary 3d instance segmentation. arXiv preprint arXiv:2306.13631 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-44480-7_21"},{"key":"e_1_3_2_1_36_1","volume-title":"Xiaoqi Li, Mingdong Wu, and Hao Dong.","author":"Wang Hongcheng","year":"2023","unstructured":"Hongcheng Wang, Andy Guan Hong Chen, Xiaoqi Li, Mingdong Wu, and Hao Dong. 2023a. Find what you want: Learning demand-conditioned object attribute space for demand-driven navigation. Advances in Neural Information Processing Systems (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01432"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.077"},{"key":"e_1_3_2_1_39_1","unstructured":"Jianzong Wu Xiangtai Li Shilin Xu Haobo Yuan Henghui Ding Yibo Yang Xia Li Jiangning Zhang Yunhai Tong Xudong Jiang et al. 2024b. Towards open vocabulary learning: A survey. IEEE TPAMI (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Vision-language navigation: a survey and taxonomy. Neural Computing and Applications","author":"Wu Wansen","year":"2024","unstructured":"Wansen Wu, Tao Chang, Xinmeng Li, Quanjun Yin, and Yue Hu. 2024a. Vision-language navigation: a survey and taxonomy. Neural Computing and Applications (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155438"},{"key":"e_1_3_2_1_42_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Xu Jingao","year":"2022","unstructured":"Jingao Xu, Hao Cao, Zheng Yang, Longfei Shangguan, Jialin Zhang, Xiaowu He, and Yunhao Liu. 2022. {SwarmMap}: Scaling up real-time collaborative visual {SLAM} at the edge. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00477"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02671"},{"key":"e_1_3_2_1_45_1","volume-title":"European Conference on Computer Vision.","author":"Yuan Haobo","year":"2024","unstructured":"Haobo Yuan, Xiangtai Li, Chong Zhou, Yining Li, Kai Chen, and Chen Change Loy. 2024a. Open-vocabulary SAM: Segment and recognize twenty-thousand classes interactively. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Haobo Yuan Xiangtai Li Chong Zhou Yining Li Kai Chen and Chen Change Loy. 2024b. Open-Vocabulary SAM: Segment and Recognize Twenty-thousand Classes Interactively. In ECCV.","DOI":"10.1007\/978-3-031-72775-7_24"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_48_1","volume-title":"A survey on open-vocabulary detection and segmentation: Past, present, and future","author":"Zhu Chaoyang","year":"2024","unstructured":"Chaoyang Zhu and Long Chen. 2024. A survey on open-vocabulary detection and segmentation: Past, present, and future. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754887","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:25Z","timestamp":1765339405000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754887"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3754887","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754887","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}