{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:23:04Z","timestamp":1778257384262,"version":"3.51.4"},"reference-count":45,"publisher":"Informa UK Limited","issue":"19-20","funder":[{"name":"iHUB-Data, IIIT Hyderabad"}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Advanced Robotics"],"published-print":{"date-parts":[[2024,10,17]]},"DOI":"10.1080\/01691864.2024.2395926","type":"journal-article","created":{"date-parts":[[2024,8,29]],"date-time":"2024-08-29T11:41:28Z","timestamp":1724931688000},"page":"1378-1391","update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":3,"title":["Open-set 3D semantic instance maps for vision language navigation \u2013 O3D-SIM"],"prefix":"10.1080","volume":"38","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6422-1749","authenticated-orcid":false,"given":"Laksh","family":"Nanwani","sequence":"first","affiliation":[{"name":"Robotics Research Center, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4108-4342","authenticated-orcid":false,"given":"Kumaraditya","family":"Gupta","sequence":"additional","affiliation":[{"name":"Robotics Research Center, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3007-8652","authenticated-orcid":false,"given":"Aditya","family":"Mathur","sequence":"additional","affiliation":[{"name":"Robotics Research Center, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6767-8612","authenticated-orcid":false,"given":"Swayam","family":"Agrawal","sequence":"additional","affiliation":[{"name":"Robotics Research Center, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1908-5521","authenticated-orcid":false,"given":"A. H. Abdul","family":"Hafez","sequence":"additional","affiliation":[{"name":"Faculty of Engineering, Hasan Kalyoncu University, Gaziantep, Turkey"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7846-7901","authenticated-orcid":false,"given":"K. Madhava","family":"Krishna","sequence":"additional","affiliation":[{"name":"Robotics Research Center, International Institute of Information Technology, Hyderabad, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"301","published-online":{"date-parts":[[2024,8,29]]},"reference":[{"key":"e_1_3_3_2_1","doi-asserted-by":"crossref","unstructured":"Huang C Mees O Zeng A et\u00a0al. Visual language maps for robot navigation. In: Proceedings of the IEEE International Conference on Robotics and Automation (ICRA) London UK. 2023.","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"e_1_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10174-9"},{"key":"e_1_3_3_4_1","doi-asserted-by":"crossref","unstructured":"Gu Q Kuwajerwala A Morin S et\u00a0al. Conceptgraphs: open-vocabulary 3D scene graphs for perception and planning. 2023. arXiv.","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"e_1_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2023.3248510"},{"key":"e_1_3_3_6_1","doi-asserted-by":"crossref","unstructured":"Chen B Xia F Ichter B et\u00a0al. Open-vocabulary queryable scene representations for real world planning. 2022. arXiv preprint arXiv:2209.09874.","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"e_1_3_3_7_1","doi-asserted-by":"crossref","unstructured":"Nanwani L Agarwal A Jain K et\u00a0al. Instance-level semantic maps for vision language navigation. In: 2023 32nd IEEE International Conference on Robot and Human Interactive Communication (RO-MAN). IEEE; 2023 Aug.","DOI":"10.1109\/RO-MAN57019.2023.10309534"},{"key":"e_1_3_3_8_1","doi-asserted-by":"crossref","unstructured":"Cheng B Misra I Schwing AG et\u00a0al. Masked-attention mask transformer for universal image segmentation. 2021. arXiv.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_3_9_1","doi-asserted-by":"crossref","unstructured":"He K Gkioxari G Dollar P et al. Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) Venice Italy; 2017 Oct.","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_10_1","unstructured":"Radford A Wook Kim J Hallacy C et\u00a0al. Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. PMLR; 2021. p.\u00a08748\u20138763."},{"key":"e_1_3_3_11_1","article-title":"DINOv2: learning robust visual features without supervision","author":"Oquab M","year":"2024","unstructured":"Oquab M, Darcet T, Moutakanni T, et\u00a0al. DINOv2: learning robust visual features without supervision. Trans Mach Learn Res. 2024.","journal-title":"Trans Mach Learn Res"},{"key":"e_1_3_3_12_1","doi-asserted-by":"crossref","unstructured":"Chang M Gervet T Khanna M et\u00a0al. Goat: go to any thing 2023. arXiv. Available from: https:\/\/arxiv.org\/abs\/2311.06430","DOI":"10.15607\/RSS.2024.XX.073"},{"key":"e_1_3_3_13_1","doi-asserted-by":"crossref","unstructured":"Anderson P Wu Q Teney D et al. Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Salt Lake City UT USA; 2018 Jun.","DOI":"10.1109\/CVPR.2018.00387"},{"key":"e_1_3_3_14_1","unstructured":"Vaswani A Shazeer N Parmar N et\u00a0al. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems NIPS'17 Red Hook NY USA. Curran Associates Inc; 2017. p.\u00a06000\u20136010."},{"key":"e_1_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Hao W Li C Li X et al. Towards learning a generic agent for vision-and-language navigation via pre-training. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Seattle WA USA; 2020. p. 13134\u201313143.","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"e_1_3_3_16_1","doi-asserted-by":"crossref","unstructured":"Qiao Y Qi Y Hong Y et\u00a0al. Hop: history-and-order aware pretraining for vision-and-language navigation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Los Alamitos CA USA. IEEE Computer Society; 2022 Jun. p.\u00a015397\u201315406.","DOI":"10.1109\/CVPR52688.2022.01498"},{"key":"e_1_3_3_17_1","doi-asserted-by":"crossref","unstructured":"Hwang M Jeong J Kim M et al. Meta-explore: exploratory hierarchical vision-and-language navigation using scene object spectrum grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Vancouver Canada; 2023.","DOI":"10.1109\/CVPR52729.2023.00646"},{"key":"e_1_3_3_18_1","doi-asserted-by":"crossref","unstructured":"Gireesh N Agrawal A Datta A et al. Sequence-agnostic multi-object navigation. In: 2023 IEEE International Conference on Robotics and Automation (ICRA) ExCeL London; 2023. p. 9573\u20139579.","DOI":"10.1109\/ICRA48891.2023.10160259"},{"key":"e_1_3_3_19_1","doi-asserted-by":"crossref","unstructured":"Qi Y Pan Z Hong Y et\u00a0al. The road to know-where: an object-and-room informed sequential bert for indoor vision-language navigation. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) Virtually; 2021. p.\u00a01635\u20131644.","DOI":"10.1109\/ICCV48922.2021.00168"},{"key":"e_1_3_3_20_1","doi-asserted-by":"crossref","unstructured":"Huo J Sun Q Jiang B et al. Geovln: learning geometry-enhanced visual representation with slot attention for vision-and-language navigation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Vancouver Canada; 2023. p. 23212\u201323221.","DOI":"10.1109\/CVPR52729.2023.02223"},{"key":"e_1_3_3_21_1","doi-asserted-by":"crossref","unstructured":"Hong Y Wu Q Qi Y et al. Vln\u0153bert: a recurrent vision-and-language bert for navigation. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Seattle WA USA; 2020. p. 1643\u20131653.","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"e_1_3_3_22_1","unstructured":"Devlin J Chang M-W Lee K et\u00a0al. Bert: pre-training of deep bidirectional transformers for language understanding. 2018. arXiv preprint arXiv:1810.04805."},{"key":"e_1_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.066"},{"key":"e_1_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016"},{"key":"e_1_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3146502"},{"key":"e_1_3_3_26_1","unstructured":"Miao Y Armeni I Pollefeys M et\u00a0al. Volumetric semantically consistent 3D panoptic mapping. 2023. arXiv preprint arXiv:2309.14737."},{"key":"e_1_3_3_27_1","doi-asserted-by":"crossref","unstructured":"Kirillov A Mintun E Ravi N et\u00a0al. Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision; Oct; 2023. p. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_28_1","doi-asserted-by":"crossref","unstructured":"He K Gkioxari G Doll\u00e1r P et al. Mask R-CNN. In: 2017 IEEE International Conference on Computer Vision (ICCV) Venice Italy; 2017. p. 2980\u20132988.","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_29_1","doi-asserted-by":"crossref","unstructured":"Kirillov A He K Girshick R et\u00a0al. Panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition; Jun; 2019.","DOI":"10.1109\/CVPR.2019.00963"},{"key":"e_1_3_3_30_1","unstructured":"Takmaz A Fedele E Sumner RW et al. Openmask3D: open-vocabulary 3D instance segmentation. In: Advances in neural information processing systems (NeurIPS) New Orleans LA USA; 2023."},{"key":"e_1_3_3_31_1","doi-asserted-by":"crossref","unstructured":"Liu S Zeng Z Ren T et\u00a0al. Grounding dino: marrying dino with grounded pre-training for open-set object detection. 2023. arXiv.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"e_1_3_3_32_1","doi-asserted-by":"crossref","unstructured":"Huang J Artemov A Chen Y et\u00a0al. SSR-2D: semantic 3D scene reconstruction from 2D images. 2023. arXiv preprint arXiv:2302.03640.","DOI":"10.1109\/TPAMI.2024.3410032"},{"key":"e_1_3_3_33_1","unstructured":"Picard Q Chevobbe S Darouich M et\u00a0al. A survey on real-time 3D scene reconstruction with SLAM methods in embedded systems. 2023. arXiv. Available from: https:\/\/arxiv.org\/abs\/2309.05349"},{"key":"e_1_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1002\/rob.2019.36.issue-2"},{"key":"e_1_3_3_36_1","doi-asserted-by":"crossref","unstructured":"K\u00fcmmerle R Grisetti G Strasdat H et al. G2O: a general framework for graph optimization. In: 2011 IEEE International Conference on Robotics and Automation Shanghai China; 2011. p. 3607\u20133613.","DOI":"10.1109\/ICRA.2011.5979949"},{"key":"e_1_3_3_37_1","unstructured":"Ren T Liu S Zeng A et\u00a0al. Grounded SAM: assembling open-world models for diverse visual tasks. 2024. arXiv."},{"key":"e_1_3_3_38_1","doi-asserted-by":"crossref","unstructured":"Zhang Y Huang X Ma J et\u00a0al. Recognize anything: a strong image tagging model. 2023. arXiv.","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"e_1_3_3_39_1","unstructured":"Ester M Kriegel H-P Sander J et\u00a0al. A density-based algorithm for discovering clusters in large spatial databases with noise. In: kdd Vol.\u00a096. 1996. p.\u00a0226\u2013231."},{"key":"e_1_3_3_40_1","unstructured":"OpenAI. Chatgpt. Available from https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_3_41_1","unstructured":"Touvron H Lavril T Izacard G et\u00a0al. LLAMA: open and efficient foundation language models. 2023. arXiv preprint arXiv:2302.13971."},{"key":"e_1_3_3_42_1","doi-asserted-by":"crossref","unstructured":"Chang A Dai A Funkhouser T et\u00a0al. Matterport3D: learning from RGB-D data in indoor environments. 2017. arXiv.","DOI":"10.1109\/3DV.2017.00081"},{"key":"e_1_3_3_43_1","doi-asserted-by":"crossref","unstructured":"Savva M Kadian A Maksymets O et al. Habitat: a platform for embodied ai research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Seoul Korea; 2019. p. 9339\u20139347.","DOI":"10.1109\/ICCV.2019.00943"},{"key":"e_1_3_3_44_1","doi-asserted-by":"crossref","unstructured":"Schumann R Riezler S. Analyzing generalization of vision and language navigation to unseen outdoor areas. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) Dublin Ireland. Association for Computational Linguistics; 2022 May. p.\u00a07519\u20137532.","DOI":"10.18653\/v1\/2022.acl-long.518"},{"key":"e_1_3_3_45_1","doi-asserted-by":"crossref","unstructured":"Jain K Chhangani V Tiwari A et al. Ground then navigate: language-guided navigation in dynamic scenes. In: 2023 IEEE International Conference on Robotics and Automation (ICRA) ExCeL London; 2023. p. 4113\u20134120.","DOI":"10.1109\/ICRA48891.2023.10160614"},{"key":"e_1_3_3_46_1","unstructured":"Makoviychuk V Wawrzyniak L Guo Y et\u00a0al. Isaac Gym: high performance GPU-based physics simulation for robot learning. 2021. arXiv preprint."}],"container-title":["Advanced Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/01691864.2024.2395926","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T12:36:19Z","timestamp":1732710979000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/01691864.2024.2395926"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,29]]},"references-count":45,"journal-issue":{"issue":"19-20","published-print":{"date-parts":[[2024,10,17]]}},"alternative-id":["10.1080\/01691864.2024.2395926"],"URL":"https:\/\/doi.org\/10.1080\/01691864.2024.2395926","relation":{},"ISSN":["0169-1864","1568-5535"],"issn-type":[{"value":"0169-1864","type":"print"},{"value":"1568-5535","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,29]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=tadr20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2024-03-18","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-05-28","order":1,"name":"revised","label":"Revised","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-07-29","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-08-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}