{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T21:10:15Z","timestamp":1774473015194,"version":"3.50.1"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128436","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"10922-10928","source":"Crossref","is-referenced-by-count":4,"title":["ZSORN: Language-Driven Object-Centric Zero-Shot Object Retrieval and Navigation"],"prefix":"10.1109","author":[{"given":"Tianrui","family":"Guan","sequence":"first","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yurou","family":"Yang","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Harry","family":"Cheng","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Muyuan","family":"Lin","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Richard","family":"Kim","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rajasimman","family":"Madhivanan","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Arnie","family":"Sen","sequence":"additional","affiliation":[{"name":"Amazon Lab126,Sunnyvale,CA,USA,94089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dinesh","family":"Manocha","sequence":"additional","affiliation":[{"name":"University of Maryland,Department of Computer Science,College Park,MD,USA,20742"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Object goal navigation using goal-oriented semantic exploration","author":"Chaplot","year":"2020","journal-title":"In Neural Information Processing Systems (NeurIPS)"},{"key":"ref2","article-title":"ZSON: Zero-shot object-goal navigation using multimodal goal embeddings","volume-title":"Advances in Neural Information Processing Systems","author":"Majumdar","year":"2022"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161289"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02219"},{"key":"ref5","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01652"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref11","first-page":"17287","article-title":"Visual clues: Bridging vision and language foundations for image paragraph captioning","volume":"35","author":"Xie","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","article-title":"Aligning large multimodal model with robust instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","article-title":"Layoutgpt: Compositional visual planning and generation with large language models","author":"Feng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01363"},{"key":"ref15","volume-title":"On the safety concerns of deploying llms\/vlms in robotics: Highlighting the risks and vulnerabilities","author":"Wu","year":"2024"},{"key":"ref16","article-title":"Physically grounded vision-language models for robotic manipulation","author":"Gao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref17","volume":"abs\/2303.03480","author":"Dorbala","year":"2023","journal-title":"Can an embodied agent find your \u201ccat-shaped mug\u201d? llm-based zero-shot object navigation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"ref19","volume":"abs\/2303.07798","author":"Yadav","year":"2023","journal-title":"Ovrl-v2: A simple state-of-art baseline for imagenav and objectnav"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref21","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-10602-1_48","article-title":"Microsoft coco: Common objects in context","volume-title":"European Conference on Computer Vision","author":"Lin","year":"2014"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref23","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref24","first-page":"11 525","article-title":"Object-centric learning with slot attention","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Locatello","year":"2020"},{"key":"ref25","first-page":"1724","article-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation","volume-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Cho"},{"key":"ref26","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International Conference on Machine Learning","author":"Jia","year":"2021"},{"key":"ref27","volume":"abs\/1810.04805","author":"Devlin","year":"2019","journal-title":"Bert: Pretraining of deep bidirectional transformers for language understanding"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"1686","DOI":"10.1109\/TRO.2023.3248510","article-title":"A survey on active simultaneous localization and mapping: State of the art and new frontiers","volume":"39","author":"Placed","year":"2022","journal-title":"IEEE Transactions on Robotics"},{"key":"ref29","article-title":"Blip: Bootstrapping language-image pretraining for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref30","volume-title":"Plar: Prompt learning for action recognition","author":"Wang","year":"2023"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128436.pdf?arnumber=11128436","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:15:54Z","timestamp":1756880154000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128436\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128436","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}