{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:03:29Z","timestamp":1777655009166,"version":"3.51.4"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732539","type":"print"},{"value":"9783031732546","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T00:00:00Z","timestamp":1732752000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T00:00:00Z","timestamp":1732752000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73254-6_10","type":"book-chapter","created":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T07:21:44Z","timestamp":1732692104000},"page":"161-178","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Prioritized Semantic Learning for\u00a0Zero-Shot Instance Navigation"],"prefix":"10.1007","author":[{"given":"Xinyu","family":"Sun","sequence":"first","affiliation":[]},{"given":"Lizhao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Hongyan","family":"Zhi","sequence":"additional","affiliation":[]},{"given":"Ronghe","family":"Qiu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2219-5569","authenticated-orcid":false,"given":"Junwei","family":"Liang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,28]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Al-Halah, Z., Ramakrishnan, S.K., Grauman, K.: Zero experience required: plug & play modular transfer learning for semantic visual navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17010\u201317020 (2022)","DOI":"10.1109\/CVPR52688.2022.01652"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"10_CR3","unstructured":"Batra, D., et al.: Rearrangement: a challenge for embodied AI. CoRR abs\/2011.01975 (2020)"},{"key":"10_CR4","unstructured":"Batra, D., et al.: ObjectNav revisited: on evaluation of embodied agents navigating to objects. arXiv preprint arXiv:2006.13171 (2020)"},{"key":"10_CR5","unstructured":"Cai, W., et al.: Bridging zero-shot object navigation and foundation models through pixel-guided navigation skill. CoRR abs\/2309.10309 (2023)"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3D: learning from RGB-D data in indoor environments. arXiv preprint arXiv:1709.06158 (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"10_CR7","unstructured":"Chen, P., et al.: $$A^2$$ NAV: action-aware zero-shot robot navigation by exploiting vision-and-language ability of foundation models. arXiv preprint arXiv:2308.07997 (2023)"},{"key":"10_CR8","unstructured":"Deitke, M., et al.: Procthor: large-scale embodied AI using procedural generation. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Proceedings of the International Conference on Neural Information Processing Systems (2022)"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Du, Y., Gan, C., Isola, P.: Curious representation learning for embodied intelligence. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.01024"},{"key":"10_CR10","unstructured":"Gadre, S.Y., Wortsman, M., Ilharco, G., Schmidt, L., Song, S.: CLIP on wheels: zero-shot object navigation as object localization and exploration. CoRR abs\/2203.10421 (2022)"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Gan, C., et al.: Finding fallen objects via asynchronous audio-visual integration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10523\u201310533 (2022)","DOI":"10.1109\/CVPR52688.2022.01027"},{"key":"10_CR12","unstructured":"Gan, C., et\u00a0al.: Threedworld: a platform for interactive multi-modal physical simulation. arXiv preprint arXiv:2007.04954 (2020)"},{"key":"10_CR13","unstructured":"Gao, P., et\u00a0al.: Llama-adapter v2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"10_CR14","unstructured":"Hahn, M., Chaplot, D.S., Tulsiani, S., Mukadam, M., Rehg, J.M., Gupta, A.: No RL, no simulation: learning to navigate without navigating. In: Proceedings of the International Conference on Neural Information Processing Systems (2021)"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"10_CR16","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"10_CR17","unstructured":"Kim, N., Kwon, O., Yoo, H., Choi, Y., Park, J., Oh, S.: Topological semantic graph memory for image-goal navigation. In: Conference on Robot Learning, pp. 393\u2013402. PMLR (2023)"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Krantz, J., et al.: Navigating to objects specified by images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10916\u201310925 (2023)","DOI":"10.1109\/ICCV51070.2023.01002"},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Krantz, J., Gokaslan, A., Batra, D., Lee, S., Maksymets, O.: Waypoint models for instruction-guided navigation in continuous environments. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15162\u201315171 (2021)","DOI":"10.1109\/ICCV48922.2021.01488"},{"key":"10_CR20","unstructured":"Krantz, J., Lee, S., Malik, J., Batra, D., Chaplot, D.S.: Instance-specific image goal navigation: training embodied agents to find object instances. arXiv preprint arXiv:2211.15876 (2022)"},{"key":"10_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58604-1_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Krantz","year":"2020","unstructured":"Krantz, J., Wijmans, E., Majumdar, A., Batra, D., Lee, S.: Beyond the Nav-graph: vision-and-language navigation in continuous environments. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_7"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Ku, A., Anderson, P., Patel, R., Ie, E., Baldridge, J.: Room-across-room: multilingual vision-and-language navigation with dense spatiotemporal grounding. In: Conference on Empirical Methods for Natural Language Processing (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Kwon, O., Kim, N., Choi, Y., Yoo, H., Park, J., Oh, S.: Visual graph memory with unsupervised representation for visual navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15890\u201315899 (2021)","DOI":"10.1109\/ICCV48922.2021.01559"},{"key":"10_CR24","unstructured":"Li, C., et al.: iGibson 2.0: object-centric simulation for robot learning of everyday household tasks. In: Conference on Robot Learning, vol.\u00a0164, pp. 455\u2013465 (2021). arXiv preprint arXiv:2108.03272 (2021)"},{"key":"10_CR25","unstructured":"Li, C., et al.: BEHAVIOR-1K: a benchmark for embodied AI with 1, 000 everyday activities and realistic simulation. In: Conference on Robot Learning, vol.\u00a0205, pp. 80\u201393 (2022)"},{"key":"10_CR26","unstructured":"Li, C., et\u00a0al.: Behavior-1k: a benchmark for embodied ai with 1,000 everyday activities and realistic simulation. In: Conference on Robot Learning, pp. 80\u201393. PMLR (2023)"},{"key":"10_CR27","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"10_CR28","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. In: Proceedings of the International Conference on Neural Information Processing Systems, vol. 34, pp. 9694\u20139705 (2021)"},{"key":"10_CR29","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"10_CR30","unstructured":"Li, W., Zhu, L., Wen, L., Yang, Y.: Decap: decoding clip latents for zero-shot captioning via text-only training. arXiv preprint arXiv:2303.03032 (2023)"},{"key":"10_CR31","unstructured":"Liang, V.W., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.Y.: Mind the gap: understanding the modality gap in multi-modal contrastive representation learning. In: Proceedings of the International Conference on Neural Information Processing Systems, vol.\u00a035, pp. 17612\u201317625 (2022)"},{"key":"10_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10_CR33","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"10_CR34","unstructured":"Ma, S., et al.: A simple knowledge distillation framework for open-world object detection. arXiv preprint arXiv:2312.08653 (2023)"},{"key":"10_CR35","unstructured":"Majumdar, A., Aggarwal, G., Devnani, B., Hoffman, J., Batra, D.: ZSON: zero-shot object-goal navigation using multimodal goal embeddings. In: Proceedings of the International Conference on Neural Information Processing Systems (2022)"},{"key":"10_CR36","unstructured":"Majumdar, A., Xia, F., Batra, D., Guibas, L., et\u00a0al.: Findthis: language-driven object disambiguation in indoor environments. In: Conference on Robot Learning (2023)"},{"key":"10_CR37","doi-asserted-by":"crossref","unstructured":"Mezghani, L., et al.: Memory-augmented reinforcement learning for image-goal navigation. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems (2022)","DOI":"10.1109\/IROS47612.2022.9981090"},{"key":"10_CR38","unstructured":"OpenAI: GPT-4 technical report. CoRR abs\/2303.08774 (2023)"},{"key":"10_CR39","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the International Conference on Machine Learning, vol.\u00a0139, pp. 8748\u20138763 (2021)"},{"key":"10_CR40","unstructured":"Ramakrishnan, S.K., et\u00a0al.: Habitat-matterport 3D dataset (HM3D): 1000 large-scale 3D environments for embodied AI. arXiv preprint arXiv:2109.08238 (2021)"},{"key":"10_CR41","doi-asserted-by":"crossref","unstructured":"Ramrakhya, R., Batra, D., Wijmans, E., Das, A.: PIRLNav: pretraining with imitation and RL finetuning for objectnav. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17896\u201317906 (2023)","DOI":"10.1109\/CVPR52729.2023.01716"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Savva, M., et al.: Habitat: a platform for embodied AI research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"10_CR43","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"10_CR44","unstructured":"Srivastava, S., et al.: BEHAVIOR: benchmark for everyday household activities in virtual, interactive, and ecological environments. In: Conference on Robot Learning, vol.\u00a0164, pp. 477\u2013490 (2021)"},{"key":"10_CR45","unstructured":"Srivastava, S., et\u00a0al.: Behavior: benchmark for everyday household activities in virtual, interactive, and ecological environments. In: Conference on Robot Learning, pp. 477\u2013490. PMLR (2022)"},{"key":"10_CR46","unstructured":"Szot, A., et al.: Habitat 2.0: training home assistants to rearrange their habitat. In: Proceedings of the International Conference on Neural Information Processing Systems (2021)"},{"key":"10_CR47","unstructured":"Thomason, J., Murray, M., Cakmak, M., Zettlemoyer, L.: Vision-and-dialog navigation. In: Conference on Robot Learning, pp. 394\u2013406. PMLR (2020)"},{"key":"10_CR48","unstructured":"Udandarao, V., Burg, M.F., Albanie, S., Bethge, M.: Visual data-type understanding does not emerge from scaling vision-language models. In: Proceedings of the International Conference on Learning Representations (2023)"},{"key":"10_CR49","unstructured":"Wang, W., et\u00a0al.: CogVLM: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Weihs, L., Deitke, M., Kembhavi, A., Mottaghi, R.: Visual room rearrangement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5922\u20135931 (2021)","DOI":"10.1109\/CVPR46437.2021.00586"},{"key":"10_CR51","unstructured":"Wijmans, E., et al.: DD-PPO: learning near-perfect pointgoal navigators from 2.5 billion frames. In: Proceedings of the International Conference on Learning Representations (2020)"},{"key":"10_CR52","unstructured":"Wu, Y., Wu, Y., Gkioxari, G., Tian, Y.: Building generalizable agents with a realistic and rich 3D environment. arXiv preprint arXiv:1801.02209 (2018)"},{"key":"10_CR53","unstructured":"Xinyu, S., Peihao, C., Jugang, F., Thomas, H.L., Jian, C., Mingkui, T.: FGPrompt: fine-grained goal prompting for image-goal navigation. In: Proceedings of the International Conference on Neural Information Processing Systems (2023)"},{"key":"10_CR54","unstructured":"Yadav, K., et al.: OVRL-V2: a simple state-of-art baseline for imagenav and objectnav. arXiv preprint arXiv:2303.07798 (2023)"},{"key":"10_CR55","unstructured":"Yadav, K., et al.: Offline visual representation learning for embodied navigation. CoRR abs\/2204.13226 (2022)"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Yadav, K., et\u00a0al.: Habitat-matterport 3D semantics dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4927\u20134936 (2023)","DOI":"10.1109\/CVPR52729.2023.00477"},{"key":"10_CR57","unstructured":"Yamauchi, B.: A frontier-based approach for autonomous exploration. In: Proceedings 1997 IEEE International Symposium on Computational Intelligence in Robotics and Automation, CIRA 1997, pp. 146\u2013151. IEEE (1997)"},{"key":"10_CR58","unstructured":"Yenamandra, S., et\u00a0al.: Homerobot: open-vocabulary mobile manipulation. arXiv preprint arXiv:2306.11565 (2023)"},{"key":"10_CR59","doi-asserted-by":"crossref","unstructured":"Yu, B., Kasaei, H., Cao, M.: L3MVN: leveraging large language models for visual target navigation. arXiv preprint arXiv:2304.05501 (2023)","DOI":"10.1109\/IROS55552.2023.10342512"},{"key":"10_CR60","unstructured":"Zhang, R., et al.: Llama-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"10_CR61","unstructured":"Zhou, K., et al.: ESC: exploration with soft commonsense constraints for zero-shot object navigation. In: Proceedings of the International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 42829\u201342842. PMLR (2023)"},{"key":"10_CR62","doi-asserted-by":"crossref","unstructured":"Zhu, F., Liang, X., Zhu, Y., Yu, Q., Chang, X., Liang, X.: Soon: scenario oriented object navigation with graph-based exploration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12689\u201312699 (2021)","DOI":"10.1109\/CVPR46437.2021.01250"},{"key":"10_CR63","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Target-driven visual navigation in indoor scenes using deep reinforcement learning. In: IEEE International Conference on Robotics and Automation (2017)","DOI":"10.1109\/ICRA.2017.7989381"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73254-6_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T08:05:49Z","timestamp":1732694749000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73254-6_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,28]]},"ISBN":["9783031732539","9783031732546"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73254-6_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,28]]},"assertion":[{"value":"28 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}