{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T17:29:58Z","timestamp":1743096598924,"version":"3.40.3"},"publisher-location":"Cham","reference-count":56,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726484"},{"type":"electronic","value":"9783031726491"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72649-1_7","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"108-125","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["DISCO: Embodied Navigation and\u00a0Interaction via\u00a0Differentiable Scene Semantics and\u00a0Dual-Level Control"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1847-9913","authenticated-orcid":false,"given":"Xinyu","family":"Xu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2671-1236","authenticated-orcid":false,"given":"Shengcheng","family":"Luo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2447-7917","authenticated-orcid":false,"given":"Yanchao","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0478-0692","authenticated-orcid":false,"given":"Yong-Lu","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1533-8576","authenticated-orcid":false,"given":"Cewu","family":"Lu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"7_CR1","unstructured":"Kadian*, A., et al.: Are we making real progress in simulated environments? measuring the sim2real gap in embodied visual navigation. arXiv:1912.06321 (2019)"},{"key":"7_CR2","unstructured":"Ahn, M., et al.: Do as i can, not as i say: grounding language in robotic affordances (2022)"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"7_CR4","unstructured":"Batra, D., et al.: ObjectNav revisited: on evaluation of embodied agents navigating to objects. arXiv:2006.13171 (2020)"},{"key":"7_CR5","unstructured":"Blukis, V., Paxton, C., Fox, D., Garg, A., Artzi, Y.: A persistent spatial semantic representation for high-level natural language instruction execution. In: Conference on Robot Learning, pp. 706\u2013717. PMLR (2022)"},{"key":"7_CR6","unstructured":"Brohan, A., et al.: Rt-2: vision-language-action models transfer web knowledge to robotic control (2023)"},{"key":"7_CR7","unstructured":"Brohan, A., et al.: Rt-1: robotics transformer for real-world control at scale (2023)"},{"key":"7_CR8","unstructured":"Chaplot, D.S., Gandhi, D., Gupta, S., Gupta, A., Salakhutdinov, R.: Learning to explore using active neural slam. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"7_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1007\/978-3-030-58539-6_2","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Chen","year":"2020","unstructured":"Chen, C., et al.: SoundSpaces: audio-visual navigation in\u00a03D\u00a0environments. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 17\u201336. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_2"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Chen, H., Suhr, A., Misra, D., Snavely, N., Artzi, Y.: Touchdown: natural language navigation and spatial reasoning in visual street environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12538\u201312547 (2019)","DOI":"10.1109\/CVPR.2019.01282"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Das, A., Datta, S., Gkioxari, G., Lee, S., Parikh, D., Batra, D.: Embodied question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00008"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Deng, S., Xu, X., Wu, C., Chen, K., Jia, K.: 3d affordancenet: a benchmark for visual object affordance understanding (2021)","DOI":"10.1109\/CVPR46437.2021.00182"},{"key":"7_CR13","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"7_CR14","unstructured":"Driess, D., et al.: Palm-e: an embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Ehsani, K., et\u00a0al.: Imitating shortest paths in simulation enables effective navigation and manipulation in the real world. arXiv preprint arXiv:2312.02976 (2023)","DOI":"10.1109\/CVPR52733.2024.01537"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Gadre, S., Ehsani, K., Song, S., Mottaghi, R.: Continuous scene representations for embodied AI. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01443"},{"issue":"4","key":"7_CR17","doi-asserted-by":"publisher","first-page":"10049","DOI":"10.1109\/lra.2022.3193254","volume":"7","author":"X Gao","year":"2022","unstructured":"Gao, X., Gao, Q., Gong, R., Lin, K., Thattai, G., Sukhatme, G.S.: Dialfred: dialogue-enabled agents for embodied instruction following. IEEE Rob. Autom. Lett. 7(4), 10049\u201310056 (2022). https:\/\/doi.org\/10.1109\/lra.2022.3193254","journal-title":"IEEE Rob. Autom. Lett."},{"issue":"3","key":"7_CR18","doi-asserted-by":"publisher","first-page":"227","DOI":"10.2307\/1574154","volume":"11","author":"JJ Gibson","year":"1978","unstructured":"Gibson, J.J.: The ecological approach to the visual perception of pictures. Leonardo 11(3), 227\u2013235 (1978)","journal-title":"Leonardo"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Gordon, D., Kembhavi, A., Rastegari, M., Redmon, J., Fox, D., Farhadi, A.: IQA: visual question answering in interactive environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4089\u20134098 (2018)","DOI":"10.1109\/CVPR.2018.00430"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"7_CR22","unstructured":"Hong, Y., et al.: 3d-llm: injecting the 3d world into large language models. arXiv (2023)"},{"key":"7_CR23","unstructured":"Huang, W., Wang, C., Zhang, R., Li, Y., Wu, J., Fei-Fei, L.: Voxposer: composable 3d value maps for robotic manipulation with language models (2023)"},{"key":"7_CR24","doi-asserted-by":"publisher","unstructured":"Inoue, Y., Ohashi, H.: Prompter: utilizing large language model prompting for a data efficient embodied instruction following (2022). https:\/\/doi.org\/10.48550\/ARXIV.2211.03267. https:\/\/arxiv.org\/abs\/2211.03267","DOI":"10.48550\/ARXIV.2211.03267"},{"key":"7_CR25","unstructured":"Kim, B., Bhambri, S., Singh, K.P.: Agent with the big picture: perceiving surroundings for interactive instruction following (2021)"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Kim, B., Kim, J., Kim, Y., Min, C., Choi, J.: Context-aware planning and environment-aware memory for instruction following embodied agents. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10936\u201310946 (2023)","DOI":"10.1109\/ICCV51070.2023.01004"},{"key":"7_CR27","unstructured":"Kolve, E., et al.: Ai2-thor: an interactive 3D environment for visual AI. arXiv preprint arXiv:1712.05474 (2017)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Ku, A., Anderson, P., Patel, R., Ie, E., Baldridge, J.: Room-across-room: multilingual vision-and-language navigation with dense spatiotemporal grounding. In: Conference on Empirical Methods for Natural Language Processing (EMNLP) (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.356"},{"key":"7_CR29","unstructured":"Li, C., et\u00a0al.: igibson 2.0: object-centric simulation for robot learning of everyday household tasks. arXiv preprint arXiv:2108.03272 (2021)"},{"key":"7_CR30","unstructured":"Li, X., et al.: Imagemanip: image-based robotic manipulation with affordance-guided next view selection (2023)"},{"key":"7_CR31","doi-asserted-by":"crossref","unstructured":"Li, Y.L., et al.: Beyond object recognition: a new benchmark towards object concept learning (2023)","DOI":"10.1109\/ICCV51070.2023.01833"},{"key":"7_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"7_CR33","unstructured":"Liu, X., Palacios, H., Muise, C.: A planning based neural-symbolic approach for embodied instruction following. In: CVPR Embodied AI Workshop (2022)"},{"key":"7_CR34","doi-asserted-by":"crossref","unstructured":"Long, Y., Li, X., Cai, W., Dong, H.: Discuss before moving: visual language navigation via multi-expert discussions (2023)","DOI":"10.1109\/ICRA57147.2024.10611565"},{"key":"7_CR35","unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts (2017)"},{"key":"7_CR36","unstructured":"Min, S.Y., Chaplot, D.S., Ravikumar, P., Bisk, Y., Salakhutdinov, R.: Film: following instructions in language with modular methods. arXiv preprint arXiv:2110.07342 (2021)"},{"issue":"3","key":"7_CR37","doi-asserted-by":"publisher","first-page":"6870","DOI":"10.1109\/LRA.2022.3178804","volume":"7","author":"M Murray","year":"2022","unstructured":"Murray, M., Cakmak, M.: Following natural language instructions for household tasks with landmark guided search and reinforced pose adjustment. IEEE Rob. Autom. Lett. 7(3), 6870\u20136877 (2022)","journal-title":"IEEE Rob. Autom. Lett."},{"key":"7_CR38","unstructured":"Nagarajan, T., Grauman, K.: Learning affordance landscapes for interaction exploration in 3D environments (2020)"},{"key":"7_CR39","doi-asserted-by":"crossref","unstructured":"Nguyen, V.Q., Suganuma, M., Okatani, T.: Look wide and interpret twice: improving performance on interactive instruction-following tasks. arXiv preprint arXiv:2106.00596 (2021)","DOI":"10.24963\/ijcai.2021\/128"},{"key":"7_CR40","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback (2022)"},{"key":"7_CR41","doi-asserted-by":"crossref","unstructured":"Pashevich, A., Schmid, C., Sun, C.: Episodic transformer for vision-and-language navigation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"7_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Savva, M., et al.: Habitat: a platform for embodied AI research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"7_CR44","doi-asserted-by":"crossref","unstructured":"Shridhar, M., et al.: Alfred: a benchmark for interpreting grounded instructions for everyday tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10740\u201310749 (2020)","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"7_CR45","unstructured":"Shridhar, M., Yuan, X., C\u00f4t\u00e9, M.A., Bisk, Y., Trischler, A., Hausknecht, M.: ALFWorld: aligning text and embodied environments for interactive learning. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021). https:\/\/arxiv.org\/abs\/2010.03768"},{"key":"7_CR46","unstructured":"Singh, K.P., Bhambri, S., Kim, B., Mottaghi, R., Choi, J.: Factorizing perception and policy for interactive instruction following. arXiv preprint arXiv:2012.03208 (2020)"},{"key":"7_CR47","doi-asserted-by":"crossref","unstructured":"Song, C.H., Kil, J., Pan, T.Y., Sadler, B.M., Chao, W.L., Su, Y.: One step at a time: long-horizon vision-and-language navigation with milestones. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15482\u201315491 (2022)","DOI":"10.1109\/CVPR52688.2022.01504"},{"key":"7_CR48","unstructured":"Srivastava, S., et al.: Behavior: benchmark for everyday household activities in virtual, interactive, and ecological environments. In: Conference in Robot Learning (CoRL) (2021)"},{"key":"7_CR49","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: AdaAfford: learning to adapt manipulation affordance for 3d articulated objects via few-shot interactions (2022)","DOI":"10.1007\/978-3-031-19818-2_6"},{"key":"7_CR50","doi-asserted-by":"crossref","unstructured":"Weihs, L., Deitke, M., Kembhavi, A., Mottaghi, R.: Visual room rearrangement. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00586"},{"key":"7_CR51","unstructured":"Wu, Z., Wang, Z., Xu, X., Lu, J., Yan, H.: Embodied task planning with large language models (2023)"},{"key":"7_CR52","unstructured":"Xu, C., Chen, Y., Wang, H., Zhu, S.C., Zhu, Y., Huang, S.: Partafford: part-level affordance discovery from 3d objects (2022)"},{"key":"7_CR53","unstructured":"Yenamandra, S., et al.: Homerobot: open vocabulary mobile manipulation (2023)"},{"key":"7_CR54","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Chai, J.: Hierarchical task learning from language instructions with unified transformers and self-monitoring. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, pp. 4202\u20134213. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.findings-acl.368. https:\/\/aclanthology.org\/2021.findings-acl.368","DOI":"10.18653\/v1\/2021.findings-acl.368"},{"key":"7_CR55","doi-asserted-by":"crossref","unstructured":"Zhou, G., Hong, Y., Wu, Q.: Navgpt: explicit reasoning in vision-and-language navigation with large language models (2023)","DOI":"10.1609\/aaai.v38i7.28597"},{"key":"7_CR56","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Visual semantic planning using deep successor representations. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 483\u2013492 (2017)","DOI":"10.1109\/ICCV.2017.60"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72649-1_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:04:53Z","timestamp":1727593493000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72649-1_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726484","9783031726491"],"references-count":56,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72649-1_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}