{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,30]],"date-time":"2026-03-30T12:09:58Z","timestamp":1774872598039,"version":"3.50.1"},"publisher-location":"Cham","reference-count":110,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726729","type":"print"},{"value":"9783031726736","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72673-6_16","type":"book-chapter","created":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:03:50Z","timestamp":1729526630000},"page":"289-310","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":35,"title":["SceneVerse: Scaling 3D Vision-Language Learning for\u00a0Grounded Scene Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4968-3290","authenticated-orcid":false,"given":"Baoxiong","family":"Jia","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8176-0241","authenticated-orcid":false,"given":"Yixin","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5729-0255","authenticated-orcid":false,"given":"Huangyue","family":"Yu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7001-3569","authenticated-orcid":false,"given":"Yan","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7737-4287","authenticated-orcid":false,"given":"Xuesong","family":"Niu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4006-1740","authenticated-orcid":false,"given":"Tengyu","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1185-5365","authenticated-orcid":false,"given":"Qing","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1524-7148","authenticated-orcid":false,"given":"Siyuan","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,22]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Abdelreheem, A., Xia, F., Elhoseiny, M., Guibas, L.: Referit3D: neural listeners for fine-grained 3D object identification in real-world scenes. In: Proceedings of European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"16_CR2","unstructured":"Agia, C., et al.: Taskography: evaluating robot task planning over large 3D scene graphs. In: Proceedings of Conference on Robot Learning (CoRL) (2022)"},{"key":"16_CR3","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Armeni, I., et al.: 3D scene graph: a structure for unified semantics, 3D space, and camera. In: Proceedings of International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00576"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Azuma, D., Miyanishi, T., Kurita, S., Kawanabe, M.: ScanQA: 3D question answering for spatial scene understanding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"16_CR6","unstructured":"Bakr, E., Alsaedy, Y., Elhoseiny, M.: Look around and refer: 2D synthetic semantics knowledge distillation for 3D visual grounding. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"issue":"4","key":"16_CR7","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1017\/S0140525X99002149","volume":"22","author":"LW Barsalou","year":"1999","unstructured":"Barsalou, L.W.: Perceptual symbol systems. Behav. Brain Sci. 22(4), 577\u2013660 (1999)","journal-title":"Behav. Brain Sci."},{"key":"16_CR8","doi-asserted-by":"publisher","first-page":"617","DOI":"10.1146\/annurev.psych.59.103006.093639","volume":"59","author":"LW Barsalou","year":"2008","unstructured":"Barsalou, L.W.: Grounded cognition. Annu. Rev. Psychol. 59, 617\u2013645 (2008)","journal-title":"Annu. Rev. Psychol."},{"key":"16_CR9","unstructured":"Baruch, G., et\u00a0al.: ArKitScenes: a diverse real-world dataset for 3D indoor scene understanding using mobile RGB-D data. In: Proceedings of Advances in Neural Information Processing Systems Datasets and Benchmarks (NeurIPS Datasets and Benchmarks Track) (2021)"},{"key":"16_CR10","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models(2021). arXiv preprint arXiv:2108.07258"},{"key":"16_CR11","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Cai, D., Zhao, L., Zhang, J., Sheng, L., Xu, D.: 3DJCG: a unified framework for joint dense captioning and visual grounding on 3D point clouds. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3D: learning from RGB-D data in indoor environments. In: Proceedings of International Conference on 3D Vision (3DV) (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"16_CR14","unstructured":"Chang, A.X., et\u00a0al.: ShapeNet: an information-rich 3D model repository (2015). arXiv preprint arXiv:1512.03012"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Proceedings of European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Chen, D.Z., Wu, Q., Nie\u00dfner, M., Chang, A.X.: D3Net: a speaker-listener architecture for semi-supervised dense captioning and visual grounding in RGB-D scans. In: Proceedings of European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"16_CR18","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Language conditioned spatial relation reasoning for 3D object grounding. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhu, H., Chen, X., Lei, Y., Yu, G., Chen, T.: End-to-end 3D dense captioning with vote2Cap-DETR. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01070"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Chen, Y., Huang, S., Yuan, T., Qi, S., Zhu, Y., Zhu, S.C.: Holistic++ scene understanding: single-view 3D holistic scene parsing and human pose estimation with human-object interaction and physical commonsense. In: Proceedings of International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00874"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: YouRefIt: embodied reference understanding with language and gesture. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00142"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Chen, Z., Gholami, A., Nie\u00dfner, M., Chang, A.X.: Scan2Cap: context-aware dense captioning in RGB-D scans. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Collins, J., et\u00a0al.: ABO: dataset and benchmarks for real-world 3D object understanding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.02045"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"16_CR25","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023). arXiv preprint arXiv:2305.06500"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Deitke, M., et\u00a0al.: Objaverse-XL: a universe of 10M+ 3D objects. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3D objects. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"16_CR28","unstructured":"Deitke, M., et al.: ProcTHOR: large-scale embodied AI using procedural generation. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR29","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics (NAACL) (2018)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Ding, R., Yang, J., Xue, C., Zhang, W., Bai, S., Qi, X.: PLA: language-driven open-vocabulary 3D scene understanding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Ding, Z., Han, X., Niethammer, M.: VoteNet: a deep learning label fusion method for multi-atlas segmentation. In: Proceedings of International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI) (2019)","DOI":"10.1007\/978-3-030-32248-9_23"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Feng, M., et al.: Free-form description guided 3D visual graph network for object grounding in point cloud. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: Proceedings of European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Gong, R., et al.: ARNOLD: a benchmark for language-grounded task learning with continuous states in realistic 3D scenes. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01873"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Gu, Q., et\u00a0al.: ConceptGraphs: open-vocabulary 3D scene graphs for perception and planning (2023). arXiv preprint arXiv:2309.16650","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"16_CR36","unstructured":"Ha, H., Song, S.: Semantic Abstraction: open-world 3D scene understanding from 2D vision-language models. In: Proceedings of Conference on Robot Learning (CoRL) (2022)"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"He, D., et al.: Transrefer3d: entity-and-relation aware transformer for fine-grained 3D visual grounding. In: Proceedings of ACM International Conference on Multimedia (MM) (2021)","DOI":"10.1145\/3474085.3475397"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"He, D., et al.: TransRefer3D: entity-and-relation aware transformer for fine-grained 3D visual grounding. In: Proceedings of ACM International Conference on Multimedia (MM) (2021)","DOI":"10.1145\/3474085.3475397"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Hong, Y., Wu, Q., Qi, Y., Rodriguez-Opazo, C., Gould, S.: VLN BERT: a recurrent vision-and-language BERT for navigation. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00169"},{"key":"16_CR40","doi-asserted-by":"crossref","unstructured":"Hong, Y., Lin, C., Du, Y., Chen, Z., Tenenbaum, J.B., Gan, C.: 3D concept learning and reasoning from multi-view images. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00888"},{"key":"16_CR41","unstructured":"Huang, J., et al.: An embodied generalist agent in 3D world (2023). arXiv preprint arXiv:2311.12871"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Huang, P.H., Lee, H.H., Chen, H.T., Liu, T.L.: Text-guided graph neural networks for referring 3D instance segmentation. In: Proceedings of AAAI Conference on Artificial Intelligence (AAAI) (2021)","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"16_CR43","doi-asserted-by":"crossref","unstructured":"Huang, S., Chen, Y., Jia, J., Wang, L.: Multi-view transformer for 3D visual grounding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Diffusion-based generation, optimization, and planning in 3D scenes. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Jain, A., Gkanatsios, N., Mediratta, I., Fragkiadaki, K.: Bottom up top down detection transformers for language grounding in images and point clouds. In: Proceedings of European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"16_CR46","doi-asserted-by":"crossref","unstructured":"Jiang, L., Zhao, H., Shi, S., Liu, S., Fu, C.W., Jia, J.: PointGroup: dual-set point grouping for 3D instance segmentation. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"16_CR47","doi-asserted-by":"crossref","unstructured":"Jiang, N., et al.: Full-body articulated human-object interaction. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/ICCV51070.2023.00859"},{"key":"16_CR48","doi-asserted-by":"crossref","unstructured":"Jiang, N., et al.: Scaling up dynamic human-scene interaction modeling. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"16_CR49","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models (2020). arXiv preprint arXiv:2001.08361"},{"key":"16_CR50","doi-asserted-by":"crossref","unstructured":"Khanna, M., et al.: Habitat synthetic scenes dataset (HSSD-200): an analysis of 3D scene scale and realism tradeoffs for object goal navigation. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01550"},{"key":"16_CR51","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment Anything. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"issue":"1","key":"16_CR52","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"key":"16_CR53","doi-asserted-by":"publisher","DOI":"10.1017\/S0140525X16001837","volume":"40","author":"BM Lake","year":"2017","unstructured":"Lake, B.M., Ullman, T.D., Tenenbaum, J.B., Gershman, S.J.: Building machines that learn and think like people. Behav. brain sci. 40, e253 (2017)","journal-title":"Behav. brain sci."},{"key":"16_CR54","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: Proceedings of International Conference on Learning Representations (ICLR) (2022)"},{"key":"16_CR55","unstructured":"Li, C., et\u00a0al.: BEHAVIOR-1K: a benchmark for embodied AI with 1,000 everyday activities and realistic simulation. In: Proceedings of Conference on Robot Learning (CoRL) (2023)"},{"key":"16_CR56","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of International Conference on Machine Learning (ICML) (2023)"},{"key":"16_CR57","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proceedings of International Conference on Machine Learning (ICML) (2022)"},{"key":"16_CR58","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"16_CR59","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"16_CR60","unstructured":"Liu, M., et al.: OpenShape: Scaling up 3D shape representation towards open-world understanding. arXiv preprint arXiv:2305.10764 (2023)"},{"key":"16_CR61","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: Zero-shot one image to 3D object. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"16_CR62","doi-asserted-by":"crossref","unstructured":"Luo, J., et al.: 3D-SPS: single-stage 3D visual grounding via referred point progressive selection. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"16_CR63","unstructured":"Luo, T., Rockwell, C., Lee, H., Johnson, J.: Scalable 3D captioning with pretrained models. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"16_CR64","unstructured":"Ma, C.Y., Lu, J., Wu, Z., AlRegib, G., Kira, Z., Socher, R., Xiong, C.: Self-monitoring navigation agent via auxiliary progress estimation. In: Proceedings of International Conference on Learning Representations (ICLR) (2019)"},{"key":"16_CR65","unstructured":"Ma, X., et al.: SQA3D: situated question answering in 3D scenes. In: Proceedings of International Conference on Learning Representations (ICLR) (2023)"},{"key":"16_CR66","unstructured":"Mao, Y., Zhang, Y., Jiang, H., Chang, A., Savva, M.: MultiScan: scalable RGBD scanning for 3D environments with articulated objects. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR67","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3D object detection. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"16_CR68","doi-asserted-by":"crossref","unstructured":"Mittal, M., et\u00a0al.: Orbit: a unified simulation framework for interactive robot learning environments. Robotics and Automation Letters (RA-L) (2023)","DOI":"10.1109\/LRA.2023.3270034"},{"key":"16_CR69","doi-asserted-by":"crossref","unstructured":"Mo, K., et al.: PartNet: a large-scale benchmark for fine-grained and hierarchical part-level 3D object understanding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00100"},{"key":"16_CR70","unstructured":"Mu, T., et al.: ManiSkill: generalizable manipulation skill benchmark with large-scale demonstrations. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2021)"},{"key":"16_CR71","unstructured":"OpenAI: Introducing ChatGPT (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"16_CR72","unstructured":"OpenAI: GPT-4 technical report (2023). arXiv preprint arXiv:2303.08774"},{"key":"16_CR73","doi-asserted-by":"crossref","unstructured":"Pashevich, A., Schmid, C., Sun, C.: Episodic transformer for vision-and-language navigation. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"16_CR74","doi-asserted-by":"crossref","unstructured":"Peng, S., et\u00a0al.: OpenScene: 3D scene understanding with open vocabularies. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"16_CR75","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceedings of International Conference on Machine Learning (ICML) (2021)"},{"key":"16_CR76","unstructured":"Ramakrishnan, S.K., et\u00a0al.: Habitat-Matterport 3D dataset (HM3D): 1000 large-scale 3D environments for embodied AI. In: Proceedings of Advances in Neural Information Processing Systems Datasets and Benchmarks (NeurIPS Datasets and Benchmarks Track) (2021)"},{"key":"16_CR77","unstructured":"Rana, K., Haviland, J., Garg, S., Abou-Chakra, J., Reid, I., Suenderhauf, N.: SayPlan: grounding large language models using 3D scene graphs for scalable robot task planning. In: Proceedings of Conference on Robot Learning (CoRL) (2023)"},{"key":"16_CR78","doi-asserted-by":"crossref","unstructured":"Rosinol, A., et al.: Kimera: from slam to spatial perception with 3D dynamic scene graphs. Int. J. Robot. Res. (IJRR) 40, 1510\u20131546 (2021)","DOI":"10.1177\/02783649211056674"},{"key":"16_CR79","unstructured":"Schuhmann, C., et\u00a0al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR80","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Hermans, A., Litany, O., Tang, S., Leibe, B.: Mask3D: mask transformer for 3D semantic instance segmentation. In: Proceedings of International Conference on Robotics and Automation (ICRA) (2023)","DOI":"10.1109\/ICRA48891.2023.10160590"},{"issue":"1\u20132","key":"16_CR81","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1162\/1064546053278973","volume":"11","author":"L Smith","year":"2005","unstructured":"Smith, L., Gasser, M.: The development of embodied cognition: six lessons from babies. Artif. Life 11(1\u20132), 13\u201329 (2005)","journal-title":"Artif. Life"},{"key":"16_CR82","unstructured":"Takmaz, A., Fedele, E., Sumner, R.W., Pollefeys, M., Tombari, F., Engelmann, F.: OpenMask3D: open-vocabulary 3D instance segmentation. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"16_CR83","unstructured":"Touvron, H., et\u00a0al.: LLaMA: Open and efficient foundation language models (2023). arXiv preprint arXiv:2302.13971"},{"key":"16_CR84","doi-asserted-by":"crossref","unstructured":"Vu, T., Kim, K., Luu, T.M., Nguyen, T., Yoo, C.D.: SoftGroup for 3D instance segmentation on point clouds. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"16_CR85","doi-asserted-by":"crossref","unstructured":"Wald, J., Avetisyan, A., Navab, N., Tombari, F., Nie\u00dfner, M.: RIO: 3D object instance re-localization in changing indoor environments. In: Proceedings of International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00775"},{"key":"16_CR86","doi-asserted-by":"crossref","unstructured":"Wald, J., Dhamo, H., Navab, N., Tombari, F.: Learning 3D semantic scene graphs from 3D indoor reconstructions. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"16_CR87","doi-asserted-by":"crossref","unstructured":"Wang, T., et\u00a0al.: EmbodiedScan: a holistic multi-modal 3D perception suite towards embodied AI. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01868"},{"key":"16_CR88","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00679"},{"key":"16_CR89","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Move as you say interact as you can: language-guided human motion generation with scene affordance. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"16_CR90","unstructured":"Wang, Z., Chen, Y., Liu, T., Zhu, Y., Liang, W., Huang, S.: HUMANISE: language-conditioned human motion generation in 3D scenes. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR91","doi-asserted-by":"crossref","unstructured":"Wu, T., et\u00a0al.: OmniObject3D: large-vocabulary 3D object dataset for realistic perception, reconstruction and generation. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00084"},{"key":"16_CR92","doi-asserted-by":"crossref","unstructured":"Wu, Y., Cheng, X., Zhang, R., Cheng, Z., Zhang, J.: EDA: explicit text-decoupling and dense alignment for 3D visual grounding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"16_CR93","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: ULIP: learning a unified representation of language, images, and point clouds for 3D understanding. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"16_CR94","doi-asserted-by":"crossref","unstructured":"Yang, J., Ding, R., Wang, Z., Qi, X.: RegionPLC: Regional point-language contrastive learning for open-world 3D scene understanding (2023). arXiv preprint arXiv:2304.00962","DOI":"10.1109\/CVPR52733.2024.01874"},{"key":"16_CR95","doi-asserted-by":"crossref","unstructured":"Yang, Y., Jia, B., Zhi, P., Huang, S.: PhyScene: physically interactable 3D scene synthesis for embodied AI. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01539"},{"key":"16_CR96","unstructured":"Yang, Y.Q., et al.: Swin3D: A pretrained transformer backbone for 3D indoor scene understanding (2023). arXiv preprint arXiv:2304.06906"},{"key":"16_CR97","doi-asserted-by":"crossref","unstructured":"Yang, Z., Zhang, S., Wang, L., Luo, J.: SAT: 2D semantics assisted training for 3D visual grounding. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"16_CR98","doi-asserted-by":"crossref","unstructured":"Yeshwanth, C., Liu, Y.C., Nie\u00dfner, M., Dai, A.: ScanNet++: a high-fidelity dataset of 3D indoor scenes. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00008"},{"key":"16_CR99","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: X-Trans2Cap: cross-modal knowledge transfer using transformer for 3D dense captioning. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00837"},{"key":"16_CR100","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: InstanceRefer: cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"16_CR101","unstructured":"Zhang, H., et al.: GLIPv2: unifying localization and vision-language understanding. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"16_CR102","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: PointClip: point cloud understanding by clip. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"16_CR103","doi-asserted-by":"crossref","unstructured":"Zhang, R., Wang, L., Qiao, Y., Gao, P., Li, H.: Learning 3D representations from 2D pre-trained models via image-to-point masked autoencoders. In: Proceedings of Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02085"},{"key":"16_CR104","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Gong, Z., Chang, A.X.: Multi3DRefer: grounding text description to multiple 3D objects. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01397"},{"key":"16_CR105","doi-asserted-by":"crossref","unstructured":"Zhao, L., Cai, D., Sheng, L., Xu, D.: 3DVG-transformer: relation modeling for visual grounding on point clouds. In: Proceedings of International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"16_CR106","doi-asserted-by":"crossref","unstructured":"Zheng, J., Zhang, J., Li, J., Tang, R., Gao, S., Zhou, Z.: Structured3D: a large photo-realistic dataset for structured 3D modeling. In: Proceedings of European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58545-7_30"},{"key":"16_CR107","unstructured":"Zhu, W., et al.: Multimodal C4: An open, billion-scale corpus of images interleaved with text (2023). arXiv preprint arXiv:2304.06939"},{"issue":"3","key":"16_CR108","doi-asserted-by":"publisher","first-page":"310","DOI":"10.1016\/j.eng.2020.01.011","volume":"6","author":"Y Zhu","year":"2020","unstructured":"Zhu, Y., et al.: Dark, beyond deep: a paradigm shift to cognitive AI with humanlike common sense. Engineering 6(3), 310\u2013345 (2020)","journal-title":"Engineering"},{"key":"16_CR109","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Ma, X., Chen, Y., Deng, Z., Huang, S., Li, Q.: 3D-VisTA: pre-trained transformer for 3D vision and text alignment. In: Proceedings of International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00272"},{"key":"16_CR110","doi-asserted-by":"crossref","unstructured":"Zhu, Z., et al.: Unifying 3D vision-language understanding via promptable queries. In: Proceedings of European Conference on Computer Vision (ECCV) (2024)","DOI":"10.1007\/978-3-031-72784-9_11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72673-6_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:08:58Z","timestamp":1729526938000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72673-6_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,22]]},"ISBN":["9783031726729","9783031726736"],"references-count":110,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72673-6_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,22]]},"assertion":[{"value":"22 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}