{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:26:44Z","timestamp":1771957604772,"version":"3.50.1"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726545","type":"print"},{"value":"9783031726552","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72655-2_11","type":"book-chapter","created":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T10:13:15Z","timestamp":1733393595000},"page":"186-202","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Agent3D-Zero: An Agent for\u00a0Zero-Shot 3D Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1814-6783","authenticated-orcid":false,"given":"Sha","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8712-8747","authenticated-orcid":false,"given":"Di","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9624-7451","authenticated-orcid":false,"given":"Jiajun","family":"Deng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0067-339X","authenticated-orcid":false,"given":"Shixiang","family":"Tang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9163-2761","authenticated-orcid":false,"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2772-9320","authenticated-orcid":false,"given":"Tong","family":"He","sequence":"additional","affiliation":[]},{"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,6]]},"reference":[{"key":"11_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Azuma, D., Miyanishi, T., Kurita, S., Kawanabe, M.: ScanQA: 3D question answering for spatial scene understanding. In: proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19129\u201319139 (2022)","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"11_CR4","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"11_CR5","unstructured":"Cadena, C., Dick, A.R., Reid, I.D.: Multi-modal auto-encoders as joint estimators for robotics scene understanding. In: Robotics: Science and Systems, vol.\u00a05 (2016)"},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Chen, B., et al.: SpatialVLM: endowing vision-language models with spatial reasoning capabilities. arXiv preprint arXiv:2401.12168 (2024)","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"11_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-58565-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"DZ Chen","year":"2020","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 202\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_13"},{"issue":"1","key":"11_CR9","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"FL Chen","year":"2023","unstructured":"Chen, F.L., et al.: VLP: a survey on vision-language pre-training. Mach. Intell. Res. 20(1), 38\u201356 (2023)","journal-title":"Mach. Intell. Res."},{"key":"11_CR10","unstructured":"Chen, G., et\u00a0al.: VideoLLM: modeling video sequence with large language models. arXiv preprint arXiv:2305.13292 (2023)"},{"key":"11_CR11","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"issue":"240","key":"11_CR12","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: PaLM: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"11_CR13","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Curless, B., Levoy, M.: A volumetric method for building complex models from range images. In: Proceedings of the 23rd Annual Conference on Computer Graphics and Interactive Techniques, pp. 303\u2013312 (1996)","DOI":"10.1145\/237170.237269"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5828\u20135839 (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3d objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13142\u201313153 (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"11_CR17","unstructured":"Deng, J., Zhang, S., Dayoub, F., Ouyang, W., Zhang, Y., Reid, I.: PoIFusion: multi-modal 3D object detection via fusion at points of interest. arXiv preprint arXiv:2403.09212 (2024)"},{"key":"11_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-32248-9_23","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2019","author":"Z Ding","year":"2019","unstructured":"Ding, Z., Han, X., Niethammer, M.: VoteNet: a deep learning label fusion method for multi-atlas segmentation. In: Shen, D., et al. (eds.) MICCAI 2019. LNCS, vol. 11766, pp. 202\u2013210. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-32248-9_23"},{"key":"11_CR19","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: The KITTI vision benchmark suite. 2(5) (2015). http:\/\/www.cvlibs.net\/datasets\/kitti"},{"key":"11_CR20","unstructured":"Gong, T., et al.: Multimodal-GPT: a vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790 (2023)"},{"issue":"4","key":"11_CR21","doi-asserted-by":"publisher","first-page":"471","DOI":"10.3390\/electronics10040471","volume":"10","author":"Z Guo","year":"2021","unstructured":"Guo, Z., Huang, Y., Hu, X., Wei, H., Zhao, B.: A survey on deep learning based approaches for scene understanding in autonomous driving. Electronics 10(4), 471 (2021)","journal-title":"Electronics"},{"key":"11_CR22","unstructured":"Hong, Y., et al.: 3D-LLM: injecting the 3D world into large language models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"issue":"7","key":"11_CR23","doi-asserted-by":"publisher","first-page":"1005","DOI":"10.1007\/s00138-016-0784-4","volume":"27","author":"R Horaud","year":"2016","unstructured":"Horaud, R., Hansard, M., Evangelidis, G., M\u00e9nier, C.: An overview of depth cameras and range scanners based on time-of-flight technologies. Mach. Vis. Appl. 27(7), 1005\u20131020 (2016)","journal-title":"Mach. Vis. Appl."},{"key":"11_CR24","unstructured":"Hua, B.S., et\u00a0al.: SHREC\u201917: RGB-D to CAD retrieval with ObjectNN dataset. In: Proceedings of the Eurographic Workshop 3D Object Retrieval, pp. 25\u201332 (2017)"},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Jatavallabhula, K.M., et\u00a0al.: ConceptFusion: open-set multimodal 3D mapping. arXiv preprint arXiv:2302.07241 (2023)","DOI":"10.15607\/RSS.2023.XIX.066"},{"issue":"2","key":"11_CR26","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1109\/TVCG.2008.96","volume":"15","author":"D Kalkofen","year":"2008","unstructured":"Kalkofen, D., Mendez, E., Schmalstieg, D.: Comprehensible visualization for augmented reality. IEEE Trans. Visual Comput. Graph. 15(2), 193\u2013204 (2008)","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"11_CR27","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision (2021)"},{"issue":"2","key":"11_CR28","first-page":"24","volume":"21","author":"M Lemmens","year":"2007","unstructured":"Lemmens, M.: Airborne lidar sensors. GIM Int. 21(2), 24\u201327 (2007)","journal-title":"GIM Int."},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Li, C., et\u00a0al.: mPLUG: effective and efficient vision-language learning by cross-modal skip-connections. arXiv preprint arXiv:2205.12005 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"11_CR30","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"11_CR31","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: 3DMIT: 3D multi-modal instruction tuning for scene understanding. arXiv preprint arXiv:2401.03201 (2024)","DOI":"10.1109\/ICMEW63481.2024.10645462"},{"key":"11_CR33","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"11_CR34","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Naeem, M.F., et al.: I2MVFormer: large language model generated multi-view document supervision for zero-shot image classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15169\u201315179 (2023)","DOI":"10.1109\/CVPR52729.2023.01456"},{"key":"11_CR36","doi-asserted-by":"publisher","first-page":"1859","DOI":"10.1109\/ACCESS.2018.2886133","volume":"7","author":"M Naseer","year":"2018","unstructured":"Naseer, M., Khan, S., Porikli, F.: Indoor scene understanding in 2.5\/3D for autonomous agents: a survey. IEEE Access 7, 1859\u20131887 (2018)","journal-title":"IEEE Access"},{"key":"11_CR37","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"11_CR38","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"11_CR40","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"11_CR41","unstructured":"Ramakrishnan, S.K., et\u00a0al.: Habitat-Matterport 3D dataset (HM3D): 1000 large-scale 3D environments for embodied AI. arXiv preprint arXiv:2109.08238 (2021)"},{"key":"11_CR42","doi-asserted-by":"crossref","unstructured":"Rasheed, H., et al.: GLaMM: pixel grounding large multimodal model. arXiv preprint arXiv:2311.03356 (2023)","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"11_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1007\/978-3-031-20074-8_9","volume-title":"Computer Vision \u2013 ECCV 2022","author":"D Schwenk","year":"2022","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13668, pp. 146\u2013162. Springer, Cham (2022)"},{"key":"11_CR44","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"11_CR45","unstructured":"Wang, Z., Huang, H., Zhao, Y., Zhang, Z., Zhao, Z.: Chat-3D: data-efficiently tuning large language model for universal dialogue of 3D scenes. arXiv preprint arXiv:2308.08769 (2023)"},{"key":"11_CR46","unstructured":"Wu, W., Yao, H., Zhang, M., Song, Y., Ouyang, W., Wang, J.: GPT4Vis: what can GPT-4 do for zero-shot visual recognition? arXiv preprint arXiv:2311.15732 (2023)"},{"key":"11_CR47","unstructured":"Wu, Z., et al.: 3D ShapeNets: a deep representation for volumetric shapes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1912\u20131920 (2015)"},{"key":"11_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"615","DOI":"10.1007\/978-3-031-19836-6_35","volume-title":"Computer Vision \u2013 ECCV 2022","author":"X Xu","year":"2022","unstructured":"Xu, X., Qiu, J., Wang, X., Wang, Z.: Relationship spatialization for depth estimation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13697, pp. 615\u2013637. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_35"},{"key":"11_CR49","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., Gao, J.: Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V. arXiv preprint arXiv:2310.11441 (2023)"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6281\u20136290 (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"11_CR51","unstructured":"Zhang, R., et al.: Llama-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Zhang, S., Deng, J., Bai, L., Li, H., Ouyang, W., Zhang, Y.: HVDistill: transferring knowledge from images to point clouds via unsupervised hybrid-view distillation. Int. J. Comput. Vision 1\u201315 (2024)","DOI":"10.1007\/s11263-023-01981-w"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72655-2_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T11:29:57Z","timestamp":1733398197000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72655-2_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"ISBN":["9783031726545","9783031726552"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72655-2_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"6 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}