{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T02:51:55Z","timestamp":1780109515942,"version":"3.54.0"},"publisher-location":"Cham","reference-count":162,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727740","type":"print"},{"value":"9783031727757","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72775-7_13","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"214-238","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":41,"title":["ShapeLLM: Universal 3D Object Understanding for\u00a0Embodied Interaction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2554-5141","authenticated-orcid":false,"given":"Zekun","family":"Qi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1104-7897","authenticated-orcid":false,"given":"Runpei","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5550-905X","authenticated-orcid":false,"given":"Shaochen","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7828-3241","authenticated-orcid":false,"given":"Haoran","family":"Geng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9725-280X","authenticated-orcid":false,"given":"Chunrui","family":"Han","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8630-8270","authenticated-orcid":false,"given":"Zheng","family":"Ge","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9319-0354","authenticated-orcid":false,"given":"Li","family":"Yi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9226-3366","authenticated-orcid":false,"given":"Kaisheng","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"13_CR1","unstructured":"Achlioptas, P., Diamanti, O., Mitliagkas, I., Guibas, L.J.: Learning representations and generative models for 3D point clouds. In: International Conference on Machine Learning (ICML) (2018)"},{"key":"13_CR2","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Bai, Y., et al.: Sequential modeling enables scalable learning for large vision models. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02157"},{"key":"13_CR4","unstructured":"Betker, J., et al.: Improving image generation with better captions (2023)"},{"key":"13_CR5","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. CoRR abs\/2108.07258 (2021)"},{"key":"13_CR6","doi-asserted-by":"publisher","unstructured":"Bradski, G., Grossberg, S.: Recognition of 3-D objects from multiple 2-D views by a self-organizing neural architecture. In: Cherkassky, V., Friedman, J.H., Wechsler, H. (eds.) NATO ASI Series, vol. 136, pp. 349\u2013375. Springer, Heidelberg (1994). https:\/\/doi.org\/10.1007\/978-3-642-79119-2_17","DOI":"10.1007\/978-3-642-79119-2_17"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Bronstein, A.M., Bronstein, M.M., Guibas, L.J., Ovsjanikov, M.: Shape google: geometric words and expressions for invariant shape retrieval. ACM Trans. Graph. 30(1), 1:1\u20131:20 (2011)","DOI":"10.1145\/1899404.1899405"},{"key":"13_CR8","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"13_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"13_CR10","unstructured":"Chang, A.X., et al.: ShapeNet: an information-rich 3D model repository. CoRR abs\/1512.03012 (2015)"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Chang, M., et al.: GOAT: GO to any thing. In: Robotics: Science and Systems (RSS) (2024)","DOI":"10.15607\/RSS.2024.XX.073"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Chen, B., et al.: SpatialVLM: endowing vision-language models with spatial reasoning capabilities. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"13_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-58565-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"DZ Chen","year":"2020","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 202\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_13"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Chen, D.Z., Gholami, A., Nie\u00dfner, M., Chang, A.X.: Scan2Cap: context-aware dense captioning in RGB-D scans. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"13_CR15","unstructured":"Chen, G., Wang, M., Yang, Y., Yu, K., Yuan, L., Yue, Y.: PointGPT: auto-regressively generative pre-training from point clouds. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR16","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. CoRR abs\/2306.15195 (2023)"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Chen, S., Garcia, R., Laptev, I., Schmid, C.: SUGAR: pre-training 3D visual representations for robotics. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18049\u201318060 (2024)","DOI":"10.1109\/CVPR52733.2024.01709"},{"key":"13_CR18","unstructured":"Chen, X., et al.: PaLI-X: on scaling up a multilingual vision and language model. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"13_CR19","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality, March 2023. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Collins, J., et\u00a0al.: ABO: dataset and benchmarks for real-world 3D object understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.02045"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"13_CR22","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"issue":"5","key":"13_CR23","doi-asserted-by":"publisher","first-page":"1242","DOI":"10.1109\/TPAMI.2018.2828437","volume":"41","author":"A Das","year":"2019","unstructured":"Das, A., et al.: Visual dialog. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) 41(5), 1242\u20131256 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Davison, J., Feldman, J., Rush, A.M.: Commonsense knowledge mining from pretrained models. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, 3\u20137 November 2019 (2019)","DOI":"10.18653\/v1\/D19-1109"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3d objects. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"Ding, Y., Zhang, X., Paxton, C., Zhang, S.: Task and motion planning with large language models for object rearrangement. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS) (2023)","DOI":"10.1109\/IROS55552.2023.10342169"},{"key":"13_CR27","unstructured":"Dong, R., et al.: DreamLLM: synergistic multimodal comprehension and creation. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR28","unstructured":"Dong, R., et al.: Autoencoders as cross-modal teachers: can pretrained 2D image transformers help 3D representation learning? In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"13_CR29","unstructured":"Driess, D., et al.: PaLM-E: an embodied multimodal language model. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Fan, G., Qi, Z., Shi, W., Ma, K.: Point-GCC: universal self-supervised 3D scene pre-training via geometry-color contrast. CoRR abs\/2305.19623 (2023)","DOI":"10.1145\/3664647.3681343"},{"key":"13_CR31","doi-asserted-by":"publisher","first-page":"3313","DOI":"10.1007\/s11263-021-01534-z","volume":"129","author":"H Fu","year":"2021","unstructured":"Fu, H., et al.: 3D-future: 3D furniture shape with texture. Int. J. Comput. Vision 129, 3313\u20133337 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Gao, Y., Wang, Z., Zheng, W.S., Xie, C., Zhou, Y.: Sculpting holistic 3D representation in contrastive language-image-3D pre-training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02170"},{"key":"13_CR33","unstructured":"Ge, Y., Ge, Y., Zeng, Z., Wang, X., Shan, Y.: Planting a SEED of vision in large language model. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Geng, H., Li, Z., Geng, Y., Chen, J., Dong, H., Wang, H.: PartManip: learning cross-category generalizable part manipulation policy from point cloud observations. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00291"},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Geng, H., Wei, S., Deng, C., Shen, B., Wang, H., Guibas, L.: SAGE: bridging semantic and actionable parts for generalizable articulated-object manipulation under language instructions. In: Robotics: Science and Systems (RSS) (2024)","DOI":"10.15607\/RSS.2024.XX.016"},{"key":"13_CR36","doi-asserted-by":"crossref","unstructured":"Geng, H., et al.: GAPartNet: cross-category domain-generalizable object perception and manipulation via generalizable and actionable parts. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00684"},{"key":"13_CR37","doi-asserted-by":"crossref","unstructured":"Geng, Y., An, B., Geng, H., Chen, Y., Yang, Y., Dong, H.: RLAfford: end-to-end affordance learning for robotic manipulation. In: IEEE International Conference on Robotics and Automation (ICRA) (2023)","DOI":"10.1109\/ICRA48891.2023.10161571"},{"key":"13_CR38","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"13_CR39","doi-asserted-by":"crossref","unstructured":"Gong, R., et al.: ARNOLD: a benchmark for language-grounded task learning with continuous states in realistic 3D scenes. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01873"},{"key":"13_CR40","doi-asserted-by":"crossref","unstructured":"Grabner, H., Gall, J., Gool, L.V.: What makes a chair a chair? In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2011)","DOI":"10.1109\/CVPR.2011.5995327"},{"key":"13_CR41","unstructured":"Guo, Z., et al.: Point-bind & point-LLM: aligning point cloud with multi-modality for 3D understanding, generation, and instruction following. CoRR abs\/2309.00615 (2023)"},{"key":"13_CR42","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"13_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/978-3-319-10584-0_23","volume-title":"Computer Vision \u2013 ECCV 2014","author":"S Gupta","year":"2014","unstructured":"Gupta, S., Girshick, R., Arbel\u00e1ez, P., Malik, J.: Learning rich features from RGB-D images for object detection and segmentation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 345\u2013360. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10584-0_23"},{"key":"13_CR44","doi-asserted-by":"crossref","unstructured":"Gupta, T., Kembhavi, A.: Visual programming: compositional visual reasoning without training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"13_CR45","doi-asserted-by":"crossref","unstructured":"Hamdi, A., Giancola, S., Ghanem, B.: MVTN: multi-view transformation network for 3D shape recognition. In: International Conference on Computer Vision (ICCV), pp. 1\u201311. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00007"},{"key":"13_CR46","unstructured":"Hong, Y., et al.: 3D-LLM: injecting the 3D world into large language models. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR47","doi-asserted-by":"crossref","unstructured":"Hou, J., Xie, S., Graham, B., Dai, A., Nie\u00dfner, M.: Pri3D: can 3D priors help 2D representation learning? In: International Conference on Computer Vision (ICCV), pp. 5673\u20135682. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00564"},{"key":"13_CR48","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"13_CR49","doi-asserted-by":"crossref","unstructured":"Hu, R., van Kaick, O., Wu, B., Huang, H., Shamir, A., Zhang, H.: Learning how objects function via co-analysis of interactions. ACM Trans. Graph. 35(4), 47:1\u201347:13 (2016)","DOI":"10.1145\/2897824.2925870"},{"key":"13_CR50","doi-asserted-by":"crossref","unstructured":"Hu, R., Li, W., van Kaick, O., Shamir, A., Zhang, H., Huang, H.: Learning to predict part mobility from a single static snapshot. ACM Trans. Graph. 36(6), 227:1\u2013227:13 (2017)","DOI":"10.1145\/3130800.3130811"},{"key":"13_CR51","doi-asserted-by":"crossref","unstructured":"Hu, R., Zhu, C., van Kaick, O., Liu, L., Shamir, A., Zhang, H.: Interaction context (ICON): towards a geometric functionality descriptor. ACM Trans. Graph. 34(4), 83:1\u201383:12 (2015)","DOI":"10.1145\/2766914"},{"key":"13_CR52","unstructured":"Huang, J., et al.: An embodied generalist agent in 3D world. In: International Conference on Machine Learning (ICML) (2024)"},{"key":"13_CR53","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: CLIP2Point: transfer CLIP to point cloud classification with image-depth pre-training. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"13_CR54","unstructured":"Huang, W., Mordatch, I., Pathak, D.: One policy to control them all: shared modular policies for agent-agnostic control. In: International Conference on Machine Learning (ICML) (2020)"},{"key":"13_CR55","unstructured":"Huang, W., Wang, C., Zhang, R., Li, Y., Wu, J., Fei-Fei, L.: VoxPoser: composable 3D value maps for robotic manipulation with language models. In: Annual Conference on Robot Learnin (CoRL) (2023)"},{"key":"13_CR56","unstructured":"Huang, W., et al.: Inner monologue: embodied reasoning through planning with language models. In: Annual Conference on Robot Learning (CoRL) (2022)"},{"key":"13_CR57","unstructured":"Ichter, B., et al.: Do as I can, not as I say: grounding language in robotic affordances. In: Annual Conference on Robot Learnin (CoRL) (2022)"},{"key":"13_CR58","unstructured":"Ilharco, G., et al.: OpenCLIP, July 2021"},{"key":"13_CR59","doi-asserted-by":"publisher","unstructured":"Jia, M., et al.: Visual prompt tuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13693, pp. 709\u2013727. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_41","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"13_CR60","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1162\/tacl_a_00324","volume":"8","author":"Z Jiang","year":"2020","unstructured":"Jiang, Z., Xu, F.F., Araki, J., Neubig, G.: How can we know what language models know. Trans. Assoc. Comput. Linguistics 8, 423\u2013438 (2020)","journal-title":"Trans. Assoc. Comput. Linguistics"},{"issue":"9","key":"13_CR61","doi-asserted-by":"publisher","first-page":"920","DOI":"10.1109\/34.310690","volume":"16","author":"T Kanade","year":"1994","unstructured":"Kanade, T., Okutomi, M.: A stereo matching algorithm with an adaptive window: theory and experiment. IEEE Trans. Pattern Anal. Mach. Intell. 16(9), 920\u2013932 (1994)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"13_CR62","doi-asserted-by":"crossref","unstructured":"Kim, V.G., Chaudhuri, S., Guibas, L.J., Funkhouser, T.A.: Shape2Pose: human-centric shape analysis. ACM Trans. Graph. 33(4), 120:1\u2013120:12 (2014)","DOI":"10.1145\/2601097.2601117"},{"key":"13_CR63","unstructured":"Koh, J.Y., Fried, D., Salakhutdinov, R.: Generating images with multimodal language models. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"issue":"1\u20132","key":"13_CR64","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn, H.W.: The Hungarian method for the assignment problem. Naval Res. Logistics Q. 2(1\u20132), 83\u201397 (1955)","journal-title":"Naval Res. Logistics Q."},{"key":"13_CR65","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"13_CR66","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"13_CR67","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, H., Yi, L., Guibas, L.J., Abbott, A.L., Song, S.: Category-level articulated object pose estimation. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00376"},{"key":"13_CR68","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: ManipLLM: embodied multimodal large language model for object-centric robotic manipulation (2023)","DOI":"10.1109\/CVPR52733.2024.01710"},{"key":"13_CR69","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, X., Wen, J.R.: Evaluating object hallucination in large vision-language models. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 292\u2013305. Association for Computational Linguistics, Singapore (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"13_CR70","doi-asserted-by":"crossref","unstructured":"Liang, Y., et\u00a0al.: TaskMatrix.AI: completing tasks by connecting foundation models with millions of APIs. Intell. Comput. 3, 0063 (2024)","DOI":"10.34133\/icomputing.0063"},{"issue":"8","key":"13_CR71","doi-asserted-by":"publisher","first-page":"1345","DOI":"10.1007\/s10514-023-10131-7","volume":"47","author":"K Lin","year":"2023","unstructured":"Lin, K., Agia, C., Migimatsu, T., Pavone, M., Bohg, J.: Text2Motion: from natural language instructions to feasible plans. Auton. Robot. 47(8), 1345\u20131365 (2023)","journal-title":"Auton. Robot."},{"key":"13_CR72","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"13_CR73","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR74","unstructured":"Liu, M., et al.: OpenShape: scaling up 3D shape representation towards open-world understanding. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR75","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, B., Wang, H., Yi, L.: Few-shot physically-aware articulated mesh generation via hierarchical deformation. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00085"},{"key":"13_CR76","unstructured":"Liu, X., Yi, L.: GeneOH diffusion: towards generalizable hand-object interaction denoising via denoising diffusion. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR77","unstructured":"Liu, X., Zhang, J., Hu, R., Huang, H., Wang, H., Yi, L.: Self-supervised category-level articulated object pose estimation with part-level SE(3) equivariance. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"13_CR78","doi-asserted-by":"crossref","unstructured":"Liu, Y., Fan, B., Xiang, S., Pan, C.: Relation-shape convolutional neural network for point cloud analysis. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00910"},{"key":"13_CR79","unstructured":"Liu, Y., et al.: MMBench: is your multi-modal model an all-around player? CoRR abs\/2307.06281 (2023)"},{"key":"13_CR80","unstructured":"Liu, Y., et al.: SyncDreamer: generating multiview-consistent images from a single-view image. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR81","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, J., Zhang, Z., Huang, J., Yi, L.: LeaF: learning frames for 4D point cloud sequence understanding. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00062"},{"key":"13_CR82","doi-asserted-by":"crossref","unstructured":"Lu, C., et al.: Beyond holistic object recognition: enriching image understanding with part states. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00727"},{"key":"13_CR83","unstructured":"Luo, T., Rockwell, C., Lee, H., Johnson, J.: Scalable 3D captioning with pretrained models. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR84","unstructured":"Ma, X., et al.: SQA3D: situated question answering in 3D scenes. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"13_CR85","unstructured":"Ma, X., Qin, C., You, H., Ran, H., Fu, Y.: Rethinking network design and local geometry in point cloud: a simple residual MLP framework. In: International Conference on Learning Representations (ICLR). OpenReview.net (2022)"},{"key":"13_CR86","doi-asserted-by":"crossref","unstructured":"Mo, K., et al.: PartNet: a large-scale benchmark for fine-grained and hierarchical part-level 3D object understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00100"},{"key":"13_CR87","unstructured":"Mu, Y., et al.: EmbodiedGPT: vision-language pre-training via embodied chain of thought. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR88","unstructured":"OpenAI: GPT-4 technical report. CoRR abs\/2303.08774 (2023). https:\/\/openai.com\/research\/gpt-4"},{"key":"13_CR89","unstructured":"OpenAI: GPT-4V(ision) system card (2023). https:\/\/openai.com\/research\/gpt-4v-system-card"},{"key":"13_CR90","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"13_CR91","unstructured":"Pan, X., Dong, L., Huang, S., Peng, Z., Chen, W., Wei, F.: Kosmos-G: generating images in context with multimodal large language models. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR92","doi-asserted-by":"publisher","unstructured":"Pang, Y., Wang, W., Tay, F.E.H., Liu, W., Tian, Y., Yuan, L.: Masked autoencoders for point cloud self-supervised learning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13662, pp. 604\u2013621. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20086-1_35","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"13_CR93","unstructured":"Peng, B., Li, C., He, P., Galley, M., Gao, J.: Instruction tuning with GPT-4. CoRR abs\/2304.03277 (2023)"},{"key":"13_CR94","doi-asserted-by":"crossref","unstructured":"Peng, S., Genova, K., Jiang, C.M., Tagliasacchi, A., Pollefeys, M., Funkhouser, T.A.: OpenScene: 3D scene understanding with open vocabularies. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"13_CR95","unstructured":"Peng, Z., et al.: KOSMOS-2: grounding multimodal large language models to the world. CoRR abs\/2306.14824 (2023)"},{"key":"13_CR96","doi-asserted-by":"crossref","unstructured":"Petroni, F., et al.: Language models as knowledge bases? In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, 3\u20137 November 2019 (2019)","DOI":"10.18653\/v1\/D19-1250"},{"key":"13_CR97","doi-asserted-by":"crossref","unstructured":"Pirk, S., et al.: Understanding and exploiting object interaction landscapes. ACM Trans. Graph. 36(3), 31:1\u201331:14 (2017)","DOI":"10.1145\/3083725"},{"key":"13_CR98","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Su, H., Mo, K., Guibas, L.J.: PointNet: deep learning on point sets for 3D classification and segmentation. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 77\u201385 (2017)","DOI":"10.1109\/CVPR.2017.16"},{"key":"13_CR99","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: PointNet++: deep hierarchical feature learning on point sets in a metric space. In: Advances in Neural Information Processing Systems, pp. 5099\u20135108 (2017)"},{"key":"13_CR100","unstructured":"Qi, H., Kumar, A., Calandra, R., Ma, Y., Malik, J.: In-hand object rotation via rapid motor adaptation. In: Annual Conference on Robot Learning (CoRL) (2023)"},{"key":"13_CR101","unstructured":"Qi, Z., et al.: Contrast with reconstruct: contrastive 3D representation learning guided by generative pretraining. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"13_CR102","unstructured":"Qi, Z., Yu, M., Dong, R., Ma, K.: VPP: efficient conditional 3D generation via voxel-point progressive representation. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR103","doi-asserted-by":"crossref","unstructured":"Qi, Z., et al.: GPT4Point: a unified framework for point-language understanding and generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26417\u201326427 (2024)","DOI":"10.1109\/CVPR52733.2024.02495"},{"key":"13_CR104","unstructured":"Qian, G., et al.: PointNeXt: revisiting PointNet++ with improved training and scaling strategies. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"13_CR105","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML). Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"13_CR106","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"13_CR107","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"13_CR108","unstructured":"Ren, J., Pan, L., Liu, Z.: Benchmarking and analyzing point cloud classification under corruptions. In: International Conference on Machine Learning (ICML) (2022)"},{"key":"13_CR109","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Hendricks, L.A., Burns, K., Darrell, T., Saenko, K.: Object hallucination in image captioning. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, Brussels, Belgium, 31 October\u20134 November 2018 (2018)","DOI":"10.18653\/v1\/D18-1437"},{"key":"13_CR110","unstructured":"Shen, W., Yang, G., Yu, A., Wong, J., Kaelbling, L.P., Isola, P.: Distilled feature fields enable few-shot language-guided manipulation. In: Annual Conference on Robot Learning (CoRL) (2023)"},{"key":"13_CR111","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving AI tasks with ChatGPT and its friends in HuggingFace. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR112","unstructured":"Shi, H., Xu, H., Clarke, S., Li, Y., Wu, J.: RoboCook: long-horizon elasto-plastic object manipulation with diverse tools. In: Annual Conference on Robot Learning (CoRL) (2023)"},{"key":"13_CR113","unstructured":"Shutterstock: Turbosquid. https:\/\/www.turbosquid.com\/"},{"key":"13_CR114","doi-asserted-by":"crossref","unstructured":"Su, H., Maji, S., Kalogerakis, E., Learned-Miller, E.G.: Multi-view convolutional neural networks for 3D shape recognition. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.114"},{"key":"13_CR115","unstructured":"Sun, J., Zhang, Q., Kailkhura, B., Yu, Z., Xiao, C., Mao, Z.M.: ModelNet40-C: a robustness benchmark for 3D point cloud recognition under corruption. In: ICLR 2022 Workshop on Socially Responsible Machine Learning (2022)"},{"key":"13_CR116","doi-asserted-by":"crossref","unstructured":"Sun, Q., et al.: Generative multimodal models are in-context learners. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"13_CR117","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: EVA-CLIP: improved training techniques for CLIP at scale. CoRR abs\/2303.15389 (2023)"},{"key":"13_CR118","unstructured":"Sun, Q., et al.: Emu: generative pretraining in multimodality. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR119","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., Vondrick, C.: ViperGPT: visual inference via Python execution for reasoning. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"13_CR120","unstructured":"Taori, R., et al.: Stanford Alpaca: an instruction-following LLaMA model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"13_CR121","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. CoRR abs\/2302.13971 (2023)"},{"key":"13_CR122","doi-asserted-by":"crossref","unstructured":"Uy, M.A., Pham, Q.H., Hua, B.S., Nguyen, T., Yeung, S.K.: Revisiting point cloud classification: a new benchmark dataset and classification model on real-world data. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1588\u20131597 (2019)","DOI":"10.1109\/ICCV.2019.00167"},{"key":"13_CR123","doi-asserted-by":"crossref","unstructured":"Wan, W., et al.: UniDexGrasp++: improving dexterous grasping policy learning via geometry-aware curriculum and iterative generalist-specialist learning. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00360"},{"key":"13_CR124","unstructured":"Wang, G., et al.: Voyager: an open-ended embodied agent with large language models. T. Mach. Learn. Res. (TMLR) (2024)"},{"key":"13_CR125","doi-asserted-by":"crossref","unstructured":"Wang, H., Sridhar, S., Huang, J., Valentin, J., Song, S., Guibas, L.J.: Normalized object coordinate space for category-level 6d object pose and size estimation. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00275"},{"key":"13_CR126","doi-asserted-by":"crossref","unstructured":"Wang, Y., Sun, Y., Liu, Z., Sarma, S.E., Bronstein, M.M., Solomon, J.M.: Dynamic graph CNN for learning on point clouds. ACM Trans. Graph. 38(5), 146:1\u2013146:12 (2019)","DOI":"10.1145\/3326362"},{"key":"13_CR127","doi-asserted-by":"crossref","unstructured":"Wang, Z., Yu, X., Rao, Y., Zhou, J., Lu, J.: Take-a-photo: 3D-to-2D generative pre-training of point cloud models. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00519"},{"key":"13_CR128","doi-asserted-by":"publisher","unstructured":"Wen, H., Liu, Y., Huang, J., Duan, B., Yi, L.: Point primitive transformer for long-term 4D point cloud video understanding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13689, pp. 19\u201335. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_2","DOI":"10.1007\/978-3-031-19818-2_2"},{"key":"13_CR129","doi-asserted-by":"crossref","unstructured":"Weng, Y., et al.: CAPTRA: category-level pose tracking for rigid and articulated objects from point clouds. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01296"},{"key":"13_CR130","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. CoRR abs\/2303.04671 (2023)"},{"key":"13_CR131","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.: Next-GPT: any-to-any multimodal LLM. In: International Conference on Machine Learning (ICML) (2024)"},{"key":"13_CR132","unstructured":"Wu, Z., et al.: 3D ShapeNets: a deep representation for volumetric shapes. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1912\u20131920 (2015)"},{"key":"13_CR133","doi-asserted-by":"crossref","unstructured":"Xu, R., Wang, X., Wang, T., Chen, Y., Pang, J., Lin, D.: PointLLM: empowering large language models to understand point clouds. CoRR abs\/2308.16911 (2023)","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"13_CR134","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: UniDexGrasp: universal robotic dexterous grasping via learning diverse proposal generation and goal-conditioned policy. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00459"},{"key":"13_CR135","doi-asserted-by":"crossref","unstructured":"Xu, Z., Shen, Y., Huang, L.: MULTIINSTRUCT: improving multi-modal zero-shot learning via instruction tuning. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL) (Volume 1: Long Papers) (2023)","DOI":"10.18653\/v1\/2023.acl-long.641"},{"key":"13_CR136","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: ULIP: learning unified representation of language, image and point cloud for 3D understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"13_CR137","doi-asserted-by":"crossref","unstructured":"Xue, L., et al.: ULIP-2: towards scalable multimodal pre-training for 3D understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02558"},{"key":"13_CR138","unstructured":"Yang, R., et al.: GPT4Tools: teaching large language model to use tools via self-instruction. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR139","unstructured":"Yang, Z., et al.: MM-REACT: prompting ChatGPT for multimodal reasoning and action. CoRR abs\/2303.11381 (2023)"},{"key":"13_CR140","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality. CoRR abs\/2304.14178 (2023)"},{"key":"13_CR141","unstructured":"Ye, S., Chen, D., Han, S., Liao, J.: 3D question answering. IEEE Trans. Vis. Comput. Graph. (2022)"},{"issue":"6","key":"13_CR142","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1145\/3272127.3275027","volume":"37","author":"L Yi","year":"2018","unstructured":"Yi, L., Huang, H., Liu, D., Kalogerakis, E., Su, H., Guibas, L.J.: Deep part induction from articulated object pairs. ACM Trans. Graph. 37(6), 209 (2018)","journal-title":"ACM Trans. Graph."},{"key":"13_CR143","unstructured":"You, Y., Shen, B., Deng, C., Geng, H., Wang, H., Guibas, L.J.: Make a donut: language-guided hierarchical EMD-space planning for zero-shot deformable object manipulation. CoRR abs\/2311.02787 (2023)"},{"key":"13_CR144","unstructured":"Yu, W., et al.: MM-Vet: evaluating large multimodal models for integrated capabilities. In: International Conference on Machine Learning (ICML) (2024)"},{"key":"13_CR145","doi-asserted-by":"crossref","unstructured":"Yu, X., Tang, L., Rao, Y., Huang, T., Zhou, J., Lu, J.: Point-BERT: pre-training 3D point cloud transformers with masked point modeling. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"13_CR146","doi-asserted-by":"publisher","unstructured":"Zeid, K.A., Schult, J., Hermans, A., Leibe, B.: Point2Vec for self-supervised representation learning on point clouds. In: K\u00f6the, U., Rother, C. (eds.) DAGM GCPR 2023. LNCS, vol. 14264, pp. 131\u2013146. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-54605-1_9","DOI":"10.1007\/978-3-031-54605-1_9"},{"key":"13_CR147","doi-asserted-by":"crossref","unstructured":"Zhang, J., Dong, R., Ma, K.: CLIP-FO3D: learning free open-world 3D scene representations from 2D dense CLIP. In: International Conference on Computer Vision (ICCV Workshop) (2023)","DOI":"10.1109\/ICCVW60793.2023.00219"},{"key":"13_CR148","unstructured":"Zhang, R., et al.: Point-m2AE: multi-scale masked autoencoders for hierarchical point cloud pre-training. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"13_CR149","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: PointCLIP: point cloud understanding by CLIP. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"13_CR150","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR151","doi-asserted-by":"crossref","unstructured":"Zhang, R., Wang, L., Qiao, Y., Gao, P., Li, H.: Learning 3D representations from 2D pre-trained models via image-to-point masked autoencoders. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02085"},{"key":"13_CR152","unstructured":"Zhang, S., et al.: GPT4RoI: instruction tuning large language model on region-of-interest. CoRR abs\/2307.03601 (2023)"},{"key":"13_CR153","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Cao, S., Wang, Y.: TAMM: TriAdapter multi-modal learning for 3D shape understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02023"},{"key":"13_CR154","doi-asserted-by":"crossref","unstructured":"Zhao, L., et al.: ChatSpot: bootstrapping multimodal LLMs via precise referring instruction tuning. In: International Joint Conference on Artificial Intelligence (IJCAI) (2024)","DOI":"10.24963\/ijcai.2024\/193"},{"key":"13_CR155","doi-asserted-by":"crossref","unstructured":"Zhao, X., Wang, H., Komura, T.: Indexing 3D scenes using the interaction bisector surface. ACM Trans. Graph. 33(3), 22:1\u201322:14 (2014)","DOI":"10.1145\/2574860"},{"key":"13_CR156","doi-asserted-by":"crossref","unstructured":"Zheng, J., Zheng, Q., Fang, L., Liu, Y., Yi, L.: CAMS: canonicalized manipulation spaces for category-level functional hand-object manipulation synthesis. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00064"},{"key":"13_CR157","unstructured":"Zheng, L., et al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. In: Advances in Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"13_CR158","unstructured":"Zhou, J., Wang, J., Ma, B., Liu, Y., Huang, T., Wang, X.: Uni3D: exploring unified 3D representation at scale. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR159","unstructured":"Zhou, Y., et al.: Analyzing and mitigating object hallucination in large vision-language models. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR160","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"13_CR161","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: PointCLIP V2: prompting CLIP and GPT for powerful 3D open-world learning. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00249"},{"key":"13_CR162","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Ma, X., Chen, Y., Deng, Z., Huang, S., Li, Q.: 3D-VisTA: pre-trained transformer for 3D vision and text alignment. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00272"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72775-7_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:24:11Z","timestamp":1732829051000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72775-7_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031727740","9783031727757"],"references-count":162,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72775-7_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}