{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T10:40:01Z","timestamp":1745404801353,"version":"3.40.4"},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1016\/j.neucom.2025.130072","type":"journal-article","created":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T11:50:25Z","timestamp":1743421825000},"page":"130072","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["3DAxisPrompt: Promoting the 3D grounding and reasoning in GPT-4o"],"prefix":"10.1016","volume":"637","author":[{"given":"Dingning","family":"Liu","sequence":"first","affiliation":[]},{"given":"Cheng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Renrui","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xinzhu","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Meng","sequence":"additional","affiliation":[]},{"given":"Zhihui","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"year":"2020","series-title":"Language models are few-shot learners","author":"Brown","key":"10.1016\/j.neucom.2025.130072_b1"},{"year":"2022","series-title":"PaLM: Scaling language modeling with pathways","author":"Chowdhery","key":"10.1016\/j.neucom.2025.130072_b2"},{"year":"2023","series-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","key":"10.1016\/j.neucom.2025.130072_b3"},{"year":"2024","series-title":"GPT-4 technical report","author":"OpenAI","key":"10.1016\/j.neucom.2025.130072_b4"},{"year":"2024","series-title":"Gemini: A family of highly capable multimodal models","author":"GeminiTeam","key":"10.1016\/j.neucom.2025.130072_b5"},{"year":"2024","series-title":"Hello gpt-4o","author":"OpenAI","key":"10.1016\/j.neucom.2025.130072_b6"},{"year":"2023","series-title":"Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V","author":"Yang","key":"10.1016\/j.neucom.2025.130072_b7"},{"year":"2024","series-title":"Dettoolchain: A new prompting paradigm to unleash detection ability of mllm","author":"Wu","key":"10.1016\/j.neucom.2025.130072_b8"},{"year":"2023","series-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","key":"10.1016\/j.neucom.2025.130072_b9"},{"key":"10.1016\/j.neucom.2025.130072_b10","series-title":"2024 IEEE International Conference on Image Processing","first-page":"3341","article-title":"LiSD: An efficient multi-task learning framework for lidar segmentation and detection","author":"Xu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b11","unstructured":"L. Wen, D. Fu, X. Li, X. Cai, M. Tao, P. Cai, M. Dou, B. Shi, L. He, Y. Qiao, DiLu: A Knowledge-Driven Approach to Autonomous Driving with Large Language Models, in: The Twelfth International Conference on Learning Representations, 2024."},{"key":"10.1016\/j.neucom.2025.130072_b12","doi-asserted-by":"crossref","unstructured":"C. Cui, Y. Ma, X. Cao, W. Ye, Y. Zhou, K. Liang, J. Chen, J. Lu, Z. Yang, K.-D. Liao, et al., A survey on multimodal large language models for autonomous driving, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024, pp. 958\u2013979.","DOI":"10.1109\/WACVW60836.2024.00106"},{"year":"2023","series-title":"Pointllm: Empowering large language models to understand point clouds","author":"Xu","key":"10.1016\/j.neucom.2025.130072_b13"},{"key":"10.1016\/j.neucom.2025.130072_b14","doi-asserted-by":"crossref","unstructured":"A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A.C. Berg, W.-Y. Lo, et al., Segment anything, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"year":"2022","series-title":"OPT: Open pre-trained transformer language models","author":"Zhang","key":"10.1016\/j.neucom.2025.130072_b15"},{"year":"2023","series-title":"Large language models as commonsense knowledge for large-scale task planning","author":"Zhao","key":"10.1016\/j.neucom.2025.130072_b16"},{"year":"2023","series-title":"Visual instruction tuning","author":"Liu","key":"10.1016\/j.neucom.2025.130072_b17"},{"year":"2024","series-title":"DeepSeek-VL: Towards real-world vision-language understanding","author":"Lu","key":"10.1016\/j.neucom.2025.130072_b18"},{"year":"2023","series-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","key":"10.1016\/j.neucom.2025.130072_b19"},{"year":"2021","series-title":"Learning transferable visual models from natural language supervision","author":"Radford","key":"10.1016\/j.neucom.2025.130072_b20"},{"key":"10.1016\/j.neucom.2025.130072_b21","unstructured":"The claude 3 model family: Opus, sonnet, haiku. URL https:\/\/api.semanticscholar.org\/CorpusID:268232499."},{"year":"2023","series-title":"An early evaluation of GPT-4V(ision)","author":"Wu","key":"10.1016\/j.neucom.2025.130072_b22"},{"year":"2023","series-title":"The dawn of LMMs: Preliminary explorations with GPT-4V(ision)","author":"Yang","key":"10.1016\/j.neucom.2025.130072_b23"},{"year":"2023","series-title":"A challenger to GPT-4V? Early explorations of gemini in visual expertise","author":"Fu","key":"10.1016\/j.neucom.2025.130072_b24"},{"year":"2023","series-title":"Towards generic anomaly detection and understanding: Large-scale visual-linguistic model (GPT-4V) takes the lead","author":"Cao","key":"10.1016\/j.neucom.2025.130072_b25"},{"year":"2024","series-title":"Open X-Embodiment: Robotic learning datasets and RT-X models","author":"Collaboration","key":"10.1016\/j.neucom.2025.130072_b26"},{"year":"2023","series-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","key":"10.1016\/j.neucom.2025.130072_b27"},{"year":"2024","series-title":"SpatialVLM: Endowing vision-language models with spatial reasoning capabilities","author":"Chen","key":"10.1016\/j.neucom.2025.130072_b28"},{"year":"2024","series-title":"A survey on in-context learning","author":"Dong","key":"10.1016\/j.neucom.2025.130072_b29"},{"year":"2023","series-title":"Tree of thoughts: Deliberate problem solving with large language models","author":"Yao","key":"10.1016\/j.neucom.2025.130072_b30"},{"year":"2022","series-title":"CPT: Colorful prompt tuning for pre-trained vision-language models","author":"Yao","key":"10.1016\/j.neucom.2025.130072_b31"},{"year":"2023","series-title":"What does CLIP know about a red circle? Visual prompt engineering for VLMs","author":"Shtedritski","key":"10.1016\/j.neucom.2025.130072_b32"},{"year":"2023","series-title":"Fine-grained visual prompting","author":"Yang","key":"10.1016\/j.neucom.2025.130072_b33"},{"year":"2024","series-title":"Scaffolding coordinates to promote vision-language coordination in large multi-modal models","author":"Lei","key":"10.1016\/j.neucom.2025.130072_b34"},{"year":"2024","series-title":"Compositional chain-of-thought prompting for large multimodal models","author":"Mitra","key":"10.1016\/j.neucom.2025.130072_b35"},{"year":"2023","series-title":"Pushing boundaries: Exploring zero shot object classification with large multimodal models","author":"Islam","key":"10.1016\/j.neucom.2025.130072_b36"},{"year":"2024","series-title":"PIVOT: Iterative visual prompting elicits actionable knowledge for VLMs","author":"Nasiriany","key":"10.1016\/j.neucom.2025.130072_b37"},{"year":"2024","series-title":"Coarse correspondence elicit 3D spacetime understanding in multimodal language model","author":"Liu","key":"10.1016\/j.neucom.2025.130072_b38"},{"year":"2023","series-title":"GPT-4V in wonderland: Large multimodal models for zero-shot smartphone GUI navigation","author":"Yan","key":"10.1016\/j.neucom.2025.130072_b39"},{"year":"2024","series-title":"GPT-4V(ision) is a generalist web agent, if grounded","author":"Zheng","key":"10.1016\/j.neucom.2025.130072_b40"},{"year":"2023","series-title":"Multimodal ChatGPT for medical applications: an experimental study of GPT-4V","author":"Yan","key":"10.1016\/j.neucom.2025.130072_b41"},{"year":"2023","series-title":"Holistic evaluation of GPT-4V for biomedical imaging","author":"Liu","key":"10.1016\/j.neucom.2025.130072_b42"},{"year":"2024","series-title":"Assessing the effectiveness of GPT-4o in climate change evidence synthesis and systematic assessments: Preliminary insights","author":"Joe","key":"10.1016\/j.neucom.2025.130072_b43"},{"year":"2024","series-title":"GPT-4o: Visual perception performance of multimodal large language models in piglet activity understanding","author":"Wu","key":"10.1016\/j.neucom.2025.130072_b44"},{"year":"2024","series-title":"Putting GPT-4o to the sword: A comprehensive evaluation of language, vision, speech, and multimodal proficiency","author":"Shahriar","key":"10.1016\/j.neucom.2025.130072_b45"},{"year":"2024","series-title":"The power of combining data and knowledge: GPT-4o is an effective interpreter of machine learning models in predicting lymph node metastasis of lung cancer","author":"Hu","key":"10.1016\/j.neucom.2025.130072_b46"},{"year":"2025","series-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning","author":"Guo","key":"10.1016\/j.neucom.2025.130072_b47"},{"year":"2025","series-title":"Claude 3.7 sonnet and Claude code","key":"10.1016\/j.neucom.2025.130072_b48"},{"year":"2025","series-title":"Grok 3 beta \u2014 The age of reasoning agents","key":"10.1016\/j.neucom.2025.130072_b49"},{"key":"10.1016\/j.neucom.2025.130072_b50","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1016\/j.softx.2015.04.001","article-title":"The Visualization Toolkit (VTK): Rewriting the rendering code for modern graphics cards","volume":"1","author":"Hanwell","year":"2015","journal-title":"SoftwareX"},{"key":"10.1016\/j.neucom.2025.130072_b51","doi-asserted-by":"crossref","unstructured":"Z. Chen, A. Tagliasacchi, H. Zhang, Bsp-net: Generating compact meshes via binary space partitioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 45\u201354.","DOI":"10.1109\/CVPR42600.2020.00012"},{"year":"2022","series-title":"Grounded language-image pre-training","author":"Li","key":"10.1016\/j.neucom.2025.130072_b52"},{"key":"10.1016\/j.neucom.2025.130072_b53","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4104","article-title":"Structure-from-motion revisited","author":"Sch\u00f6nberger","year":"2016"},{"key":"10.1016\/j.neucom.2025.130072_b54","doi-asserted-by":"crossref","unstructured":"J.R. Shue, E.R. Chan, R. Po, Z. Ankner, J. Wu, G. Wetzstein, 3d neural field generation using triplane diffusion, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 20875\u201320886.","DOI":"10.1109\/CVPR52729.2023.02000"},{"key":"10.1016\/j.neucom.2025.130072_b55","series-title":"European Conference on Computer Vision","first-page":"202","article-title":"Scanrefer: 3d object localization in rgb-d scans using natural language","author":"Chen","year":"2020"},{"key":"10.1016\/j.neucom.2025.130072_b56","doi-asserted-by":"crossref","unstructured":"A. Dai, A.X. Chang, M. Savva, M. Halber, T. Funkhouser, M. Nie\u00dfner, Scannet: Richly-annotated 3d reconstructions of indoor scenes, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 5828\u20135839.","DOI":"10.1109\/CVPR.2017.261"},{"year":"2024","series-title":"FMB: a functional manipulation benchmark for generalizable robotic learning","author":"Luo","key":"10.1016\/j.neucom.2025.130072_b57"},{"key":"10.1016\/j.neucom.2025.130072_b58","series-title":"CVPR","article-title":"NuScenes: A multimodal dataset for autonomous driving","author":"Caesar","year":"2020"},{"year":"2017","series-title":"ScanNet: Richly-annotated 3D reconstructions of indoor scenes","author":"Dai","key":"10.1016\/j.neucom.2025.130072_b59"},{"key":"10.1016\/j.neucom.2025.130072_b60","series-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022"},{"year":"2023","series-title":"LLM-grounder: Open-vocabulary 3D visual grounding with large language model as an agent","author":"Yang","key":"10.1016\/j.neucom.2025.130072_b61"},{"year":"2015","series-title":"ShapeNet: An information-rich 3D model repository","author":"Chang","key":"10.1016\/j.neucom.2025.130072_b62"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225007441?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225007441?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T10:21:17Z","timestamp":1745403677000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231225007441"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":62,"alternative-id":["S0925231225007441"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2025.130072","relation":{},"ISSN":["0925-2312"],"issn-type":[{"type":"print","value":"0925-2312"}],"subject":[],"published":{"date-parts":[[2025,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"3DAxisPrompt: Promoting the 3D grounding and reasoning in GPT-4o","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2025.130072","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"130072"}}