{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T11:16:05Z","timestamp":1768648565293,"version":"3.49.0"},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["DUT24GF311"],"award-info":[{"award-number":["DUT24GF311"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013804","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013804","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476040"],"award-info":[{"award-number":["62476040"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372080"],"award-info":[{"award-number":["62372080"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1016\/j.neucom.2025.130072","type":"journal-article","created":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T07:50:25Z","timestamp":1743407425000},"page":"130072","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["3DAxisPrompt: Promoting the 3D grounding and reasoning in GPT-4o"],"prefix":"10.1016","volume":"637","author":[{"given":"Dingning","family":"Liu","sequence":"first","affiliation":[]},{"given":"Cheng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Renrui","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xinzhu","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Meng","sequence":"additional","affiliation":[]},{"given":"Zhihui","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2025.130072_b1","series-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"10.1016\/j.neucom.2025.130072_b2","series-title":"PaLM: Scaling language modeling with pathways","author":"Chowdhery","year":"2022"},{"key":"10.1016\/j.neucom.2025.130072_b3","series-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b4","series-title":"GPT-4 technical report","author":"OpenAI","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b5","series-title":"Gemini: A family of highly capable multimodal models","author":"GeminiTeam","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b6","series-title":"Hello gpt-4o","author":"OpenAI","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b7","series-title":"Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V","author":"Yang","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b8","series-title":"Dettoolchain: A new prompting paradigm to unleash detection ability of mllm","author":"Wu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b9","series-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b10","series-title":"2024 IEEE International Conference on Image Processing","first-page":"3341","article-title":"LiSD: An efficient multi-task learning framework for lidar segmentation and detection","author":"Xu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b11","unstructured":"L. Wen, D. Fu, X. Li, X. Cai, M. Tao, P. Cai, M. Dou, B. Shi, L. He, Y. Qiao, DiLu: A Knowledge-Driven Approach to Autonomous Driving with Large Language Models, in: The Twelfth International Conference on Learning Representations, 2024."},{"key":"10.1016\/j.neucom.2025.130072_b12","doi-asserted-by":"crossref","unstructured":"C. Cui, Y. Ma, X. Cao, W. Ye, Y. Zhou, K. Liang, J. Chen, J. Lu, Z. Yang, K.-D. Liao, et al., A survey on multimodal large language models for autonomous driving, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024, pp. 958\u2013979.","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"10.1016\/j.neucom.2025.130072_b13","series-title":"Pointllm: Empowering large language models to understand point clouds","author":"Xu","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b14","doi-asserted-by":"crossref","unstructured":"A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A.C. Berg, W.-Y. Lo, et al., Segment anything, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10.1016\/j.neucom.2025.130072_b15","series-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neucom.2025.130072_b16","series-title":"Large language models as commonsense knowledge for large-scale task planning","author":"Zhao","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b17","series-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b18","series-title":"DeepSeek-VL: Towards real-world vision-language understanding","author":"Lu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b19","series-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b20","series-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2025.130072_b21","unstructured":"The claude 3 model family: Opus, sonnet, haiku. URL https:\/\/api.semanticscholar.org\/CorpusID:268232499."},{"key":"10.1016\/j.neucom.2025.130072_b22","series-title":"An early evaluation of GPT-4V(ision)","author":"Wu","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b23","series-title":"The dawn of LMMs: Preliminary explorations with GPT-4V(ision)","author":"Yang","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b24","series-title":"A challenger to GPT-4V? Early explorations of gemini in visual expertise","author":"Fu","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b25","series-title":"Towards generic anomaly detection and understanding: Large-scale visual-linguistic model (GPT-4V) takes the lead","author":"Cao","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b26","series-title":"Open X-Embodiment: Robotic learning datasets and RT-X models","author":"Collaboration","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b27","series-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b28","series-title":"SpatialVLM: Endowing vision-language models with spatial reasoning capabilities","author":"Chen","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b29","series-title":"A survey on in-context learning","author":"Dong","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b30","series-title":"Tree of thoughts: Deliberate problem solving with large language models","author":"Yao","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b31","series-title":"CPT: Colorful prompt tuning for pre-trained vision-language models","author":"Yao","year":"2022"},{"key":"10.1016\/j.neucom.2025.130072_b32","series-title":"What does CLIP know about a red circle? Visual prompt engineering for VLMs","author":"Shtedritski","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b33","series-title":"Fine-grained visual prompting","author":"Yang","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b34","series-title":"Scaffolding coordinates to promote vision-language coordination in large multi-modal models","author":"Lei","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b35","series-title":"Compositional chain-of-thought prompting for large multimodal models","author":"Mitra","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b36","series-title":"Pushing boundaries: Exploring zero shot object classification with large multimodal models","author":"Islam","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b37","series-title":"PIVOT: Iterative visual prompting elicits actionable knowledge for VLMs","author":"Nasiriany","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b38","series-title":"Coarse correspondence elicit 3D spacetime understanding in multimodal language model","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b39","series-title":"GPT-4V in wonderland: Large multimodal models for zero-shot smartphone GUI navigation","author":"Yan","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b40","series-title":"GPT-4V(ision) is a generalist web agent, if grounded","author":"Zheng","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b41","series-title":"Multimodal ChatGPT for medical applications: an experimental study of GPT-4V","author":"Yan","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b42","series-title":"Holistic evaluation of GPT-4V for biomedical imaging","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b43","series-title":"Assessing the effectiveness of GPT-4o in climate change evidence synthesis and systematic assessments: Preliminary insights","author":"Joe","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b44","series-title":"GPT-4o: Visual perception performance of multimodal large language models in piglet activity understanding","author":"Wu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b45","series-title":"Putting GPT-4o to the sword: A comprehensive evaluation of language, vision, speech, and multimodal proficiency","author":"Shahriar","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b46","series-title":"The power of combining data and knowledge: GPT-4o is an effective interpreter of machine learning models in predicting lymph node metastasis of lung cancer","author":"Hu","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b47","series-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning","author":"Guo","year":"2025"},{"key":"10.1016\/j.neucom.2025.130072_b48","series-title":"Claude 3.7 sonnet and Claude code","year":"2025"},{"key":"10.1016\/j.neucom.2025.130072_b49","series-title":"Grok 3 beta \u2014 The age of reasoning agents","year":"2025"},{"key":"10.1016\/j.neucom.2025.130072_b50","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1016\/j.softx.2015.04.001","article-title":"The Visualization Toolkit (VTK): Rewriting the rendering code for modern graphics cards","volume":"1","author":"Hanwell","year":"2015","journal-title":"SoftwareX"},{"key":"10.1016\/j.neucom.2025.130072_b51","doi-asserted-by":"crossref","unstructured":"Z. Chen, A. Tagliasacchi, H. Zhang, Bsp-net: Generating compact meshes via binary space partitioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 45\u201354.","DOI":"10.1109\/CVPR42600.2020.00012"},{"key":"10.1016\/j.neucom.2025.130072_b52","series-title":"Grounded language-image pre-training","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2025.130072_b53","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4104","article-title":"Structure-from-motion revisited","author":"Sch\u00f6nberger","year":"2016"},{"key":"10.1016\/j.neucom.2025.130072_b54","doi-asserted-by":"crossref","unstructured":"J.R. Shue, E.R. Chan, R. Po, Z. Ankner, J. Wu, G. Wetzstein, 3d neural field generation using triplane diffusion, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 20875\u201320886.","DOI":"10.1109\/CVPR52729.2023.02000"},{"key":"10.1016\/j.neucom.2025.130072_b55","series-title":"European Conference on Computer Vision","first-page":"202","article-title":"Scanrefer: 3d object localization in rgb-d scans using natural language","author":"Chen","year":"2020"},{"key":"10.1016\/j.neucom.2025.130072_b56","doi-asserted-by":"crossref","unstructured":"A. Dai, A.X. Chang, M. Savva, M. Halber, T. Funkhouser, M. Nie\u00dfner, Scannet: Richly-annotated 3d reconstructions of indoor scenes, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 5828\u20135839.","DOI":"10.1109\/CVPR.2017.261"},{"key":"10.1016\/j.neucom.2025.130072_b57","series-title":"FMB: a functional manipulation benchmark for generalizable robotic learning","author":"Luo","year":"2024"},{"key":"10.1016\/j.neucom.2025.130072_b58","series-title":"CVPR","article-title":"NuScenes: A multimodal dataset for autonomous driving","author":"Caesar","year":"2020"},{"key":"10.1016\/j.neucom.2025.130072_b59","series-title":"ScanNet: Richly-annotated 3D reconstructions of indoor scenes","author":"Dai","year":"2017"},{"key":"10.1016\/j.neucom.2025.130072_b60","series-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","article-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2022"},{"key":"10.1016\/j.neucom.2025.130072_b61","series-title":"LLM-grounder: Open-vocabulary 3D visual grounding with large language model as an agent","author":"Yang","year":"2023"},{"key":"10.1016\/j.neucom.2025.130072_b62","series-title":"ShapeNet: An information-rich 3D model repository","author":"Chang","year":"2015"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225007441?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225007441?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T05:48:36Z","timestamp":1762321716000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231225007441"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":62,"alternative-id":["S0925231225007441"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2025.130072","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2025,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"3DAxisPrompt: Promoting the 3D grounding and reasoning in GPT-4o","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2025.130072","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"130072"}}