{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:31:28Z","timestamp":1777656688672,"version":"3.51.4"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306046"],"award-info":[{"award-number":["62306046"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128671","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"9490-9498","source":"Crossref","is-referenced-by-count":19,"title":["SpatialBot: Precise Spatial Understanding with Vision Language Models"],"prefix":"10.1109","author":[{"given":"Wenxiao","family":"Cai","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Shanghai Jiao Tong University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Iaroslav","family":"Ponomarenko","sequence":"additional","affiliation":[{"name":"Peking University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianhao","family":"Yuan","sequence":"additional","affiliation":[{"name":"University of Oxford,England"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoqi","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wankou","family":"Yang","sequence":"additional","affiliation":[{"name":"Southeast University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Dong","sequence":"additional","affiliation":[{"name":"Peking University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Shanghai Jiao Tong University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"The dawn of lmms: Preliminary explorations with gpt-4v(ision)","volume":"abs\/2309.17421","author":"Yang","year":"2023","journal-title":"ArXiv"},{"key":"ref2","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref3","article-title":"Visual instruction tuning","volume":"abs\/2304.08485","author":"Liu","year":"2023","journal-title":"ArXiv"},{"key":"ref4","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International Conference on Machine Learning","author":"Li","year":"2023"},{"key":"ref5","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023"},{"key":"ref6","article-title":"Efficient multimodal learning from data-centric perspective","volume":"abs\/2402.11530","author":"He","year":"2024","journal-title":"ArXiv"},{"key":"ref7","article-title":"Spatial as deep: Spatial cnn for traffic scene understanding","volume-title":"AAAI Conference on Artificial Intelligence","author":"Pan","year":"2017"},{"key":"ref8","first-page":"19107","article-title":"Scanqa: 3d question answering for spatial scene understanding","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Azuma"},{"key":"ref9","article-title":"3d spatial measurement for model reconstruction: A review","author":"Flores-Fuentes","year":"2022","journal-title":"Measurement"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"330","DOI":"10.3390\/ijgi9050330","article-title":"A review of techniques for 3d reconstruction of indoor environments","volume":"9","author":"Kang","year":"2020","journal-title":"ISPRS Int. J. Geo Inf."},{"issue":"11","key":"ref11","doi-asserted-by":"crossref","first-page":"2021","DOI":"10.1364\/JOSAA.32.002021","article-title":"Computational photography with plenoptic camera and light field capture: tutorial","volume":"32","author":"Lam","year":"2015","journal-title":"Journal of the Optical Society of America. A, Optics, image science, and vision"},{"key":"ref12","article-title":"Cliport: What and where pathways for robotic manipulation","volume":"abs\/2109.12098","author":"Shridhar","year":"2021","journal-title":"ArXiv"},{"key":"ref13","article-title":"Open x-embodiment: Robotic learning datasets and rt-x models","author":"Padalkar","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01710"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610090"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2025.3543137"},{"key":"ref17","article-title":"Learning robotic navigation from experience: principles, methods and recent results","volume":"378","author":"Levine","year":"2022","journal-title":"Philosophical Transactions of the Royal Society B"},{"key":"ref18","article-title":"Spatial cognition: The role of landmark, route, and survey knowledge in human and robot navigation","author":"Werner","year":"1997","journal-title":"GI Jahrestagung"},{"key":"ref19","doi-asserted-by":"crossref","DOI":"10.3390\/app10020497","article-title":"Semantic information for robot navigation: A survey","author":"Crespo","year":"2020","journal-title":"Applied Sciences"},{"key":"ref20","article-title":"Rag-driver: Generalisable driving explanations with retrieval-augmented in-context learning in multi-modal large language model","author":"Yuan","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref21","article-title":"Learning multiple probabilistic decisions from latent world model in autonomous driving","author":"Xiao","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref22","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","volume":"abs\/2306.13394","author":"Fu","year":"2023","journal-title":"ArXiv"},{"key":"ref23","article-title":"Mmbench: Is your multi-modal model an all-around player?","volume":"abs\/2307.06281","author":"Liu","year":"2023","journal-title":"ArXiv"},{"key":"ref24","article-title":"Efficient multimodal large language models: A survey","volume":"abs\/2405.10739","author":"Jin","year":"2024","journal-title":"ArXiv"},{"key":"ref25","article-title":"Yi: Open foundation models by 01. ai","author":"Young","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref26","article-title":"Mm1: Methods, analysis & insights from multimodal 1 lm pre-training","author":"McKinzie","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref27","article-title":"Mini-gemini: Mining the potential of multi-modality vision language models","volume":"abs\/2403.18814","author":"Li","year":"2024","journal-title":"ArXiv"},{"key":"ref28","article-title":"Svit: Scaling up visual instruction tuning","author":"Zhao","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref29","article-title":"Llama: Open and efficient foundation language models","volume":"abs\/2302.13971","author":"Touvron","year":"2023","journal-title":"ArXiv"},{"key":"ref30","article-title":"Phi3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024","journal-title":"arXiv preprint arXiv"},{"issue":"3","key":"ref31","first-page":"6","volume":"2","author":"Chiang","year":"2023","journal-title":"Vicuna: An opensource chatbot impressing gpt-4 with 90% chatgpt quality"},{"key":"ref32","article-title":"Qwen technical report","author":"Bai","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2025.104429"},{"key":"ref34","article-title":"Seed-bench: Benchmarking multimodal 1 lms with generative comprehension","volume":"abs\/2307.16125","author":"Li","year":"2023","journal-title":"ArXiv"},{"key":"ref35","article-title":"Mmmu: A massive multidiscipline multimodal understanding and reasoning benchmark for expert agi","volume":"abs\/2311.16502","author":"Yue","year":"2023","journal-title":"ArXiv"},{"key":"ref36","article-title":"Llava-grounding: Grounded visual chat with large multimodal models","volume":"abs\/2312.02949","author":"Zhang","year":"2023","journal-title":"ArXiv"},{"key":"ref37","article-title":"Ferret: Refer and ground anything anywhere at any granularity","volume":"abs\/2310.07704","author":"You","year":"2023","journal-title":"ArXiv"},{"key":"ref38","article-title":"Lisa: Reasoning segmentation via large language model","volume":"abs\/2308.00692","author":"Lai","year":"2023","journal-title":"ArXiv"},{"key":"ref39","article-title":"Osprey: Pixel understanding with visual instruction tuning","volume":"abs\/2312.10032","author":"Yuan","year":"2023","journal-title":"ArXiv"},{"key":"ref40","article-title":"Next-chat: An 1mm for chat, detection and segmentation","volume":"abs\/2311.04498","author":"Zhang","year":"2023","journal-title":"ArXiv"},{"key":"ref41","article-title":"V-irl: Grounding virtual intelligence in real life","volume":"abs\/2402.03310","author":"Yang","year":"2024","journal-title":"ArXiv"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32188"},{"key":"ref43","article-title":"Uncertainty quantification in stereo matching","author":"Cai","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref44","article-title":"Vcoder: Versatile vision encoders for multimodal large language models","volume":"abs\/2312.14233","author":"Jain","year":"2023","journal-title":"ArXiv"},{"key":"ref45","doi-asserted-by":"crossref","first-page":"11441","DOI":"10.1109\/CVPR42600.2020.01146","article-title":"Graspnet-1billion: A large-scale benchmark for general object grasping","volume-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Fang","year":"2020"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.1109\/IROS58592.2024.10801782","volume-title":"Manifoundation model for general-purpose robotic manipulation of contact synthesis with arbitrary objects and robots","author":"Xu","year":"2024"},{"key":"ref47","article-title":"Proximity qa: Unleashing the power of multi-modal large language models for spatial proximity analysis","volume":"abs\/2401.17862","author":"Li","year":"2024","journal-title":"ArXiv"},{"key":"ref48","volume-title":"Languageimage models with 3d understanding","author":"Cho","year":"2024"},{"key":"ref49","article-title":"Spatialvlm: Endowing visionlanguage models with spatial reasoning capabilities","volume":"abs\/2401.12168","author":"Chen","year":"2024","journal-title":"ArXiv"},{"key":"ref50","volume-title":"Spatialrgpt: Grounded spatial reasoning in vision language model","author":"Cheng","year":"2024"},{"key":"ref51","article-title":"Depth anything: Unleashing the power of large-scale unlabeled data","volume":"abs\/2401.10891","author":"Yang","year":"2024","journal-title":"ArXiv"},{"key":"ref52","article-title":"Zoedepth: Zero-shot transfer by combining relative and metric depth","volume":"abs\/2302.12288","author":"Bhat","year":"2023","journal-title":"ArXiv"},{"key":"ref53","article-title":"Repurposing diffusion-based image generators for monocular depth estimation","volume":"abs\/2312.02145","author":"Ke","year":"2023","journal-title":"ArXiv"},{"key":"ref54","doi-asserted-by":"crossref","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","article-title":"Towards robust monocular depth estimation: Mixing datasets for zeroshot cross-dataset transfer","volume":"44","author":"Ranftl","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"ref55","article-title":"Rt-1: Robotics transformer for real-world control at scale","volume":"abs\/2212.06817","author":"Brohan","year":"2022","journal-title":"ArXiv"},{"key":"ref56","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","volume":"abs\/2307.15818","author":"Brohan","year":"2023","journal-title":"ArXiv"},{"key":"ref57","article-title":"Octo: An open-source generalist robot policy","author":"Team","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref58","article-title":"Openvla: An open-source vision-language-action model","author":"Kim","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref59","doi-asserted-by":"crossref","DOI":"10.1145\/2047196.2047270","article-title":"Kinectfusion: real-time 3d reconstruction and interaction using a moving depth camera","volume-title":"Proceedings of the 24th annual ACM symposium on User interface software and technology","author":"Izadi","year":"2011"},{"key":"ref60","doi-asserted-by":"crossref","first-page":"644","DOI":"10.1016\/j.neucom.2015.10.104","article-title":"Dense 3d reconstruction combining depth and rgb information","volume":"175","author":"Pan","year":"2016","journal-title":"Neurocomputing"},{"key":"ref61","doi-asserted-by":"crossref","first-page":"3354","DOI":"10.1109\/ROBOT.2005.1570628","article-title":"Indoor navigation of a wheeled mobile robot along visual routes","volume-title":"Proceedings of the 2005 IEEE International Conference on Robotics and Automation","author":"Blanc","year":"2005"},{"key":"ref62","doi-asserted-by":"crossref","DOI":"10.1080\/23311916.2019.1632046","article-title":"A comprehensive study for robot navigation techniques","volume":"6","author":"Gul","year":"2019","journal-title":"Cogent Engineering"},{"key":"ref63","doi-asserted-by":"crossref","first-page":"2002","DOI":"10.1109\/CVPR.2018.00214","article-title":"Deep ordinal regression network for monocular depth estimation","volume-title":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Fu","year":"2018"},{"key":"ref64","volume-title":"Berkeley UR5 demonstration dataset","author":"Chen"},{"key":"ref65","article-title":"Bridgedata v2: A dataset for robot learning at scale","volume-title":"Conference on Robot Learning","author":"Walke","year":"2023"},{"key":"ref66","article-title":"Finetuning offline world models in the real world","volume":"abs\/2310.16029","author":"Feng","year":"2023","journal-title":"ArXiv"},{"key":"ref67","article-title":"Robocook: Long-horizon elasto-plastic object manipulation with diverse tools","volume":"abs\/2306.14447","author":"Shi","year":"2023","journal-title":"ArXiv"},{"key":"ref68","article-title":"D3fields: Dynamic 3d descriptor fields for zero-shot generalizable robotic manipulation","volume":"abs\/2309.16118","author":"Wang","year":"2023","journal-title":"ArXiv"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.120"},{"key":"ref70","article-title":"Vima: General robot manipulation with multimodal prompts","volume":"abs\/2210.03094","author":"Jiang","year":"2022","journal-title":"ArXiv"},{"key":"ref71","doi-asserted-by":"crossref","first-page":"3992","DOI":"10.1109\/ICCV51070.2023.00371","article-title":"Segment anything","volume-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Kirillov","year":"2023"},{"key":"ref72","doi-asserted-by":"crossref","first-page":"398","DOI":"10.1007\/s11263-018-1116-0","article-title":"Making the V in vqa matter: Elevating the role of image understanding in visual question answering","volume":"127","author":"Goyal","year":"2016","journal-title":"International Journal of Computer Vision"},{"key":"ref73","doi-asserted-by":"crossref","first-page":"6693","DOI":"10.1109\/CVPR.2019.00686","article-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering","volume-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Hudson","year":"2019"},{"key":"ref74","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.emnlp-main.20","article-title":"Evaluating object hallucination in large vision-language models","volume-title":"Conference on Empirical Methods in Natural Language Processing","author":"Li","year":"2023"},{"key":"ref75","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128671.pdf?arnumber=11128671","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T04:45:19Z","timestamp":1769489119000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128671\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128671","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}