{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T07:33:10Z","timestamp":1764401590354,"version":"3.46.0"},"reference-count":11,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T00:00:00Z","timestamp":1750809600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T00:00:00Z","timestamp":1750809600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,25]]},"DOI":"10.1109\/snpd65828.2025.11254066","type":"proceedings-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T18:40:29Z","timestamp":1764355229000},"page":"472-478","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Spatial Reasoning in Multimodal Vision-Language Models via Depth-Aware Feature Integration"],"prefix":"10.1109","author":[{"given":"Hiroo","family":"Tsuji","sequence":"first","affiliation":[{"name":"Otemon Gakuin University,Department of Information Engineering,Ibaraki,Japan"}]}],"member":"263","reference":[{"article-title":"Learning Transferable Visual Models From Natural Language Supervision","year":"2021","author":"Radford","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.568"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4293"},{"article-title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","year":"2023","author":"Brohan","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341422"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"article-title":"RoboFlamingo-Plus: Fusion of Depth and RGB Perception with Vision-Language Models for Enhanced Robotic Manipulation","year":"2025","author":"Wang","key":"ref7"},{"article-title":"Refining CLIP\u2019s Spatial Awareness: A Visual-Centric Perspective","year":"2025","author":"Qiu","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00566"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"}],"event":{"name":"2025 IEEE\/ACIS 29th International Conference on Software Engineering, Artificial Intelligence, Networking and Parallel\/Distributed Computing (SNPD)","start":{"date-parts":[[2025,6,25]]},"location":"Busan, Korea, Republic of","end":{"date-parts":[[2025,6,27]]}},"container-title":["2025 IEEE\/ACIS 29th International Conference on Software Engineering, Artificial Intelligence, Networking and Parallel\/Distributed Computing (SNPD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11251382\/11252538\/11254066.pdf?arnumber=11254066","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T07:26:30Z","timestamp":1764401190000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11254066\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,25]]},"references-count":11,"URL":"https:\/\/doi.org\/10.1109\/snpd65828.2025.11254066","relation":{},"subject":[],"published":{"date-parts":[[2025,6,25]]}}}