{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T14:59:59Z","timestamp":1775228399362,"version":"3.50.1"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,9,24]],"date-time":"2024-09-24T00:00:00Z","timestamp":1727136000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,24]],"date-time":"2024-09-24T00:00:00Z","timestamp":1727136000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,9,24]]},"DOI":"10.1109\/itsc58415.2024.10919532","type":"proceedings-article","created":{"date-parts":[[2025,3,21]],"date-time":"2025-03-21T19:00:11Z","timestamp":1742583611000},"page":"476-481","source":"Crossref","is-referenced-by-count":6,"title":["AirVista: Empowering UAVs with 3D Spatial Reasoning Abilities Through a Multimodal Large Language Model Agent"],"prefix":"10.1109","author":[{"given":"Fei","family":"Lin","sequence":"first","affiliation":[{"name":"Macau University of Science and Technology,Faculty of Innovation Engineering,Department of Engineering Science,Macau,China,999078"}]},{"given":"Yonglin","family":"Tian","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Management and Control of Complex Systems, Institute of Automation,Chinese Academy of Sciences,Beijing,China,100190"}]},{"given":"Yunzhe","family":"Wang","sequence":"additional","affiliation":[{"name":"Capital University of Economics and Business,Beijing,China,100070"}]},{"given":"Tengchao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Macau University of Science and Technology,Faculty of Innovation Engineering,Department of Engineering Science,Macau,China,999078"}]},{"given":"Xinyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences,Department of Artificial Intelligence,Beijing,China,100049"}]},{"given":"Fei-Yue","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Management and Control of Complex Systems,Chinese Academy of Sciences,Beijing,100190"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2022.3223728"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3307012"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3390\/systems11080400"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MITS.2020.3014079"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.3390\/rs13152868"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2022.3197815"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2017.7510598"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"ref9","author":"Zeng","year":"2023","journal-title":"Large language models for robotics: A survey"},{"key":"ref10","author":"Zhao","year":"2023","journal-title":"Agent as cerebrum, con-troller as cerebellum: Implementing an embodied lmm-based agent on drones"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00131"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01411"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref14","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref15","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"key":"ref16","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref17","author":"Liu","year":"2023","journal-title":"Improved baselines with visual instruction tuning"},{"key":"ref18","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International Conference on Machine Learning","author":"Li","year":"2023"},{"key":"ref19","author":"Casper","year":"2023","journal-title":"Open problems and fundamental limitations of reinforcement learning from human feedback"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-67361-5_40"},{"key":"ref22","first-page":"1","article-title":"CARLA: An open urban driving simulator","volume-title":"Conference on Robot Learning","author":"Dosovitskiy","year":"2017"},{"key":"ref23","first-page":"20482","article-title":"3d-llm: Injecting the 3d world into large language models","volume":"36","author":"Hong","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref25","volume-title":"Vqasynth","year":"2024"},{"key":"ref26","author":"Bhat","year":"2023","journal-title":"Zoedepth: Zero-shot transfer by combining relative and metric depth"},{"key":"ref27","volume-title":"LLaVA-NeXT: Improved reasoning, OCR, and world knowledge","author":"Liu","year":"2024"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref31","author":"Kannan","year":"2023","journal-title":"Smart-llm: Smart multi-agent robot task planning using large language models"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00235"},{"key":"ref33","author":"Hu","year":"2021","journal-title":"Lora: Low-rank adaptation of large language models"},{"key":"ref34","first-page":"2023","article-title":"Gpt-4v(ision) system card","volume-title":"OpenAI","year":"2023"},{"key":"ref35","article-title":"The claude 3 model family: Opus, sonnet, haiku","year":"2024","journal-title":"An-thropic"},{"key":"ref36","author":"Team","year":"2023","journal-title":"Gemini: a family of highly capable multimodal models"},{"key":"ref37","volume-title":"https:\/\/openrouter.ai"}],"event":{"name":"2024 IEEE 27th International Conference on Intelligent Transportation Systems (ITSC)","location":"Edmonton, AB, Canada","start":{"date-parts":[[2024,9,24]]},"end":{"date-parts":[[2024,9,27]]}},"container-title":["2024 IEEE 27th International Conference on Intelligent Transportation Systems (ITSC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10919469\/10919190\/10919532.pdf?arnumber=10919532","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T17:44:36Z","timestamp":1742838276000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10919532\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,24]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/itsc58415.2024.10919532","relation":{},"subject":[],"published":{"date-parts":[[2024,9,24]]}}}