{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T12:42:00Z","timestamp":1776775320438,"version":"3.51.2"},"reference-count":52,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100012542","name":"Sichuan Province Science and Technology Support Program","doi-asserted-by":"publisher","award":["2024NSFSC0497"],"award-info":[{"award-number":["2024NSFSC0497"]}],"id":[{"id":"10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100012542","name":"Sichuan Province Science and Technology Support Program","doi-asserted-by":"publisher","award":["2024NSFTD0040"],"award-info":[{"award-number":["2024NSFTD0040"]}],"id":[{"id":"10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.neucom.2026.133492","type":"journal-article","created":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T03:52:56Z","timestamp":1774583576000},"page":"133492","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["DroneNav: Unified text-visual representation and structured spatial reasoning for robust UAV vision-and-language navigation"],"prefix":"10.1016","volume":"683","author":[{"given":"Fangming","family":"Liu","sequence":"first","affiliation":[]},{"given":"Guohua","family":"Li","sequence":"additional","affiliation":[]},{"given":"Linfeng","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Yadong","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8331-5186","authenticated-orcid":false,"given":"Peng","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133492_bib0005","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"3674","article-title":"Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments","author":"Anderson","year":"2018"},{"key":"10.1016\/j.neucom.2026.133492_bib0010","author":"Bai"},{"key":"10.1016\/j.neucom.2026.133492_bib0015","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133492_bib0020","author":"Cai"},{"key":"10.1016\/j.neucom.2026.133492_bib0025","series-title":"2013 American control conference","first-page":"2568","article-title":"Uav flight path planning in time varying complex wind-fields","author":"Chakrabarty","year":"2013"},{"key":"10.1016\/j.neucom.2026.133492_bib0030","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"12875","article-title":"Neural topological slam for visual navigation","author":"Chaplot","year":"2020"},{"key":"10.1016\/j.neucom.2026.133492_bib0035","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12538","article-title":"TOUCHDOWN: natural language navigation and spatial reasoning in visual street environments","author":"Chen","year":"2019"},{"key":"10.1016\/j.neucom.2026.133492_bib0040","author":"Fan"},{"key":"10.1016\/j.neucom.2026.133492_bib0045","doi-asserted-by":"crossref","first-page":"21311","DOI":"10.1109\/TASE.2025.3604018","article-title":"Visual and textual commonsense-enhanced layout learning for vision-and-language navigation","volume":"22","author":"Gao","year":"2025","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"key":"10.1016\/j.neucom.2026.133492_bib0050","doi-asserted-by":"crossref","first-page":"10874","DOI":"10.1109\/LRA.2024.3483042","article-title":"Enhancing scene understanding for vision-and-language navigation by knowledge awareness","volume":"9","author":"Gao","year":"2024","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.neucom.2026.133492_bib0055","author":"Gao"},{"key":"10.1016\/j.neucom.2026.133492_bib0060","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1109\/MPRV.2008.80","article-title":"OpenStreetMap: user-generated street maps","volume":"7","author":"Haklay","year":"2008","journal-title":"IEEE Pervasive Comput."},{"key":"10.1016\/j.neucom.2026.133492_bib0065","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13137","article-title":"Towards learning a generic agent for vision-and-language navigation via pre-training","author":"Hao","year":"2020"},{"key":"10.1016\/j.neucom.2026.133492_bib0070","first-page":"3","article-title":"LoRA: low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"10.1016\/j.neucom.2026.133492_bib0075","doi-asserted-by":"crossref","first-page":"316","DOI":"10.1007\/s11263-021-01554-9","article-title":"SensatUrban: learning semantics from urban-scale photogrammetric point clouds","volume":"130","author":"Hu","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neucom.2026.133492_bib0080","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114190","article-title":"LLVM-drone: a synergistic framework integrating large language models and vision models for visual tasks in unmanned aerial vehicles","author":"Hu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.neucom.2026.133492_bib0085","author":"Hurst"},{"key":"10.1016\/j.neucom.2026.133492_bib0090","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"14921","article-title":"Iterative vision-and-language navigation","author":"Krantz","year":"2023"},{"key":"10.1016\/j.neucom.2026.133492_bib0095","series-title":"European Conference on Computer Vision","first-page":"104","article-title":"Beyond the Nav-Graph: vision-and-language navigation in continuous environments","author":"Krantz","year":"2020"},{"key":"10.1016\/j.neucom.2026.133492_bib0100","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"5912","article-title":"CityNav: a large-scale dataset for real-world aerial navigation","author":"Lee","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0105","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"8592","article-title":"LLaVA-ST: a multimodal large language model for fine-grained spatial-temporal understanding","author":"Li","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0110","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15407","article-title":"EnvEdit: environment editing for vision-and-language navigation","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133492_bib0115","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4860","article-title":"REGNav: room expert guided image-goal navigation","author":"Li","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0120","author":"Liu"},{"key":"10.1016\/j.neucom.2026.133492_bib0125","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"1644","article-title":"Vision-language navigation with random environmental mixup","author":"Liu","year":"2021"},{"key":"10.1016\/j.neucom.2026.133492_bib0130","series-title":"Advances in Neural Information Processing Systems","first-page":"34892","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2026.133492_bib0135","series-title":"European conference on computer vision","first-page":"38","article-title":"Grounding DINO: marrying dino with grounded pre-training for open-set object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.133492_bib0140","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"15384","article-title":"AerialVLN: vision-and-language navigation for uavs","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2026.133492_bib0145","author":"Loshchilov"},{"key":"10.1016\/j.neucom.2026.133492_bib0150","doi-asserted-by":"crossref","first-page":"21","DOI":"10.1080\/10095020.2017.1420509","article-title":"A survey on vision-based uav navigation","volume":"21","author":"Lu","year":"2018","journal-title":"Geo-spat. Inf. Sci."},{"key":"10.1016\/j.neucom.2026.133492_bib0155","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"15942","article-title":"Episodic transformer for vision-and-language navigation","author":"Pashevich","year":"2021"},{"key":"10.1016\/j.neucom.2026.133492_bib0160","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.133492_bib0165","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations","first-page":"270","article-title":"Dialogpt: large-scale generative pre-training for conversational response generation","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neucom.2026.133492_bib0170","series-title":"2015 International Conference on Unmanned Aircraft Systems (ICUAS)","first-page":"1138","article-title":"Performance-aware flight path planning for unmanned aircraft in uniform wind fields","author":"Schopferer","year":"2015"},{"key":"10.1016\/j.neucom.2026.133492_bib0175","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"14664","article-title":"Enhancing multi-robot semantic navigation through multimodal chain-of-thought score collaboration","author":"Shen","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0180","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"14673","article-title":"Towards audio-visual navigation in noisy environments: a large-scale benchmark dataset and an architecture considering multiple sound-sources","author":"Shi","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0185","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10740","article-title":"ALFRED: a benchmark for interpreting grounded instructions for everyday tasks","author":"Shridhar","year":"2020"},{"key":"10.1016\/j.neucom.2026.133492_bib0190","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"7060","article-title":"Learning fine-grained alignment for aerial vision-dialog navigation","author":"Su","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0195","author":"Team"},{"key":"10.1016\/j.neucom.2026.133492_bib0200","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133492_bib0205","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"11414","article-title":"A method for visual spatial description based on large language model fine-tuning","author":"Wang","year":"2024"},{"key":"10.1016\/j.neucom.2026.133492_bib0210","doi-asserted-by":"crossref","first-page":"6516","DOI":"10.1109\/TNNLS.2024.3398300","article-title":"Discovering intrinsic subgoals for vision-and-language navigation via hierarchical reinforcement learning","volume":"36","author":"Wang","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.neucom.2026.133492_bib0215","author":"Wang"},{"key":"10.1016\/j.neucom.2026.133492_bib0220","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6629","article-title":"Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation","author":"Wang","year":"2019"},{"key":"10.1016\/j.neucom.2026.133492_bib0225","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.129457","article-title":"Instruction-guided path planning with 3d semantic maps for vision-language navigation","volume":"625","author":"Wang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133492_bib0230","doi-asserted-by":"crossref","first-page":"8406","DOI":"10.1109\/TMM.2024.3358112","article-title":"Vision-and-language navigation via latent semantic alignment learning","volume":"26","author":"Wu","year":"2024","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.neucom.2026.133492_bib0235","author":"Xu"},{"key":"10.1016\/j.neucom.2026.133492_bib0240","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"9005","article-title":"FLAME: learning to navigate with multimodal llm in urban environments","author":"Xu","year":"2025"},{"key":"10.1016\/j.neucom.2026.133492_bib0245","author":"Zhang"},{"key":"10.1016\/j.neucom.2026.133492_bib0250","article-title":"Causal learning with uncertainty-aware transformer for vision-and-language navigation","author":"Zhang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133492_bib0255","author":"Zhao"},{"key":"10.1016\/j.neucom.2026.133492_bib0260","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"7641","article-title":"NavGPT: explicit reasoning in vision-and-language navigation with large language models","author":"Zhou","year":"2024"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008891?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008891?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T11:48:00Z","timestamp":1776772080000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226008891"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":52,"alternative-id":["S0925231226008891"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133492","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"DroneNav: Unified text-visual representation and structured spatial reasoning for robust UAV vision-and-language navigation","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133492","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"133492"}}