{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:55:35Z","timestamp":1759334135832,"version":"build-2065373602"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62422204","62125201","62402152","62472133","U24B20174"],"award-info":[{"award-number":["62422204","62125201","62402152","62472133","U24B20174"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LZ23F020007","LQN25F020017","LDT23F02025F02"],"award-info":[{"award-number":["LZ23F020007","LQN25F020017","LDT23F02025F02"]}]},{"DOI":"10.13039\/100022963","name":"Key Research and Development Program of Zhejiang Province","doi-asserted-by":"publisher","award":["2025C01026"],"award-info":[{"award-number":["2025C01026"]}],"id":[{"id":"10.13039\/100022963","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Geosci. Remote Sensing"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tgrs.2025.3606214","type":"journal-article","created":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T18:22:00Z","timestamp":1757010120000},"page":"1-12","source":"Crossref","is-referenced-by-count":0,"title":["Benchmarking and Enhancing Geospatial Visual Reasoning Over Street Maps"],"prefix":"10.1109","volume":"63","author":[{"given":"Wenwen","family":"Pan","sequence":"first","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"given":"Haiting","family":"Zhou","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3069-9347","authenticated-orcid":false,"given":"Zhenwei","family":"Shao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3252-3455","authenticated-orcid":false,"given":"Shuai","family":"Shao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"given":"Suguo","family":"Zhu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1842-4050","authenticated-orcid":false,"given":"Min","family":"Tan","sequence":"additional","affiliation":[{"name":"Key Laboratory of Complex Systems Modeling and Simulation, the School of Computer Science, Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1922-7283","authenticated-orcid":false,"given":"Jun","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8407-1137","authenticated-orcid":false,"given":"Zhou","family":"Yu","sequence":"additional","affiliation":[{"name":"with the School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen 310018, China, Hangzhou Dianzi University, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref4","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","author":"Bubeck","year":"2023","journal-title":"arXiv:2303.12712"},{"key":"ref5","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"article-title":"Introducing Meta LLaMA 3: The most capable openly available LLM to date","volume-title":"Meta AI","year":"2024","key":"ref6"},{"article-title":"4v(ision) system card","volume-title":"OpenAI","year":"2023","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref9","first-page":"1","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Represent. (ICLR)","author":"Zhu","year":"2024"},{"volume-title":"Hello GPT-4o","year":"2024","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"ref17","article-title":"ChartX & ChartVLM: A versatile benchmark and foundation model for complicated chart reasoning","author":"Xia","year":"2024","journal-title":"arXiv:2402.12185"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MPRV.2008.80"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CW64301.2024.00058"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3254205"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3562422"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4235-6"},{"key":"ref26","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.528"},{"key":"ref29","first-page":"1","article-title":"MathVista: Evaluating mathematical reasoning of foundation models in visual contexts","volume-title":"Proc. 12th Int. Conf. Learn. Represent. (ICLR)","author":"Lu","year":"2024"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref31","article-title":"Evaluation and analysis of hallucination in large vision-language models","author":"Wang","year":"2023","journal-title":"arXiv:2308.15126"},{"key":"ref32","first-page":"54111","article-title":"On evaluating adversarial robustness of large vision-language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhao"},{"key":"ref33","article-title":"Holistic analysis of hallucination in GPT-4 V(ision): Bias and interference challenges","author":"Cui","year":"2023","journal-title":"arXiv:2311.03287"},{"key":"ref34","article-title":"HallusionBench: An advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models","author":"Guan","year":"2023","journal-title":"arXiv:2310.14566"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-47243-5_15"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-30796-7_12"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3390\/ijgi13010010"},{"key":"ref38","article-title":"Geoqamap-geographic question answering with maps leveraging LLM and open knowledge base (short paper)","volume-title":"Proc. 12th Int. Conf. Geographic Inf. Sci., GISci.","volume":"277","author":"Feng"},{"key":"ref39","first-page":"1","article-title":"GeoLLM: Extracting geospatial knowledge from large language models","volume-title":"Proc. 12th Int. Conf. Learn. Represent. (ICLR)","author":"Manvi"},{"article-title":"Language models are unsupervised multitask learners","year":"2019","author":"Radford","key":"ref40"},{"key":"ref41","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ouyang"},{"key":"ref42","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"key":"ref43","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref44","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"issue":"70","key":"ref45","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2022","journal-title":"J. Mach. Learn. Res."},{"volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 With 90%* ChatGPT Quality","year":"2023","author":"Chiang","key":"ref46"},{"key":"ref47","first-page":"49250","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"},{"key":"ref48","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref49","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. 36, Annu. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Liu"},{"key":"ref50","first-page":"24185","article-title":"InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","author":"Zhu","year":"2024"},{"key":"ref51","article-title":"Yi: Open foundation models by 01.AI","volume-title":"arXiv.2403.04652","author":"Young","year":"2024"},{"article-title":"LLaVA-NeXT: Improved reasoning, OCR, and world knowledge","year":"2024","author":"Liu","key":"ref52"},{"key":"ref53","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","volume-title":"arXiv:2404.14219","author":"Abdin","year":"2024"},{"key":"ref54","article-title":"InternLM-XComposer2-4KHD: A pioneering large vision-language model handling resolutions from 336 pixels to 4K HD","author":"Dong","year":"2024","journal-title":"arXiv:2404.06512"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"ref56","article-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv:2409.12191"},{"key":"ref57","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv:2408.03326"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2025.3574657"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01282"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02221"},{"key":"ref61","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref62","article-title":"Imp: Highly capable large multimodal models for mobile devices","author":"Shao","year":"2024","journal-title":"arXiv:2405.12107"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"}],"container-title":["IEEE Transactions on Geoscience and Remote Sensing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/36\/10807682\/11151549.pdf?arnumber=11151549","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T13:03:26Z","timestamp":1759237406000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11151549\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":63,"URL":"https:\/\/doi.org\/10.1109\/tgrs.2025.3606214","relation":{},"ISSN":["0196-2892","1558-0644"],"issn-type":[{"type":"print","value":"0196-2892"},{"type":"electronic","value":"1558-0644"}],"subject":[],"published":{"date-parts":[[2025]]}}}