{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T19:44:20Z","timestamp":1765568660897,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,3]]},"DOI":"10.1145\/3748636.3762755","type":"proceedings-article","created":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T19:07:30Z","timestamp":1765566450000},"page":"440-453","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CartoMapQA: A Fundamental Benchmark Dataset Evaluating Vision-Language Models on Cartographic Map Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9238-8601","authenticated-orcid":false,"given":"Huy Quang","family":"Ung","sequence":"first","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3364-5863","authenticated-orcid":false,"given":"Guillaume","family":"Habault","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4487-6285","authenticated-orcid":false,"given":"Yasutaka","family":"Nishimura","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5623-9470","authenticated-orcid":false,"given":"Hao","family":"Niu","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8909-635X","authenticated-orcid":false,"given":"Roberto","family":"Legaspi","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4927-076X","authenticated-orcid":false,"given":"Tomoki","family":"Oya","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3128-7781","authenticated-orcid":false,"given":"Ryoichi","family":"Kojima","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4911-4289","authenticated-orcid":false,"given":"Masato","family":"Taya","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6410-1359","authenticated-orcid":false,"given":"Chihiro","family":"Ono","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8856-7813","authenticated-orcid":false,"given":"Atsunori","family":"Minamikawa","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc., Fujimino, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7055-9518","authenticated-orcid":false,"given":"Yan","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Southern California, Los Angeles, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Large language models for mathematical reasoning: Progresses and challenges. arXiv preprint arXiv:2402.00157","author":"Ahn Janice","year":"2024","unstructured":"Janice Ahn, Rishu Verma, Renze Lou, Di Liu, Rui Zhang, and Wenpeng Yin. 2024. Large language models for mathematical reasoning: Progresses and challenges. arXiv preprint arXiv:2402.00157 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716\u201323736."},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic. 2025. Claude 3.7 Sonnet. https:\/\/www.anthropic.com\/claude\/sonnet"},{"key":"e_1_3_2_1_5_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589132.3625625"},{"key":"e_1_3_2_1_8_1","volume-title":"Modeling and Analyzing Urban Networks and Amenities with OSMnx. Working paper","author":"Boeing Geoff","year":"2024","unstructured":"Geoff Boeing. 2024. Modeling and Analyzing Urban Networks and Amenities with OSMnx. Working paper (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024. How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_11_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"16th International Conference on Spatial Information Theory (COSIT","author":"Cohn Anthony G","year":"2024","unstructured":"Anthony G Cohn and Robert E Blackwell. 2024. Evaluating the Ability of Large Language Models to Reason About Cardinal Directions. In 16th International Conference on Spatial Information Theory (COSIT 2024). Schloss Dagstuhl-Leibniz-Zentrum f\u00fcr Informatik."},{"key":"e_1_3_2_1_13_1","unstructured":"Google DeepMind. 2025. Gemini 2.5: Our most intelligent AI model. https:\/\/blog.google\/technology\/google-deepmind\/gemini-model-thinking-updates-march-2025\/#gemini-2-5-thinking"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635772"},{"key":"e_1_3_2_1_15_1","volume-title":"MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models. In International Conference on Machine Learning.","author":"Dihan Mahir Labib","year":"2025","unstructured":"Mahir Labib Dihan, Md Tanvir Hassan, Md Tanvir Parvez, Md Hasebul Hasan, Md Almash Alam, Muhammad Aamir Cheema, Mohammed Eunus Ali, and Md Rizwan Parvez. 2025. MapEval: A Map-Based Evaluation of Geo-Spatial Reasoning in Foundation Models. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_16_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"CityBench: Evaluating the Capabilities of Large Language Model as World Model. arXiv preprint arXiv:2406.13945","author":"Feng Jie","year":"2024","unstructured":"Jie Feng, Jun Zhang, Junbo Yan, Xin Zhang, Tianjian Ouyang, Tianhui Liu, Yuwei Du, Siqi Guo, and Yong Li. 2024. CityBench: Evaluating the Capabilities of Large Language Model as World Model. arXiv preprint arXiv:2406.13945 (2024)."},{"key":"e_1_3_2_1_18_1","unstructured":"Peng Gao Jiaming Han Renrui Zhang Ziyi Lin Shijie Geng Aojun Zhou Wei Zhang Pan Lu Conghui He Xiangyu Yue et al. 2023. Llama-adapter v2: Parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)."},{"key":"e_1_3_2_1_19_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Language Models Represent Space and Time. In The Twelfth International Conference on Learning Representations.","author":"Gurnee Wes","year":"2024","unstructured":"Wes Gurnee and Max Tegmark. 2024. Language Models Represent Space and Time. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","unstructured":"Aric Hagberg Pieter J Swart and Daniel A Schult. 2008. Exploring network structure dynamics and function using NetworkX. Technical Report. Los Alamos National Laboratory (LANL) Los Alamos NM (United States)."},{"key":"e_1_3_2_1_22_1","volume-title":"MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework. In The Twelfth International Conference on Learning Representations.","author":"Hong Sirui","year":"2024","unstructured":"Sirui Hong, Mingchen Zhuge, Jonathan Chen, Xiawu Zheng, Yuheng Cheng, Jinlin Wang, Ceyao Zhang, Zili Wang, Steven Ka Shing Yau, Zijuan Lin, et al. 2024. MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","unstructured":"Wenyi Hong Weihan Wang Ming Ding Wenmeng Yu Qingsong Lv Yan Wang Yean Cheng Shiyu Huang Junhui Ji Zhao Xue et al. 2024. Cogvlm2: Visual language models for image and video understanding. arXiv preprint arXiv:2408.16500 (2024)."},{"key":"e_1_3_2_1_24_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_25_1","unstructured":"Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney et al. 2024. Openai o1 system card. arXiv preprint arXiv:2412.16720 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 27831\u201327840","author":"Kuckreja Kartik","year":"2024","unstructured":"Kartik Kuckreja, Muhammad Sohail Danish, Muzammal Naseer, Abhijit Das, Salman Khan, and Fahad Shahbaz Khan. 2024. Geochat: Grounded large vision-language model for remote sensing. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 27831\u201327840."},{"key":"e_1_3_2_1_27_1","unstructured":"Bo Li Kaichen Zhang Hao Zhang Dong Guo Renrui Zhang Feng Li Yuanhan Zhang Ziwei Liu and Chunyuan Li. 2024. LLaVA-NeXT: Stronger LLMs Supercharge Multimodal Capabilities in the Wild. https:\/\/llava-vl.github.io\/blog\/2024-05-10-llava-next-stronger-llms\/"},{"key":"e_1_3_2_1_28_1","volume-title":"LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024. LLaVA-OneVision: Easy Visual Task Transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_31_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems 36 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2988782"},{"key":"e_1_3_2_1_33_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Manvi Rohin","year":"2024","unstructured":"Rohin Manvi, Samar Khanna, Gengchen Mai, Marshall Burke, David B Lobell, and Stefano Ermon. 2024. GeoLLM: Extracting Geospatial Knowledge from Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_34_1","unstructured":"Meta. 2024. Llama-3.2-11b-vision-instruct. https:\/\/huggingface.co\/meta-llama\/Llama-3.2-11B-Vision-Instruct"},{"key":"e_1_3_2_1_35_1","unstructured":"Meta. 2024. Llama-3.2-90b-vision-instruct. https:\/\/huggingface.co\/meta-llama\/Llama-3.2-90B-Vision-Instruct"},{"key":"e_1_3_2_1_36_1","unstructured":"Meta. 2025. Llama-4-Scout-17B-16E-Instruct. https:\/\/huggingface.co\/meta-llama\/Llama-4-Scout-17B-16E-Instruct"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3615886.3627745"},{"key":"e_1_3_2_1_38_1","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_39_1","unstructured":"OpenAI. 2025. OpenAI o3 and o4-mini System Card. https:\/\/cdn.openai.com\/pdf\/2221c875-02dc-4789-800b-e7758f3722c1\/o3-and-o4-mini-system-card.pdf"},{"key":"e_1_3_2_1_40_1","unstructured":"OpenStreetMap contributors. 2017. Planet dump retrieved from https:\/\/planet.osm.org. https:\/\/www.openstreetmap.org."},{"key":"e_1_3_2_1_41_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"python visualization. 2020. Folium. https:\/\/python-visualization.github.io\/folium\/"},{"key":"e_1_3_2_1_43_1","volume-title":"International conference on machine learning. PmLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_2_1_44_1","volume-title":"GPT4GEO: How a Language Model Sees the World's Geography. arXiv preprint arXiv:2306.00020","author":"Roberts Jonathan","year":"2023","unstructured":"Jonathan Roberts, Timo L\u00fcddecke, Sowmen Das, Kai Han, and Samuel Albanie. 2023. GPT4GEO: How a Language Model Sees the World's Geography. arXiv preprint arXiv:2306.00020 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.824"},{"key":"e_1_3_2_1_46_1","volume-title":"GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial Reasoning. arXiv preprint arXiv:2407.01892","author":"Tang Zhisheng","year":"2024","unstructured":"Zhisheng Tang and Mayank Kejriwal. 2024. GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial Reasoning. arXiv preprint arXiv:2407.01892 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems 36","author":"Wang Wenhai","year":"2024","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al. 2024. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al. 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824\u201324837."},{"key":"e_1_3_2_1_50_1","volume-title":"Multi-SpatialMLLM: Multi-Frame Spatial Understanding with Multi-Modal Large Language Models. arXiv preprint arXiv:2505.17015","author":"Xu Runsen","year":"2025","unstructured":"Runsen Xu, Weiyao Wang, Hao Tang, Xingyu Chen, Xiaodong Wang, Fu-Jen Chu, Dahua Lin, Matt Feiszli, and Kevin J Liang. 2025. Multi-SpatialMLLM: Multi-Frame Spatial Understanding with Multi-Modal Large Language Models. arXiv preprint arXiv:2505.17015 (2025)."},{"key":"e_1_3_2_1_51_1","volume-title":"Jungo Kasai, and Ilker Yildirim.","author":"Yamada Yutaro","year":"2024","unstructured":"Yutaro Yamada, Yihan Bao, Andrew Kyle Lampinen, Jungo Kasai, and Ilker Yildirim. 2024. Evaluating Spatial Understanding of Large Language Models. Transactions on Machine Learning Research (2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645378"},{"key":"e_1_3_2_1_53_1","volume-title":"V-irl: Grounding virtual intelligence in real life. arXiv preprint arXiv:2402.03310","author":"Yang Jihan","year":"2024","unstructured":"Jihan Yang, Runyu Ding, Ellis Brown, Xiaojuan Qi, and Saining Xie. 2024. V-irl: Grounding virtual intelligence in real life. arXiv preprint arXiv:2402.03310 (2024)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"e_1_3_2_1_55_1","volume-title":"Good at captioning, bad at counting: Benchmarking gpt-4v on earth observation data. arXiv preprint arXiv:2401.17600","author":"Zhang Chenhui","year":"2024","unstructured":"Chenhui Zhang and Sherrie Wang. 2024. Good at captioning, bad at counting: Benchmarking gpt-4v on earth observation data. arXiv preprint arXiv:2401.17600 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In The Twelfth International Conference on Learning Representations."}],"event":{"name":"SIGSPATIAL '25: 33rd ACM International Conference on Advances in Geographic Information Systems","location":"The Graduate Hotel Minneapolis Minneapolis MN USA","acronym":"SIGSPATIAL '25","sponsor":["SIGSPATIAL ACM Special Interest Group on Spatial Information"]},"container-title":["Proceedings of the 33rd ACM International Conference on Advances in Geographic Information Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3748636.3762755","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T19:10:20Z","timestamp":1765566620000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3748636.3762755"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,3]]},"references-count":56,"alternative-id":["10.1145\/3748636.3762755","10.1145\/3748636"],"URL":"https:\/\/doi.org\/10.1145\/3748636.3762755","relation":{},"subject":[],"published":{"date-parts":[[2025,11,3]]},"assertion":[{"value":"2025-12-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}