{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T14:53:19Z","timestamp":1776955999773,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758235","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"12905-12911","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["RSVLM-QA: A Benchmark Dataset for Remote Sensing Vision Language Model-based Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4265-2205","authenticated-orcid":false,"given":"Xing","family":"Zi","sequence":"first","affiliation":[{"name":"School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0837-9068","authenticated-orcid":false,"given":"Jinghao","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1516-015X","authenticated-orcid":false,"given":"Yunxiao","family":"Shi","sequence":"additional","affiliation":[{"name":"SEDE, University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5834-5181","authenticated-orcid":false,"given":"Xian","family":"Tao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1336-2241","authenticated-orcid":false,"given":"Jun","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2561-6496","authenticated-orcid":false,"given":"Ali","family":"Braytee","sequence":"additional","affiliation":[{"name":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7745-9667","authenticated-orcid":false,"given":"Mukesh","family":"Prasad","sequence":"additional","affiliation":[{"name":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2025.03.028"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2858817"},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_5_1","volume-title":"VRSBench: A Versatile Vision-Language Benchmark Dataset for Remote Sensing Image Understanding. In The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Li Xiang","unstructured":"Xiang Li, Jian Ding, and Mohamed Elhoseiny. [n.d.]. VRSBench: A Versatile Vision-Language Benchmark Dataset for Remote Sensing Image Understanding. In The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12060939"},{"key":"e_1_3_2_1_7_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"key":"e_1_3_2_1_8_1","volume-title":"Remoteclip: A vision language foundation model for remote sensing","author":"Liu Fan","year":"2024","unstructured":"Fan Liu, Delong Chen, Zhangqingyun Guan, Xiaocong Zhou, Jiale Zhu, Qiaolin Ye, Liyong Fu, and Jun Zhou. 2024. Remoteclip: A vision language foundation model for remote sensing. IEEE Transactions on Geoscience and Remote Sensing (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2988782"},{"key":"e_1_3_2_1_11_1","volume-title":"Ovis: Structural Embedding Alignment for Multimodal Large Language Model. arXiv:2405.20797 [cs.CV] https:\/\/arxiv.org\/abs\/2405.20797","author":"Lu Shiyin","year":"2024","unstructured":"Shiyin Lu, Yang Li, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, and Han-Jia Ye. 2024. Ovis: Structural Embedding Alignment for Multimodal Large Language Model. arXiv:2405.20797 [cs.CV] https:\/\/arxiv.org\/abs\/2405.20797"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"e_1_3_2_1_13_1","volume-title":"When Large Vision-Language Model Meets Large Remote Sensing Imagery: Coarse-to-Fine Text-Guided Token Pruning. CoRR","author":"Luo Junwei","year":"2025","unstructured":"Junwei Luo, Yingying Zhang, Xue Yang, Kang Wu, Qi Zhu, Lei Liang, Jingdong Chen, and Yansheng Li. 2025. When Large Vision-Language Model Meets Large Remote Sensing Imagery: Coarse-to-Fine Text-Guided Token Pruning. CoRR (2025)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2017.8127684"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CITS.2016.7546397"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs17010162"},{"key":"e_1_3_2_1_18_1","unstructured":"Gemma Team Aishwarya Kamath Johan Ferret Shreya Pathak Nino Vieillard Ramona Merhej Sarah Perrin Tatiana Matejovicova Alexandre Ram\u00e9 Morgane Rivi\u00e8re et al. 2025. Gemma 3 technical report. arXiv preprint arXiv:2503.19786 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"Alignment and Multimodal Reasoning for Remote Sensing Visual Question Answering. In IGARSS 2024-2024 IEEE International Geoscience and Remote Sensing Symposium. IEEE, 7115-7118","author":"Tian Yumin","year":"2024","unstructured":"Yumin Tian, Haojie Xu, Di Wang, Ke Li, and Lin Zhao. 2024. Alignment and Multimodal Reasoning for Remote Sensing Visual Question Answering. In IGARSS 2024-2024 IEEE International Geoscience and Remote Sensing Symposium. IEEE, 7115-7118."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks, J. Vanschoren and S. Yeung (Eds.)","volume":"1","author":"Wang Junjue","year":"2021","unstructured":"Junjue Wang, Zhuo Zheng, Ailong Ma, Xiaoyan Lu, and Yanfei Zhong. 2021. LoveDA: A Remote Sensing Land-Cover Dataset for Domain Adaptive Semantic Segmentation. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks, J. Vanschoren and S. Yeung (Eds.), Vol. 1. Curran Associates, Inc. https:\/\/datasets-benchmarks-proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/4e732ced3463d06de0ca9a15b6153677-Paper-round2.pdf"},{"key":"e_1_3_2_1_21_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 28-37","author":"Zamir Syed Waqas","year":"2019","unstructured":"Syed Waqas Zamir, Aditya Arora, Akshita Gupta, Salman Khan, Guolei Sun, Fahad Shahbaz Khan, Fan Zhu, Ling Shao, Gui-Song Xia, and Xiang Bai. 2019. isaid: A large-scale dataset for instance segmentation in aerial images. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops. 28-37."},{"key":"e_1_3_2_1_23_1","first-page":"1","article-title":"Chatearthnet: A global-scale image-text dataset empowering vision-language geo-foundation models","volume":"2024","author":"Yuan Zhenghang","year":"2024","unstructured":"Zhenghang Yuan, Zhitong Xiong, Lichao Mou, and Xiao Xiang Zhu. 2024a. Chatearthnet: A global-scale image-text dataset empowering vision-language geo-foundation models. Earth System Science Data Discussions, Vol. 2024 (2024), 1-24.","journal-title":"Earth System Science Data Discussions"},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Chatearthnet: A global-scale image-text dataset empowering vision-language geo-foundation models","volume":"2024","author":"Yuan Zhenghang","year":"2024","unstructured":"Zhenghang Yuan, Zhitong Xiong, Lichao Mou, and Xiao Xiang Zhu. 2024b. Chatearthnet: A global-scale image-text dataset empowering vision-language geo-foundation models. Earth System Science Data Discussions, Vol. 2024 (2024), 1-24.","journal-title":"Earth System Science Data Discussions"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3312479"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3449154"},{"key":"e_1_3_2_1_28_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Yuchen Duan Hao Tian Weijie Su Jie Shao et al. 2025. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758235","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:00:36Z","timestamp":1765342836000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758235"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":28,"alternative-id":["10.1145\/3746027.3758235","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758235","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}