{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T10:01:18Z","timestamp":1777370478605,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Funda\u00e7\u00e3o para a Ci\u00eancia e Tecnologia","award":["UIDB\/50021\/2020,UIDP\/04516\/2020"],"award-info":[{"award-number":["UIDB\/50021\/2020,UIDP\/04516\/2020"]}]},{"name":"Center for Responsible AI","award":["C645008882-00000055"],"award-info":[{"award-number":["C645008882-00000055"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,29]]},"DOI":"10.1145\/3678717.3691318","type":"proceedings-article","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T06:29:21Z","timestamp":1732256961000},"page":"220-232","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Multilingual Vision-Language Pre-training for the Remote Sensing Domain"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6474-7822","authenticated-orcid":false,"given":"Jo\u00e3o Daniel","family":"Silva","sequence":"first","affiliation":[{"name":"INESC-ID, Instituto Superior T\u00e9cnico, University of Lisbon, Lisbon, Portugal"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6290-5719","authenticated-orcid":false,"given":"Jo\u00e3o","family":"Magalh\u00e3es","sequence":"additional","affiliation":[{"name":"NOVA-LINCS, NOVA University, Lisbon, Portugal"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0374-2459","authenticated-orcid":false,"given":"Devis","family":"Tuia","sequence":"additional","affiliation":[{"name":"EPFL ENAC IIE ECEO, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne, Sion, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3856-2936","authenticated-orcid":false,"given":"Bruno","family":"Martins","sequence":"additional","affiliation":[{"name":"INESC-ID &amp; LUMLIS (Lisbon ELLIS Unit), Instituto Superior T\u00e9cnico, University of Lisbon, Lisbon, Portugal"}]}],"member":"320","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tower: An Open Multilingual Large Language Model for Translation-Related Tasks. arXiv:2402.17733","author":"Alves Duarte M.","year":"2024","unstructured":"Duarte M. Alves, Jos\u00e9 Pombal, Nuno M. Guerreiro, Pedro H. Martins, Jo\u00e3o Alves, Amin Farajian, Ben Peters, Ricardo Rei, Patrick Fernandes, Sweta Agrawal, et al. 2024. Tower: An Open Multilingual Large Language Model for Translation-Related Tasks. arXiv:2402.17733 (2024)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_4_1","volume-title":"MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"A Simple Framework for Contrastive Learning of Visual Representations. arXiv:2002.05709","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A Simple Framework for Contrastive Learning of Visual Representations. arXiv:2002.05709 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS52108.2023.10281814"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2675998"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3201474"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_2_1_11_1","volume-title":"Unsupervised cross-lingual representation learning at scale. arXiv:1911.02116","author":"Conneau Alexis","year":"2019","unstructured":"Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzm\u00e1n, Edouard Grave, Myle Ott, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Unsupervised cross-lingual representation learning at scale. arXiv:1911.02116 (2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"e_1_3_2_1_14_1","volume-title":"CLIP itself is a strong fine-tuner: Achieving 85.7% and 88.0% top-1 accuracy with ViT-B and ViT-L on imagenet. arXiv:2212.06138","author":"Dong Xiaoyi","year":"2022","unstructured":"Xiaoyi Dong, Jianmin Bao, Ting Zhang, Dongdong Chen, Shuyang Gu, Weiming Zhang, Lu Yuan, Dong Chen, Fang Wen, and Nenghai Yu. 2022. CLIP itself is a strong fine-tuner: Achieving 85.7% and 88.0% top-1 accuracy with ViT-B and ViT-L on imagenet. arXiv:2212.06138 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-eacl.88"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems.","author":"Fan Lijie","year":"2024","unstructured":"Lijie Fan, Dilip Krishnan, Phillip Isola, Dina Katabi, and Yonglong Tian. 2024. Improving CLIP training with language rewrites. In Proceedings of the Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_18_1","volume-title":"Ismail Ben Ayed, and Jose Dolz","author":"Hajimiri Sina","year":"2024","unstructured":"Sina Hajimiri, Ismail Ben Ayed, and Jose Dolz. 2024. Pay Attention to Your Neighbours: Training-Free Open-Vocabulary Semantic Segmentation. arXiv:2404.08181 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"EuroSat: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification","author":"Helber Patrick","year":"2019","unstructured":"Patrick Helber, Benjamin Bischke, Andreas Dengel, and Damian Borth. 2019. EuroSat: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing (2019)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3013818"},{"key":"e_1_3_2_1_21_1","volume-title":"RSGPT: A Remote Sensing Vision Language Model and Benchmark. arXiv:2307.15266","author":"Hu Yuan","year":"2023","unstructured":"Yuan Hu, Jianlong Yuan, Congcong Wen, Xiaonan Lu, and Xiang Li. 2023. RSGPT: A Remote Sensing Vision Language Model and Benchmark. arXiv:2307.15266 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.5143773"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Koh Jing Yu","year":"2023","unstructured":"Jing Yu Koh, Ruslan Salakhutdinov, and Daniel Fried. 2023. Grounding language models to images for multimodal inputs and outputs. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_24_1","volume-title":"Muzammal Naseer, Abhijit Das, Salman Khan, and Fahad Shahbaz Khan.","author":"Kuckreja Kartik","year":"2023","unstructured":"Kartik Kuckreja, Muhammad Sohail Danish, Muzammal Naseer, Abhijit Das, Salman Khan, and Fahad Shahbaz Khan. 2023. GeoChat: Grounded Large Vision-Language Model for Remote Sensing. arXiv:2311.15826 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"RSI-CB: A large-scale remote sensing image classification benchmark using crowdsourced data. Sensors 20, 6","author":"Li Haifeng","year":"2020","unstructured":"Haifeng Li, Xin Dou, Chao Tao, Zhixiang Wu, Jie Chen, Jian Peng, Min Deng, and Ling Zhao. 2020. RSI-CB: A large-scale remote sensing image classification benchmark using crowdsourced data. Sensors 20, 6 (2020)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2023.103497"},{"key":"e_1_3_2_1_27_1","volume-title":"A Comprehensive Analysis of Data, Architecture, and Training Strategies. arXiv:2404.08197","author":"Li Zichao","year":"2024","unstructured":"Zichao Li, Cihang Xie, and Ekin Dogus Cubuk. 2024. Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and Training Strategies. arXiv:2404.08197 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"RemoteCLIP: A Vision Language Foundation Model for Remote Sensing","author":"Liu Fan","year":"2024","unstructured":"Fan Liu, Delong Chen, Zhangqingyun Guan, Xiaocong Zhou, Jiale Zhu, Qiaolin Ye, Liyong Fu, and Jun Zhou. 2024. RemoteCLIP: A Vision Language Foundation Model for Remote Sensing. IEEE Transactions on Geoscience and Remote Sensing 62 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Visual instruction tuning. arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"e_1_3_2_1_31_1","volume-title":"Knowledge-aware Text-Image Retrieval for Remote Sensing Images. arXiv:2405.03373","author":"Mi Li","year":"2024","unstructured":"Li Mi, Xianjie Dai, Javiera Castillo-Navarro, and Devis Tuia. 2024. Knowledge-aware Text-Image Retrieval for Remote Sensing Images. arXiv:2405.03373 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the Workshop on Complex Data Challenges in Earth Observation.","author":"Mi Li","year":"2022","unstructured":"Li Mi, Siran Li, Christel Chappuis, and Devis Tuia. 2022. Knowledge-Aware Cross-Modal Text-Image Retrieval for Remote Sensing Images. In Proceedings of the Workshop on Complex Data Challenges in Earth Observation."},{"key":"e_1_3_2_1_33_1","volume-title":"Ali Alilooee, Ser-Nam Lim, and Rajiv Ramnath.","author":"Monsefi Amin Karimi","year":"2024","unstructured":"Amin Karimi Monsefi, Kishore Prakash Sailaja, Ali Alilooee, Ser-Nam Lim, and Rajiv Ramnath. 2024. DetailCLIP: Detail-Oriented CLIP for Fine-Grained Tasks. arXiv:2409.06809 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"e_1_3_2_1_35_1","volume-title":"LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal Language Model. arXiv:2402.02544","author":"Muhtar Dilxat","year":"2024","unstructured":"Dilxat Muhtar, Zhenshi Li, Feng Gu, Xueliang Zhang, and Pengfeng Xiao. 2024. LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal Language Model. arXiv:2402.02544 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Luc Van Gool, and Federico Tombari","author":"Naeem Muhammad Ferjad","year":"2023","unstructured":"Muhammad Ferjad Naeem, Yongqin Xian, Xiaohua Zhai, Lukas Hoyer, Luc Van Gool, and Federico Tombari. 2023. SILC: Improving vision language pretraining with self-distillation. arXiv:2310.13355 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Fine Tuning CLIP with Remote Sensing (Satellite) Images and Captions. HuggingFace Blog","author":"Pal Sujit","year":"2021","unstructured":"Sujit Pal, Artashes Arutiunian, Goutham Venkatesh, Ritobrata Ghosh, Dev Vidhani, and Mayank Bhaskar. 2021. Fine Tuning CLIP with Remote Sensing (Satellite) Images and Captions. HuggingFace Blog (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Proceedings of the Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_39_1","volume-title":"MLRSNet: A multi-label high spatial resolution remote sensing dataset for semantic scene understanding. ISPRS Journal of Photogrammetry and Remote Sensing 169","author":"Qi Xiaoman","year":"2020","unstructured":"Xiaoman Qi, Panpan Zhu, Yuebin Wang, Liqiang Zhang, Junhuan Peng, Mengfan Wu, Jialong Chen, Xudong Zhao, Ning Zang, and P Takis Mathiopoulos. 2020. MLRSNet: A multi-label high spatial resolution remote sensing dataset for semantic scene understanding. ISPRS Journal of Photogrammetry and Remote Sensing 169 (2020)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CITS.2016.7546397"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3215803"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.104"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems","author":"Salimans Tim","year":"2016","unstructured":"Tim Salimans and Durk P Kingma. 2016. Weight normalization: A simple reparameterization to accelerate training of deep neural networks. Proceedings of the Conference on Neural Information Processing Systems (2016)."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. LAION-5b: An open large-scale dataset for training next generation image-text models. Proceedings of the Conference on Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"Large Language Models for Captioning and Retrieving Remote Sensing Images. arXiv:2402.06475","author":"Silva Jo\u00e3o Daniel","year":"2024","unstructured":"Jo\u00e3o Daniel Silva, Jo\u00e3o Magalh\u00e3es, Devis Tuia, and Bruno Martins. 2024. Large Language Models for Captioning and Retrieving Remote Sensing Images. arXiv:2402.06475 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Representation Learning with Contrastive Predictive Coding. arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_48_1","volume-title":"CLIP with Quality Captions: A Strong Pretraining for Vision Tasks. arXiv:2405.08911","author":"Anasosalu Vasu Pavan Kumar","year":"2024","unstructured":"Pavan Kumar Anasosalu Vasu, Hadi Pouransari, Fartash Faghri, and Oncel Tuzel. 2024. CLIP with Quality Captions: A Strong Pretraining for Vision Tasks. arXiv:2405.08911 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv:2312.01597","author":"Wang Feng","year":"2023","unstructured":"Feng Wang, Jieru Mei, and Alan Yuille. 2023. SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv:2312.01597 (2023)."},{"key":"e_1_3_2_1_50_1","article-title":"Scene classification with recurrent attention of VHR remote sensing images","volume":"57","author":"Wang Qi","year":"2018","unstructured":"Qi Wang, Shaoteng Liu, Jocelyn Chanussot, and Xuelong Li. 2018. Scene classification with recurrent attention of VHR remote sensing images. IEEE Transactions on Geoscience and Remote Sensing 57, 2 (2018).","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_51_1","volume-title":"Diffusion Feedback Helps CLIP See Better. arXiv:2407.20171","author":"Wang Wenxuan","year":"2024","unstructured":"Wenxuan Wang, Quan Sun, Fan Zhang, Yepeng Tang, Jing Liu, and Xinlong Wang. 2024. Diffusion Feedback Helps CLIP See Better. arXiv:2407.20171 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"MedCLIP: Contrastive learning from unpaired medical images and text. arXiv:2210.10163","author":"Wang Zifeng","year":"2022","unstructured":"Zifeng Wang, Zhenbang Wu, Dinesh Agarwal, and Jimeng Sun. 2022. MedCLIP: Contrastive learning from unpaired medical images and text. arXiv:2210.10163 (2022)."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops.","author":"Zamir Syed Waqas","year":"2019","unstructured":"Syed Waqas Zamir, Aditya Arora, Akshita Gupta, Salman Khan, Guolei Sun, Fahad Shahbaz Khan, Fan Zhu, Ling Shao, Gui-Song Xia, and Xiang Bai. 2019. iSAID: A Large-scale Dataset for Instance Segmentation in Aerial Images. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2685945"},{"key":"e_1_3_2_1_55_1","first-page":"298","article-title":"Structural high-resolution satellite image indexing","volume":"38","author":"Xia Gui-Song","year":"2010","unstructured":"Gui-Song Xia, Wen Yang, Julie Delon, Yann Gousseau, Hong Sun, and Henri Maitre. 2010. Structural high-resolution satellite image indexing. In International Archives of Photogrammetry and Remote Sensing, Vol. 38. 298--303.","journal-title":"International Archives of Photogrammetry and Remote Sensing"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"e_1_3_2_1_57_1","volume-title":"Text-image matching for cross-modal remote sensing image retrieval via graph neural network","author":"Yu Hongfeng","year":"2022","unstructured":"Hongfeng Yu, Fanglong Yao, Wanxuan Lu, Nayu Liu, Peiguang Li, Hongjian You, and Xian Sun. 2022. Text-image matching for cross-modal remote sensing image retrieval via graph neural network. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing 16 (2022)."},{"key":"e_1_3_2_1_58_1","volume-title":"Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval. arXiv:2308.12509","author":"Yuan Yuan","year":"2023","unstructured":"Yuan Yuan, Yang Zhan, and Zhitong Xiong. 2023. Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval. arXiv:2308.12509 (2023)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/JURSE57346.2023.10144189"},{"key":"e_1_3_2_1_60_1","volume-title":"Exploring a fine-grained multiscale method for cross-modal remote sensing image retrieval. arXiv:2204.09868","author":"Yuan Zhiqiang","year":"2022","unstructured":"Zhiqiang Yuan, Wenkai Zhang, Kun Fu, Xuan Li, Chubo Deng, Hongqi Wang, and Xian Sun. 2022. Exploring a fine-grained multiscale method for cross-modal remote sensing image retrieval. arXiv:2204.09868 (2022)."},{"key":"e_1_3_2_1_61_1","volume-title":"Remote sensing cross-modal textimage retrieval based on global and local information","author":"Yuan Zhiqiang","year":"2022","unstructured":"Zhiqiang Yuan, Wenkai Zhang, Changyuan Tian, Xuee Rong, Zhengyuan Zhang, Hongqi Wang, Kun Fu, and Xian Sun. 2022. Remote sensing cross-modal textimage retrieval based on global and local information. IEEE Transactions on Geoscience and Remote Sensing 60 (2022)."},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Yuksekgonul Mert","year":"2022","unstructured":"Mert Yuksekgonul, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2022. When and why vision-language models behave like bags-of-words, and what to do about it?. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_63_1","volume-title":"Mind the Modality Gap: Towards a Remote Sensing Vision-Language Model via Cross-modal Alignment. arXiv:2402.09816","author":"Zavras Angelos","year":"2024","unstructured":"Angelos Zavras, Dimitrios Michail, Beg\u00fcm Demir, and Ioannis Papoutsis. 2024. Mind the Modality Gap: Towards a Remote Sensing Vision-Language Model via Cross-modal Alignment. arXiv:2402.09816 (2024)."},{"key":"e_1_3_2_1_64_1","unstructured":"Andy Zeng Maria Attarian Brian Ichter Krzysztof Choromanski Adrian Wong Stefan Welker Federico Tombari Aveek Purohit Michael Ryoo Vikas Sindhwani et al. 2022. Socratic models: Composing zero-shot multimodal reasoning with language. arXiv:2204.00598 (2022)."},{"key":"e_1_3_2_1_65_1","volume-title":"SkyEyeGPT: Unifying Remote Sensing Vision-Language Tasks via Instruction Tuning with Large Language Model. arXiv:2401.09712","author":"Zhan Yang","year":"2024","unstructured":"Yang Zhan, Zhitong Xiong, and Yuan Yuan. 2024. SkyEyeGPT: Unifying Remote Sensing Vision-Language Tasks via Instruction Tuning with Large Language Model. arXiv:2401.09712 (2024)."},{"key":"e_1_3_2_1_66_1","volume-title":"Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Finegrained Understanding. arXiv:2306.08832","author":"Zhang Le","year":"2023","unstructured":"Le Zhang, Rabiul Awal, and Aishwarya Agrawal. 2023. Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Finegrained Understanding. arXiv:2306.08832 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"RS5M and GeoRSCLIP: A Large Scale Vision-Language Dataset and A Large Vision-Language Model for Remote Sensing. arXiv:2306.11300","author":"Zhang Zilun","year":"2024","unstructured":"Zilun Zhang, Tiancheng Zhao, Yulong Guo, and Jianwei Yin. 2024. RS5M and GeoRSCLIP: A Large Scale Vision-Language Dataset and A Large Vision-Language Model for Remote Sensing. arXiv:2306.11300 (2024)."},{"key":"e_1_3_2_1_68_1","article-title":"Dirichlet-derived multiple topic scene classification model for high spatial resolution remote sensing imagery","volume":"54","author":"Zhao Bei","year":"2015","unstructured":"Bei Zhao, Yanfei Zhong, Gui-Song Xia, and Liangpei Zhang. 2015. Dirichlet-derived multiple topic scene classification model for high spatial resolution remote sensing imagery. IEEE Transactions on Geoscience and Remote Sensing 54, 4 (2015).","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_69_1","volume-title":"MMICL: Empowering vision-language model with multi-modal in-context learning. arXiv:2309.07915","author":"Zhao Haozhe","year":"2023","unstructured":"Haozhe Zhao, Zefan Cai, Shuzheng Si, Xiaojian Ma, Kaikai An, Liang Chen, Zixuan Liu, Sheng Wang, Wenjuan Han, and Baobao Chang. 2023. MMICL: Empowering vision-language model with multi-modal in-context learning. arXiv:2309.07915 (2023)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1117\/1.JRS.10.035004"},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Zheng Ding Zhuowen Tu","year":"2023","unstructured":"Zhuowen Tu Zheng Ding, Jieke Wang. 2023. Open-Vocabulary Universal Image Segmentation with MaskCLIP. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_72_1","volume-title":"Chen Change Loy, and Bo Dai","author":"Zhou Chong","year":"2021","unstructured":"Chong Zhou, Chen Change Loy, and Bo Dai. 2021. Extract Free Dense Labels from CLIP. arXiv:2112.01071 (2021)."},{"key":"e_1_3_2_1_73_1","volume-title":"Pattern-Net: A benchmark dataset for performance evaluation of remote sensing image retrieval. ISPRS Journal of Photogrammetry and Remote Sensing 145","author":"Zhou Weixun","year":"2018","unstructured":"Weixun Zhou, Shawn Newsam, Congmin Li, and Zhenfeng Shao. 2018. Pattern-Net: A benchmark dataset for performance evaluation of remote sensing image retrieval. ISPRS Journal of Photogrammetry and Remote Sensing 145 (2018)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01075"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2015.2475299"}],"event":{"name":"SIGSPATIAL '24: The 32nd ACM International Conference on Advances in Geographic Information Systems","location":"Atlanta GA USA","acronym":"SIGSPATIAL '24","sponsor":["SIGSPATIAL ACM Special Interest Group on Spatial Information"]},"container-title":["Proceedings of the 32nd ACM International Conference on Advances in Geographic Information Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678717.3691318","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3678717.3691318","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:42:07Z","timestamp":1755859327000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3678717.3691318"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"references-count":75,"alternative-id":["10.1145\/3678717.3691318","10.1145\/3678717"],"URL":"https:\/\/doi.org\/10.1145\/3678717.3691318","relation":{},"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"2024-11-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}