{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:46:37Z","timestamp":1780058797490,"version":"3.54.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372155"],"award-info":[{"award-number":["62372155"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013088","name":"Qinglan Project of Jiangsu Province of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013088","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Changzhou Science and Technology Bureau Project","award":["20231313"],"award-info":[{"award-number":["20231313"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754950","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"3027-3036","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["RemoteSAM: Towards Segment Anything for Earth Observation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4588-3658","authenticated-orcid":false,"given":"Liang","family":"Yao","sequence":"first","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8746-9845","authenticated-orcid":false,"given":"Fan","family":"Liu","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8172-2894","authenticated-orcid":false,"given":"Delong","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8724-5796","authenticated-orcid":false,"given":"Chuanyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1379-7975","authenticated-orcid":false,"given":"Yijun","family":"Wang","sequence":"additional","affiliation":[{"name":"HoHai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6081-1547","authenticated-orcid":false,"given":"Ziyun","family":"Chen","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9976-4433","authenticated-orcid":false,"given":"Wei","family":"Xu","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7394-0082","authenticated-orcid":false,"given":"Shimin","family":"Di","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4408-3800","authenticated-orcid":false,"given":"Yuhui","family":"Zheng","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"3920","article-title":"FMARS: Annotating Remote Sensing Images for Disaster Management using Foundation Models","author":"Arnaudo Edoardo","year":"2024","unstructured":"Edoardo Arnaudo, Jacopo Lungo Vaschetti, Lorenzo Innocenti, Luca Barco, Davide Lisi, Vanina Fissore, and Claudio Rossi. 2024. FMARS: Annotating Remote Sensing Images for Disaster Management using Foundation Models. In IEEE IGARSS. IEEE, 3920-3924.","journal-title":"IEEE IGARSS. IEEE"},{"key":"e_1_3_2_1_2_1","volume-title":"Hisham Cholakkal, Mubarak Shah, Ming-Hsuan Yang, and Fahad Shahbaz Khan.","author":"Awais Muhammad","year":"2025","unstructured":"Muhammad Awais, Muzammal Naseer, Salman Khan, Rao Muhammad Anwer, Hisham Cholakkal, Mubarak Shah, Ming-Hsuan Yang, and Fahad Shahbaz Khan. 2025. Foundation Models Defining a New Era in Vision: a Survey and Outlook. IEEE TPAMI (2025)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00367"},{"key":"e_1_3_2_1_4_1","volume-title":"Multi-Temporal Earth Observation Imagery. In ICLR Workshop on Machine Learning for Remote Sensing (ML4RS).","author":"Brown Christopher","year":"2024","unstructured":"Christopher Brown, Michal Kazmierski, William Rucklidge, Valerie Pasquarella, and Evan Shelhamer. 2024. Learned Embedding Fields for Multi-Source, Multi-Temporal Earth Observation Imagery. In ICLR Workshop on Machine Learning for Remote Sensing (ML4RS)."},{"key":"e_1_3_2_1_5_1","volume-title":"Yejin Bang, and Pascale Fung.","author":"Chen Delong","year":"2024","unstructured":"Delong Chen, Samuel Cahyawijaya, Etsuko Ishii, Ho Shu Chan, Yejin Bang, and Pascale Fung. 2024a. What Makes for Good Image Captions? arXiv:2405.00485 [cs.CV] https:\/\/arxiv.org\/abs\/2405.00485"},{"key":"e_1_3_2_1_6_1","volume-title":"Subobject-level image tokenization. arXiv preprint arXiv:2402.14327","author":"Chen Delong","year":"2024","unstructured":"Delong Chen, Samuel Cahyawijaya, Jianfeng Liu, Baoyuan Wang, and Pascale Fung. 2024b. Subobject-level image tokenization. arXiv preprint arXiv:2402.14327 (2024)."},{"key":"e_1_3_2_1_7_1","first-page":"114108","article-title":"Remote sensing of diverse urban environments: From the single city to multiple cities","volume":"305","author":"Chen Gang","year":"2024","unstructured":"Gang Chen, Yuyu Zhou, James A Voogt, and Eleanor C Stokes. 2024c. Remote sensing of diverse urban environments: From the single city to multiple cities. RSE, Vol. 305 (2024), 114108.","journal-title":"RSE"},{"key":"e_1_3_2_1_8_1","unstructured":"Ting Chen Saurabh Saxena Lala Li David J. Fleet and Geoffrey Hinton. 2022a. Pix2seq: A Language Modeling Framework for Object Detection. arXiv:2109.10852 [cs.CV] https:\/\/arxiv.org\/abs\/2109.10852"},{"key":"e_1_3_2_1_9_1","unstructured":"Ting Chen Saurabh Saxena Lala Li Tsung-Yi Lin David J. Fleet and Geoffrey Hinton. 2022b. A Unified Sequence Interface for Vision Tasks. arXiv:2206.07669 [cs.CV] https:\/\/arxiv.org\/abs\/2206.07669"},{"key":"e_1_3_2_1_10_1","first-page":"26573","article-title":"Mask grounding for referring image segmentation","author":"Chng Yong Xien","year":"2024","unstructured":"Yong Xien Chng, Henry Zheng, Yizeng Han, Xuchong Qiu, and Gao Huang. 2024. Mask grounding for referring image segmentation. In CVPR. 26573-26583.","journal-title":"CVPR."},{"key":"e_1_3_2_1_11_1","first-page":"5823","volume-title":"IEEE TMM","volume":"26","author":"Cho Yubin","year":"2023","unstructured":"Yubin Cho, Hyunwoo Yu, and Suk-Ju Kang. 2023. Cross-aware early fusion with stage-divided vision and language transformer encoders for referring image segmentation. IEEE TMM, Vol. 26 (2023), 5823-5833."},{"key":"e_1_3_2_1_12_1","first-page":"6172","article-title":"Functional map of the world","author":"Christie Gordon","year":"2018","unstructured":"Gordon Christie, Neil Fendley, James Wilson, and Ryan Mukherjee. 2018. Functional map of the world. In CVPR. 6172-6180.","journal-title":"CVPR."},{"key":"e_1_3_2_1_13_1","first-page":"197","article-title":"Satmae: Pre-training transformers for temporal and multi-spectral satellite imagery","volume":"35","author":"Cong Yezhen","year":"2022","unstructured":"Yezhen Cong, Samar Khanna, Chenlin Meng, Patrick Liu, Erik Rozi, Yutong He, Marshall Burke, David Lobell, and Stefano Ermon. 2022. Satmae: Pre-training transformers for temporal and multi-spectral satellite imagery. NeurIPS, Vol. 35 (2022), 197-211.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_14_1","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248-255.","journal-title":"CVPR. Ieee"},{"key":"e_1_3_2_1_15_1","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In NAACL-HLT. 4171-4186.","journal-title":"NAACL-HLT."},{"key":"e_1_3_2_1_16_1","volume-title":"Cross-modal bidirectional interaction model for referring remote sensing image segmentation. arXiv preprint arXiv:2410.08613","author":"Dong Zhe","year":"2024","unstructured":"Zhe Dong, Yuzhe Sun, Yanfeng Gu, and Tianzhu Liu. 2024. Cross-modal bidirectional interaction model for referring remote sensing image segmentation. arXiv preprint arXiv:2410.08613 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395","author":"Hu Shengding","year":"2024","unstructured":"Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, et al., 2024. Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395 (2024)."},{"key":"e_1_3_2_1_18_1","first-page":"4067","article-title":"Beyond one-to-one: Rethinking the referring image segmentation","author":"Hu Yutao","year":"2023","unstructured":"Yutao Hu, Qixiong Wang, Wenqi Shao, Enze Xie, Zhenguo Li, Jungong Han, and Ping Luo. 2023. Beyond one-to-one: Rethinking the referring image segmentation. In ICCV. 4067-4077.","journal-title":"ICCV."},{"key":"e_1_3_2_1_19_1","first-page":"4424","article-title":"Bi-directional relationship inferring network for referring image segmentation","author":"Hu Zhiwei","year":"2020","unstructured":"Zhiwei Hu, Guang Feng, Jiayu Sun, Lihe Zhang, and Huchuan Lu. 2020. Bi-directional relationship inferring network for referring image segmentation. In CVPR. 4424-4433.","journal-title":"CVPR."},{"key":"e_1_3_2_1_20_1","first-page":"16888","article-title":"Look before you leap: Learning landmark features for one-stage visual grounding","author":"Huang Binbin","year":"2021","unstructured":"Binbin Huang, Dongze Lian, Weixin Luo, and Shenghua Gao. 2021. Look before you leap: Learning landmark features for one-stage visual grounding. In CVPR. 16888-16897.","journal-title":"CVPR."},{"key":"e_1_3_2_1_21_1","first-page":"10488","article-title":"Referring image segmentation via cross-modal progressive comprehension","author":"Huang Shaofei","year":"2020","unstructured":"Shaofei Huang, Tianrui Hui, Si Liu, Guanbin Li, Yunchao Wei, Jizhong Han, Luoqi Liu, and Bo Li. 2020. Referring image segmentation via cross-modal progressive comprehension. In CVPR. 10488-10497.","journal-title":"CVPR."},{"key":"e_1_3_2_1_22_1","volume-title":"Linguistic structure guided context modeling for referring image segmentation","author":"Hui Tianrui","unstructured":"Tianrui Hui, Si Liu, Shaofei Huang, Guanbin Li, Sansi Yu, Faxi Zhang, and Jizhong Han. 2020. Linguistic structure guided context modeling for referring image segmentation. In ECCV. Springer, 59-75."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127599"},{"key":"e_1_3_2_1_24_1","first-page":"27831","article-title":"Geochat: Grounded large vision-language model for remote sensing","author":"Kuckreja Kartik","year":"2024","unstructured":"Kartik Kuckreja, Muhammad Sohail Danish, Muzammal Naseer, Abhijit Das, Salman Khan, and Fahad Shahbaz Khan. 2024. Geochat: Grounded large vision-language model for remote sensing. In CVPR. 27831-27840.","journal-title":"CVPR."},{"key":"e_1_3_2_1_25_1","volume-title":"Clearclip: Decomposing clip representations for dense vision-language inference","author":"Lan Mengcheng","year":"2024","unstructured":"Mengcheng Lan, Chaofeng Chen, Yiping Ke, Xinjiang Wang, Litong Feng, and Wayne Zhang. 2024. Clearclip: Decomposing clip representations for dense vision-language inference. In ECCV. Springer, 143-160."},{"key":"e_1_3_2_1_26_1","volume-title":"Exploring fine-grained image-text alignment for referring remote sensing image segmentation","author":"Lei Sen","year":"2024","unstructured":"Sen Lei, Xinyu Xiao, Tianlin Zhang, Heng-Chao Li, Zhenwei Shi, and Qing Zhu. 2024. Exploring fine-grained image-text alignment for referring remote sensing image segmentation. IEEE TGRS (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Xiangyong Cao, Xueru Bai, Feng Zhou, Deyu Meng, and Zhi Wang.","author":"Li Kaiyu","year":"2024","unstructured":"Kaiyu Li, ruixun Liu, Xiangyong Cao, Xueru Bai, Feng Zhou, Deyu Meng, and Zhi Wang. 2024d. SegEarth-OV: Towards Training-Free Open-Vocabulary Segmentation for Remote Sensing Images. arXiv preprint arXiv:2410.01768 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2019.11.023"},{"key":"e_1_3_2_1_29_1","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","volume":"34","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. NeurIPS, Vol. 34 (2021), 19652-19664.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3354783"},{"key":"e_1_3_2_1_31_1","volume-title":"Toward open vocabulary aerial object detection with clip-activated student-teacher learning","author":"Li Yan","unstructured":"Yan Li, Weiwei Guo, Xue Yang, Ning Liao, Dunyun He, Jiaqi Zhou, and Wenxian Yu. 2024b. Toward open vocabulary aerial object detection with clip-activated student-teacher learning. In ECCV. Springer, 431-448."},{"key":"e_1_3_2_1_32_1","volume-title":"Masked angle-aware autoencoder for remote sensing images","author":"Li Zhihao","unstructured":"Zhihao Li, Biao Hou, Siteng Ma, Zitong Wu, Xianpeng Guo, Bo Ren, and Licheng Jiao. 2024c. Masked angle-aware autoencoder for remote sensing images. In ECCV. Springer, 260-278."},{"key":"e_1_3_2_1_33_1","first-page":"10880","article-title":"A real-time cross-modality correlation filtering method for referring expression comprehension","author":"Liao Yue","year":"2020","unstructured":"Yue Liao, Si Liu, Guanbin Li, Fei Wang, Yanjie Chen, Chen Qian, and Bo Li. 2020. A real-time cross-modality correlation filtering method for referring expression comprehension. In CVPR. 10880-10889.","journal-title":"CVPR."},{"key":"e_1_3_2_1_34_1","first-page":"4266","article-title":"Progressive language-customized visual feature learning for one-stage visual grounding","volume":"31","author":"Liao Yue","year":"2022","unstructured":"Yue Liao, Aixi Zhang, Zhiyuan Chen, Tianrui Hui, and Si Liu. 2022. Progressive language-customized visual feature learning for one-stage visual grounding. IEEE TIP, Vol. 31 (2022), 4266-4277.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_35_1","volume-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575","author":"Lin Ziyi","year":"2023","unstructured":"Ziyi Lin, Chris Liu, Renrui Zhang, Peng Gao, Longtian Qiu, Han Xiao, Han Qiu, Chen Lin, Wenqi Shao, Keqin Chen, et al., 2023. Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)."},{"key":"e_1_3_2_1_36_1","first-page":"23592","article-title":"Gres: Generalized referring expression segmentation","author":"Liu Chang","year":"2023","unstructured":"Chang Liu, Henghui Ding, and Xudong Jiang. 2023a. Gres: Generalized referring expression segmentation. In CVPR. 23592-23601.","journal-title":"CVPR."},{"key":"e_1_3_2_1_37_1","volume-title":"Remoteclip: A vision language foundation model for remote sensing","author":"Liu Fan","year":"2024","unstructured":"Fan Liu, Delong Chen, Zhangqingyun Guan, Xiaocong Zhou, Jiale Zhu, Qiaolin Ye, Liyong Fu, and Jun Zhou. 2024a. Remoteclip: A vision language foundation model for remote sensing. IEEE TGRS (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Scale-Invariant Feature Disentanglement via Adversarial Learning for UAV-based Object Detection. arXiv preprint arXiv:2405.15465","author":"Liu Fan","year":"2024","unstructured":"Fan Liu, Liang Yao, Chuanyi Zhang, Ting Wu, Xinlei Zhang, Xiruo Jiang, and Jun Zhou. 2024c. Scale-Invariant Feature Disentanglement via Adversarial Learning for UAV-based Object Detection. arXiv preprint arXiv:2405.15465 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-024-10915-y"},{"key":"e_1_3_2_1_40_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023b. Visual Instruction Tuning."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3079993"},{"key":"e_1_3_2_1_42_1","first-page":"26658","article-title":"Rotated multi-scale interaction network for referring remote sensing image segmentation","author":"Liu Sihan","year":"2024","unstructured":"Sihan Liu, Yiwei Ma, Xiaoqing Zhang, Haowei Wang, Jiayi Ji, Xiaoshuai Sun, and Rongrong Ji. 2024b. Rotated multi-scale interaction network for referring remote sensing image segmentation. In CVPR. 26658-26668.","journal-title":"CVPR."},{"key":"e_1_3_2_1_43_1","first-page":"779","article-title":"CARIS: Context-aware referring image segmentation","author":"Liu Sun-Ao","year":"2023","unstructured":"Sun-Ao Liu, Yiheng Zhang, Zhaofan Qiu, Hongtao Xie, Yongdong Zhang, and Ting Yao. 2023c. CARIS: Context-aware referring image segmentation. In ACM MM. 779-788.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_44_1","first-page":"10012","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021b. Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV. 10012-10022.","journal-title":"ICCV."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3070368"},{"key":"e_1_3_2_1_46_1","first-page":"5261","article-title":"Change-aware sampling and contrastive learning for satellite images","author":"Mall Utkarsh","year":"2023","unstructured":"Utkarsh Mall, Bharath Hariharan, and Kavita Bala. 2023. Change-aware sampling and contrastive learning for satellite images. In CVPR. 5261-5270.","journal-title":"CVPR."},{"key":"e_1_3_2_1_47_1","first-page":"9414","article-title":"Seasonal contrast: Unsupervised pre-training from uncurated remote sensing data","author":"Manas Oscar","year":"2021","unstructured":"Oscar Manas, Alexandre Lacoste, Xavier Gir\u00f3-i Nieto, David Vazquez, and Pau Rodriguez. 2021. Seasonal contrast: Unsupervised pre-training from uncurated remote sensing data. In ICCV. 9414-9423.","journal-title":"ICCV."},{"key":"e_1_3_2_1_48_1","first-page":"1","article-title":"Prompting DirectSAM for Semantic Contour Extraction in Remote Sensing Images","author":"Miao Shiyu","year":"2025","unstructured":"Shiyu Miao, Delong Chen, Fan Liu, Chuanyi Zhang, Yanhui Gu, Shengjie Guo, and Jun Zhou. 2025. Prompting DirectSAM for Semantic Contour Extraction in Remote Sensing Images. In ICASSP. IEEE, 1-5.","journal-title":"ICASSP. IEEE"},{"key":"e_1_3_2_1_49_1","volume-title":"Lhrs-bot: Empowering remote sensing with vgi-enhanced large multimodal language model","author":"Muhtar Dilxat","year":"2024","unstructured":"Dilxat Muhtar, Zhenshi Li, Feng Gu, Xueliang Zhang, and Pengfeng Xiao. 2024. Lhrs-bot: Empowering remote sensing with vgi-enhanced large multimodal language model. In ECCV. Springer, 440-457."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02627"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12142291"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00378"},{"key":"e_1_3_2_1_53_1","first-page":"4088","article-title":"Scale-mae: A scale-aware masked autoencoder for multiscale geospatial representation learning","author":"Reed Colorado J","year":"2023","unstructured":"Colorado J Reed, Ritwik Gupta, Shufan Li, Sarah Brockman, Christopher Funk, Brian Clipp, Kurt Keutzer, Salvatore Candido, Matt Uyttendaele, and Trevor Darrell. 2023b. Scale-mae: A scale-aware masked autoencoder for multiscale geospatial representation learning. In ICCV. 4088-4099.","journal-title":"ICCV."},{"key":"e_1_3_2_1_54_1","unstructured":"Tianhe Ren Shilong Liu Ailing Zeng Jing Lin Kunchang Li He Cao Jiayu Chen Xinyu Huang Yukang Chen Feng Yan Zhaoyang Zeng Hao Zhang Feng Li Jie Yang Hongyang Li Qing Jiang and Lei Zhang. 2024. Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks. arXiv:2401.14159 [cs.CV]"},{"key":"e_1_3_2_1_55_1","volume-title":"SATIN: A multi-task metadataset for classifying satellite imagery using vision-language models. arXiv preprint arXiv:2304.11619","author":"Roberts Jonathan","year":"2023","unstructured":"Jonathan Roberts, Kai Han, and Samuel Albanie. 2023. SATIN: A multi-task metadataset for classifying satellite imagery using vision-language models. arXiv preprint arXiv:2304.11619 (2023)."},{"key":"e_1_3_2_1_56_1","unstructured":"Fu Rong Meng Lan Qian Zhang and Lefei Zhang. 2025. Customized SAM 2 for Referring Remote Sensing Image Segmentation. arXiv:2503.07266 [cs.CV] https:\/\/arxiv.org\/abs\/2503.07266"},{"key":"e_1_3_2_1_57_1","first-page":"4694","article-title":"Zero-shot grounding of objects from natural language queries","author":"Sadhu Arka","year":"2019","unstructured":"Arka Sadhu, Kan Chen, and Ram Nevatia. 2019. Zero-shot grounding of objects from natural language queries. In ICCV. 4694-4703.","journal-title":"ICCV."},{"key":"e_1_3_2_1_58_1","first-page":"1","volume-title":"IEEE TGRS","volume":"61","author":"Sun Xian","year":"2022","unstructured":"Xian Sun, Peijin Wang, Wanxuan Lu, Zicong Zhu, Xiaonan Lu, Qibin He, Junxi Li, Xuee Rong, Zhujun Yang, Hao Chang, et al., 2022. RingMo: A remote sensing foundation model with masked image modeling. IEEE TGRS, Vol. 61 (2022), 1-22."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3194732"},{"key":"e_1_3_2_1_60_1","volume-title":"Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier H\u00e9naff, Jeremiah Harmsen, Andreas Steiner, and Xiaohua Zhai.","author":"Tschannen Michael","year":"2025","unstructured":"Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier H\u00e9naff, Jeremiah Harmsen, Andreas Steiner, and Xiaohua Zhai. 2025. SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features. arXiv preprint arXiv:2502.14786 (2025)."},{"key":"e_1_3_2_1_61_1","volume-title":"Mrinalini Kochupillai, Sa\u0161o D\u017eeroski, Jan N van Rijn, Holger H Hoos, Fabio Del Frate, Mihai Datcu, et al.","author":"Tuia Devis","year":"2024","unstructured":"Devis Tuia, Konrad Schindler, Beg\u00fcm Demir, Xiao Xiang Zhu, Mrinalini Kochupillai, Sa\u0161o D\u017eeroski, Jan N van Rijn, Holger H Hoos, Fabio Del Frate, Mihai Datcu, et al., 2024. Artificial Intelligence to Advance Earth Observation: A review of models, recent trends, and pathways forward. IEEE GRSM (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Delphin Raj Kesari Mary, Ramalingam Murugan, Rajeswari Chengoden, Thippa Reddy Gadekallu, Nitin Rakesh, Yaodong Zhu, and Jeongyeup Paek.","author":"Victor Nancy","year":"2024","unstructured":"Nancy Victor, Praveen Kumar Reddy Maddikunta, Delphin Raj Kesari Mary, Ramalingam Murugan, Rajeswari Chengoden, Thippa Reddy Gadekallu, Nitin Rakesh, Yaodong Zhu, and Jeongyeup Paek. 2024. Remote Sensing for Agriculture in the Era of Industry 5.0-A survey. IEEE JSTARS (2024)."},{"key":"e_1_3_2_1_63_1","first-page":"8815","article-title":"Samrs: Scaling-up remote sensing segmentation dataset with segment anything model","volume":"36","author":"Wang Di","year":"2023","unstructured":"Di Wang, Jing Zhang, Bo Du, Minqiang Xu, Lin Liu, Dacheng Tao, and Liangpei Zhang. 2023c. Samrs: Scaling-up remote sensing segmentation dataset with segment anything model. NeurIPS, Vol. 36 (2023), 8815-8827.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_64_1","first-page":"1","volume-title":"IEEE TGRS","volume":"61","author":"Wang Di","year":"2022","unstructured":"Di Wang, Qiming Zhang, Yufei Xu, Jing Zhang, Bo Du, Dacheng Tao, and Liangpei Zhang. 2022b. Advancing plain vision transformer toward remote sensing foundation model. IEEE TGRS, Vol. 61 (2022), 1-15."},{"key":"e_1_3_2_1_65_1","volume-title":"SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv preprint arXiv:2312.01597","author":"Wang Feng","year":"2023","unstructured":"Feng Wang, Jieru Mei, and Alan Yuille. 2023b. SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference. arXiv preprint arXiv:2312.01597 (2023)."},{"key":"e_1_3_2_1_66_1","volume-title":"LoveDA: A remote sensing land-cover dataset for domain adaptive semantic segmentation. arXiv preprint arXiv:2110.08733","author":"Wang Junjue","year":"2021","unstructured":"Junjue Wang, Zhuo Zheng, Ailong Ma, Xiaoyan Lu, and Yanfei Zhong. 2021. LoveDA: A remote sensing land-cover dataset for domain adaptive semantic segmentation. arXiv preprint arXiv:2110.08733 (2021)."},{"key":"e_1_3_2_1_67_1","first-page":"23318","article-title":"Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022a. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In ICML. PMLR, 23318-23340.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2023.3281651"},{"key":"e_1_3_2_1_69_1","first-page":"28","article-title":"isaid: A large-scale dataset for instance segmentation in aerial images","author":"Zamir Syed Waqas","year":"2019","unstructured":"Syed Waqas Zamir, Aditya Arora, Akshita Gupta, Salman Khan, Guolei Sun, Fahad Shahbaz Khan, Fan Zhu, Ling Shao, Gui-Song Xia, and Xiang Bai. 2019. isaid: A large-scale dataset for instance segmentation in aerial images. In CVPRW. 28-37.","journal-title":"CVPRW."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3371348"},{"key":"e_1_3_2_1_71_1","first-page":"3974","article-title":"DOTA: A large-scale dataset for object detection in aerial images","author":"Xia Gui-Song","year":"2018","unstructured":"Gui-Song Xia, Xiang Bai, Jian Ding, Zhen Zhu, Serge Belongie, Jiebo Luo, Mihai Datcu, Marcello Pelillo, and Liangpei Zhang. 2018. DOTA: A large-scale dataset for object detection in aerial images. In CVPR. 3974-3983.","journal-title":"CVPR."},{"key":"e_1_3_2_1_72_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu and Chengyuan Li et al. 2025. Qwen2.5 Technical Report. arXiv:2412.15115 [cs.CL] https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"e_1_3_2_1_73_1","first-page":"18155","article-title":"Lavt: Language-aware vision transformer for referring image segmentation","author":"Yang Zhao","year":"2022","unstructured":"Zhao Yang, Jiaqi Wang, Yansong Tang, Kai Chen, Hengshuang Zhao, and Philip HS Torr. 2022. Lavt: Language-aware vision transformer for referring image segmentation. In CVPR. 18155-18165.","journal-title":"CVPR."},{"key":"e_1_3_2_1_74_1","volume-title":"Falcon: A Remote Sensing Vision-Language Foundation Model. arXiv preprint arXiv:2503.11070","author":"Yao Kelu","year":"2025","unstructured":"Kelu Yao, Nuo Xu, Rong Yang, Yingying Xu, Zhuoyan Gao, Titinunt Kitrungrotsakul, Yi Ren, Pu Zhang, Jin Wang, Ning Wei, et al., 2025. Falcon: A Remote Sensing Vision-Language Foundation Model. arXiv preprint arXiv:2503.11070 (2025)."},{"key":"e_1_3_2_1_75_1","volume-title":"Domain-Invariant Progressive Knowledge Distillation for UAV-Based Object Detection","author":"Yao Liang","year":"2024","unstructured":"Liang Yao, Fan Liu, Chuanyi Zhang, Zhiquan Ou, and Ting Wu. 2024. Domain-Invariant Progressive Knowledge Distillation for UAV-Based Object Detection. IEEE GRSL (2024)."},{"key":"e_1_3_2_1_76_1","first-page":"15502","article-title":"Shifting more attention to visual backbone: Query-modulated refinement networks for end-to-end visual grounding","author":"Ye Jiabo","year":"2022","unstructured":"Jiabo Ye, Junfeng Tian, Ming Yan, Xiaoshan Yang, Xuwu Wang, Ji Zhang, Liang He, and Xin Lin. 2022. Shifting more attention to visual backbone: Query-modulated refinement networks for end-to-end visual grounding. In CVPR. 15502-15512.","journal-title":"CVPR."},{"key":"e_1_3_2_1_77_1","volume-title":"Rrsis: Referring remote sensing image segmentation","author":"Yuan Zhenghang","year":"2024","unstructured":"Zhenghang Yuan, Lichao Mou, Yuansheng Hua, and Xiao Xiang Zhu. 2024. Rrsis: Referring remote sensing image segmentation. IEEE TGRS (2024)."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3250471"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2025.01.020"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_81_1","volume-title":"Earthgpt: A universal multi-modal large language model for multi-sensor image comprehension in remote sensing domain","author":"Zhang Wei","year":"2024","unstructured":"Wei Zhang, Miaoxin Cai, Tong Zhang, Yin Zhuang, and Xuerui Mao. 2024a. Earthgpt: A universal multi-modal large language model for multi-sensor image comprehension in remote sensing domain. IEEE TGRS (2024)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2900302"},{"key":"e_1_3_2_1_83_1","volume-title":"RS5M and GeoRSCLIP: A large scale vision-language dataset and a large vision-language model for remote sensing","author":"Zhang Zilun","year":"2024","unstructured":"Zilun Zhang, Tiancheng Zhao, Yulong Guo, and Jianwei Yin. 2024c. RS5M and GeoRSCLIP: A large scale vision-language dataset and a large vision-language model for remote sensing. IEEE TGRS (2024)."},{"key":"e_1_3_2_1_84_1","volume-title":"Chen Change Loy, and Bo Dai","author":"Zhou Chong","year":"2022","unstructured":"Chong Zhou, Chen Change Loy, and Bo Dai. 2022. Extract Free Dense Labels from CLIP. In ECCV."},{"key":"e_1_3_2_1_85_1","volume-title":"Geoground: A unified large vision-language model. for remote sensing visual grounding. arXiv preprint arXiv:2411.11904","author":"Zhou Yue","year":"2024","unstructured":"Yue Zhou, Mengcheng Lan, Xiang Li, Yiping Ke, Xue Jiang, Litong Feng, and Wayne Zhang. 2024. Geoground: A unified large vision-language model. for remote sensing visual grounding. arXiv preprint arXiv:2411.11904 (2024)."},{"key":"e_1_3_2_1_86_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:26Z","timestamp":1765339586000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754950"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":86,"alternative-id":["10.1145\/3746027.3754950","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754950","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}