{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T07:16:23Z","timestamp":1764141383514,"version":"3.46.0"},"reference-count":68,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2021YFB3900504"],"award-info":[{"award-number":["2021YFB3900504"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Appl. Earth Observations Remote Sensing"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/jstars.2025.3625958","type":"journal-article","created":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T17:34:09Z","timestamp":1761672849000},"page":"29113-29130","source":"Crossref","is-referenced-by-count":0,"title":["DGTRSD and DGTRSCLIP: A Dual-Granularity Remote Sensing Image\u2013Text Dataset and Vision\u2013Language Foundation Model for Alignment"],"prefix":"10.1109","volume":"18","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0607-7294","authenticated-orcid":false,"given":"Weizhi","family":"Chen","sequence":"first","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8396-0338","authenticated-orcid":false,"given":"Yupeng","family":"Deng","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"given":"Wei","family":"Jin","sequence":"additional","affiliation":[{"name":"PLA Unit 32021, Beijing, China"}]},{"given":"Jingbo","family":"Chen","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8281-2528","authenticated-orcid":false,"given":"Jiansheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1588-7524","authenticated-orcid":false,"given":"Yuman","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Information Network Security, People&#x2019;s Public Security University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6508-0983","authenticated-orcid":false,"given":"Zhihao","family":"Xi","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7025-2137","authenticated-orcid":false,"given":"Diyou","family":"Liu","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7161-434X","authenticated-orcid":false,"given":"Kai","family":"Li","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9316-6686","authenticated-orcid":false,"given":"Yu","family":"Meng","sequence":"additional","affiliation":[{"name":"Aerospace Information Research Institute, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/762"},{"key":"ref3","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2023.103497"},{"article-title":"Remote sensing vision-language foundation models without annotations via ground remote alignment","year":"2023","author":"Mall","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.11834\/jrs.20244503"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2024.3492069"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2024.3454054"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3390838"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28393"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3449154"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_18"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3390\/rs12030405"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2021.3078451"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/1869790.1869829"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CITS.2016.7546397"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i6.32683"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2022.3207171"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/mgrs.2025.3576766"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"article-title":"Representation learning with contrastive predictive coding","year":"2018","author":"Oord","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72904-1_26"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28017"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01059"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321501"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.258"},{"key":"ref32","first-page":"8690","article-title":"Geoclip: Clip-inspired alignment between locations and images for effective worldwide geo-localization","volume":"36","author":"Cepeda","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32457"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72649-1_5"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.90"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2075"},{"key":"ref37","article-title":"Tulip: Token-length upgraded clip","volume-title":"Thirteenth Int. Conf. Learn. Representations","author":"Najdenkoska","year":"2024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3070368"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2020.534696"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00646"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00577"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.440"},{"key":"ref43","article-title":"Loveda: A remote sensing land-cover dataset for domain adaptive semantic segmentation","volume-title":"Proc. Neural Inf. Process. Syst. Track Datasets Benchmarks 1, NeurIPS Datasets Benchmarks, Virtual","author":"Wang","year":"2021"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2019.8900532"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.3390\/app10217622"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.446"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2019.11.023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00418"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2900302"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.5220\/0006120603240331"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.3390\/rs12101662"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-021-02893-3"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_33"},{"article-title":"Vision meets drones: A challenge","year":"2018","author":"Zhu","key":"ref54"},{"key":"ref55","first-page":"28","article-title":"isaid: A large-scale dataset for instance segmentation in aerial images","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. Workshops","author":"Zamir","year":"2019"},{"article-title":"Gemini: A family of highly capable multimodal models","year":"2023","author":"Team","key":"ref58"},{"key":"ref59","article-title":"Lora: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2022"},{"article-title":"Qwen2. 5-omni technical report","year":"2025","author":"Xu","key":"ref60"},{"article-title":"Qwen2. 5 technical report","year":"2024","author":"Yang","key":"ref61"},{"key":"ref62","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2675998"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2018.01.004"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2015.2496185"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2864987"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1117\/1.JRS.10.035004"}],"container-title":["IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/4609443\/10766875\/11219074.pdf?arnumber=11219074","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T07:12:07Z","timestamp":1764141127000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11219074\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":68,"URL":"https:\/\/doi.org\/10.1109\/jstars.2025.3625958","relation":{},"ISSN":["1939-1404","2151-1535"],"issn-type":[{"type":"print","value":"1939-1404"},{"type":"electronic","value":"2151-1535"}],"subject":[],"published":{"date-parts":[[2025]]}}}