{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T16:59:08Z","timestamp":1780937948836,"version":"3.54.1"},"reference-count":73,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100003995","name":"Anhui Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["2508085QF250"],"award-info":[{"award-number":["2508085QF250"]}],"id":[{"id":"10.13039\/501100003995","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010814","name":"Anhui Province Department of Education","doi-asserted-by":"publisher","award":["2024AH050451"],"award-info":[{"award-number":["2024AH050451"]}],"id":[{"id":"10.13039\/501100010814","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.eswa.2026.132140","type":"journal-article","created":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T16:39:52Z","timestamp":1774543192000},"page":"132140","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["From insufficient to sufficient: Hierarchical semantic alignment for remote sensing image-text retrieval"],"prefix":"10.1016","volume":"321","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9531-8418","authenticated-orcid":false,"given":"Yuan","family":"Shi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3059-6629","authenticated-orcid":false,"given":"Wentao","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6265-0767","authenticated-orcid":false,"given":"Shaofan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8865-3353","authenticated-orcid":false,"given":"Lu","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3231-9983","authenticated-orcid":false,"given":"Guolong","family":"Shi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4256-5153","authenticated-orcid":false,"given":"Zhongyang","family":"Yao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7196-0944","authenticated-orcid":false,"given":"Wei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8932-0825","authenticated-orcid":false,"given":"Lichuan","family":"Gu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"3","key":"10.1016\/j.eswa.2026.132140_bib0001","doi-asserted-by":"crossref","first-page":"405","DOI":"10.3390\/rs12030405","article-title":"TextRS: Deep bidirectional triplet network for matching text to remote sensing images","volume":"12","author":"Abdullah","year":"2020","journal-title":"Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0002","series-title":"2025 International conference on communication technologies (comtech)","first-page":"1","article-title":"Remote sensing cross-modal text-image retrieval using multi-grade dynamic feature fusion","author":"Ali","year":"2025"},{"key":"10.1016\/j.eswa.2026.132140_bib0003","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2022.3192460","article-title":"Bi-modal transformer-based approach for visual question answering in remote sensing imagery","volume":"60","author":"Bazi","year":"2022","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0004","first-page":"1","article-title":"Scale-aware adaptive refinement and cross interaction for remote sensing audio-visual cross-modal retrieval","volume":"62","author":"Chen","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0005","first-page":"1","article-title":"Multiscale salient alignment learning for remote-sensing image\u2013text retrieval","volume":"62","author":"Chen","year":"2023","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0006","first-page":"1","article-title":"Thread the needle: Cues-driven multi-association for remote sensing cross-modal retrieval","volume":"62","author":"Chen","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0007","first-page":"1","article-title":"Integrating multisubspace joint learning with multilevel guidance for cross-modal retrieval of remote sensing images","volume":"62","author":"Chen","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0008","doi-asserted-by":"crossref","first-page":"4284","DOI":"10.1109\/JSTARS.2021.3070872","article-title":"A deep semantic alignment network for the cross-modal image-text retrieval in remote sensing","volume":"14","author":"Cheng","year":"2021","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"issue":"11","key":"10.1016\/j.eswa.2026.132140_bib0009","doi-asserted-by":"crossref","first-page":"2207","DOI":"10.1109\/JPROC.2016.2598228","article-title":"Big data for remote sensing: Challenges and opportunities","volume":"104","author":"Chi","year":"2016","journal-title":"Proceedings of the IEEE"},{"key":"10.1016\/j.eswa.2026.132140_bib0010","series-title":"Proceedings of the 2025 3rd international conference on communication networks and machine learning","first-page":"103","article-title":"An adaptive fusion network integrating multi-source feature for remote sensing image-text retrieval","author":"Deng","year":"2025"},{"issue":"Nov","key":"10.1016\/j.eswa.2026.132140_bib0011","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.eswa.2026.132140_bib0012","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.eswa.2026.132140_bib0013","unstructured":"Faghri, F., Fleet, D. J., Kiros, J. R., & Fidler, S. (2017). VSE++: Improving visual-semantic embeddings with hard negatives. arXiv: 1707.05612,."},{"issue":"2","key":"10.1016\/j.eswa.2026.132140_bib0014","doi-asserted-by":"crossref","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","article-title":"Clip-adapter: Better vision-language models with feature adapters","volume":"132","author":"Gao","year":"2024","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.eswa.2026.132140_bib0015","doi-asserted-by":"crossref","first-page":"26418","DOI":"10.52202\/068431-1916","article-title":"Wukong: A 100 million large-scale chinese cross-modal pre-training benchmark","volume":"35","author":"Gu","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132140_bib0016","first-page":"1","article-title":"Visual global-salient-guided network for remote sensing image-text retrieval","volume":"62","author":"He","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0017","first-page":"1","article-title":"A novel SVM-based decoder for remote sensing image captioning","volume":"60","author":"Hoxha","year":"2021","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0018","first-page":"1","article-title":"Knowledge-aided momentum contrastive learning for remote-sensing image text retrieval","volume":"61","author":"Ji","year":"2023","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0019","series-title":"International conference on machine learning","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.eswa.2026.132140_bib0020","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"201","article-title":"Stacked cross attention for image-text matching","author":"Lee","year":"2018"},{"key":"10.1016\/j.eswa.2026.132140_bib0021","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., & Zettlemoyer, L. (2019). BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv: 1910.13461,.","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"10.1016\/j.eswa.2026.132140_bib0022","series-title":"Computer vision\u2013ECCV 2020: 16th European conference, glasgow, UK, august 23\u201328, 2020, proceedings, part XXX 16","first-page":"121","article-title":"OSCAR: Object-semantics aligned pre-training for vision-language tasks","author":"Li","year":"2020"},{"key":"10.1016\/j.eswa.2026.132140_bib0023","doi-asserted-by":"crossref","first-page":"94","DOI":"10.1016\/j.inffus.2020.10.008","article-title":"Image retrieval from remote sensing big data: A survey","volume":"67","author":"Li","year":"2021","journal-title":"Information Fusion"},{"issue":"4","key":"10.1016\/j.eswa.2026.132140_bib0024","doi-asserted-by":"crossref","first-page":"2183","DOI":"10.1109\/TGRS.2017.2776321","article-title":"Exploring models and data for remote sensing image caption generation","volume":"56","author":"Lu","year":"2017","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0025","first-page":"1","article-title":"Fusion-based correlation learning model for cross-modal remote sensing image retrieval","volume":"19","author":"Lv","year":"2021","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"key":"10.1016\/j.eswa.2026.132140_bib0026","first-page":"1","article-title":"Direction-oriented visual\u2013semantic embedding model for remote sensing image\u2013text retrieval","volume":"62","author":"Ma","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"issue":"5","key":"10.1016\/j.eswa.2026.132140_bib0027","doi-asserted-by":"crossref","first-page":"7150","DOI":"10.1109\/TNNLS.2022.3214208","article-title":"Query-adaptive late fusion for hierarchical fine-grained video-text retrieval","volume":"35","author":"Ma","year":"2022","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"issue":"10","key":"10.1016\/j.eswa.2026.132140_bib0028","doi-asserted-by":"crossref","first-page":"5486","DOI":"10.1109\/TCSVT.2023.3257193","article-title":"Using multimodal contrastive knowledge distillation for video-text retrieval","volume":"33","author":"Ma","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132140_bib0029","article-title":"TSGR2: Image-text matching via triple-level scene graph relation reasoning","volume":"187","author":"Ma","year":"2025","journal-title":"Applied Soft Computing"},{"key":"10.1016\/j.eswa.2026.132140_bib0030","doi-asserted-by":"crossref","first-page":"5065","DOI":"10.1109\/TMM.2023.3330091","article-title":"FedSH: Towards privacy-preserving text-based person re-identification","volume":"26","author":"Ma","year":"2023","journal-title":"IEEE Transactions on Multimedia"},{"issue":"1","key":"10.1016\/j.eswa.2026.132140_bib0031","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2022.103119","article-title":"Adaptive multi-feature fusion via cross-entropy normalization for effective image retrieval","volume":"60","author":"Ma","year":"2023","journal-title":"Information Processing & Management"},{"key":"10.1016\/j.eswa.2026.132140_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.108213","article-title":"Joint-attention feature fusion network and dual-adaptive NMS for object detection","volume":"241","author":"Ma","year":"2022","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.eswa.2026.132140_bib0033","doi-asserted-by":"crossref","first-page":"47","DOI":"10.1016\/j.future.2014.10.029","article-title":"Remote sensing big data computing: Challenges and opportunities","volume":"51","author":"Ma","year":"2015","journal-title":"Future Generation Computer Systems"},{"key":"10.1016\/j.eswa.2026.132140_bib0034","series-title":"Proceedings of the 2023 ACM international conference on multimedia retrieval","first-page":"398","article-title":"Reducing semantic confusion: Scene-aware aggregation network for remote sensing cross-modal retrieval","author":"Pan","year":"2023"},{"issue":"4","key":"10.1016\/j.eswa.2026.132140_bib0035","doi-asserted-by":"crossref","first-page":"12363","DOI":"10.1007\/s11042-023-15798-9","article-title":"SAM: Cross-modal semantic alignments module for image-text retrieval","volume":"83","author":"Park","year":"2024","journal-title":"Multimedia Tools and Applications"},{"key":"10.1016\/j.eswa.2026.132140_bib0036","series-title":"2016 international conference on computer, information and telecommunication systems (cits)","first-page":"1","article-title":"Deep semantic understanding of high resolution remote sensing image","author":"Qu","year":"2016"},{"key":"10.1016\/j.eswa.2026.132140_bib0037","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.eswa.2026.132140_bib0038","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"14974","article-title":"Prompting large language models with answer heuristics for knowledge-based visual question answering","author":"Shao","year":"2023"},{"issue":"6","key":"10.1016\/j.eswa.2026.132140_bib0039","doi-asserted-by":"crossref","first-page":"3623","DOI":"10.1109\/TGRS.2017.2677464","article-title":"Can a machine generate humanlike language descriptions for a remote sensing image?","volume":"55","author":"Shi","year":"2017","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0040","series-title":"European conference on computer vision","first-page":"111","article-title":"Enhancing recipe retrieval with foundation models: A data augmentation perspective","author":"Song","year":"2024"},{"issue":"8","key":"10.1016\/j.eswa.2026.132140_bib0041","doi-asserted-by":"crossref","first-page":"6922","DOI":"10.1109\/TGRS.2020.3031111","article-title":"SD-RSIC: Summarization-driven deep remote sensing image captioning","volume":"59","author":"Sumbul","year":"2020","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0042","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6398","article-title":"Circle loss: A unified perspective of pair similarity optimization","author":"Sun","year":"2020"},{"key":"10.1016\/j.eswa.2026.132140_bib0043","first-page":"1","article-title":"Cross-modal prealigned method with global and local information for remote sensing image and text retrieval","volume":"62","author":"Sun","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0044","first-page":"1","article-title":"Prior-experience-based vision-language model for remote sensing image-text retrieval","volume":"62","author":"Tang","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0045","doi-asserted-by":"crossref","first-page":"405","DOI":"10.1016\/j.inffus.2022.08.032","article-title":"Multispectral and hyperspectral image fusion in remote sensing: A survey","volume":"89","author":"Vivone","year":"2023","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.132140_bib0046","first-page":"1","article-title":"Graph-based hierarchical semantic consistency network for remote sensing image-text retrieval","volume":"99","author":"Wang","year":"2025","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0047","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.isprsjprs.2021.11.020","article-title":"Multi-label semantic feature fusion for remote sensing image captioning","volume":"184","author":"Wang","year":"2022","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0048","series-title":"Proceedings of the 27th ACM international conference on multimedia","first-page":"12","article-title":"Matching images and text with multi-modal tensor fusion and re-ranking","author":"Wang","year":"2019"},{"issue":"6","key":"10.1016\/j.eswa.2026.132140_bib0049","doi-asserted-by":"crossref","first-page":"11384","DOI":"10.1109\/TNNLS.2024.3458898","article-title":"Cross-modal remote sensing image\u2013text retrieval via context and uncertainty-aware prompt","volume":"36","author":"Wang","year":"2024","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.eswa.2026.132140_bib0050","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"5764","article-title":"CAMP: Cross-modal adaptive message passing for text-image retrieval","author":"Wang","year":"2019"},{"key":"10.1016\/j.eswa.2026.132140_bib0051","doi-asserted-by":"crossref","first-page":"36","DOI":"10.1016\/j.inffus.2022.09.008","article-title":"Review of pixel-level remote sensing image fusion based on deep learning","volume":"90","author":"Wang","year":"2023","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.132140_bib0052","first-page":"1","article-title":"Spatial\u2013channel attention transformer with pseudo regions for remote sensing image-text retrieval","volume":"62","author":"Wu","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0053","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"6162","article-title":"Text-based occluded person re-identification via multi-granularity contrastive consistency learning","volume":"vol. 38","author":"Wu","year":"2024"},{"key":"10.1016\/j.eswa.2026.132140_bib0054","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1016\/j.inffus.2023.03.021","article-title":"From degrade to upgrade: Learning a self-supervised degradation guided adaptive network for blind remote sensing image super-resolution","volume":"96","author":"Xiao","year":"2023","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.132140_bib0055","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10714","article-title":"Vid2Seq: Large-scale pretraining of a visual language model for dense video captioning","author":"Yang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132140_bib0056","first-page":"1","article-title":"Remote sensing image-text retrieval with implicit-explicit relation reasoning","volume":"62","author":"Yang","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0057","first-page":"1","article-title":"Transcending fusion: A multi-scale alignment method for remote sensing image-text retrieval","volume":"62","author":"Yang","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"issue":"3","key":"10.1016\/j.eswa.2026.132140_bib0058","doi-asserted-by":"crossref","first-page":"503","DOI":"10.3390\/rs17030503","article-title":"Remote sensing cross-modal text-image retrieval based on attention correction and filtering","volume":"17","author":"Yang","year":"2025","journal-title":"Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0059","first-page":"5753","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","volume":"32","author":"Yang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.eswa.2026.132140_bib0060","doi-asserted-by":"crossref","first-page":"688","DOI":"10.1109\/JSTARS.2022.3226325","article-title":"Hypergraph-enhanced textual-visual matching network for cross-modal remote sensing image retrieval via dynamic hypergraph learning","volume":"16","author":"Yao","year":"2022","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0061","unstructured":"Yao, L., Huang, R., Hou, L., Lu, G., Niu, M., Xu, H., Liang, X., Li, Z., Jiang, X., & Xu, C. (2021). Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv: 2111.07783,."},{"key":"10.1016\/j.eswa.2026.132140_bib0062","first-page":"1","article-title":"Parameter-efficient transfer learning for remote sensing image\u2013text retrieval","volume":"61","author":"Yuan","year":"2023","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0063","doi-asserted-by":"crossref","unstructured":"Yuan, Z., Zhang, W., Fu, K., Li, X., Deng, C., Wang, H., & Sun, X. (2022a). Exploring a fine-grained multiscale method for cross-modal remote sensing image retrieval. arXiv preprint arXiv: 2204.09868,.","DOI":"10.1109\/TGRS.2021.3078451"},{"key":"10.1016\/j.eswa.2026.132140_bib0064","first-page":"1","article-title":"A lightweight multi-scale crossmodal text-image retrieval method in remote sensing","volume":"60","author":"Yuan","year":"2021","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0065","first-page":"1","article-title":"Remote sensing cross-modal text-image retrieval based on global and local information","volume":"60","author":"Yuan","year":"2022","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"issue":"8","key":"10.1016\/j.eswa.2026.132140_bib0066","doi-asserted-by":"crossref","first-page":"5625","DOI":"10.1109\/TPAMI.2024.3369699","article-title":"Vision-language models for vision tasks: A survey","volume":"46","author":"Zhang","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132140_bib0067","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"3536","article-title":"Context-aware attention network for image-text retrieval","author":"Zhang","year":"2020"},{"key":"10.1016\/j.eswa.2026.132140_bib0068","first-page":"1","article-title":"Hypersphere-based remote sensing cross-modal text\u2013image retrieval via curriculum learning","volume":"61","author":"Zhang","year":"2023","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"issue":"18","key":"10.1016\/j.eswa.2026.132140_bib0069","doi-asserted-by":"crossref","first-page":"4637","DOI":"10.3390\/rs15184637","article-title":"A fusion encoder with multi-task guidance for cross-modal text\u2013image retrieval in remote sensing","volume":"15","author":"Zhang","year":"2023","journal-title":"Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0070","first-page":"1","article-title":"RS5M and GeoRSCLIP: A large scale vision-language dataset and a large vision-language model for remote sensing","volume":"62","author":"Zhang","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0071","first-page":"1","article-title":"Masking-based cross-modal remote sensing image-text retrieval via dynamic contrastive learning","volume":"62","author":"Zhao","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132140_bib0072","series-title":"European conference on computer vision","first-page":"73","article-title":"DreamLIP: Language-image pre-training with long captions","author":"Zheng","year":"2024"},{"key":"10.1016\/j.eswa.2026.132140_bib0073","first-page":"1","article-title":"SIRS: Multi-task joint learning for remote sensing foreground-entity image-text retrieval","volume":"62","author":"Zhu","year":"2024","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426010535?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426010535?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T16:09:53Z","timestamp":1780934993000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426010535"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":73,"alternative-id":["S0957417426010535"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132140","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"From insufficient to sufficient: Hierarchical semantic alignment for remote sensing image-text retrieval","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132140","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132140"}}