{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T02:02:49Z","timestamp":1779933769337,"version":"3.53.1"},"reference-count":74,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171332"],"award-info":[{"award-number":["62171332"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276197"],"award-info":[{"award-number":["62276197"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fund of the National Key Laboratory of Science and Technology on Remote Sensing Information and imagery Analysis, Beijing Research Institute of Uranium Geology","award":["6142A010301"],"award-info":[{"award-number":["6142A010301"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Geosci. Remote Sensing"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/tgrs.2023.3280546","type":"journal-article","created":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T17:34:28Z","timestamp":1685381668000},"page":"1-15","source":"Crossref","is-referenced-by-count":54,"title":["Interacting-Enhancing Feature Transformer for Cross-Modal Remote-Sensing Image and Text Retrieval"],"prefix":"10.1109","volume":"61","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1375-0778","authenticated-orcid":false,"given":"Xu","family":"Tang","sequence":"first","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0862-6564","authenticated-orcid":false,"given":"Yijing","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingjing","family":"Ma","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0379-2042","authenticated-orcid":false,"given":"Xiangrong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9752-9530","authenticated-orcid":false,"given":"Fang","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Systems for High-Dimensional Information of the Ministry of Education, School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3354-9617","authenticated-orcid":false,"given":"Licheng","family":"Jiao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref13","first-page":"1","article-title":"AR2Det: An accurate and real-time rotational one-stage ship detector in remote sensing images","volume":"60","author":"yang","year":"2021","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3169479"},{"key":"ref12","first-page":"1","article-title":"An unsupervised remote sensing change detection method based on multiscale graph convolutional network and metric learning","volume":"60","author":"tang","year":"2021","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3172371"},{"key":"ref15","first-page":"1","article-title":"ABNet: Adaptive balanced network for multiscale object detection in remote sensing imagery","volume":"60","author":"liu","year":"2021","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00564"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3005431"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3168697"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3155665"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3157671"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS46834.2022.9883252"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3224815"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3163706"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3194505"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3131592"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3078451"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.3390\/rs13030516"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746251"},{"key":"ref45","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"li","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst (NeurIPS)"},{"key":"ref48","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref47","article-title":"Improving language understanding by generative pre-training","author":"radford","year":"2018"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2967597"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"ref44","first-page":"1","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"lu","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148470"},{"key":"ref49","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"brown","year":"2020","journal-title":"Proc Adv Neur Inf Process Sys"},{"key":"ref8","first-page":"1","article-title":"A lightweight multi-scale crossmodal text-image retrieval method in remote sensing","volume":"60","author":"yuan","year":"2021","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref7","first-page":"1","article-title":"Meta-hashing for remote sensing image retrieval","volume":"60","author":"tang","year":"2021","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3070872"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.10.008"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2021.01.020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3007533"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.3390\/rs10081243"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00601"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1049\/ipr2.12176"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.209"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0274-8"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.01.008"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00750"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3005403"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.3390\/app9102110"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2985716"},{"key":"ref71","article-title":"Generating long sequences with sparse transformers","author":"child","year":"2019","journal-title":"arXiv 1904 10509"},{"key":"ref70","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref73","first-page":"4271","article-title":"Funnel-Transformer: Filtering out sequential redundancy for efficient language processing","volume":"33","author":"dai","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00827"},{"key":"ref24","first-page":"1","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref68","article-title":"Google&#x2019;s neural machine translation system: Bridging the gap between human and machine translation","author":"wu","year":"2016","journal-title":"arXiv 1609 08144"},{"key":"ref23","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","author":"kim","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref26","article-title":"An image is worth 16&#x00D7;16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"arXiv 2010 11929"},{"key":"ref25","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"arXiv 1810 04805"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"ref20","article-title":"VSE++: Improving visual-semantic embeddings with hard negatives","author":"faghri","year":"2017","journal-title":"arXiv 1707 05612"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2014.2357078"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref66","first-page":"1","article-title":"Im2Text: Describing images using 1 million captioned photographs","volume":"24","author":"ordonez","year":"2011","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref21","first-page":"35946","article-title":"Masked autoencoders as spatiotemporal learners","volume":"35","author":"feichtenhofer","year":"2022","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00170"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/1869790.1869829"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CITS.2016.7546397"}],"container-title":["IEEE Transactions on Geoscience and Remote Sensing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/36\/10006360\/10138021.pdf?arnumber=10138021","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,3]],"date-time":"2023-07-03T18:11:21Z","timestamp":1688407881000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10138021\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/tgrs.2023.3280546","relation":{},"ISSN":["0196-2892","1558-0644"],"issn-type":[{"value":"0196-2892","type":"print"},{"value":"1558-0644","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}