{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T06:38:22Z","timestamp":1759991902163,"version":"3.44.0"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["41971365"],"award-info":[{"award-number":["41971365"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Earth Sci Inform"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s12145-025-01917-7","type":"journal-article","created":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T23:36:58Z","timestamp":1747870618000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A question-type guided and progressive self-attention network for remote sensing visual question answering"],"prefix":"10.1007","volume":"18","author":[{"given":"Jiangfan","family":"Feng","sequence":"first","affiliation":[]},{"given":"Hui","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shaokang","family":"Dong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,22]]},"reference":[{"issue":"2","key":"1917_CR1","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1007\/s41651-023-00148-y","volume":"7","author":"MT Amare","year":"2023","unstructured":"Amare MT, Demissie ST, Beza SA, Erena SH (2023) Land cover change detection and prediction in the fafan catchment of ethiopia. J Geovisualization Spat Anal 7(2):19","journal-title":"J Geovisualization Spat Anal"},{"key":"1917_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102229","volume":"74","author":"S Ansith","year":"2022","unstructured":"Ansith S, Bini A (2022) Land use classification of high resolution remote sensing images using an encoder based modified gan architecture. Displays 74:102229","journal-title":"Displays"},{"key":"1917_CR3","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick CL, Parikh D (2015) VQA: Visual Question Answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"issue":"9","key":"1917_CR4","doi-asserted-by":"publisher","first-page":"1477","DOI":"10.3390\/rs16091477","volume":"16","author":"Y Bazi","year":"2024","unstructured":"Bazi Y, Bashmal L, Al Rahhal MM, Ricci R, Melgani F (2024) Rs-llava: A large vision-language model for joint captioning and question answering in remote sensing imagery. Remote Sens 16(9):1477","journal-title":"Remote Sens"},{"key":"1917_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.jhydrol.2022.128817","volume":"616","author":"S Chen","year":"2023","unstructured":"Chen S, Fu YH, Wu Z, Hao F, Hao Z, Guo Y, Geng X, Li X, Zhang X, Tang J et al (2023) Informing the swat model with remote sensing detected vegetation phenology for improved modeling of ecohydrological processes. J Hydrol 616:128817","journal-title":"J Hydrol"},{"key":"1917_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110706","volume":"275","author":"C Chen","year":"2023","unstructured":"Chen C, Han D, Shen X (2023) Clvin: Complete language-vision interaction network for visual question answering. Knowl-Based Syst 275:110706","journal-title":"Knowl-Based Syst"},{"issue":"11","key":"1917_CR7","doi-asserted-by":"publisher","first-page":"8125","DOI":"10.1007\/s00521-022-08092-6","volume":"35","author":"C Ding","year":"2023","unstructured":"Ding C, Wang M, Zhou Z, Huang T, Wang X, Li J (2023) Siamese transformer network-based similarity metric learning for cross-source remote sensing image retrieval. Neural Comput Appl 35(11):8125\u20138142","journal-title":"Neural Comput Appl"},{"key":"1917_CR8","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"1917_CR9","doi-asserted-by":"crossref","unstructured":"Feng J, Tang E, Zeng M, Gu Z, Kou P, Zheng W (2023) Improving visual question answering for remote sensing via alternate-guided attention and combined loss. Int J Appl Earth Obs Geoinformation. Publisher: Elsevier, 122:103427","DOI":"10.1016\/j.jag.2023.103427"},{"key":"1917_CR10","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102329","volume":"75","author":"J Feng","year":"2022","unstructured":"Feng J, Liu R (2022) Lrb-net: Improving vqa via division of labor strategy and multimodal classifiers. Displays 75:102329","journal-title":"Displays"},{"key":"1917_CR11","volume":"126","author":"J Feng","year":"2024","unstructured":"Feng J, Wang H (2024) A multi-scale contextual attention network for remote sensing visual question answering. Int J Appl Earth Obs Geoinf 126:103641","journal-title":"Int J Appl Earth Obs Geoinf"},{"key":"1917_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102822","volume":"87","author":"T Fountoukidou","year":"2023","unstructured":"Fountoukidou T, Sznitman R (2023) A reinforcement learning approach for vqa validation: An application to diabetic macular edema grading. Med Image Anal 87:102822","journal-title":"Med Image Anal"},{"key":"1917_CR13","doi-asserted-by":"crossref","unstructured":"Gan Z, Cheng Y, Kholy AE, Li L, Liu J, Gao J (2019) Multi-step reasoning via recurrent dual attention for visual dialog. arXiv preprint arXiv:1902.00579","DOI":"10.18653\/v1\/P19-1648"},{"key":"1917_CR14","doi-asserted-by":"crossref","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D (2017) Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6904\u20136913","DOI":"10.1109\/CVPR.2017.670"},{"key":"1917_CR15","doi-asserted-by":"crossref","unstructured":"Guo Y, Huang Y (2022) Capturing global and local information in remote sensing visual question answering. In: IGARSS 2022-2022 IEEE International geoscience and remote sensing symposium. IEEE, pp 6340\u20136343","DOI":"10.1109\/IGARSS46834.2022.9884926"},{"key":"1917_CR16","unstructured":"Ilievski I, Feng J (2017) Multimodal learning and reasoning for visual question answering. Adv Neural Inform Process Syst 30"},{"key":"1917_CR17","doi-asserted-by":"crossref","unstructured":"Ishmam MF, Shovon MSH, Mridha M, Dey N (2024) From image to language: A critical analysis of visual question answering (vqa) approaches, challenges, and opportunities. Inform Fusion p 102270","DOI":"10.1016\/j.inffus.2024.102270"},{"key":"1917_CR18","doi-asserted-by":"crossref","unstructured":"Kuckreja K, Danish MS, Naseer M, Das A, Khan S, Khan FS (2024) Geochat: Grounded large vision-language model for remote sensing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 27831\u201327840","DOI":"10.1109\/CVPR52733.2024.02629"},{"key":"1917_CR19","unstructured":"Leng Z, Tan M, Liu C, Cubuk ED, Shi X, Cheng S, Anguelov D (2022) Polyloss: A polynomial expansion perspective of classification loss functions. arXiv preprint arXiv:2204.12511"},{"key":"1917_CR20","doi-asserted-by":"crossref","unstructured":"Li Y, Ma Y, Liu G, Wei Q, Chen Y, Shang R, Jiao L (2024) Enhancing remote sensing visual question answering: A mask-based dual-stream feature mutual attention network. IEEE Geosci Remote Sens Lett","DOI":"10.1109\/LGRS.2024.3389042"},{"key":"1917_CR21","doi-asserted-by":"crossref","unstructured":"Li M, Moens M-F (2022) Dynamic key-value memory enhanced multi-step graph reasoning for knowledge-based visual question answering. In: Proceedings of the AAAI conference on artificial intelligence, vol 36, pp 10983\u201310992","DOI":"10.1609\/aaai.v36i10.21346"},{"issue":"3","key":"1917_CR22","doi-asserted-by":"publisher","first-page":"464","DOI":"10.1007\/s11442-023-2092-z","volume":"33","author":"C Li","year":"2023","unstructured":"Li C, Zhuang D, He J, Wen K (2023) Spatiotemporal variations in remote sensing phenology of vegetation and its responses to temperature change of boreal forest in tundra-taiga transitional zone in the eastern siberia. J Geog Sci 33(3):464\u2013482","journal-title":"J Geog Sci"},{"key":"1917_CR23","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Goyal P, Girshick R, He K, Doll\u00e1r P (2017) Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision, pp 2980\u20132988","DOI":"10.1109\/ICCV.2017.324"},{"key":"1917_CR24","doi-asserted-by":"crossref","unstructured":"Ling S, You S, Bao B-K (2024) Two-stage reasoning network with modality decomposition for text vqa. In: International conference on multimedia modeling. Springer, pp 127\u2013140","DOI":"10.1007\/978-3-031-53311-2_10"},{"key":"1917_CR25","doi-asserted-by":"crossref","unstructured":"Lobry S, Marcos D, Murray J, Tuia D (2020) RSVQA: Visual question answering for remote sensing data. IEEE Trans Geosci Remote Sens. IEEE, 58(12):8555\u20138566","DOI":"10.1109\/TGRS.2020.2988782"},{"key":"1917_CR26","doi-asserted-by":"crossref","unstructured":"Luo H, Guo Z, Wu Z, Teng F, Li T (2024) Transformer-based vision-language alignment for robot navigation and question answering. Inform Fusion 108:102351","DOI":"10.1016\/j.inffus.2024.102351"},{"key":"1917_CR27","doi-asserted-by":"crossref","unstructured":"Lv Z, Zhong P, Wang W, You Z, Falco N (2023) Multi-scale attention network guided with change gradient image for land cover change detection using remote sensing images. IEEE Geosci Remote Sens Lett","DOI":"10.1109\/LGRS.2023.3267879"},{"key":"1917_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.rse.2023.113924","volume":"301","author":"Y Ma","year":"2024","unstructured":"Ma Y, Chen S, Ermon S, Lobell DB (2024) Transfer learning in environmental remote sensing. Remote Sens Environ 301:113924","journal-title":"Remote Sens Environ"},{"issue":"2","key":"1917_CR29","doi-asserted-by":"publisher","first-page":"1745","DOI":"10.1007\/s12145-023-01208-z","volume":"17","author":"SN MohanRajan","year":"2024","unstructured":"MohanRajan SN, Loganathan A, Manoharan P, Alenizi FA (2024) Fuzzy swin transformer for land use\/land cover change detection using liss-iii satellite data. Earth Sci Inf 17(2):1745\u20131764","journal-title":"Earth Sci Inf"},{"key":"1917_CR30","doi-asserted-by":"crossref","unstructured":"Muhtar D, Li Z, Gu F, Zhang X, Xiao P (2024) Lhrs-bot: Empowering remote sensing with vgi-enhanced large multimodal language model. arXiv preprint arXiv:2402.02544","DOI":"10.1007\/978-3-031-72904-1_26"},{"key":"1917_CR31","doi-asserted-by":"crossref","unstructured":"Munyati C (2024) Relating urban land surface temperature to vegetation leafing using thermal imagery and vegetation indices. Earth Sci Inform pp 1\u201318","DOI":"10.1007\/s12145-024-01443-y"},{"key":"1917_CR32","unstructured":"Pang C, Wu J, Li J, Liu Y, Sun J, Li W, Weng X, Wang S, Feng L, Xia G-S et al (2024) H2rsvlm: Towards helpful and honest remote sensing large vision language model. arXiv preprint arXiv:2403.20213"},{"key":"1917_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2023.3276293","volume":"61","author":"A Sarkar","year":"2023","unstructured":"Sarkar A, Chowdhury T, Murphy RR, Gangopadhyay A, Rahnemoonfar M (2023) Sam-vqa: Supervised attention-based visual question answering model for post-disaster damage assessment on remote sensing imagery. IEEE Trans Geosci Remote Sens 61:1\u201316","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"1917_CR34","doi-asserted-by":"crossref","unstructured":"Seenivasan L, Islam M, Kannan G, Ren H (2023) Surgicalgpt: end-to-end language-vision gpt for visual question answering in surgery. In: International conference on medical image computing and computer-assisted intervention. Springer, pp 281\u2013290","DOI":"10.1007\/978-3-031-43996-4_27"},{"key":"1917_CR35","doi-asserted-by":"publisher","first-page":"545","DOI":"10.1007\/s12145-018-0347-5","volume":"11","author":"A Shamsoddini","year":"2018","unstructured":"Shamsoddini A, Raval S (2018) Mapping red edge-based vegetation health indicators using landsat tm data for australian native vegetation cover. Earth Sci Inf 11:545\u2013552","journal-title":"Earth Sci Inf"},{"key":"1917_CR36","doi-asserted-by":"crossref","unstructured":"Suleymanov A, Shagaliev R, Belan L, Bogdan E, Tuktarova I, Nagaev E, Muftakhina D (2024) Forest growing stock volume mapping with accompanying uncertainty in heterogeneous landscapes using remote sensing data. Earth Sci Inform pp 1\u201311","DOI":"10.1007\/s12145-024-01457-6"},{"key":"1917_CR37","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inform Process Syst 30"},{"key":"1917_CR38","doi-asserted-by":"crossref","unstructured":"Wang Y, Ghamisi P (2024) Rsadapter: Adapting multimodal models for remote sensing visual question answering. IEEE Trans Geosci Remote Sens","DOI":"10.1109\/TGRS.2024.3413174"},{"key":"1917_CR39","doi-asserted-by":"crossref","unstructured":"Wang Z, Prabha R, Huang T, Wu J, Rajagopal R (2024) Skyscript: A large and semantically diverse vision-language dataset for remote sensing. In: Proceedings of the AAAI conference on artificial intelligence, vol 38, pp 5805\u20135813","DOI":"10.1609\/aaai.v38i6.28393"},{"key":"1917_CR40","doi-asserted-by":"crossref","unstructured":"Wang J, Zheng Z, Chen Z, Ma A, Zhong Y (2024) Earthvqa: Towards queryable earth via relational reasoning-based remote sensing visual question answering. In: Proceedings of the AAAI conference on artificial intelligence, vol 38, pp 5481\u20135489","DOI":"10.1609\/aaai.v38i6.28357"},{"key":"1917_CR41","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1016\/j.isprsjprs.2024.05.001","volume":"212","author":"J Wang","year":"2024","unstructured":"Wang J, Ma A, Chen Z, Zheng Z, Wan Y, Zhang L, Zhong Y (2024) Earthvqanet: Multi-task visual question answering for remote sensing image understanding. ISPRS J Photogramm Remote Sens 212:422\u2013439","journal-title":"ISPRS J Photogramm Remote Sens"},{"key":"1917_CR42","doi-asserted-by":"crossref","unstructured":"Xi Y, Zhang Y, Ding S, Wan S (2020) Visual question answering model based on visual relationship detection. Signal Process: Image Commun 80:115648","DOI":"10.1016\/j.image.2019.115648"},{"key":"1917_CR43","doi-asserted-by":"crossref","unstructured":"Yang Z, He X, Gao J, Deng L, Smola A (2016) Stacked attention networks for image question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 21\u201329","DOI":"10.1109\/CVPR.2016.10"},{"key":"1917_CR44","doi-asserted-by":"crossref","unstructured":"Ying X, Wang Q, Li X, Yu M, Jiang H, Gao J, Liu Z, Yu R (2019) Multi-attention object detection model in remote sensing images based on multi-scale. IEEE Access, IEEE, 7:94508\u201394519","DOI":"10.1109\/ACCESS.2019.2928522"},{"key":"1917_CR45","doi-asserted-by":"crossref","unstructured":"Yuan Z, Mou L, Wang Q, Zhu XX (2022) From easy to hard: Learning language-guided curriculum for visual question answering on remote sensing data. IEEE Trans Geosci Remote Sens. IEEE, 60:1\u201311","DOI":"10.1109\/TGRS.2022.3173811"},{"key":"1917_CR46","doi-asserted-by":"crossref","unstructured":"Yuan Z, Mou L, Zhu XX (2023) Multilingual augmentation for robust visual question answering in remote sensing images. In: 2023 Joint Urban remote sensing event (JURSE). IEEE, pp 1\u20134","DOI":"10.1109\/JURSE57346.2023.10144189"},{"key":"1917_CR47","doi-asserted-by":"publisher","DOI":"10.1016\/j.scitotenv.2022.159741","volume":"858","author":"S Yuan","year":"2023","unstructured":"Yuan S, Li Y, Bao F, Xu H, Yang Y, Yan Q, Zhong S, Yin H, Xu J, Huang Z et al (2023) Marine environmental monitoring with unmanned vehicle platforms: Present applications and future prospects. Sci Total Environ 858:159741","journal-title":"Sci Total Environ"},{"key":"1917_CR48","doi-asserted-by":"crossref","unstructured":"Zhang M, Chen F, Li B (2023) Multi-step question-driven visual question answering for remote sensing. IEEE Trans Geosci Remote Sens","DOI":"10.1109\/TGRS.2023.3312479"},{"key":"1917_CR49","doi-asserted-by":"crossref","unstructured":"Zhang Z, Jiao L, Li L, Liu X, Chen P, Liu F, Li Y, Guo Z (2023) A spatial hierarchical reasoning network for remote sensing visual question answering. IEEE Trans Geosci Remote Sens. IEEE, 61:1\u201315","DOI":"10.1109\/TGRS.2023.3237606"},{"key":"1917_CR50","doi-asserted-by":"crossref","unstructured":"Zhang H, Wu W, Zhang M (2022) Efficient multi-step reasoning attention network for visual question answering. In: Thirteenth international conference on graphics and image processing (ICGIP 2021). SPIE, vol 12083, pp 560\u2013566","DOI":"10.1117\/12.2623218"},{"key":"1917_CR51","doi-asserted-by":"crossref","unstructured":"Zhang Y, Zheng X, Lu X (2023) Remote sensing image retrieval by deep attention hashing with distance-adaptive ranking. IEEE J Sel Top Appl Earth Obs Remote Sens","DOI":"10.1109\/JSTARS.2023.3271303"},{"key":"1917_CR52","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1016\/j.inffus.2019.03.005","volume":"52","author":"D Zhang","year":"2019","unstructured":"Zhang D, Cao R, Wu S (2019) Information fusion in visual question answering: A survey. Inform Fusion 52:268\u2013280","journal-title":"Inform Fusion"},{"key":"1917_CR53","volume":"127","author":"K Zhao","year":"2024","unstructured":"Zhao K, Xiong W (2024) Exploring region features in remote sensing image captioning. Int J Appl Earth Obs Geoinf 127:103672","journal-title":"Int J Appl Earth Obs Geoinf"}],"container-title":["Earth Science Informatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12145-025-01917-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12145-025-01917-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12145-025-01917-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T15:16:28Z","timestamp":1757171788000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12145-025-01917-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,22]]},"references-count":53,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1917"],"URL":"https:\/\/doi.org\/10.1007\/s12145-025-01917-7","relation":{},"ISSN":["1865-0473","1865-0481"],"issn-type":[{"type":"print","value":"1865-0473"},{"type":"electronic","value":"1865-0481"}],"subject":[],"published":{"date-parts":[[2025,5,22]]},"assertion":[{"value":"17 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"409"}}