{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T09:46:19Z","timestamp":1773913579318,"version":"3.50.1"},"reference-count":68,"publisher":"Springer Science and Business Media LLC","issue":"35-36","license":[{"start":{"date-parts":[[2020,7,17]],"date-time":"2020-07-17T00:00:00Z","timestamp":1594944000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,7,17]],"date-time":"2020-07-17T00:00:00Z","timestamp":1594944000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2020,9]]},"DOI":"10.1007\/s11042-020-09294-7","type":"journal-article","created":{"date-parts":[[2020,7,17]],"date-time":"2020-07-17T14:34:43Z","timestamp":1594996483000},"page":"26661-26682","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":50,"title":["Remote sensing image caption generation via transformer and reinforcement learning"],"prefix":"10.1007","volume":"79","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9825-7853","authenticated-orcid":false,"given":"Xiangqing","family":"Shen","sequence":"first","affiliation":[]},{"given":"Bing","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jiaqi","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,7,17]]},"reference":[{"key":"9294_CR1","doi-asserted-by":"publisher","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on computer vision and pattern recognition. IEEE https:\/\/doi.org\/10.1109\/cvpr.2018.00636","DOI":"10.1109\/cvpr.2018.00636"},{"key":"9294_CR2","unstructured":"Bahdanau D, Cho K, Bengio Y (2015) Neural machine translation by jointly learning to align and translate. In: Bengio Y, LeCun Y (eds) 3rd International conference on learning representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, conference track proceedings"},{"key":"9294_CR3","unstructured":"Banerjee S, Lavie A (2005) METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization; association for computational linguistics: Ann Arbor, Michigan, pp 65\u201372"},{"issue":"Nips 2015","key":"9294_CR4","first-page":"28","volume":"28","author":"S Bengio","year":"2015","unstructured":"Bengio S, Vinyals O, Jaitly N, Shazeer N (2015) Scheduled sampling for sequence prediction with recurrent neural networks. AdvX Neural Inf Process Syst 28(Nips 2015):28","journal-title":"AdvX Neural Inf Process Syst"},{"key":"9294_CR5","unstructured":"Chen K, Zhou Z, Guo J, Zhang D, Sun X (2013) Semantic scene understanding oriented high resolution remote sensing image change information analysis. In: Proceedings of the annual conference on high resolution earth observation, Beijing, China, pp 1\u201312"},{"key":"9294_CR6","doi-asserted-by":"crossref","unstructured":"Chen X, Zitnick CL (2015) Mind\u2019s eye: a recurrent visual representation for image caption generation. In: 2015 IEEE conference on computer vision and pattern recognition (Cvpr), pp 2422\u20132431","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"9294_CR7","doi-asserted-by":"publisher","first-page":"1865","DOI":"10.1109\/JPROC.2017.2675998","volume":"105","author":"G Cheng","year":"2017","unstructured":"Cheng G, Han J, Lu X (2017) Remote sensing image scene classification: benchmark and state of the art. Proc IEEE 105:1865\u20131883. https:\/\/doi.org\/10.1109\/jproc.2017.2675998","journal-title":"Proc IEEE"},{"key":"9294_CR8","doi-asserted-by":"publisher","first-page":"2811","DOI":"10.1109\/TGRS.2017.2783902","volume":"56","author":"G Cheng","year":"2018","unstructured":"Cheng G, Yang CY, Yao XW, Guo L, Han JW (2018) When deep learning meets metric learning: remote sensing image scene classification via learning discriminative CNNs. IEEE Trans Geosci Remote Sens 56:2811\u20132821. https:\/\/doi.org\/10.1109\/Tgrs.2017.2783902","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR9","doi-asserted-by":"publisher","unstructured":"Das A, Kottur S, Gupta K, Singh A, Yadav D, Moura J, Parikh D (2017) Visual Dialog. 1080\u20131089. https:\/\/doi.org\/10.1109\/CVPR.2017.121","DOI":"10.1109\/CVPR.2017.121"},{"key":"9294_CR10","doi-asserted-by":"publisher","unstructured":"Dong L, S M, Shan J, Liu B, Yu Y, Yan T (2019) Computation offloading for mobile-edge computing with multi-user. 841\u2013850. https:\/\/doi.org\/10.1109\/ICDCS.2019.00088","DOI":"10.1109\/ICDCS.2019.00088"},{"key":"9294_CR11","doi-asserted-by":"publisher","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J, Forsyth D (2010) Every picture tells a story: generating sentences from images. In: Computer vision \u2013 ECCV 2010. https:\/\/doi.org\/10.1007\/978-3-642-15561-1_2. Springer, Berlin, pp 15\u201329","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"9294_CR12","doi-asserted-by":"publisher","unstructured":"Feng Q, Wu Y, Fan H, Yan C, Xu M, Yang Y (2020) Cascaded revision network for novel object captioning. IEEE Trans. Circuits Syst. Video Technol 1\u20131. https:\/\/doi.org\/10.1109\/tcsvt.2020.2965966","DOI":"10.1109\/tcsvt.2020.2965966"},{"key":"9294_CR13","doi-asserted-by":"crossref","unstructured":"Gerber R, Nagel NH (1996) Knowledge representation for the generation of quantified natural language descriptions of vehicle traffic in image sequences. In: Proceedings of 3rd IEEE international conference on image processing, vol 2. IEEE, pp 805\u2013808","DOI":"10.1109\/ICIP.1996.561027"},{"key":"9294_CR14","doi-asserted-by":"publisher","unstructured":"Gong Y, Wang L, Hodosh M, Hockenmaier J, Lazebnik S (2014) Improving image-sentence embeddings using large weakly annotated photo collections. In: Computer Vision \u2013 ECCV 2014. https:\/\/doi.org\/10.1007\/978-3-319-10593-2_35. Springer International Publishing, Cham, pp 529\u2013545","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"9294_CR15","doi-asserted-by":"crossref","unstructured":"Guo J, Sun Z, Tang H, Jia X, Wang S, Yan X, Ye G, Wu G (2016) Hybrid optimization algorithm of particle swarm optimization and cuckoo search for preventive maintenance period optimization. Discret Dyn Nat Soc","DOI":"10.1155\/2016\/1516271"},{"key":"9294_CR16","doi-asserted-by":"publisher","unstructured":"Han XB, Zhong YF, Zhang LP (2017) An efficient and robust integrated geospatial object detection framework for high spatial resolution remote sensing imagery. Remote Sens, 9. https:\/\/doi.org\/10.3390\/rs9070666","DOI":"10.3390\/rs9070666"},{"key":"9294_CR17","doi-asserted-by":"crossref","unstructured":"He KM, Zhang XY, Ren SQ, Sun J (2016) Deep residual learning for image recognition. In: 2016 Ieee conference on computer vision and pattern recognition (Cvpr), pp 770\u2013778, https:\/\/doi.org\/10.1109\/ Cvpr.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"9294_CR18","unstructured":"Hinton GE, Srivastava N, Krizhevsky A, Sutskever I, Salakhutdinov R Improving neural networks by preventing co-adaptation of feature detectors. arXiv:1207.0580"},{"key":"9294_CR19","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899. https:\/\/doi.org\/10.1613\/jair.3994","journal-title":"J Artif Intell Res"},{"issue":"Nips 2014","key":"9294_CR20","first-page":"27","volume":"27","author":"A Karpathy","year":"2014","unstructured":"Karpathy A, Joulin A, Li FF (2014) Deep fragment embeddings for bidirectional image sentence mapping. Adv Neural Inf Process Syst 27(Nips 2014):27","journal-title":"Adv Neural Inf Process Syst"},{"key":"9294_CR21","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: Bengio Y, LeCun Y (eds) 3rd International conference on learning representations, ICLR 2015, San Diego, CA, USA May 7-9, 2015, conference track proceedings"},{"key":"9294_CR22","doi-asserted-by":"publisher","unstructured":"Kulkarni G, Premraj V, Dhar S, Li S, Choi Y, Berg AC, Berg TL (2011) Baby talk: understanding and generating simple image descriptions. CVPR 2011 IEEE, https:\/\/doi.org\/10.1109\/cvpr.2011.5995466","DOI":"10.1109\/cvpr.2011.5995466"},{"key":"9294_CR23","first-page":"58","volume":"7","author":"H Kundra","year":"2015","unstructured":"Kundra H, Sadawarti H (2015) Hybrid algorithm of cuckoo search and particle swarm optimization for natural terrain feature extraction. Res J Inf Technol 7:58\u201369","journal-title":"Res J Inf Technol"},{"key":"9294_CR24","volume-title":"Target detection method of high resolution remote sensing image based on semantic model","author":"Y Li","year":"2012","unstructured":"Li Y (2012) Target detection method of high resolution remote sensing image based on semantic model. Graduate University of Chinese Academy of Sciences, Beijing"},{"key":"9294_CR25","unstructured":"Li S, Kulkarni G, Berg TL, Berg AC, Choi Y (2011) Composing simple image descriptions using web-scale n-grams. In: Proceedings of the fifteenth conference on computational natural language learning; association for computational linguistics: Stroudsburg, PA, USA, CoNLL \u201911, pp 220\u2013228"},{"key":"9294_CR26","unstructured":"Lin CY (2004) ROUGE: A package for automatic evaluation of summaries. text summarization branches out; association for computational linguistics: Barcelona, Spain, 74\u201381"},{"key":"9294_CR27","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1042\/BSR20080061","volume":"34","author":"T Liu","year":"2009","unstructured":"Liu T., Li P., Zhang L., Chen X. (2009) A remote sensing image retrieval model based on semantic mining. Geomatics Inf Sci Wuhan Univ 34:684\u2013687. https:\/\/doi.org\/10.1042\/BSR20080061","journal-title":"Geomatics Inf Sci Wuhan Univ"},{"key":"9294_CR28","doi-asserted-by":"publisher","first-page":"2183","DOI":"10.1109\/TGRS.2017.2776321","volume":"56","author":"XX Lu","year":"2018","unstructured":"Lu XX, Wang BQ, Zheng XT, Li XL (2018) Exploring models and data for remote sensing image caption generation. IEEE Trans Geosci Remote Sens 56:2183\u20132195. https:\/\/doi.org\/10.1109\/Tgrs.2017.2776321","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR29","doi-asserted-by":"publisher","unstructured":"Lu JS, Xiong CM, Parikh D, Socher R (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: 30th Ieee conference on computer vision and pattern recognition (Cvpr 2017), pp 3242\u20133250. https:\/\/doi.org\/10.1109\/Cvpr.2017.345","DOI":"10.1109\/Cvpr.2017.345"},{"key":"9294_CR30","doi-asserted-by":"publisher","first-page":"5148","DOI":"10.1109\/TGRS.2017.2702596","volume":"55","author":"XQ Lu","year":"2017","unstructured":"Lu XQ, Zheng XT, Yuan Y (2017) Remote sensing scene classification by unsupervised representation learning. IEEE Trans Geosci Remote Sens 55:5148\u20135157. https:\/\/doi.org\/10.1109\/Tgrs.2017.2702596","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR31","doi-asserted-by":"publisher","first-page":"645","DOI":"10.1109\/TGRS.2016.2612821","volume":"55","author":"E Maggiori","year":"2017","unstructured":"Maggiori E, Tarabalka Y, Charpiat G, Alliez P (2017) Convolutional neural networks for large-scale remote-sensing image classification. IEEE Trans Geosci Remote Sens 55:645\u2013657. https:\/\/doi.org\/10.1109\/Tgrs.2016.2612821","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR32","unstructured":"Mao J, Xu W, Yang Y, Wang J, Yuille AL (2015) Deep captioning with multimodal recurrent neural networks (m-RNN). In: Bengio Y, LeCun Y (eds) 3rd International conference on learning representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, conference track proceedings"},{"key":"9294_CR33","doi-asserted-by":"crossref","unstructured":"Nie GY, Cheng MM, Liu Y, Liang Z, Fan DP, Liu Y, Wang Y (2019) Multi-level context ultra-aggregation for stereo matching. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3283\u20133291","DOI":"10.1109\/CVPR.2019.00340"},{"key":"9294_CR34","doi-asserted-by":"publisher","first-page":"369","DOI":"10.1016\/j.apm.2019.03.016","volume":"72","author":"T Pant","year":"2019","unstructured":"Pant T, Han C, Wang H (2019) Examination of errors of table integration in flamelet\/progress variable modeling of a turbulent non-premixed jet flame. Appl Math Model 72:369\u2013384","journal-title":"Appl Math Model"},{"key":"9294_CR35","doi-asserted-by":"publisher","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJBLEU (2001) Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics. https:\/\/doi.org\/10.3115\/1073083.1073135https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135 10.3115\/1073083.1073135"},{"key":"9294_CR36","doi-asserted-by":"crossref","unstructured":"Qu B, Li XL, Tao DC, Lu XQ (2016) Deep semantic understanding of high resolution remote sensing image. In: 2016 International conference on computer, information and telecommunication systems (Cits), pp 124\u2013128","DOI":"10.1109\/CITS.2016.7546397"},{"key":"9294_CR37","doi-asserted-by":"publisher","unstructured":"R JA, Raimond K (2015) A review on availability of remote sensing data. IEEE Technological innovation in ICT for agriculture and rural development (TIAR). IEEE, 2015. https:\/\/doi.org\/10.1109\/tiar.2015.7358548https:\/\/doi.org\/10.1109\/tiar.2015.7358548","DOI":"10.1109\/tiar.2015.7358548 10.1109\/tiar.2015.7358548"},{"key":"9294_CR38","doi-asserted-by":"crossref","unstructured":"Rahaman KR, Hassan QK (2016) Application of remote sensing to quantify local warming trends: a review. In: 2016 5th International conference on informatics, electronics and vision (Iciev), pp 256\u2013261","DOI":"10.1109\/ICIEV.2016.7760006"},{"key":"9294_CR39","unstructured":"Ranzato M, Chopra S, Auli M, Zaremba W (2016) Sequence level training with recurrent neural networks. In: Bengio Y, LeCun Y (eds) 4th International conference on learning representations, ICLR 2016, San Juan, Puerto Rico May 2-4, 2016, conference track proceedings"},{"key":"9294_CR40","doi-asserted-by":"publisher","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: 30th IEEE conference on computer vision and pattern recognition (Cvpr 2017), pp 1179\u20131195. https:\/\/doi.org\/10.1109\/Cvpr.2017.131","DOI":"10.1109\/Cvpr.2017.131"},{"key":"9294_CR41","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H, Krause J, Satheesh S, Ma S, Huang ZH, Karpathy A, Khosla A, Bernstein M, Berg AC, Fei-Fei L (2015) Imagenet Large Scale Visual Recognition Challenge. Int J Comput Vis 115:211\u2013252. https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int J Comput Vis"},{"key":"9294_CR42","doi-asserted-by":"publisher","first-page":"3623","DOI":"10.1109\/TGRS.2017.2677464","volume":"55","author":"ZW Shi","year":"2017","unstructured":"Shi ZW, Zou ZX (2017) Can a machine generate humanlike language descriptions for a remote sensing image?. IEEE Trans Geosci Remote Sens 55:3623\u20133634. https:\/\/doi.org\/10.1109\/Tgrs.2017.2677464","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR43","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1162\/089892904322984526","volume":"16","author":"MW Spratling","year":"2004","unstructured":"Spratling MW, Johnson MH (2004) A feedback model of visual attention. J Cogn Neurosci 16:219\u2013237. https:\/\/doi.org\/10.1162\/089892904322984526","journal-title":"J Cogn Neurosci"},{"key":"9294_CR44","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton G, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15:1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"9294_CR45","doi-asserted-by":"crossref","unstructured":"Sun C, Gan C, Nevatia R (2015) Automatic concept discovery from parallel text and visual corpora. In: 2015 Ieee international conference on computer vision (Iccv). https:\/\/doi.org\/10.1109\/Iccv.2015.298, pp 2596\u20132604","DOI":"10.1109\/ICCV.2015.298"},{"key":"9294_CR46","volume-title":"Reinforcement learning: an introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton RS, Barto AG (2018) Reinforcement learning: an introduction. MIT press, Cambridge"},{"key":"9294_CR47","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1016\/j.isprsjprs.2015.10.004","volume":"115","author":"C Toth","year":"2016","unstructured":"Toth C, J\u00f3\u017ak\u00f3w G (2016) Remote sensing platforms and sensors: a survey. Isprs J Photogramm Remote Sens 115:22\u201336. https:\/\/doi.org\/10.1016\/j.isprsjprs.2015.10.004","journal-title":"Isprs J Photogramm Remote Sens"},{"key":"9294_CR48","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser Lu, Polosukhin I (2017) Attention is all you need. In: Guyon I, Luxburg UV, Bengio S, Wallach H, Fergus R, Vishwanathan S, Garnett R (eds) Advances in neural information processing systems 30. Curran Associates, Inc, pp 5998\u20136008"},{"key":"9294_CR49","doi-asserted-by":"publisher","unstructured":"Vedantam R, Zitnick C, Parikh D (2015) CIDEr: consensus-based image description evaluation 4566\u20134575. https:\/\/doi.org\/10.1109\/CVPR.2015.7299087","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"9294_CR50","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2017","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2017) Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans Pattern Anal Mach Intell 39:652\u2013663. https:\/\/doi.org\/10.1109\/Tpami.2016.2587640","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9294_CR51","doi-asserted-by":"publisher","unstructured":"Wang B, Lu X, Zheng X, Li X (2019) Semantic descriptions of high-resolution remote sensing images. IEEE Geosci Remote Sens Lett 1\u20135. https:\/\/doi.org\/10.1109\/LGRS.2019.2893772","DOI":"10.1109\/LGRS.2019.2893772"},{"key":"9294_CR52","first-page":"48","volume":"40","author":"J Wang","year":"2012","unstructured":"Wang J, Zhou H (2012) Research on key technologies of remote sensing image data retrieval based on semantics. Comput Digit Eng 40:48\u201350","journal-title":"Comput Digit Eng"},{"key":"9294_CR53","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1007\/BF00992696","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams RJ (1992) Simple statistical gradient-following algorithms for connectionist reinforcement learning. Mach Learn 8:229\u2013256. https:\/\/doi.org\/10.1007\/BF00992696","journal-title":"Mach Learn"},{"key":"9294_CR54","doi-asserted-by":"publisher","first-page":"3984","DOI":"10.1109\/TIP.2020.2967584","volume":"29","author":"Y Wu","year":"2020","unstructured":"Wu Y, Jiang L, Yang Y (2020) Revisiting embodiedQA: A Simple Baseline and Beyond. IEEE Trans Image Process 29:3984\u20133992. https:\/\/doi.org\/10.1109\/tip.2020.2967584","journal-title":"IEEE Trans Image Process"},{"key":"9294_CR55","doi-asserted-by":"publisher","unstructured":"Wu Q, Shen CH, Liu LQ, Dick A, van den Hengel A (2016) What value do explicit high level concepts have in vision to language problems?. In: 2016 Ieee conference on computer vision and pattern recognition (Cvpr). https:\/\/doi.org\/10.1109\/Cvpr.2016.29, pp 203\u2013212","DOI":"10.1109\/Cvpr.2016.29"},{"key":"9294_CR56","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2018","unstructured":"Wu Q, Shen CH, Wang P, Dick A, van den Hengel A (2018) Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach Intell 40:1367\u20131381. https:\/\/doi.org\/10.1109\/Tpami.2017.2708709","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9294_CR57","doi-asserted-by":"publisher","unstructured":"Wu Y, Zhu L, Jiang L, Yang Y (2018) Decoupled novel object captioner. 2018 ACM multimedia conference on multimedia conference. ACM Press. https:\/\/doi.org\/10.1145\/3240508.3240640","DOI":"10.1145\/3240508.3240640"},{"key":"9294_CR58","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. proceedings of the 32nd international conference on machine learning. In: Bach F, Blei D (eds) Proceedings of machine learning research. PMLR: Lille, France, vol 37, pp 2048\u20132057"},{"key":"9294_CR59","doi-asserted-by":"publisher","unstructured":"Yang J, Jiang Z, Zhou Q, Zhang H, Shi J (2015) Remote sensing image semantic labeling based on conditional random field 36:3069\u20133081. https:\/\/doi.org\/10.7527\/S1000-6893.2014.0356","DOI":"10.7527\/S1000-6893.2014.0356"},{"key":"9294_CR60","doi-asserted-by":"publisher","unstructured":"Yang Y, Newsam S (2010) Bag-of-visual-words and spatial extensions for land-use classification. In: Proceedings of the 18th SIGSPATIAL international conference on advances in geographic information systems; ACM: New York, NY, USA, 2010; GIS\u201910, pp 270\u2013279. https:\/\/doi.org\/10.1145\/1869790.1869829https:\/\/doi.org\/10.1145\/1869790.1869829","DOI":"10.1145\/1869790.1869829 10.1145\/1869790.1869829"},{"key":"9294_CR61","doi-asserted-by":"publisher","unstructured":"Yao T, Pan YW, Li YH, Qiu ZF, Mei T (2017) Boosting image captioning with attributes. In: 2017 Ieee international conference on computer vision (Iccv), pp 4904\u20134912. https:\/\/doi.org\/10.1109\/Iccv.2017.524https:\/\/doi.org\/10.1109\/Iccv.2017.524","DOI":"10.1109\/Iccv.2017.524 10.1109\/Iccv.2017.524"},{"key":"9294_CR62","doi-asserted-by":"publisher","first-page":"1485","DOI":"10.1109\/JPROC.2010.2050411","volume":"98","author":"BZ Yao","year":"2010","unstructured":"Yao BZ, Yang X, Lin LA, Lee MW, Zhu SC (2010) I2t: image parsing to text description. Proc Ieee 98:1485\u20131508. https:\/\/doi.org\/10.1109\/Jproc.2010.2050411","journal-title":"Proc Ieee"},{"key":"9294_CR63","doi-asserted-by":"publisher","unstructured":"You QZ, Jin HL, Wang ZW, Fang C, Luo JB (2016) Image captioning with semantic attention. In: 2016 Ieee conference on computer vision and pattern recognition (Cvpr 2017), pp 4651\u20134659. https:\/\/doi.org\/10.1109\/Cvpr.2016.503","DOI":"10.1109\/Cvpr.2016.503"},{"key":"9294_CR64","doi-asserted-by":"publisher","first-page":"2175","DOI":"10.1109\/TGRS.2014.2357078","volume":"53","author":"F Zhang","year":"2015","unstructured":"Zhang F, Du B, Zhang LP (2015) Saliency-guided unsupervised feature learning for scene classification. IEEE Trans Geosci Remote Sens 53:2175\u20132184. https:\/\/doi.org\/10.1109\/Tgrs.2014.2357078","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9294_CR65","doi-asserted-by":"publisher","unstructured":"Zhang XR, Wang X, Tang X, Zhou HY, Li C (2019) Description generation for remote sensing images using attribute attention mechanism. Remote Sens 11. https:\/\/doi.org\/10.3390\/rs11060612","DOI":"10.3390\/rs11060612"},{"key":"9294_CR66","doi-asserted-by":"publisher","first-page":"1511","DOI":"10.1109\/JSTARS.2016.2620900","volume":"10","author":"LB Zhang","year":"2017","unstructured":"Zhang LB, Zhang YY (2017) Airport detection and aircraft recognition based on two-layer saliency model in high spatial resolution remote-sensing images. IEEE J Sel Top Appl Earth Obs Remote Sens 10:1511\u20131524. https:\/\/doi.org\/10.1109\/Jstars.2016.2620900","journal-title":"IEEE J Sel Top Appl Earth Obs Remote Sens"},{"key":"9294_CR67","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1007\/s11263-017-1033-7","volume":"124","author":"L Zhu","year":"2017","unstructured":"Zhu L, Xu Z, Yang Y, Hauptmann AG (2017) Uncovering the temporal context for video question answering. Int J Comput Vis 124:409\u2013421. https:\/\/doi.org\/10.1007\/s11263-017-1033-7","journal-title":"Int J Comput Vis"},{"key":"9294_CR68","doi-asserted-by":"publisher","unstructured":"Zhu QQ, Zhong YF, Zhang LP (2014) Multi-feature probability topic scene classifier for high spatial resolution remote sensing imagery 2014. IEEE International Geoscience and Remote Sensing Symposium (Igarss). https:\/\/doi.org\/10.1109\/Igarss.2014.6947071","DOI":"10.1109\/Igarss.2014.6947071"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09294-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-09294-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09294-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,7,16]],"date-time":"2021-07-16T23:58:26Z","timestamp":1626479906000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-09294-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7,17]]},"references-count":68,"journal-issue":{"issue":"35-36","published-print":{"date-parts":[[2020,9]]}},"alternative-id":["9294"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-09294-7","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,7,17]]},"assertion":[{"value":"18 July 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 June 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 June 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 July 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of interests"}}]}}