{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T05:25:14Z","timestamp":1769059514473,"version":"3.49.0"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,8,27]],"date-time":"2022-08-27T00:00:00Z","timestamp":1661558400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,27]],"date-time":"2022-08-27T00:00:00Z","timestamp":1661558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100013098","name":"Scientific and Innovative Action Plan of Shanghai","doi-asserted-by":"publisher","award":["20511100600"],"award-info":[{"award-number":["20511100600"]}],"id":[{"id":"10.13039\/501100013098","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","award":["22ZR1418400"],"award-info":[{"award-number":["22ZR1418400"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2023,1]]},"DOI":"10.1007\/s00521-022-07726-z","type":"journal-article","created":{"date-parts":[[2022,8,27]],"date-time":"2022-08-27T16:02:48Z","timestamp":1661616168000},"page":"2429-2442","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Hierarchical decoding with latent context for image captioning"],"prefix":"10.1007","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6270-7771","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yingshuai","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Kangkang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Wen","family":"Du","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,27]]},"reference":[{"key":"7726_CR1","doi-asserted-by":"publisher","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 3156\u20133164. https:\/\/doi.org\/10.1109\/CVPR.2015.7298935","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"7726_CR2","doi-asserted-by":"publisher","first-page":"6209","DOI":"10.1109\/TIP.2020.2988435","volume":"29","author":"J Zhang","year":"2020","unstructured":"Zhang J, Peng Y (2020) Video captioning with object-aware spatio-temporal correlation and aggregation. IEEE Trans Image Process 29:6209\u20136222","journal-title":"IEEE Trans Image Process"},{"key":"7726_CR3","doi-asserted-by":"publisher","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 3242\u20133250. https:\/\/doi.org\/10.1109\/CVPR.2017.345","DOI":"10.1109\/CVPR.2017.345"},{"key":"7726_CR4","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"7726_CR5","doi-asserted-by":"crossref","unstructured":"Yang L, Tang KD, Yang J, Li L (2017) Dense captioning with joint inference and visual context. In: IEEE conference on computer vision and pattern recognition, pp 1978\u20131987","DOI":"10.1109\/CVPR.2017.214"},{"key":"7726_CR6","doi-asserted-by":"crossref","unstructured":"Kim D, Choi J, Oh T, Kweon IS (2019) Dense relational captioning: Triple-stream networks for relationship-based captioning. In: IEEE conference on computer vision and pattern recognition, pp 6271\u20136280","DOI":"10.1109\/CVPR.2019.00643"},{"key":"7726_CR7","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y (2019) Hierarchical vision-language alignment for video captioning. In: MultiMedia modeling\u201425th international conference, vol 11295, pp 42\u201354","DOI":"10.1007\/978-3-030-05710-7_4"},{"key":"7726_CR8","unstructured":"Simonyan K, Zisserman A (2015) Very deep convolutional networks for large-scale image recognition. In: Proceedings of the 3rd international conference on learning representations, ICLR. http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"7726_CR9","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"7726_CR10","unstructured":"Ren S, He K, Girshick RB, Sun J (2015) Faster R-CNN: towards real-time object detection with region proposal networks. In: Proceedings of the 28th advances in neural information processing systems, NIPS, pp 91\u201399"},{"key":"7726_CR11","doi-asserted-by":"publisher","unstructured":"Jia X, Gavves E, Fernando B, Tuytelaars T (2015) Guiding the long-short term memory model for image caption generation. In: Proceedings of the IEEE international conference on computer vision, ICCV, pp 2407\u20132415. https:\/\/doi.org\/10.1109\/ICCV.2015.277","DOI":"10.1109\/ICCV.2015.277"},{"key":"7726_CR12","doi-asserted-by":"publisher","unstructured":"Guo Y, Liu Y, M. H. T. de Boer, Liu L, Lew MS (2018) A dual prediction network for image captioning. In: Proceedings of the IEEE international conference on multimedia and expo, ICME, pp 1\u20136. https:\/\/doi.org\/10.1109\/ICME.2018.8486491","DOI":"10.1109\/ICME.2018.8486491"},{"key":"7726_CR13","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville AC, Salakhutdinov R, Zemel RS, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. In: Proceedings of the 32nd international conference on machine learning, ICML, pp 2048\u20132057. http:\/\/proceedings.mlr.press\/v37\/xuc15.html"},{"key":"7726_CR14","doi-asserted-by":"publisher","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 4651\u20134659. https:\/\/doi.org\/10.1109\/CVPR.2016.503","DOI":"10.1109\/CVPR.2016.503"},{"key":"7726_CR15","doi-asserted-by":"publisher","unstructured":"Chen L, Zhang H, Xiao J, Nie L, Shao J, Liu W, Chua T (2017) SCA-CNN: spatial and channel-wise attention in convolutional networks for image captioning. In: Proceedings of IEEE conference on computer vision and pattern recognition, CVPR, pp 6298\u20136306. https:\/\/doi.org\/10.1109\/CVPR.2017.667","DOI":"10.1109\/CVPR.2017.667"},{"key":"7726_CR16","doi-asserted-by":"publisher","unstructured":"Chen S, Zhao Q (2018) Boosted attention: leveraging human attention for image captioning. In: Proceedings of the European conference on computer vision, ECCV, pp 72\u201388. https:\/\/doi.org\/10.1007\/978-3-030-01252-6_5","DOI":"10.1007\/978-3-030-01252-6_5"},{"issue":"6","key":"7726_CR17","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2018","unstructured":"Wu Q, Shen C, Wang P, Dick AR, van den Hengel A (2018) Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach Intell 40(6):1367\u20131381. https:\/\/doi.org\/10.1109\/TPAMI.2017.2708709","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"7726_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.107075","volume":"98","author":"J Wang","year":"2020","unstructured":"Wang J, Wang W, Wang L, Wang Z, Feng DD, Tan T (2020) Learning visual relationship and context-aware attention for image captioning. Pattern Recogn 98:107075. https:\/\/doi.org\/10.1016\/j.patcog.2019.107075","journal-title":"Pattern Recogn"},{"key":"7726_CR19","doi-asserted-by":"publisher","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision, ECCV, pp 711\u2013727. https:\/\/doi.org\/10.1007\/978-3-030-01264-9_42","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"7726_CR20","doi-asserted-by":"publisher","unstructured":"Guo L, Liu J, Tang J, Li J, Luo W, Lu H (2019) Aligning linguistic words and visual semantic units for image captioning. In: Proceedings of the 27th ACM international conference on multimedia, MM, pp 765\u2013773. https:\/\/doi.org\/10.1145\/3343031.3350943","DOI":"10.1145\/3343031.3350943"},{"key":"7726_CR21","doi-asserted-by":"publisher","unstructured":"Yang J, Lu J, Lee S, Batra D, Parikh D (2018) Graph R-CNN for scene graph generation. In: Proceedings of the European conference on computer vision, ECCV, pp 690\u2013706. https:\/\/doi.org\/10.1007\/978-3-030-01246-5_41","DOI":"10.1007\/978-3-030-01246-5_41"},{"key":"7726_CR22","doi-asserted-by":"publisher","unstructured":"Xu D, Zhu Y, Choy CB, Fei-Fei L (2017) Scene graph generation by iterative message passing. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 3097\u20133106. https:\/\/doi.org\/10.1109\/CVPR.2017.330","DOI":"10.1109\/CVPR.2017.330"},{"key":"7726_CR23","unstructured":"Hamilton WL, Ying Z, Leskovec J (2017) Inductive representation learning on large graphs. In: Proceedings of the 30th advances in neural information processing systems, NIPS, pp 1024\u20131034"},{"issue":"11","key":"7726_CR24","doi-asserted-by":"publisher","first-page":"2942","DOI":"10.1109\/TMM.2019.2915033","volume":"21","author":"X Xiao","year":"2019","unstructured":"Xiao X, Wang L, Ding K, Xiang S, Pan C (2019) Deep hierarchical encoder-decoder network for image captioning. IEEE Trans Multimed 21(11):2942\u20132956. https:\/\/doi.org\/10.1109\/TMM.2019.2915033","journal-title":"IEEE Trans Multimed"},{"key":"7726_CR25","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y (2019) Object-aware aggregation with bidirectional temporal graph for video captioning. In: IEEE conference on computer vision and pattern recognition, pp 8327\u20138336","DOI":"10.1109\/CVPR.2019.00852"},{"key":"7726_CR26","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1016\/j.neucom.2019.12.073","volume":"387","author":"Y Wei","year":"2020","unstructured":"Wei Y, Wang L, Cao H, Shao M, Wu C (2020) Multi-attention generative adversarial network for image captioning. Neurocomputing 387:91\u201399. https:\/\/doi.org\/10.1016\/j.neucom.2019.12.073","journal-title":"Neurocomputing"},{"key":"7726_CR27","doi-asserted-by":"publisher","first-page":"104146","DOI":"10.1016\/j.imavis.2021.104146","volume":"10","author":"Z Zhang","year":"2021","unstructured":"Zhang Z, Wu Q, Wang Y, Chen F (2021) Exploring region relationships implicitly: image captioning with visual relationship attention. Image Vis Comput 10:104146","journal-title":"Image Vis Comput"},{"issue":"7","key":"7726_CR28","doi-asserted-by":"publisher","first-page":"1681","DOI":"10.1109\/TMM.2018.2888822","volume":"21","author":"Z Zhang","year":"2019","unstructured":"Zhang Z, Wu Q, Wang Y, Chen F (2019) High-quality image captioning with fine-grained and semantic-guided visual attention. IEEE Trans Multimed 21(7):1681\u20131693. https:\/\/doi.org\/10.1109\/TMM.2018.2888822","journal-title":"IEEE Trans Multimed"},{"key":"7726_CR29","doi-asserted-by":"publisher","first-page":"66680","DOI":"10.1109\/ACCESS.2019.2917979","volume":"7","author":"S Wang","year":"2019","unstructured":"Wang S, Lan L, Zhang X, Dong G, Luo Z (2019) Cascade semantic fusion for image captioning. IEEE Access 7:66680\u201366688. https:\/\/doi.org\/10.1109\/ACCESS.2019.2917979","journal-title":"IEEE Access"},{"key":"7726_CR30","doi-asserted-by":"publisher","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. In: Proceedings of IEEE international conference on computer vision, ICCV, pp 4904\u20134912. https:\/\/doi.org\/10.1109\/ICCV.2017.524","DOI":"10.1109\/ICCV.2017.524"},{"key":"7726_CR31","doi-asserted-by":"crossref","unstructured":"Guan Z, Liu K, Ma Y, Xu Q, J T (2018) Middle-level attribute-based language retouching for image caption generation. In: Proceedings of the IEEE international conference on acoustics, speech and signal processing, ICASSP, vol 8, pp 3081\u20133085","DOI":"10.3390\/app8101850"},{"issue":"2","key":"7726_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TCYB.2019.2904052","volume":"51","author":"X Li","year":"2019","unstructured":"Li X, Yuan A, Lu X (2019) Vision-to-language tasks based on attributes and attention mechanism. IEEE Trans Cybern 51(2):1\u201314","journal-title":"IEEE Trans Cybern"},{"key":"7726_CR33","doi-asserted-by":"publisher","unstructured":"Li Y, Ouyang W, Zhou B, Wang K, Wang X (2017) Scene graph generation from objects, phrases and region captions. In: Proceedings of IEEE international conference on computer vision, ICCV, pp 1270\u20131279. https:\/\/doi.org\/10.1109\/ICCV.2017.142","DOI":"10.1109\/ICCV.2017.142"},{"key":"7726_CR34","doi-asserted-by":"crossref","unstructured":"Zellers R, Yatskar M, Thomson S, Choi Y (2018) Neural motifs: scene graph parsing with global context. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 5831\u20135840. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Zellers_Neural_Motifs_Scene_CVPR_2018_paper.html","DOI":"10.1109\/CVPR.2018.00611"},{"key":"7726_CR35","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 10685\u201310694. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Yang_Auto-Encoding_Scene_Graphs_for_Image_Captioning_CVPR_2019_paper.html","DOI":"10.1109\/CVPR.2019.01094"},{"issue":"1","key":"7726_CR36","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L, Shamma DA, Bernstein MS, Fei-Fei L (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis 123(1):32\u201373. https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int J Comput Vis"},{"issue":"3","key":"7726_CR37","doi-asserted-by":"publisher","first-page":"808","DOI":"10.1109\/TMM.2019.2931815","volume":"22","author":"L Wu","year":"2020","unstructured":"Wu L, Xu M, Wang J, Perry SW (2020) Recall what you see continually using gridlstm in image captioning. IEEE Trans Multimed 22(3):808\u2013818. https:\/\/doi.org\/10.1109\/TMM.2019.2931815","journal-title":"IEEE Trans Multimed"},{"key":"7726_CR38","doi-asserted-by":"crossref","unstructured":"Qin Y, Du J, Zhang Y, Lu H (2019) Look back and predict forward in image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 8367\u20138375. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Qin_Look_Back_and_Predict_Forward_in_Image_Captioning_CVPR_2019_paper.html","DOI":"10.1109\/CVPR.2019.00856"},{"issue":"8","key":"7726_CR39","doi-asserted-by":"publisher","first-page":"116183","DOI":"10.1016\/j.image.2021.116183","volume":"94","author":"Y Wei","year":"2021","unstructured":"Wei Y, Wu C, Jia ZY, Hu XF, Shi H (2021) Past is important: improved image captioning by looking back in time. Signal Process Image Commun 94(8):116183","journal-title":"Signal Process Image Commun"},{"issue":"5","key":"7726_CR40","first-page":"1112","volume":"42","author":"L Gao","year":"2020","unstructured":"Gao L, Li X, Song J, Shen HT (2020) Hierarchical LSTMs with adaptive attention for visual captioning. IEEE Trans Pattern Anal Mach Intell 42(5):1112\u20131131","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"7726_CR41","doi-asserted-by":"publisher","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 1179\u20131195. https:\/\/doi.org\/10.1109\/CVPR.2017.131","DOI":"10.1109\/CVPR.2017.131"},{"key":"7726_CR42","doi-asserted-by":"publisher","unstructured":"Liu S, Zhu Z, Ye N, Guadarrama S, Murphy K (2017) Improved image captioning via policy gradient optimization of spider. In: Proceedings of the IEEE international conference on computer vision, ICCV, pp 873\u2013881. https:\/\/doi.org\/10.1109\/ICCV.2017.100","DOI":"10.1109\/ICCV.2017.100"},{"key":"7726_CR43","doi-asserted-by":"publisher","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 4566\u20134575. https:\/\/doi.org\/10.1109\/CVPR.2015.7299087","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"7726_CR44","doi-asserted-by":"publisher","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) SPICE: semantic propositional image caption evaluation. In: Proceedings of the European conference on computer vision, ECCV, pp 382\u2013398. https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24","DOI":"10.1007\/978-3-319-46454-1_24"},{"issue":"5","key":"7726_CR45","doi-asserted-by":"publisher","first-page":"1372","DOI":"10.1109\/TMM.2019.2941820","volume":"22","author":"N Xu","year":"2020","unstructured":"Xu N, Zhang H, Liu A-A, Nie W, Su Y, Nie J, Zhan Y (2020) Multi-level policy and reward-based deep reinforcement learning framework for image captioning. IEEE Trans Multimed 22(5):1372\u20131383","journal-title":"IEEE Trans Multimed"},{"key":"7726_CR46","first-page":"2413","volume":"99","author":"J Wu","year":"2020","unstructured":"Wu J, Chen T, Wu H, Yang Z, Lin L (2020) Fine-grained image captioning with global-local discriminative objective. IEEE Trans Multimed 99:2413\u20132427","journal-title":"IEEE Trans Multimed"},{"issue":"8","key":"7726_CR47","doi-asserted-by":"publisher","first-page":"2149","DOI":"10.1109\/TMM.2019.2951226","volume":"22","author":"L Guo","year":"2020","unstructured":"Guo L, Liu J, Lu S, Lu H (2020) Show, tell, and polish: ruminant decoding for image captioning. IEEE Trans Multimed 22(8):2149\u20132162","journal-title":"IEEE Trans Multimed"},{"key":"7726_CR48","unstructured":"Velickovic P, Cucurull G, Casanova A, Romero A, Li\u00f2 P, Bengio Y (2018) Graph attention networks. In: Proceedings of the 6th international conference on learning representations, ICLR. https:\/\/openreview.net\/forum?id=rJXMpikCZ"},{"key":"7726_CR49","unstructured":"Kipf TN, Welling M (2017) Semi-supervised classification with graph convolutional networks. In: Proceedings of the 5th international conference on learning representations, ICLR. https:\/\/openreview.net\/forum?id=SJU4ayYgl"},{"key":"7726_CR50","doi-asserted-by":"publisher","unstructured":"Lin T, Maire M, Belongie SJ, Hays J, Perona P, Ramanan D, P. Doll\u00e1r, Zitnick CL (2014) Microsoft COCO: common objects in context. In: Proceedings of the European conference on computer vision, ECCV, pp. 740\u2013755. https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"7726_CR51","doi-asserted-by":"publisher","unstructured":"Karpathy A, Li FF (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, CVPR, pp 3128\u20133137. https:\/\/doi.org\/10.1109\/CVPR.2015.7298932","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"7726_CR52","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics, ACL, pp 311\u2013318. https:\/\/www.aclweb.org\/anthology\/P02-1040\/","DOI":"10.3115\/1073083.1073135"},{"key":"7726_CR53","doi-asserted-by":"publisher","unstructured":"Denkowski MJ, Lavie A (2014) Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the 9th workshop on statistical machine translation, WMT, pp 376\u2013380. https:\/\/doi.org\/10.3115\/v1\/w14-3348","DOI":"10.3115\/v1\/w14-3348"},{"key":"7726_CR54","unstructured":"Lin C-Y (2004) ROUGE: a package for automatic evaluation of summaries. In: Text summarization branches out, association for computational linguistics, Barcelona, Spain, pp 74\u201381. https:\/\/www.aclweb.org\/anthology\/W04-1013"},{"key":"7726_CR55","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: Proceedings of the 3rd international conference on learning representations, ICLR. http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"7726_CR56","doi-asserted-by":"crossref","unstructured":"Gu J, Cai J, Wang G, Chen T (2018) Stack-captioning: coarse-to-fine learning for image captioning. In: Proceedings of the 32nd association for the advancement of artificial intelligence, AAAI, pp 6837\u20136844. https:\/\/www.aaai.org\/ocs\/index.php\/AAAI\/AAAI18\/paper\/view\/16465","DOI":"10.1609\/aaai.v32i1.12266"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-022-07726-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-022-07726-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-022-07726-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,16]],"date-time":"2023-01-16T05:09:57Z","timestamp":1673845797000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-022-07726-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,27]]},"references-count":56,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,1]]}},"alternative-id":["7726"],"URL":"https:\/\/doi.org\/10.1007\/s00521-022-07726-z","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,27]]},"assertion":[{"value":"10 January 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 August 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 August 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}