{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T21:40:34Z","timestamp":1774388434630,"version":"3.50.1"},"reference-count":58,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,1,29]],"date-time":"2024-01-29T00:00:00Z","timestamp":1706486400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,29]],"date-time":"2024-01-29T00:00:00Z","timestamp":1706486400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s00530-023-01249-w","type":"journal-article","created":{"date-parts":[[2024,1,29]],"date-time":"2024-01-29T06:02:09Z","timestamp":1706508129000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":29,"title":["GVA: guided visual attention approach for automatic image caption generation"],"prefix":"10.1007","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8546-5426","authenticated-orcid":false,"given":"Md. Bipul","family":"Hossen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3207-2258","authenticated-orcid":false,"given":"Zhongfu","family":"Ye","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9426-3415","authenticated-orcid":false,"given":"Amr","family":"Abdussalam","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8030-8379","authenticated-orcid":false,"given":"Md. Imran","family":"Hossain","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,29]]},"reference":[{"key":"1249_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.10.059","author":"A Yuan","year":"2019","unstructured":"Yuan, A., Li, X., Lu, X.: 3G structure for image caption generation. Neurocomputing (2019). https:\/\/doi.org\/10.1016\/j.neucom.2018.10.059","journal-title":"Neurocomputing"},{"key":"1249_CR2","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2023","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cascianelli, S., Fiameni, G., Cucchiara, R.: From show to tell: a survey on deep learning-based image captioning. IEEE Trans. Pattern Anal. Mach. Intell. 45, 539\u2013559 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3148210","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1249_CR3","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Jiang, Y.-G., Liu, W., Zhang, T.: Recurrent fusion network for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp. 499\u2013515 (2018)","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"1249_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2020.103068","author":"H Wei","year":"2020","unstructured":"Wei, H., Li, Z., Zhang, C., Ma, H.: The synergy of double attention: combine sentence-level and word-level attention for image captioning. Comput. Vis. Image Underst. (2020). https:\/\/doi.org\/10.1016\/j.cviu.2020.103068","journal-title":"Comput. Vis. Image Underst."},{"key":"1249_CR5","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D.  Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition pp. 3156\u20133164 (2015).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1249_CR6","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2917771","author":"K Wang","year":"2019","unstructured":"Wang, K., Zhang, X., Wang, F., Wu, T.Y., Chen, C.M.: Multilayer dense attention model for image caption. IEEE Access (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2917771","journal-title":"IEEE Access"},{"key":"1249_CR7","doi-asserted-by":"publisher","first-page":"18413","DOI":"10.1007\/s11042-021-10578-9","volume":"80","author":"C Sur","year":"2021","unstructured":"Sur, C.: MRRC: multiple role representation crossover interpretation for image captioning with R-CNN feature distribution composition (FDC). Multimed. Tools Appl. 80, 18413\u201318443 (2021). https:\/\/doi.org\/10.1007\/s11042-021-10578-9","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR8","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Hu, Z., Zhac, Y., Liu, X., & Hong, R.: Enhanced text-guided attention model for image captioning. In 2018 IEEE fourth international conference on multimedia big data (BigMM) pp. 1\u20135 (2018)","DOI":"10.1109\/BigMM.2018.8499172"},{"key":"1249_CR9","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3042086","author":"W Zhao","year":"2021","unstructured":"Zhao, W., Wu, X., Luo, J.: Cross-domain image captioning via cross-modal retrieval and model adaptation. IEEE Trans. Image Process. (2021). https:\/\/doi.org\/10.1109\/TIP.2020.3042086","journal-title":"IEEE Trans. Image Process."},{"key":"1249_CR10","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3202690","author":"M Al-Qatf","year":"2022","unstructured":"Al-Qatf, M., Wang, X., Hawbani, A., Abdusallam, A., Alsamhi, S.H.: Image captioning with novel topics guidance and retrieval-based topics re-weighting. IEEE Trans. Multimed. (2022). https:\/\/doi.org\/10.1109\/TMM.2022.3202690","journal-title":"IEEE Trans. Multimed."},{"key":"1249_CR11","doi-asserted-by":"publisher","DOI":"10.1145\/3409388","author":"X Liu","year":"2021","unstructured":"Liu, X., Xu, Q.: Adaptive attention-based high-level semantic introduction for image caption. ACM Trans. Multimed. Comput. Commun. Appl. (2021). https:\/\/doi.org\/10.1145\/3409388","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1249_CR12","doi-asserted-by":"publisher","first-page":"154953","DOI":"10.1109\/ACCESS.2020.3018752","volume":"8","author":"L Cheng","year":"2020","unstructured":"Cheng, L., Wei, W., Mao, X., Liu, Y., Miao, C.: Stack-VS: stacked visual-semantic attention for image caption generation. IEEE Access 8, 154953\u2013154965 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.3018752","journal-title":"IEEE Access"},{"key":"1249_CR13","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom\u2013up and top\u2013down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086. IEEE (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1249_CR14","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-021-11293-1","author":"K Deorukhkar","year":"2022","unstructured":"Deorukhkar, K., Ket, S.: A detailed review of prevailing image captioning methods using deep learning techniques. Multimed. Tools Appl. (2022). https:\/\/doi.org\/10.1007\/s11042-021-11293-1","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR15","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings\u201430th IEEE Conference on Computer Vision and Pattern Recognition. CVPR (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"1249_CR16","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-00937-3","author":"T do Carmo Nogueira","year":"2022","unstructured":"do Carmo Nogueira, T., Vinhal, C.D.N., da Cruz J\u00fanior, G., Ullmann, M.R.D., Marques, T.C.: A reference-based model using deep learning for image captioning. Multimed. Syst. (2022). https:\/\/doi.org\/10.1007\/s00530-022-00937-3","journal-title":"Multimed. Syst."},{"key":"1249_CR17","doi-asserted-by":"publisher","first-page":"11531","DOI":"10.1007\/s11042-019-08567-0","volume":"79","author":"S Wang","year":"2020","unstructured":"Wang, S., Lan, L., Zhang, X., Luo, Z.: GateCap: gated spatial and semantic attention model for image captioning. Multimed. Tools Appl. 79, 11531\u201311549 (2020). https:\/\/doi.org\/10.1007\/s11042-019-08567-0","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR18","doi-asserted-by":"publisher","first-page":"3157","DOI":"10.1007\/s11063-022-10759-z","volume":"54","author":"F Xiao","year":"2022","unstructured":"Xiao, F., Xue, W., Shen, Y., Gao, X.: A new attention-based LSTM for image captioning. Neural. Process. Lett. 54, 3157\u20133171 (2022). https:\/\/doi.org\/10.1007\/s11063-022-10759-z","journal-title":"Neural. Process. Lett."},{"key":"1249_CR19","doi-asserted-by":"publisher","first-page":"1223","DOI":"10.1007\/s11042-022-13279-z","volume":"82","author":"D Zhao","year":"2023","unstructured":"Zhao, D., Yang, R., Wang, Z., Qi, Z.: A cooperative approach based on self-attention with interactive attribute for image caption. Multimed. Tools Appl. 82, 1223\u20131236 (2023). https:\/\/doi.org\/10.1007\/s11042-022-13279-z","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR20","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-023-00693-9","author":"R Sasibhooshan","year":"2023","unstructured":"Sasibhooshan, R., Kumaraswamy, S., Sasidharan, S.: Image caption generation using visual attention prediction and contextual spatial relation extraction. J. Big Data (2023). https:\/\/doi.org\/10.1186\/s40537-023-00693-9","journal-title":"J. Big Data"},{"key":"1249_CR21","doi-asserted-by":"crossref","unstructured":"Zhou, D., Yang, J., Zhang, C., Tang, Y.: Joint Science Network and attention-guided for image captioning. In: Proceedings\u2014IEEE International Conference on Data Mining. ICDM (2021)","DOI":"10.1109\/ICDM51629.2021.00201"},{"key":"1249_CR22","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-Linear attention networks for image captioning. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 10968\u201310977. IEEE Computer Society (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1249_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104570","author":"Z Wang","year":"2022","unstructured":"Wang, Z., Shi, S., Zhai, Z., Wu, Y., Yang, R.: ArCo: attention-reinforced transformer with contrastive learning for image captioning. Image Vis. Comput. (2022). https:\/\/doi.org\/10.1016\/j.imavis.2022.104570","journal-title":"Image Vis. Comput."},{"key":"1249_CR24","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TMM.2020.3011317","volume":"23","author":"J Wu","year":"2021","unstructured":"Wu, J., Chen, T., Wu, H., Yang, Z., Luo, G., Lin, L.: Fine-grained image captioning with global-local discriminative objective. IEEE Trans. Multimed. 23, 2413\u20132427 (2021). https:\/\/doi.org\/10.1109\/TMM.2020.3011317","journal-title":"IEEE Trans. Multimed."},{"key":"1249_CR25","doi-asserted-by":"crossref","unstructured":"Fang, Z., Wang, J., Hu, X., Liang, L., Gan, Z., Wang, L., Yang, Y., Liu, Z.: Injecting semantic concepts into end-to-end image captioning. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition pp. 18009\u201318019 (2022)","DOI":"10.1109\/CVPR52688.2022.01748"},{"key":"1249_CR26","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1249_CR27","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031","author":"S Ren","year":"2017","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. (2017). https:\/\/doi.org\/10.1109\/TPAMI.2016.2577031","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1249_CR28","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., Nie, L., Shao, J., Liu, W., Chua, T.-S.: SCA-CNN: Spatial and channel-wise attention in convolutional networks for image captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6298\u20136306. IEEE (2017)","DOI":"10.1109\/CVPR.2017.667"},{"key":"1249_CR29","doi-asserted-by":"publisher","first-page":"134","DOI":"10.1109\/ACCESS.2022.3232508","volume":"11","author":"H Zhang","year":"2023","unstructured":"Zhang, H., Ma, C., Jiang, Z., Lian, J.: Image caption generation using contextual information fusion with Bi-LSTM-s. IEEE Access 11, 134\u2013143 (2023). https:\/\/doi.org\/10.1109\/ACCESS.2022.3232508","journal-title":"IEEE Access"},{"key":"1249_CR30","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-09128-6","author":"N Naqvi","year":"2020","unstructured":"Naqvi, N., Ye, Z.F.: Image captions: global-local and joint signals attention model (GL-JSAM). Multimed. Tools Appl. (2020). https:\/\/doi.org\/10.1007\/s11042-020-09128-6","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR31","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-13793-0","author":"H Sharma","year":"2022","unstructured":"Sharma, H., Srivastava, S.: Multilevel attention and relation network based image captioning model. Multimed. Tools Appl. (2022). https:\/\/doi.org\/10.1007\/s11042-022-13793-0","journal-title":"Multimed. Tools Appl."},{"key":"1249_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3460474","volume":"17","author":"W Jiang","year":"2021","unstructured":"Jiang, W., Wang, W., Hu, H.: Bi-directional co-attention network for image captioning. ACM Trans. Multimed. Comput. Commun. Appl. 17, 1\u201320 (2021). https:\/\/doi.org\/10.1145\/3460474","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1249_CR33","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2021.103138","author":"X Zhong","year":"2021","unstructured":"Zhong, X., Nie, G., Huang, W., Liu, W., Ma, B., Lin, C.W.: Attention-guided image captioning with adaptive global and local feature fusion. J. Vis. Commun. Image Represent. (2021). https:\/\/doi.org\/10.1016\/j.jvcir.2021.103138","journal-title":"J. Vis. Commun. Image Represent."},{"key":"1249_CR34","unstructured":"Xu, K., Ba, J.L., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., Zemel, R.S., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: 32nd International Conference on Machine Learning. ICML (2015)"},{"key":"1249_CR35","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-01036-z","author":"J Li","year":"2022","unstructured":"Li, J., Wang, Y., Zhao, D.: Layer-wise enhanced transformer with multi-modal fusion for image caption. Multimed. Syst. (2022). https:\/\/doi.org\/10.1007\/s00530-022-01036-z","journal-title":"Multimed. Syst."},{"key":"1249_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.107075","author":"J Wang","year":"2020","unstructured":"Wang, J., Wang, W., Wang, L., Wang, Z., Feng, D.D., Tan, T.: Learning visual relationship and context-aware attention for image captioning. Pattern Recognit. (2020). https:\/\/doi.org\/10.1016\/j.patcog.2019.107075","journal-title":"Pattern Recognit."},{"key":"1249_CR37","doi-asserted-by":"publisher","first-page":"66680","DOI":"10.1109\/ACCESS.2019.2917979","volume":"7","author":"S Wang","year":"2019","unstructured":"Wang, S., Lan, L., Zhang, X., Dong, G., Luo, Z.: Cascade semantic fusion for image captioning. IEEE Access 7, 66680\u201366688 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2917979","journal-title":"IEEE Access"},{"key":"1249_CR38","doi-asserted-by":"publisher","first-page":"57943","DOI":"10.1109\/ACCESS.2020.2981513","volume":"8","author":"C Wu","year":"2020","unstructured":"Wu, C., Yuan, S., Cao, H., Wei, Y., Wang, L.: Hierarchical attention-based fusion for image caption with multi-grained rewards. IEEE Access 8, 57943\u201357951 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2981513","journal-title":"IEEE Access"},{"key":"1249_CR39","doi-asserted-by":"publisher","first-page":"2117","DOI":"10.1109\/TMM.2019.2896516","volume":"21","author":"X Li","year":"2019","unstructured":"Li, X., Jiang, S.: Know more say less: image captioning based on scene graphs. IEEE Trans. Multimed. 21, 2117\u20132130 (2019). https:\/\/doi.org\/10.1109\/TMM.2019.2896516","journal-title":"IEEE Trans. Multimed."},{"key":"1249_CR40","doi-asserted-by":"publisher","first-page":"694","DOI":"10.1109\/TIP.2019.2928144","volume":"29","author":"L Zhou","year":"2020","unstructured":"Zhou, L., Zhang, Y., Jiang, Y.G., Zhang, T., Fan, W.: Re-caption: saliency-enhanced image captioning through two-phase learning. IEEE Trans. Image Process. 29, 694\u2013709 (2020). https:\/\/doi.org\/10.1109\/TIP.2019.2928144","journal-title":"IEEE Trans. Image Process."},{"key":"1249_CR41","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/TCSVT.2021.3067449","volume":"32","author":"C Yan","year":"2022","unstructured":"Yan, C., Hao, Y., Li, L., Yin, J., Liu, A., Mao, Z., Chen, Z., Gao, X.: Task-adaptive attention for image captioning. IEEE Trans. Circuits Syst. Video Technol. 32, 43\u201351 (2022). https:\/\/doi.org\/10.1109\/TCSVT.2021.3067449","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1249_CR42","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3242\u20133250. IEEE (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"1249_CR43","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2894139","author":"L Gao","year":"2020","unstructured":"Gao, L., Li, X., Song, J., Shen, H.T.: Hierarchical LSTMs with adaptive attention for visual captioning. IEEE Trans. Pattern Anal. Mach. Intell. (2020). https:\/\/doi.org\/10.1109\/TPAMI.2019.2894139","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1249_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.12.026","author":"YH Tan","year":"2019","unstructured":"Tan, Y.H., Chan, C.S.: Phrase-based image caption generator with hierarchical LSTM network. Neurocomputing (2019). https:\/\/doi.org\/10.1016\/j.neucom.2018.12.026","journal-title":"Neurocomputing"},{"key":"1249_CR45","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition pp. 770-778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1249_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119774","volume":"223","author":"H Parvin","year":"2023","unstructured":"Parvin, H., Naghsh-Nilchi, A.R., Mohammadi, H.M.: Transformer-based local-global guidance for image captioning. Expert Syst. Appl. 223, 119774 (2023). https:\/\/doi.org\/10.1016\/j.eswa.2023.119774","journal-title":"Expert Syst. Appl."},{"key":"1249_CR47","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: Common objects in context. In: Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics) pp. 740-755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1249_CR48","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7","author":"BA Plummer","year":"2017","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. Int. J. Comput. Vis. (2017). https:\/\/doi.org\/10.1007\/s11263-016-0965-7","journal-title":"Int. J. Comput. Vis."},{"key":"1249_CR49","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.: BLEU: a method for automatic evaluation of machine translation. In: Computational Linguistics pp. 311-318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1249_CR50","doi-asserted-by":"crossref","unstructured":"Lavie, A., Agarwal, A.: METEOR: An automatic metric for MT evaluation with high levels of correlation with human judgments. In: Proceedings of the Second Workshop on Statistical Machine Translation (2007)","DOI":"10.3115\/1626355.1626389"},{"key":"1249_CR51","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. Proceedings of the workshop on text summarization branches out (WAS 2004) pp. 74-81 (2004)"},{"key":"1249_CR52","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition pp. 4566-4575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1249_CR53","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., & Gould, S.:  Spice: Semantic propositional image caption evaluation. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14 pp. 382-398 (2016).","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1249_CR54","unstructured":"Kingma, D.P., Ba, J.L.: Adam: a method for stochastic optimization. In: 3rd International Conference on Learning Representations, ICLR 2015\u2014Conference Track Proceedings (2015) arXiv preprint arXiv:1412.6980."},{"key":"1249_CR55","unstructured":"Cohen, E., Beck, J.C.: Empirical analysis of beam search performance degradation in neural sequence models. In: 36th International Conference on Machine Learning, ICML 2019 pp. 1290-1299 (2019)"},{"key":"1249_CR56","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2888822","author":"Z Zhang","year":"2019","unstructured":"Zhang, Z., Wu, Q., Wang, Y., Chen, F.: High-quality image captioning with fine-grained and semantic-guided visual attention. IEEE Trans. Multimed. (2019). https:\/\/doi.org\/10.1109\/TMM.2018.2888822","journal-title":"IEEE Trans. Multimed."},{"key":"1249_CR57","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3576927","volume":"19","author":"A Abdussalam","year":"2023","unstructured":"Abdussalam, A., Ye, Z., Hawbani, A., Al-Qatf, M., Khan, R.: NumCap: a number-controlled multi-caption image captioning network. ACM Trans. Multimed. Comput. Commun. Appl. 19, 1\u201324 (2023). https:\/\/doi.org\/10.1145\/3576927","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1249_CR58","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-018-5856-1","author":"X Li","year":"2018","unstructured":"Li, X., Yuan, A., Lu, X.: Multi-modal gated recurrent units for image description. Multimed. Tools Appl. (2018). https:\/\/doi.org\/10.1007\/s11042-018-5856-1","journal-title":"Multimed. Tools Appl."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01249-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01249-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01249-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,14]],"date-time":"2024-02-14T06:23:48Z","timestamp":1707891828000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01249-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,29]]},"references-count":58,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["1249"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01249-w","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1,29]]},"assertion":[{"value":"12 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 December 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 January 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"50"}}