{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T15:41:43Z","timestamp":1778168503188,"version":"3.51.4"},"reference-count":111,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T00:00:00Z","timestamp":1708387200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T00:00:00Z","timestamp":1708387200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-18307-8","type":"journal-article","created":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T07:03:26Z","timestamp":1708412606000},"page":"34219-34268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":44,"title":["A comprehensive literature review on image captioning methods and metrics based on deep learning technique"],"prefix":"10.1007","volume":"83","author":[{"given":"Ahmad Sami","family":"Al-Shamayleh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omar","family":"Adwan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohammad A.","family":"Alsharaiah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abdelrahman H.","family":"Hussein","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qasem M.","family":"Kharma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christopher Ifeanyi","family":"Eke","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,2,20]]},"reference":[{"key":"18307_CR1","doi-asserted-by":"publisher","first-page":"28121","DOI":"10.1007\/s11042-018-5971-z","volume":"77","author":"AS Al-Shamayleh","year":"2018","unstructured":"Al-Shamayleh AS, Ahmad R, Abushariah MA, Alam KA, Jomhari N (2018) A systematic literature review on vision based gesture recognition techniques. Multimed Tools Appl 77:28121\u201328184","journal-title":"Multimed Tools Appl"},{"key":"18307_CR2","doi-asserted-by":"publisher","unstructured":"Anderson, P, Fernando, B, Johnson, M, Gould, S (2016) Spice: Semantic propositional image caption evaluation. Paper presented at the European conference on computer vision. https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"18307_CR3","doi-asserted-by":"crossref","unstructured":"Aneja J, Deshpande A, Schwing AG (2018) Convolutional image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5561\u20135570","DOI":"10.1109\/CVPR.2018.00583"},{"issue":"3","key":"18307_CR4","first-page":"1638","volume":"12","author":"V Atliha","year":"2022","unstructured":"Atliha V, \u0160e\u0161ok DJAS (2022) Image-Captioning Model Compression 12(3):1638","journal-title":"Image-Captioning Model Compression"},{"key":"18307_CR5","first-page":"291","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai S, An SJN (2018) A survey on automatic image caption generation 311:291\u2013304","journal-title":"A survey on automatic image caption generation"},{"key":"18307_CR6","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"18307_CR7","doi-asserted-by":"publisher","unstructured":"Bernardi, R, Cakici, R, Elliott, D, Erdem, A, Erdem, E, Ikizler-Cinbis, N, . . . Plank, BJJ O AI R (2016) Automatic description generation from images: A survey of models, datasets, and evaluation measures. 55, 409\u2013442. https:\/\/doi.org\/10.1613\/jair.4900","DOI":"10.1613\/jair.4900"},{"key":"18307_CR8","doi-asserted-by":"crossref","unstructured":"Boser BE, Guyon IM, Vapnik VN (1992) A training algorithm for optimal margin classifiers. In Proceedings of the fifth annual workshop on Computational learning theory, pp 144\u2013152","DOI":"10.1145\/130385.130401"},{"key":"18307_CR9","doi-asserted-by":"crossref","unstructured":"Caglayan O, Madhyastha P, Specia L (2020) Curious case of language generation evaluation metrics: A cautionary tale. arXiv preprint arXiv:2010.13588","DOI":"10.18653\/v1\/2020.coling-main.210"},{"key":"18307_CR10","unstructured":"Callison-Burch C, Osborne M, Koehn P (2006) Re-evaluating the role of BLEU in machine translation research. In 11th conference of the european chapter of the association for computational linguistics, pp 249\u2013256"},{"issue":"4","key":"18307_CR11","first-page":"807","volume":"13","author":"H Chen","year":"2021","unstructured":"Chen H, Ding G, Lin Z, Guo Y, Shan C, Han JJCC (2021) Image Caption Memorized Knowl 13(4):807\u2013820","journal-title":"Image Caption Memorized Knowl"},{"key":"18307_CR12","doi-asserted-by":"crossref","unstructured":"Chen L, Zhang H, Xiao J, Nie L, Shao J, Liu W, Chua TS (2017) Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5659\u20135667","DOI":"10.1109\/CVPR.2017.667"},{"key":"18307_CR13","doi-asserted-by":"crossref","unstructured":"Chen T, Liao YH, Chuang CY, Hsu WT, Fu J, Sun M (2017) Show, adapt and tell: Adversarial training of cross-domain image captioner. In Proceedings of the IEEE international conference on computer vision, pp 521\u2013530","DOI":"10.1109\/ICCV.2017.64"},{"key":"18307_CR14","doi-asserted-by":"publisher","unstructured":"Cho, K, Courville, A, Bengio, YJITOM (2015) Describing multimedia content using attention-based encoder-decoder networks. 17(11), 1875\u20131886. https:\/\/doi.org\/10.1109\/TMM.2015.2477044","DOI":"10.1109\/TMM.2015.2477044"},{"key":"18307_CR15","doi-asserted-by":"crossref","unstructured":"Cornia M, Baraldi L, Cucchiara R (2019) Show, control and tell: A framework for generating controllable and grounded captions. Paper presented at the Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2019.00850"},{"key":"18307_CR16","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. Paper presented at the Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"18307_CR17","doi-asserted-by":"crossref","unstructured":"Cui Y, Yang G, Veit A, Huang X, Belongie S (2018) Learning to evaluate image captioning. Paper presented at the Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2018.00608"},{"key":"18307_CR18","unstructured":"Dai J, Li Y, He K, Sun J (2016) R-FCN: Object detection via region-based fully convolutional networks. In: Advances in neural information processing systems, p 29"},{"key":"18307_CR19","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR'05), vol 1. IEEE, pp 886\u2013893","DOI":"10.1109\/CVPR.2005.177"},{"key":"18307_CR20","doi-asserted-by":"crossref","unstructured":"Dao DC, Nguyen TO, Bressan S (2016) Factors influencing the performance of image captioning model: an evaluation. In: Proceedings of the 14th international conference on advances in mobile computing and multi media, pp 235\u2013243","DOI":"10.1145\/3007120.3007136"},{"key":"18307_CR21","doi-asserted-by":"publisher","unstructured":"Dash, SK, Saha, S, Pakray, P, Gelbukh, AJJOI, Systems, F (2019) Generating image captions through multimodal embedding. 36(5), 4787\u20134796. https:\/\/doi.org\/10.3233\/JIFS-179027","DOI":"10.3233\/JIFS-179027"},{"key":"18307_CR22","doi-asserted-by":"publisher","unstructured":"Deng, C, Ding, N, Tan, M, Wu, Q (2020) Length-controllable image captioning. Paper presented at the European Conference on Computer Vision. https:\/\/doi.org\/10.1007\/978-3-030-58601-0_42","DOI":"10.1007\/978-3-030-58601-0_42"},{"key":"18307_CR23","unstructured":"Denoual E, Lepage Y (2005) BLEU in characters: towards automatic MT evaluation in languages without word delimiters. In: Companion volume to the proceedings of conference including posters\/demos and tutorial abstracts"},{"key":"18307_CR24","doi-asserted-by":"crossref","unstructured":"Deorukhkar K, Ket S (2022) A detailed review of prevailing image captioning methods using deep learning techniques. Multimed Tools Appl 81(1):1313\u20131336","DOI":"10.1007\/s11042-021-11293-1"},{"key":"18307_CR25","doi-asserted-by":"crossref","unstructured":"Donahue, J, Anne Hendricks, L, Guadarrama, S, Rohrbach, M, Venugopalan, S, Saenko, K, Darrell, T (2015) Long-term recurrent convolutional networks for visual recognition and description. Paper presented at the Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.21236\/ADA623249"},{"key":"18307_CR26","doi-asserted-by":"publisher","unstructured":"Dong J, Li X, Snoek CG (2018) Predicting visual features from text for image and video caption retrieval. IEEE Trans Multimed 20(12):3377\u20133388. https:\/\/doi.org\/10.1109\/TMM.2018.2832602","DOI":"10.1109\/TMM.2018.2832602"},{"key":"18307_CR27","unstructured":"Elliott, D, Keller, F (2013) Image description using visual dependency representations. Paper presented at the Proceedings of the 2013 conference on empirical methods in natural language processing"},{"key":"18307_CR28","doi-asserted-by":"publisher","unstructured":"Fang F, Wang H, Chen Y, Tang P (2018) Looking deeper and transferring attention for image captioning. Multimed Tools Appl 77:31159\u201331175. https:\/\/doi.org\/10.1007\/s11042-018-6228-6","DOI":"10.1007\/s11042-018-6228-6"},{"key":"18307_CR29","doi-asserted-by":"crossref","unstructured":"Fei Z (2020) Iterative back modification for faster image captioning. In: Proceedings of the 28th ACM international conference on multimedia, pp 3182\u20133190","DOI":"10.1145\/3394171.3413901"},{"key":"18307_CR30","doi-asserted-by":"crossref","unstructured":"Fu, K, Jin, J, Cui, R, Sha, F, Zhang, CJITOPA, Intelligence, M (2016) Aligning where to see and what to tell: Image captioning with region-based attention and scene-specific contexts. 39(12), 2321\u20132334","DOI":"10.1109\/TPAMI.2016.2642953"},{"key":"18307_CR31","doi-asserted-by":"publisher","unstructured":"Gao, L, Guo, Z, Zhang, H, Xu, X, Shen, HTJITOM (2017) Video captioning with attention-based LSTM and semantic consistency. 19(9), 2045\u20132055. https:\/\/doi.org\/10.1109\/TMM.2017.2729019","DOI":"10.1109\/TMM.2017.2729019"},{"key":"18307_CR32","doi-asserted-by":"crossref","unstructured":"Ghandi T, Pourreza H, Mahyar H (2023) Deep learning approaches on image captioning: A review. ACM Comput Surv 56(3):1\u201339","DOI":"10.1145\/3617592"},{"key":"18307_CR33","doi-asserted-by":"publisher","unstructured":"Gong, Y, Wang, L, Hodosh, M, Hockenmaier, J, Lazebnik, S (2014) Improving image-sentence embeddings using large weakly annotated photo collections. Paper presented at the European conference on computer vision. https:\/\/doi.org\/10.1007\/978-3-319-10593-2_35","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"18307_CR34","unstructured":"Guo L, Liu J, Zhu X, Lu HJAPA (2021) Fast Sequence Generation with Multi-Agent Reinforcement Learning"},{"key":"18307_CR35","doi-asserted-by":"publisher","unstructured":"Guo, R, Ma, S, Han, YJMT, Applications (2019) Image captioning: from structural tetrad to translated sentences. 78(17), 24321\u201324346. https:\/\/doi.org\/10.1007\/s11042-018-7118-7","DOI":"10.1007\/s11042-018-7118-7"},{"issue":"3","key":"18307_CR36","first-page":"6143","volume":"22","author":"M Han","year":"2019","unstructured":"Han M, Chen W, Moges ADJCC (2019) Fast Image Caption Using LSTM 22(3):6143\u20136155","journal-title":"Fast Image Caption Using LSTM"},{"key":"18307_CR37","doi-asserted-by":"publisher","unstructured":"He X, Yang Y, Shi B, Bai X (2019) Vd-san: visual-densely semantic attention network for image caption generation. Neurocomputing 328:48\u201355. https:\/\/doi.org\/10.1016\/j.neucom.2018.02.106","DOI":"10.1016\/j.neucom.2018.02.106"},{"issue":"6","key":"18307_CR38","first-page":"1","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain MZ, Sohel F, Shiratuddin MF, Laga H (2019) A comprehensive survey of deep learning for image captioning. ACM Comput Surv (CsUR) 51(6):1\u201336","journal-title":"A Comprehensive Survey Deep Learn Image Caption"},{"key":"18307_CR39","doi-asserted-by":"crossref","unstructured":"Hosseini R, Xie P (2022) Image understanding by captioning with differentiable architecture search. In: Proceedings of the 30th ACM international conference on multimedia, pp 4665\u20134673","DOI":"10.1145\/3503161.3548150"},{"key":"18307_CR40","doi-asserted-by":"crossref","unstructured":"Johnson J, Krishna R, Stark M, Li LJ, Shamma D, Bernstein M, Fei-Fei L (2015) Image retrieval using scene graphs. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3668\u20133678","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"18307_CR41","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"18307_CR42","doi-asserted-by":"crossref","unstructured":"Kasai, J, Sakaguchi, K, Dunagan, L, Morrison, J, Bras, RL, Choi, Y, Smith, NAJAPA (2021) Transparent human evaluation for image captioning","DOI":"10.18653\/v1\/2022.naacl-main.254"},{"key":"18307_CR43","unstructured":"Kiros, R, Salakhutdinov, R, Zemel, RSJAPA (2014) Unifying visual-semantic embeddings with multimodal neural language models"},{"key":"18307_CR44","doi-asserted-by":"publisher","unstructured":"Kitchenham B, Brereton OP, Budgen D, Turner M, Bailey J, Linkman S (2009) Systematic literature reviews in software engineering\u2013a systematic literature review. Inf Softw Technol 51(1):7\u201315. https:\/\/doi.org\/10.1016\/j.infsof.2008.09.009","DOI":"10.1016\/j.infsof.2008.09.009"},{"issue":"12","key":"18307_CR45","doi-asserted-by":"publisher","first-page":"2049","DOI":"10.1016\/j.infsof.2013.07.010","volume":"55","author":"B Kitchenham","year":"2013","unstructured":"Kitchenham B, Brereton P (2013) A systematic review of systematic review process research in software engineering. Inf Softw Technol 55(12):2049\u20132075","journal-title":"Inf Softw Technol"},{"key":"18307_CR46","unstructured":"Keele S (2007) Guidelines for performing systematic literature reviews in software engineering"},{"key":"18307_CR47","unstructured":"Kitchenham, BJK, UK, Keele University (2004) Procedures for performing systematic reviews. 33(2004), 1\u201326"},{"key":"18307_CR48","doi-asserted-by":"crossref","unstructured":"Kumar, A, Goel, SJIJOHIS (2017) A survey of evolution of image captioning techniques. 14(3), 123\u2013139.","DOI":"10.3233\/HIS-170246"},{"key":"18307_CR49","doi-asserted-by":"publisher","unstructured":"Kuznetsova, P, Ordonez, V, Berg, TL, Choi, YJTOTAFCL (2014) Treetalk: Composition and compression of trees for image descriptions. 2, 351\u2013362 https:\/\/doi.org\/10.1162\/tacl_a_00188","DOI":"10.1162\/tacl_a_00188"},{"key":"18307_CR50","doi-asserted-by":"publisher","unstructured":"LeCun Y, Bottou L, Bengio Y, Haffner P (1998) Gradient-based learning applied to document recognition. Proc IEEE 86(11):2278\u20132324. https:\/\/doi.org\/10.1109\/5.726791","DOI":"10.1109\/5.726791"},{"key":"18307_CR51","doi-asserted-by":"publisher","unstructured":"Li X, Yin X, Li C, Zhang P, Hu X, Zhang L et al (2020) Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Computer vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16. Springer International Publishing, pp 121\u2013137. https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"18307_CR52","unstructured":"Lin CY (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"18307_CR53","doi-asserted-by":"crossref","unstructured":"Lin CY, Och FJ (2004) Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proceedings of the 42nd annual meeting of the association for computational linguistics (ACL-04), pp 605\u2013612","DOI":"10.3115\/1218955.1219032"},{"key":"18307_CR54","doi-asserted-by":"crossref","unstructured":"Liu, S, Zhu, Z, Ye, N, Guadarrama, S, Murphy, K (2017) Improved image captioning via policy gradient optimization of spider. Paper presented at the Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.100"},{"key":"18307_CR55","unstructured":"Liu, S, Zhu, Z, Ye, N, Guadarrama, S, Murphy, KJAPA (2016). Optimization of image description metrics using policy gradient methods. 5"},{"key":"18307_CR56","doi-asserted-by":"publisher","unstructured":"Liu W, Anguelov D, Erhan D, Szegedy C, Reed S, Fu CY, Berg AC (2016) SSD: Single shot multibox detector. In: Computer vision\u2013ECCV 2016: 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, proceedings, part I 14. Springer International Publishing, pp 21\u201337. https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"18307_CR57","doi-asserted-by":"publisher","unstructured":"Lowe, DGJIJOCV (2004) Distinctive image features from scale-invariant keypoints. 60(2), 91\u2013110. https:\/\/doi.org\/10.1023\/B:VISI.0000029664.99615.94","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"18307_CR58","unstructured":"Mao, J, Xu, W, Yang, Y, Wang, J, Yuille, ALJAPA (2014) Explain images with multimodal recurrent neural networks"},{"key":"18307_CR59","doi-asserted-by":"crossref","unstructured":"Mao, Y, Chen, L, Jiang, Z, Zhang, D, Zhang, Z, Shao, J, Xiao, J (2022) Rethinking the reference-based distinctive image captioning. Paper presented at the Proceedings of the 30th ACM International Conference on Multimedia","DOI":"10.1145\/3503161.3548358"},{"key":"18307_CR60","unstructured":"Mitchell, M, Dodge, J, Goyal, A, Yamaguchi, K, Stratos, K, Han, X, . . . Daum\u00e9 III, H (2012) Midge: Generating image descriptions from computer vision detections. Paper presented at the Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics"},{"key":"18307_CR61","doi-asserted-by":"publisher","unstructured":"Ojala T, Pietik\u00e4inen M, M\u00e4enp\u00e4\u00e4 T (2000) Gray scale and rotation invariant texture classification with local binary patterns. In: Computer vision-ECCV 2000: 6th European conference on computer vision Dublin, Ireland, June 26\u2013July 1, 2000 proceedings, part I 6. Springer, Berlin Heidelberg, pp 404\u2013420. https:\/\/doi.org\/10.1007\/3-540-45054-8_27","DOI":"10.1007\/3-540-45054-8_27"},{"key":"18307_CR62","doi-asserted-by":"publisher","unstructured":"Oluwasanmi A, Aftab MU, Alabdulkreem E, Kumeda B, Baagyere EY, Qin Z (2019) Captionnet: Automatic end-to-end Siamese difference captioning model with attention. IEEE Access 7:106773\u2013106783. https:\/\/doi.org\/10.1109\/ACCESS.2019.2931223","DOI":"10.1109\/ACCESS.2019.2931223"},{"key":"18307_CR63","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y, Mei T (2020) X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10971\u201310980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"18307_CR64","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Paper presented at the proceedings of the 40th annual meeting of the Association for Computational Linguistics","DOI":"10.3115\/1073083.1073135"},{"key":"18307_CR65","doi-asserted-by":"crossref","unstructured":"Park, CC, Kim, B, Kim, GJITOPA, Intelligence, M (2018) Towards personalized image captioning via multimodal memory networks. 41(4), 999\u20131012","DOI":"10.1109\/TPAMI.2018.2824816"},{"key":"18307_CR66","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"18307_CR67","doi-asserted-by":"publisher","unstructured":"Robertson S (2004) Understanding inverse document frequency: on theoretical arguments for IDF. J Doc 60(5):503\u2013520. https:\/\/doi.org\/10.1108\/00220410410560582","DOI":"10.1108\/00220410410560582"},{"key":"18307_CR68","doi-asserted-by":"crossref","unstructured":"Sammani F, Melas-Kyriazi L (2020) Show, edit and tell: a framework for editing image captions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4808\u20134816","DOI":"10.1109\/CVPR42600.2020.00486"},{"key":"18307_CR69","doi-asserted-by":"crossref","unstructured":"Sargar O, Kinger S (2021) Image captioning methods and metrics. In: 2021 international conference on emerging smart computing and informatics (ESCI). IEEE, pp 522\u2013526","DOI":"10.1109\/ESCI50559.2021.9396839"},{"key":"18307_CR70","doi-asserted-by":"crossref","unstructured":"Schuster, S, Krishna, R, Chang, A, Fei-Fei, L, Manning, CD (2015) Generating semantically precise scene graphs from textual descriptions for improved image retrieval. Paper presented at the Proceedings of the fourth workshop on vision and language","DOI":"10.18653\/v1\/W15-2812"},{"key":"18307_CR71","doi-asserted-by":"crossref","unstructured":"Sharif N, Bennamoun M, White LR, Shah SAA (2018) Learning-based composite metrics for improved caption evaluation. In: 56th annual meeting of association for computational linguistics","DOI":"10.18653\/v1\/P18-3003"},{"key":"18307_CR72","doi-asserted-by":"publisher","unstructured":"Sharif, N, White, L, Bennamoun, M, Shah, SAA (2018) NNEval: Neural network based evaluation metric for image captioning. Paper presented at the Proceedings of the European Conference on Computer Vision (ECCV). https:\/\/doi.org\/10.1007\/978-3-030-01237-3_3","DOI":"10.1007\/978-3-030-01237-3_3"},{"key":"18307_CR73","doi-asserted-by":"crossref","unstructured":"Shetty, R, Rohrbach, M, Anne Hendricks, L, Fritz, M, Schiele, B (2017) Speaking the same language: Matching machine to human captions by adversarial training. Paper presented at the Proceedings of the IEEE International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2017.445"},{"key":"18307_CR74","doi-asserted-by":"crossref","unstructured":"Shuster, K, Humeau, S, Hu, H, Bordes, A, Weston, J (2019) Engaging image captioning via personality. Paper presented at the Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2019.01280"},{"issue":"10","key":"18307_CR75","first-page":"2024","volume":"9","author":"R Stani\u016bt\u0117","year":"2019","unstructured":"Stani\u016bt\u0117 R, \u0160e\u0161ok DJAS (2019) A System Literature Rev Image Caption 9(10):2024","journal-title":"A System Literature Rev Image Caption"},{"key":"18307_CR76","doi-asserted-by":"crossref","unstructured":"Stefanini M, Cornia M, Baraldi L, Cascianelli S, Fiameni G, Cucchiara R (2022) From show to tell: A survey on deep learning-based image captioning. IEEE Trans Pattern Anal Mach Intell 45(1):539\u2013559","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"18307_CR77","first-page":"144","volume":"367","author":"J Su","year":"2019","unstructured":"Su J, Tang J, Lu Z, Han X, Zhang H (2019) A neural image captioning model with caption-to-images semantic constructor. Neurocomputing 367:144\u2013151","journal-title":"A Neural Image Caption Model Caption-to-Images Semantic Construct"},{"key":"18307_CR78","doi-asserted-by":"publisher","unstructured":"Tan JH, Chan CS, Chuah JH(2019) Comic: Toward a compact image captioning model with attention. IEEE Trans Multimed 21(10):2686\u20132696. https:\/\/doi.org\/10.1109\/TMM.2019.2904878","DOI":"10.1109\/TMM.2019.2904878"},{"key":"18307_CR79","doi-asserted-by":"crossref","unstructured":"Tan Y, Lin Z, Fu P, Zheng M, Wang L, Cao Y, Wang W (2022) Detach and attach: Stylized image captioning without paired stylized dataset. In: Proceedings of the 30th ACM international conference on multimedia, pp 4733\u20134741","DOI":"10.1145\/3503161.3548295"},{"key":"18307_CR80","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D (2015) Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"18307_CR81","doi-asserted-by":"crossref","unstructured":"Vinyals, O, Toshev, A, Bengio, S, Erhan, D (2015) Show and tell: A neural image caption generator. Paper presented at the Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"18307_CR82","doi-asserted-by":"crossref","unstructured":"Vinyals, O, Toshev, A, Bengio, S, Erhan, DJITOPA, Intelligence, M (2016) Show and tell: Lessons learned from the 2015 mscoco image captioning challenge. 39(4), 652\u2013663","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"18307_CR83","doi-asserted-by":"crossref","unstructured":"Wang C, Yang H, Bartz C, Meinel C (2016) Image captioning with deep bidirectional LSTMs. In: Proceedings of the 24th ACM international conference on multimedia, pp 988\u2013997","DOI":"10.1145\/2964284.2964299"},{"key":"18307_CR84","doi-asserted-by":"publisher","unstructured":"Wang C, Yang H, Meinel C (2018) Image captioning with deep bidirectional LSTMs and multi-task learning. ACM Trans Multimed Comput Commun Appl (TOMM) 14(2s):1\u201320. https:\/\/doi.org\/10.1145\/3115432","DOI":"10.1145\/3115432"},{"key":"18307_CR85","first-page":"66358","volume":"7","author":"EK Wang","year":"2019","unstructured":"Wang EK, Zhang X, Wang F, Wu TY, Chen CM (2019) Multilayer dense attention model for image caption. IEEE Access 7:66358\u201366368","journal-title":"Multilayer dense Attention Model for Image Caption"},{"key":"18307_CR86","doi-asserted-by":"crossref","unstructured":"Wang, Q, Chan, AB (2019) Describing like humans: on diversity in image captioning. Paper presented at the Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR.2019.00432"},{"key":"18307_CR87","unstructured":"Wang, Q, Wan, J, Chan, ABJITOPA, Intelligence, M (2020) On diversity in image captioning: Metrics and methods"},{"key":"18307_CR88","doi-asserted-by":"crossref","unstructured":"Wu, Q, Shen, C, Wang, P, Dick, A, Van Den Hengel, AJITOPA, Intelligence, M (2017) Image captioning and visual question answering based on attributes and external knowledge. 40(6), 1367\u20131381","DOI":"10.1109\/TPAMI.2017.2708709"},{"key":"18307_CR89","doi-asserted-by":"publisher","unstructured":"Xiao, F, Gong, X, Zhang, Y, Shen, Y, Li, J, Gao, XJN (2019) DAA: Dual LSTMs with adaptive attention for image captioning. 364, 322\u2013329. https:\/\/doi.org\/10.1016\/j.neucom.2019.06.085","DOI":"10.1016\/j.neucom.2019.06.085"},{"key":"18307_CR90","doi-asserted-by":"publisher","unstructured":"Xiao X, Wang L, Ding K, Xiang S, Pan C (2019) Deep hierarchical encoder\u2013decoder network for image captioning. IEEE Trans Multimed 21(11):2942\u20132956. https:\/\/doi.org\/10.1109\/TMM.2019.2915033","DOI":"10.1109\/TMM.2019.2915033"},{"key":"18307_CR91","first-page":"285","volume":"90","author":"X Xiao","year":"2019","unstructured":"Xiao X, Wang L, Ding K, Xiang S, Pan CJPR (2019) Dense Semantic Embedding Network for Image Captioning 90:285\u2013296","journal-title":"Dense Semantic Embedding Network for Image Captioning"},{"key":"18307_CR92","unstructured":"Xu, K, Ba, J, Kiros, R, Cho, K, Courville, A, Salakhudinov, R, . . . Bengio, Y (2015) Show, attend and tell: Neural image caption generation with visual attention. Paper presented at the International conference on machine learning"},{"key":"18307_CR93","doi-asserted-by":"crossref","unstructured":"Xu, N, Zhang, H, Liu, A-A, Nie, W, Su, Y, Nie, J, Zhang, YJITOM (2019) Multi-level policy and reward-based deep reinforcement learning framework for image captioning. 22(5), 1372\u20131383","DOI":"10.1109\/TMM.2019.2941820"},{"key":"18307_CR94","first-page":"56","volume":"328","author":"J Yang","year":"2019","unstructured":"Yang J, Sun Y, Liang J, Ren B, Lai S-HJN (2019) Image Caption Incorporating Affect Concepts Learned from both Visual and Textual Components 328:56\u201368","journal-title":"Image Caption Incorporating Affect Concepts Learned from both Visual and Textual Components"},{"key":"18307_CR95","doi-asserted-by":"publisher","unstructured":"Yang, L-C, Yang, C-Y, Hsu, JY-J (2021) Object Relation Attention for Image Paragraph Captioning. Paper presented at the Proceedings of the AAAI Conference on Artificial Intelligence. https:\/\/doi.org\/10.1609\/aaai.v35i4.16423","DOI":"10.1609\/aaai.v35i4.16423"},{"key":"18307_CR96","volume":"189","author":"L Yang","year":"2019","unstructured":"Yang L, Hu H (2019) Visual skeleton and reparative attention for part-of-speech image captioning system. Comput Vis Image Underst 189:102819","journal-title":"Visual Skeleton and Reparative Attention for Part-of-Speech image captioning system"},{"issue":"1","key":"18307_CR97","first-page":"549","volume":"50","author":"L Yang","year":"2019","unstructured":"Yang L, Hu H (2019) Adaptive syncretic attention for constrained image captioning. Neural Process Lett 50:549\u2013564","journal-title":"Adaptive syncretic attention for constrained Image captioning"},{"key":"18307_CR98","doi-asserted-by":"publisher","unstructured":"Yang M, Liu J, Shen Y, Zhao Z, Chen X, Wu Q, Li C (2020) An ensemble of generation-and retrieval-based image captioning with dual generator generative adversarial network. IEEE Trans Image Process 29:9627\u20139640. https:\/\/doi.org\/10.1109\/TIP.2020.3028651","DOI":"10.1109\/TIP.2020.3028651"},{"key":"18307_CR99","doi-asserted-by":"crossref","unstructured":"Yang, M, Zhao, W, Xu, W, Feng, Y, Zhao, Z, Chen, X, Lei, KJITOM (2018) Multitask learning for cross-domain image captioning. 21(4), 1047\u20131061","DOI":"10.1109\/TMM.2018.2869276"},{"key":"18307_CR100","doi-asserted-by":"crossref","unstructured":"You, Q, Jin, H, Wang, Z, Fang, C, Luo, J (2016) Image captioning with semantic attention. Paper presented at the Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.503"},{"key":"18307_CR101","doi-asserted-by":"crossref","unstructured":"Yu, N, Hu, X, Song, B, Yang, J, Zhang, JJITOIP (2018) Topic-oriented image captioning based on order-embedding. 28(6), 2743\u20132754","DOI":"10.1109\/TIP.2018.2889922"},{"key":"18307_CR102","first-page":"132","volume":"392","author":"X Zeng","year":"2020","unstructured":"Zeng X, Wen L, Liu B, Qi XJN (2020) Deep Learning for Ultrasound Image Caption Generation Based on Object Detection 392:132\u2013141","journal-title":"Deep Learning for Ultrasound Image Caption Generation Based on Object Detection"},{"key":"18307_CR103","doi-asserted-by":"publisher","unstructured":"Zhang, J, Li, K, Wang, Z, Zhao, X, Wang, ZJESWA (2021) Visual enhanced gLSTM for image captioning. 184, 115462. https:\/\/doi.org\/10.1016\/j.eswa.2021.115462","DOI":"10.1016\/j.eswa.2021.115462"},{"key":"18307_CR104","doi-asserted-by":"publisher","unstructured":"Zhang J, Li K, Wang Z (2021) Parallel-fusion LSTM with synchronous semantic and visual information for image captioning. J Vis Commun Image Represent 75:103044. https:\/\/doi.org\/10.1016\/j.jvcir.2021.103044","DOI":"10.1016\/j.jvcir.2021.103044"},{"key":"18307_CR105","doi-asserted-by":"publisher","unstructured":"Zhang, T, Huang, M, Zhao, L (2018) Learning structured representation for text classification via reinforcement learning. Paper presented at the Thirty-Second AAAI Conference on Artificial Intelligence. https:\/\/doi.org\/10.1609\/aaai.v32i1.12047","DOI":"10.1609\/aaai.v32i1.12047"},{"key":"18307_CR106","first-page":"212","volume":"395","author":"X Zhang","year":"2020","unstructured":"Zhang X, He S, Song X, Lau RW, Jiao J, Ye QJN (2020) Image Captioning via Semantic Element Embedding 395:212\u2013221","journal-title":"Image Captioning via Semantic Element Embedding"},{"key":"18307_CR107","doi-asserted-by":"crossref","unstructured":"Zhang Z, Wu Q, Wang Y, Chen FJITOM (2018) High-quality image captioning with fine-grained and semantic-guided visual attention. 21(7):1681\u20131693","DOI":"10.1109\/TMM.2018.2888822"},{"key":"18307_CR108","doi-asserted-by":"publisher","unstructured":"Zhang Z, Zhang W, Diao W, Yan M, Gao X, Sun XJIA (2019) VAA: Visual aligning attention model for remote sensing image captioning. 7:137355\u2013137364. https:\/\/doi.org\/10.1109\/ACCESS.2019.2942154","DOI":"10.1109\/ACCESS.2019.2942154"},{"key":"18307_CR109","first-page":"55","volume":"319","author":"X Zhu","year":"2018","unstructured":"Zhu X, Li L, Liu J, Li Z, Peng H, Niu XJN (2018) Image Captioning with Triple-Attention and Stack Parallel LSTM 319:55\u201365","journal-title":"Image Captioning with Triple-Attention and Stack Parallel LSTM"},{"key":"18307_CR110","unstructured":"Zhu X, Wang W, Guo L, Liu J (2020) AutoCaption: Image captioning with neural architecture search. arXiv preprint arXiv:2012.09742"},{"key":"18307_CR111","doi-asserted-by":"crossref","unstructured":"Zohourianshahzadi Z, Kalita JK (2022) Neural attention for image captioning: review of outstanding methods. Artif Intell Rev 55(5):3833\u20133862","DOI":"10.1007\/s10462-021-10092-2"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18307-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-18307-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18307-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T13:46:32Z","timestamp":1712065592000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-18307-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,20]]},"references-count":111,"journal-issue":{"issue":"12","published-online":{"date-parts":[[2024,4]]}},"alternative-id":["18307"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-18307-8","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,20]]},"assertion":[{"value":"26 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 October 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 January 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflicts of interest to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}