{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:30:12Z","timestamp":1766068212746,"version":"3.41.2"},"reference-count":88,"publisher":"Springer Science and Business Media LLC","issue":"20","license":[{"start":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T00:00:00Z","timestamp":1748131200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T00:00:00Z","timestamp":1748131200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s00521-025-11199-1","type":"journal-article","created":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T15:01:38Z","timestamp":1748185298000},"page":"15501-15533","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Attention-based transformer model for Arabic image captioning"],"prefix":"10.1007","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1450-4172","authenticated-orcid":false,"given":"Israa","family":"Al Badarneh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rana Husni","family":"Al Mahmoud","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bassam H.","family":"Hammo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Omar","family":"Al-Kadi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,25]]},"reference":[{"issue":"6","key":"11199_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain MZ, Sohel F, Shiratuddin MF, Laga H (2019) A comprehensive survey of deep learning for image captioning. ACM Comput Surv (CsUR) 51(6):1\u201336","journal-title":"ACM Comput Surv (CsUR)"},{"issue":"4","key":"11199_CR2","doi-asserted-by":"publisher","first-page":"571","DOI":"10.1007\/s13218-020-00679-2","volume":"34","author":"R Biswas","year":"2020","unstructured":"Biswas R, Barz M, Sonntag D (2020) Towards explanatory interactive image captioning using top-down and bottom-up features, beam search and re-ranking. KI-K\u00fcnstliche Intell 34(4):571\u2013584","journal-title":"KI-K\u00fcnstliche Intell"},{"key":"11199_CR3","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai S, An S (2018) A survey on automatic image caption generation. Neurocomputing 311:291\u2013304","journal-title":"Neurocomputing"},{"key":"11199_CR4","doi-asserted-by":"crossref","unstructured":"Ghandi T, Pourreza H, Mahyar H (2022) Deep learning approaches on image captioning: a review. arXiv preprint arXiv:2201.12944","DOI":"10.1145\/3617592"},{"key":"11199_CR5","doi-asserted-by":"crossref","unstructured":"Tien HN, Do T-H, Nguyen V-A (2020) Image captioning in vietnamese language based on deep learning network. In: International conference on computational collective intelligence, Springer, pp 789\u2013800","DOI":"10.1007\/978-3-030-63119-2_64"},{"key":"11199_CR6","doi-asserted-by":"crossref","unstructured":"Cheikh M, Zrigui M (2020) Active learning based framework for image captioning corpus creation. In: International conference on learning and intelligent optimization, Springer, pp 128\u2013142","DOI":"10.1007\/978-3-030-53552-0_14"},{"issue":"12","key":"11199_CR7","doi-asserted-by":"publisher","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","volume":"30","author":"J Yu","year":"2019","unstructured":"Yu J, Li J, Yu Z, Huang Q (2019) Multimodal transformer with multi-view visual representation for image captioning. IEEE Trans Circuits Syst Video Technol 30(12):4467\u20134480","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11199_CR8","unstructured":"Pa WP, Nwe TL et al (2020) Automatic myanmar image captioning using CNN and LSTM-based language model. In: Proceedings of the 1st joint workshop on spoken language technologies for under-resourced languages (SLTU) and collaboration and computing for under-resourced languages (CCURL), pp 139\u2013143"},{"key":"11199_CR9","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1016\/j.patrec.2025.01.001","volume":"189","author":"AR Alsabbagh","year":"2025","unstructured":"Alsabbagh AR, Mansour T, Al-Kharabsheh M, Ebdah AS, Al-Nahhas S, Mahafza W, Al-Kadi O (2025) Minimedgpt: efficient large vision-language model for medical visual question answering. Patt Recogn Lett 189:8\u201316","journal-title":"Patt Recogn Lett"},{"key":"11199_CR10","doi-asserted-by":"publisher","first-page":"1093","DOI":"10.1109\/TLT.2024.3358864","volume":"17","author":"HY Ayyoub","year":"2024","unstructured":"Ayyoub HY, Al-Kadi OS (2024) Learning style identification using semisupervised self-taught labeling. IEEE Trans Learn Technol 17:1093\u20131106","journal-title":"IEEE Trans Learn Technol"},{"key":"11199_CR11","doi-asserted-by":"publisher","unstructured":"Ahsan H, Bhatt D, Shah K, Bhalla N (2021) Multi-modal image captioning for the visually impaired. In: Proceedings of the 2021 conference of the north American chapter of the association for computational linguistics: student research workshop. Association for Computational Linguistics, pp 53\u201360. https:\/\/doi.org\/10.18653\/v1\/2021.naacl-srw.8 . https:\/\/aclanthology.org\/2021.naacl-srw.8","DOI":"10.18653\/v1\/2021.naacl-srw.8"},{"key":"11199_CR12","doi-asserted-by":"crossref","unstructured":"Fudholi DH, Windiatmoko Y, Afrianto N, Susanto PE, Suyuti M, Hidayatullah AF, Rahmadi R (2021) Image captioning with attention for smart local tourism using efficientnet. In: IOP conference series: materials science and engineering, IOP Publishing, vol. 1077, p 012038","DOI":"10.1088\/1757-899X\/1077\/1\/012038"},{"issue":"4","key":"11199_CR13","doi-asserted-by":"publisher","first-page":"2150044","DOI":"10.1142\/S0219467821500443","volume":"21","author":"M Nivedita","year":"2021","unstructured":"Nivedita M, Chandrashekar P, Mahapatra S, Phamila YAV, Selvaperumal SK (2021) Image captioning for video surveillance system using neural networks. Int J Image Graph 21(4):2150044","journal-title":"Int J Image Graph"},{"key":"11199_CR14","doi-asserted-by":"publisher","first-page":"4462","DOI":"10.1109\/JSTARS.2020.3013818","volume":"13","author":"G Hoxha","year":"2020","unstructured":"Hoxha G, Melgani F, Demir B (2020) Toward remote sensing image retrieval under a deep image captioning perspective. IEEE J Select Top Appl Earth Obs Remote Sens 13:4462\u20134475","journal-title":"IEEE J Select Top Appl Earth Obs Remote Sens"},{"key":"11199_CR15","doi-asserted-by":"crossref","unstructured":"Wang Z, Huang Z, Luo Y (2020) Paic: parallelised attentive image captioning. In: Australasian database conference, Springer, pp 16\u201328","DOI":"10.1007\/978-3-030-39469-1_2"},{"key":"11199_CR16","doi-asserted-by":"crossref","unstructured":"Shuster K, Humeau S, Hu H, Bordes A, Weston J (2019) Engaging image captioning via personality. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 12516\u201312526","DOI":"10.1109\/CVPR.2019.01280"},{"key":"11199_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.iatssr.2019.11.008","author":"H Fujiyoshi","year":"2019","unstructured":"Fujiyoshi H, Hirakawa T, Yamashita T (2019) Deep learning-based image recognition for autonomous driving. IATSS Res. https:\/\/doi.org\/10.1016\/j.iatssr.2019.11.008","journal-title":"IATSS Res"},{"key":"11199_CR18","doi-asserted-by":"crossref","unstructured":"Shah AP, Lamare J-B, Nguyen-Anh T, Hauptmann A (2018) Cadp: a novel dataset for cctv traffic camera based accident analysis. In: 2018 15th IEEE international conference on advanced video and signal based surveillance (AVSS), IEEE, pp 1\u20139","DOI":"10.1109\/AVSS.2018.8639160"},{"key":"11199_CR19","doi-asserted-by":"crossref","unstructured":"Guinness D, Cutrell E, Morris MR (2018) Caption crawler: enabling reusable alternative text descriptions using reverse image search. In: Proceedings of the 2018 CHI conference on human factors in computing systems, pp 1\u201311","DOI":"10.1145\/3173574.3174092"},{"key":"11199_CR20","doi-asserted-by":"crossref","unstructured":"Huang Q, Yang L, Huang H, Wu T, Lin D (2020) Caption-supervised face recognition: training a state-of-the-art face model without manual annotation. In: European conference on computer vision, Springer, pp 139\u2013155","DOI":"10.1007\/978-3-030-58520-4_9"},{"key":"11199_CR21","doi-asserted-by":"publisher","unstructured":"Elhagry A, Kadaoui K (2021) A thorough review on recent deep learning methodologies for image captioning. arXiv e-prints, 2107\u201313114. https:\/\/doi.org\/10.48550\/arXiv.2107.13114, arXiv:2107.13114 [cs.CV]","DOI":"10.48550\/arXiv.2107.13114"},{"key":"11199_CR22","doi-asserted-by":"crossref","unstructured":"Alyafeai Z, Al-Shaibani M (2020) Arbml: democritizing Arabic natural language processing tools. In: Proceedings of second workshop for NLP open source software (NLP-OSS), pp 8\u201313","DOI":"10.18653\/v1\/2020.nlposs-1.2"},{"issue":"41","key":"11199_CR23","doi-asserted-by":"publisher","first-page":"30615","DOI":"10.1007\/s11042-020-09539-5","volume":"79","author":"T Carmo Nogueira","year":"2020","unstructured":"Carmo Nogueira T, Vinhal CDN, Cruz J\u00fanior G, Ullmann MRD (2020) Reference-based model using multimodal gated recurrent units for image captioning. Multimed Tools Appl 79(41):30615\u201330635","journal-title":"Multimed Tools Appl"},{"key":"11199_CR24","doi-asserted-by":"crossref","unstructured":"Zhang W, Nie W, Li X, Yu Y (2019) Image caption generation with adaptive transformer. In: 2019 34rd youth academic annual conference of Chinese association of automation (YAC), IEEE, pp 521\u2013526","DOI":"10.1109\/YAC.2019.8787715"},{"issue":"9","key":"11199_CR25","doi-asserted-by":"publisher","first-page":"6977","DOI":"10.1016\/j.jksuci.2022.02.018","volume":"34","author":"A Abu-Srhan","year":"2022","unstructured":"Abu-Srhan A, Abushariah MA, Al-Kadi OS (2022) The effect of loss function on conditional generative adversarial networks. J King Saud Univ-Comput Inf Sci 34(9):6977\u20136988","journal-title":"J King Saud Univ-Comput Inf Sci"},{"key":"11199_CR26","doi-asserted-by":"publisher","first-page":"110077","DOI":"10.1016\/j.compeleceng.2025.110077","volume":"123","author":"I Al Badarneh","year":"2025","unstructured":"Al Badarneh I, Hammo BH, Al-Kadi O (2025) An ensemble model with attention based mechanism for image captioning. Comput Electr Eng 123:110077","journal-title":"Comput Electr Eng"},{"key":"11199_CR27","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1109\/ACCESS.2020.3041676","volume":"9","author":"MAH Madhfar","year":"2020","unstructured":"Madhfar MAH, Qamar AM (2020) Effective deep learning models for automatic diacritization of Arabic text. IEEE Access 9:273\u2013288","journal-title":"IEEE Access"},{"key":"11199_CR28","doi-asserted-by":"publisher","first-page":"18772","DOI":"10.1109\/ACCESS.2019.2896713","volume":"7","author":"J Zakraoui","year":"2019","unstructured":"Zakraoui J, Elloumi S, Alja\u2019am JM, Yahia SB (2019) Improving Arabic text to image mapping using a robust machine learning technique. IEEE Access 7:18772\u201318782","journal-title":"IEEE Access"},{"key":"11199_CR29","doi-asserted-by":"crossref","unstructured":"ElJundi O, Dhaybi M, Mokadam K, Hajj HM, Asmar DC (2020) Resources and end-to-end neural network models for Arabic image captioning. In: VISIGRAPP (5: VISAPP), pp 233\u2013241","DOI":"10.5220\/0008881202330241"},{"key":"11199_CR30","doi-asserted-by":"crossref","unstructured":"Attai A, Elnagar A (2020) A survey on arabic image captioning systems using deep learning models. In: 2020 14th international conference on innovations in information technology (IIT), IEEE, pp 114\u2013119","DOI":"10.1109\/IIT50501.2020.9299027"},{"key":"11199_CR31","doi-asserted-by":"crossref","unstructured":"Pedersoli M, Lucas T, Schmid C, Verbeek J (2017) Areas of attention for image captioning. In: Proceedings of the IEEE international conference on computer vision, pp 1242\u20131250","DOI":"10.1109\/ICCV.2017.140"},{"key":"11199_CR32","doi-asserted-by":"crossref","unstructured":"He S, Liao W, Tavakoli HR, Yang M, Rosenhahn B, Pugeault N (2020) Image captioning through image transformer. In: Proceedings of the Asian conference on computer vision, pp 1\u201317","DOI":"10.1007\/978-3-030-69538-5_10"},{"key":"11199_CR33","doi-asserted-by":"publisher","first-page":"5538927","DOI":"10.1155\/2021\/5538927","volume":"2021","author":"A Oluwasammi","year":"2021","unstructured":"Oluwasammi A, Aftab MU, Qin Z, Ngo ST, Doan TV, Nguyen SB, Nguyen SH, Nguyen GH (2021) Features to text: a comprehensive survey of deep learning on semantic segmentation and image captioning. Complexity 2021:5538927","journal-title":"Complexity"},{"issue":"7","key":"11199_CR34","doi-asserted-by":"publisher","first-page":"5721","DOI":"10.1002\/cpe.5721","volume":"34","author":"J Chen","year":"2022","unstructured":"Chen J, Zhuge H (2022) A news image captioning approach based on multimodal pointer-generator network. Concurr Comput: Pract Exp 34(7):5721","journal-title":"Concurr Comput: Pract Exp"},{"issue":"1","key":"11199_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-021-00444-8","volume":"8","author":"L Alzubaidi","year":"2021","unstructured":"Alzubaidi L, Zhang J, Humaidi AJ, Al-Dujaili A, Duan Y, Al-Shamma O, Santamar\u00eda J, Fadhel MA, Al-Amidie M, Farhan L (2021) Review of deep learning: concepts, CNN architectures, challenges, applications, future directions. J Big Data 8(1):1\u201374","journal-title":"J Big Data"},{"key":"11199_CR36","doi-asserted-by":"crossref","unstructured":"Faiyaz\u00a0Khan M, Sadiq-Ur-Rahman S, Islam S et al (2021) Improved Bengali image captioning via deep convolutional neural network based encoder-decoder model. In: Proceedings of international joint conference on advances in computational intelligence, Springer, pp 217\u2013229","DOI":"10.1007\/978-981-16-0586-4_18"},{"key":"11199_CR37","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"issue":"10","key":"11199_CR38","doi-asserted-by":"publisher","first-page":"2024","DOI":"10.3390\/app9102024","volume":"9","author":"R Stani\u016bt\u0117","year":"2019","unstructured":"Stani\u016bt\u0117 R, \u0160e\u0161ok D (2019) A systematic literature review on image captioning. Appl Sci 9(10):2024","journal-title":"Appl Sci"},{"key":"11199_CR39","unstructured":"Tan M, Le Q (2019) Efficientnet: rethinking model scaling for convolutional neural networks. In: International conference on machine learning, PMLR, pp 6105\u20136114"},{"key":"11199_CR40","doi-asserted-by":"publisher","first-page":"106691","DOI":"10.1016\/j.asoc.2020.106691","volume":"96","author":"G Marques","year":"2020","unstructured":"Marques G, Agarwal D, Torre D\u00edez I (2020) Automated medical diagnosis of Covid-19 through efficientnet convolutional neural network. Appl Soft Comput 96:106691","journal-title":"Appl Soft Comput"},{"key":"11199_CR41","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems, pp 5998\u20136008"},{"issue":"1","key":"11199_CR42","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini M, Cornia M, Baraldi L, Cascianelli S, Fiameni G, Cucchiara R (2022) From show to tell: a survey on deep learning-based image captioning. IEEE Trans Patt Anal Mach Intell 45(1):539\u2013559","journal-title":"IEEE Trans Patt Anal Mach Intell"},{"issue":"15","key":"11199_CR43","doi-asserted-by":"publisher","first-page":"764","DOI":"10.1049\/el.2020.0635","volume":"56","author":"D Wang","year":"2020","unstructured":"Wang D, Hu H, Chen D (2020) Transformer with sparse self-attention mechanism for image captioning. Electron Lett 56(15):764\u2013766","journal-title":"Electron Lett"},{"key":"11199_CR44","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) Bert: Pre-training of deep bidirectional transformers for language understanding. In: North American chapter of the association for computational linguistics. https:\/\/api.semanticscholar.org\/CorpusID:52967399"},{"issue":"2","key":"11199_CR45","doi-asserted-by":"publisher","first-page":"111","DOI":"10.3233\/AIC-210172","volume":"35","author":"M Cornia","year":"2022","unstructured":"Cornia M, Baraldi L, Cucchiara R (2022) Explaining transformer-based image captioning models: an empirical analysis. AI Commun 35(2):111\u2013129","journal-title":"AI Commun"},{"key":"11199_CR46","doi-asserted-by":"publisher","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics. Association for Computational Linguistics, Philadelphia, pp 311\u2013318. https:\/\/doi.org\/10.3115\/1073083.1073135 . https:\/\/aclanthology.org\/P02-1040","DOI":"10.3115\/1073083.1073135"},{"key":"11199_CR47","unstructured":"Lin C-Y (2004) Rouge: a package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"11199_CR48","unstructured":"Banerjee S, Lavie A (2005) Meteor: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the Acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"11199_CR49","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"11199_CR50","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: semantic propositional image caption evaluation. In: European conference on computer vision, Springer, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"11199_CR51","doi-asserted-by":"publisher","unstructured":"Ghandi T, Pourreza H, Mahyar H (2022) Deep learning approaches on image captioning: a review. arXiv e-prints, 2201\u201312944. https:\/\/doi.org\/10.48550\/arXiv.2201.12944arXiv:2201.12944 [cs.CV]","DOI":"10.48550\/arXiv.2201.12944"},{"key":"11199_CR52","unstructured":"Lin C-Y (2004) Rouge: a package for automatic evaluation of summaries, p 10"},{"key":"11199_CR53","doi-asserted-by":"crossref","unstructured":"Jindal V (2017) A deep learning approach for Arabic caption generation using roots-words. In: Proceedings of the AAAI conference on artificial intelligence, vol. 31, pp 4941\u20134942","DOI":"10.1609\/aaai.v31i1.11090"},{"issue":"11","key":"11199_CR54","doi-asserted-by":"publisher","first-page":"5228","DOI":"10.3390\/app11115228","volume":"11","author":"W Almanaseer","year":"2021","unstructured":"Almanaseer W, Alshraideh M, Alkadi O (2021) A deep belief network classification approach for automatic diacritization of Arabic text. Appl Sci 11(11):5228","journal-title":"Appl Sci"},{"key":"11199_CR55","doi-asserted-by":"crossref","unstructured":"Jindal V (2018) Generating image captions in Arabic using root-word based recurrent neural networks and deep neural networks. In: Proceedings of the AAAI conference on artificial intelligence, vol. 32, pp 8093\u20138094","DOI":"10.1609\/aaai.v32i1.12179"},{"key":"11199_CR56","doi-asserted-by":"publisher","DOI":"10.14569\/IJACSA.2018.090610","author":"HA Al-muzaini","year":"2018","unstructured":"Al-muzaini HA, Al-yahya TN, Benhidour H (2018) Automatic Arabic image captioning using RNN-LTM-based language model and CNN. Int J Adv Comput Sci Appl. https:\/\/doi.org\/10.14569\/IJACSA.2018.090610","journal-title":"Int J Adv Comput Sci Appl"},{"issue":"3","key":"11199_CR57","first-page":"205","volume":"6","author":"R Mualla","year":"2018","unstructured":"Mualla R, Alkheir J (2018) Development of an Arabic image description system. Int J Comput Sci Trends Technol (IJCST) 6(3):205\u2013213","journal-title":"Int J Comput Sci Trends Technol (IJCST)"},{"key":"11199_CR58","doi-asserted-by":"crossref","unstructured":"Hejazi H, Shaalan K (2021) Deep learning for Arabic image captioning: a comparative study of main factors and preprocessing recommendations. Int J Adv Comput Sci Appl 12(11)","DOI":"10.14569\/IJACSA.2021.0121105"},{"issue":"7","key":"11199_CR59","first-page":"11","volume":"13","author":"MT Lasheen","year":"2022","unstructured":"Lasheen MT, Barakat NH (2022) Arabic image captioning: the effect of text pre-processing on the attention weights and the bleu-n scores. Int J Adv Comput Sci Appl 13(7):11","journal-title":"Int J Adv Comput Sci Appl"},{"key":"11199_CR60","doi-asserted-by":"crossref","unstructured":"Emami J, Nugues P, Elnagar A, Afyouni I (2022) Arabic image captioning using pre-training of deep bidirectional transformers. In: Proceedings of the 15th international conference on natural language generation, pp 40\u201351","DOI":"10.18653\/v1\/2022.inlg-main.4"},{"key":"11199_CR61","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00521-023-08744-1","volume":"35","author":"S Elbedwehy","year":"2023","unstructured":"Elbedwehy S, Medhat T (2023) Improved Arabic image captioning model using feature concatenation with pre-trained word embedding. Neural Comput Appl 35:1\u201317. https:\/\/doi.org\/10.1007\/s00521-023-08744-1","journal-title":"Neural Comput Appl"},{"key":"11199_CR62","unstructured":"Sabri SM (2021) Arabic image captioning using deep learning with attention. PhD thesis, University of Georgia"},{"key":"11199_CR63","doi-asserted-by":"publisher","DOI":"10.3390\/app13042446","author":"S Cho","year":"2023","unstructured":"Cho S, Oh H (2023) Generalized image captioning for multilingual support. Appl Sci. https:\/\/doi.org\/10.3390\/app13042446","journal-title":"Appl Sci"},{"key":"11199_CR64","unstructured":"Colombo F (2020) Transfer learning analysis of fashion image captioning systems"},{"key":"11199_CR65","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/j.iotcps.2023.04.003","volume":"3","author":"PP Ray","year":"2023","unstructured":"Ray PP (2023) Chatgpt: a comprehensive review on background, applications, key challenges, bias, ethics, limitations and future scope. Internet of Things Cyber-Phys Syst 3:121\u2013154. https:\/\/doi.org\/10.1016\/j.iotcps.2023.04.003","journal-title":"Internet of Things Cyber-Phys Syst"},{"key":"11199_CR66","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. TACL 2:67\u201378","journal-title":"TACL"},{"key":"11199_CR67","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"key":"11199_CR68","unstructured":"van Miltenburg E (October 2019) Pragmatic factors in (automatic) image description. PhD thesis, Vrije Universiteit Amsterdam"},{"key":"11199_CR69","doi-asserted-by":"publisher","first-page":"113598","DOI":"10.1016\/j.eswa.2020.113598","volume":"159","author":"RH AlMahmoud","year":"2020","unstructured":"AlMahmoud RH, Hammo B, Faris H (2020) A modified bond energy algorithm with fuzzy merging and its application to Arabic text document clustering. Expert Syst Appl 159:113598. https:\/\/doi.org\/10.1016\/j.eswa.2020.113598","journal-title":"Expert Syst Appl"},{"key":"11199_CR70","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning, PMLR, pp 2048\u20132057"},{"key":"11199_CR71","doi-asserted-by":"crossref","unstructured":"Gong L, Crego JM, Senellart J (2019) Enhanced transformer model for data-to-text generation. In: Proceedings of the 3rd workshop on neural generation and translation, pp 148\u2013156","DOI":"10.18653\/v1\/D19-5615"},{"key":"11199_CR72","doi-asserted-by":"crossref","unstructured":"Wolf T, Chaumond J, Debut L, Sanh V, Delangue C, Moi A, Cistac P, Funtowicz M, Davison J, Shleifer S et al (2020) Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 conference on empirical methods in natural language processing: system demonstrations, pp 38\u201345","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"11199_CR73","doi-asserted-by":"publisher","first-page":"3062706","DOI":"10.1155\/2020\/3062706","volume":"2020","author":"H Wang","year":"2020","unstructured":"Wang H, Zhang Y, Yu X (2020) An overview of image caption generation methods. Comput Intell Neurosci 2020:3062706","journal-title":"Comput Intell Neurosci"},{"key":"11199_CR74","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai S, An S (2018) A survey on automatic image caption generation. Neurocomputing 311:291\u2013304. https:\/\/doi.org\/10.1016\/j.neucom.2018.05.080","journal-title":"Neurocomputing"},{"issue":"7","key":"11199_CR75","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.5721","volume":"34","author":"J Chen","year":"2019","unstructured":"Chen J, Zhuge H (2019) A news image captioning approach based on multimodal pointer-generator network. Concurr Comput: Pract Exp 34(7):e5721","journal-title":"Concurr Comput: Pract Exp"},{"key":"11199_CR76","unstructured":"Li J, Monroe W, Jurafsky D (2016) A simple, fast diverse decoding algorithm for neural generation. ArXiv:abs\/1611.08562"},{"key":"11199_CR77","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1007\/s11704-019-8208-z","volume":"14","author":"X Dong","year":"2020","unstructured":"Dong X, Yu Z, Cao W, Shi Y, Ma Q (2020) A survey on ensemble learning. Front Comp Sci 14:241\u2013258","journal-title":"Front Comp Sci"},{"key":"11199_CR78","unstructured":"Velioglu R, Rose J (2020) Detecting hate speech in memes using multimodal deep learning approaches: prize-winning solution to hateful memes challenge. arXiv preprint arXiv:2012.12975"},{"key":"11199_CR79","unstructured":"Kingma D, Ba J (2014) Adam: a method for stochastic optimization. In: International conference on learning representations"},{"issue":"9","key":"11199_CR80","doi-asserted-by":"publisher","first-page":"101750","DOI":"10.1016\/j.jksuci.2023.101750","volume":"35","author":"A Alsayed","year":"2023","unstructured":"Alsayed A, Qadah TM, Arif M (2023) A performance analysis of transformer-based deep learning models for Arabic image captioning. J King Saud Univ-Comput Inf Sci 35(9):101750","journal-title":"J King Saud Univ-Comput Inf Sci"},{"key":"11199_CR81","doi-asserted-by":"publisher","first-page":"109420","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma Y, Ji J, Sun X, Zhou Y, Ji R (2023) Towards local visual modeling for image captioning. Patt Recogn 138:109420","journal-title":"Patt Recogn"},{"key":"11199_CR82","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"11199_CR83","doi-asserted-by":"crossref","unstructured":"Kalimuthu M, Mogadala A, Mosbach M, Klakow D (2021) Fusion models for improved image captioning. In: Pattern recognition. ICPR international workshops and challenges: Virtual Event, January 10\u201315, 2021, Proceedings, Part VI, Springer, pp 381\u2013395","DOI":"10.1007\/978-3-030-68780-9_32"},{"issue":"11","key":"11199_CR84","doi-asserted-by":"publisher","first-page":"1655","DOI":"10.1007\/s00371-018-1565-z","volume":"35","author":"T Jiang","year":"2019","unstructured":"Jiang T, Zhang Z, Yang Y (2019) Modeling coverage with semantic embedding for image caption generation. Vis Comput 35(11):1655\u20131665","journal-title":"Vis Comput"},{"issue":"4","key":"11199_CR85","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3576927","volume":"19","author":"A Abdussalam","year":"2023","unstructured":"Abdussalam A, Ye Z, Hawbani A, Al-Qatf M, Khan R (2023) Numcap: a number-controlled multi-caption image captioning network. ACM Trans Multimed Comput Commun Appl 19(4):1\u201324","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"11199_CR86","doi-asserted-by":"crossref","unstructured":"Shrimal A, Chakraborty T (2021) Attention beam: An image captioning approach (student abstract). In: Proceedings of the AAAI conference on artificial intelligence, vol. 35, pp 15887\u201315888","DOI":"10.1609\/aaai.v35i18.17940"},{"key":"11199_CR87","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1109\/TIP.2020.3042086","volume":"30","author":"W Zhao","year":"2020","unstructured":"Zhao W, Wu X, Luo J (2020) Cross-domain image captioning via cross-modal retrieval and model adaptation. IEEE Trans Image Process 30:1180\u20131192","journal-title":"IEEE Trans Image Process"},{"key":"11199_CR88","doi-asserted-by":"publisher","first-page":"101712","DOI":"10.1016\/j.bpg.2020.101712","volume":"52","author":"T Eelbode","year":"2021","unstructured":"Eelbode T, Sinonquel P, Maes F, Bisschops R (2021) Pitfalls in training and validation of deep learning systems. Best Pract Res Clin Gastroenterol 52:101712","journal-title":"Best Pract Res Clin Gastroenterol"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-025-11199-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-025-11199-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-025-11199-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T11:06:42Z","timestamp":1751886402000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-025-11199-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,25]]},"references-count":88,"journal-issue":{"issue":"20","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["11199"],"URL":"https:\/\/doi.org\/10.1007\/s00521-025-11199-1","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"type":"print","value":"0941-0643"},{"type":"electronic","value":"1433-3058"}],"subject":[],"published":{"date-parts":[[2025,5,25]]},"assertion":[{"value":"27 November 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declaration"}},{"value":"The authors declare that they have no known competing financial interests or personal relations that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}