{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T12:36:43Z","timestamp":1776861403706,"version":"3.51.2"},"reference-count":157,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004541","name":"Ministry of Education, India","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004541","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1016\/j.neucom.2025.132385","type":"journal-article","created":{"date-parts":[[2025,12,13]],"date-time":"2025-12-13T02:32:05Z","timestamp":1765593125000},"page":"132385","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Pixels to prose: A comprehensive survey of image captioning techniques with deep learning and generative artificial intelligence"],"prefix":"10.1016","volume":"667","author":[{"given":"Aarti","family":"Sharma","sequence":"first","affiliation":[]},{"given":"Hrishikesh","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Millie","family":"Pant","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2025.132385_bib1","series-title":"IEEE International Conference on Computer Vision (ICCV) 2019","first-page":"8947","article-title":"nocaps: novel object captioning at scale","author":"Agrawal","year":"2019"},{"key":"10.1016\/j.neucom.2025.132385_bib2","unstructured":"J.-B. Alayracet al., Flamingo: a visual language model for few-shot learning2022.."},{"key":"10.1016\/j.neucom.2025.132385_bib3","series-title":"European Conference on Computer Vision (ECCV)","first-page":"382","article-title":"Spice: Semantic propositional image caption evaluation","author":"Anderson","year":"2016"},{"key":"10.1016\/j.neucom.2025.132385_bib4","article-title":"Partially-supervised image captioning","volume":"vol. 31","author":"Anderson","year":"2018"},{"key":"10.1016\/j.neucom.2025.132385_bib5","doi-asserted-by":"crossref","unstructured":"P. Anderson et al., Bottom-up and top-down attention for image captioning and visual question answering 2018.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.neucom.2025.132385_bib6","doi-asserted-by":"crossref","unstructured":"J. Aneja et al., Convolutional image captioning 2017.","DOI":"10.1109\/CVPR.2018.00583"},{"key":"10.1016\/j.neucom.2025.132385_bib7","first-page":"1877","article-title":"Language models are few-shot learners","volume":"vol. 33","author":"Brown","year":"2020"},{"key":"10.1016\/j.neucom.2025.132385_bib8","series-title":"Proceedings of the 11th Conference of the European Chapter of the Association for Computational Linguistics (EACL)","first-page":"249","article-title":"Re-evaluating the role of BLEU in machine translation research","author":"Callison-Burch","year":"2006"},{"issue":"no. 1","key":"10.1016\/j.neucom.2025.132385_bib9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TPAMI.2021.3137605","article-title":"A comprehensive survey of scene graphs: generation and application","volume":"vol. 45","author":"Chang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"no. 5","key":"10.1016\/j.neucom.2025.132385_bib10","doi-asserted-by":"crossref","DOI":"10.1145\/3465055","article-title":"An attentive survey of attention models","volume":"vol. 12","author":"Chaudhari","year":"2021","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"10.1016\/j.neucom.2025.132385_bib11","doi-asserted-by":"crossref","unstructured":"S. Chen et al., Say as you wish: Fine-grained control of image caption generation with abstract scene graphs 2020.","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"10.1016\/j.neucom.2025.132385_bib12","first-page":"2422","article-title":"Mind\u2019s eye: a recurrent visual representation for image caption generation","author":"Chen","year":"2015","journal-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit."},{"key":"10.1016\/j.neucom.2025.132385_bib13","doi-asserted-by":"crossref","unstructured":"Y.-C. Chen et al., Uniter: Universal image-text representation learning 2020.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"10.1016\/j.neucom.2025.132385_bib14","unstructured":"J. Cho et al., Unifying vision-and-language tasks via text generation 2021."},{"key":"10.1016\/j.neucom.2025.132385_bib15","doi-asserted-by":"crossref","unstructured":"J. Cho et al., Fine-grained image captioning with CLIP reward 2023.","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"10.1016\/j.neucom.2025.132385_bib16","first-page":"1800","article-title":"Xception: Deep learning with depthwise separable convolutions","author":"Chollet","year":"2017","journal-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit."},{"key":"10.1016\/j.neucom.2025.132385_bib17","first-page":"8299","article-title":"Show, control and tell: A framework for generating controllable and grounded captions","author":"Cornia","year":"2019","journal-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit."},{"issue":"no. 2","key":"10.1016\/j.neucom.2025.132385_bib18","doi-asserted-by":"crossref","first-page":"111","DOI":"10.3233\/AIC-210172","article-title":"Explaining transformer-based image captioning models: an empirical analysis","volume":"vol. 35","author":"Cornia","year":"2022","journal-title":"AI Commun."},{"key":"10.1016\/j.neucom.2025.132385_bib19","doi-asserted-by":"crossref","unstructured":"M. Cornia et al., Meshed-memory transformer for image captioning 2020.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"10.1016\/j.neucom.2025.132385_bib20","doi-asserted-by":"crossref","unstructured":"S. Datta et al., Align2ground: Weakly supervised phrase grounding guided by image-caption alignment 2019.","DOI":"10.1109\/ICCV.2019.00269"},{"key":"10.1016\/j.neucom.2025.132385_bib21","unstructured":"J. Devlin et al., BERT: Pre-training of deep bidirectional transformers for language understanding 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib22","unstructured":"J. Devlin et al., Exploring nearest neighbor approaches for image captioning 2015."},{"key":"10.1016\/j.neucom.2025.132385_bib23","unstructured":"C. Doersch, Tutorial on variational autoencoders 2021."},{"key":"10.1016\/j.neucom.2025.132385_bib24","doi-asserted-by":"crossref","unstructured":"J. Donahue et al., Long-term recurrent convolutional networks for visual recognition and description 2016.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"10.1016\/j.neucom.2025.132385_bib25","doi-asserted-by":"crossref","unstructured":"Z.-Y. Dou et al., An empirical study of training end-to-end vision-and-language transformers 2022.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"10.1016\/j.neucom.2025.132385_bib26","series-title":"European Conference on Computer Vision (ECCV)","first-page":"15","article-title":"Every picture tells a story: Generating sentences from images","author":"Farhadi","year":"2010"},{"key":"10.1016\/j.neucom.2025.132385_bib27","doi-asserted-by":"crossref","unstructured":"Z. Gan et al., Vision-language pre-training: Basics, recent advances, and future trends 2022.","DOI":"10.1561\/9781638281337"},{"issue":"no. 3","key":"10.1016\/j.neucom.2025.132385_bib28","doi-asserted-by":"crossref","DOI":"10.1145\/3617592","article-title":"Deep learning approaches on image captioning: a review","volume":"56","author":"Ghandi","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.neucom.2025.132385_bib29","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"1440","article-title":"Fast R-CNN","author":"Girshick","year":"2015"},{"key":"10.1016\/j.neucom.2025.132385_bib30","doi-asserted-by":"crossref","unstructured":"J. Gu et al., Unpaired image captioning via scene graph alignments 2019.","DOI":"10.1109\/ICCV.2019.01042"},{"key":"10.1016\/j.neucom.2025.132385_bib31","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4199","article-title":"MSCap: Multi-style image captioning with unpaired stylized text","author":"Guo","year":"2019"},{"key":"10.1016\/j.neucom.2025.132385_bib32","doi-asserted-by":"crossref","unstructured":"L. Guo et al., Normalized and geometry-aware self-attention network for image captioning, 2020.","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"10.1016\/j.neucom.2025.132385_bib33","series-title":"European Conference on Computer Vision (ECCV)","first-page":"417","article-title":"Captioning images taken by people who are blind","author":"Gurari","year":"2020"},{"key":"10.1016\/j.neucom.2025.132385_bib34","unstructured":"S. Herdade, , 2020, Image captioning Transform. Objects into words."},{"issue":"no. 6","key":"10.1016\/j.neucom.2025.132385_bib35","doi-asserted-by":"crossref","DOI":"10.1145\/3295748","article-title":"A comprehensive survey of deep learning for image captioning","volume":"vol. 51","author":"Hossain","year":"2019","journal-title":"ACM Comput. Surv."},{"issue":"8","key":"10.1016\/j.neucom.2025.132385_bib36","doi-asserted-by":"crossref","first-page":"5929","DOI":"10.1007\/s10462-020-09838-1","article-title":"A review on the long short-term memory model","volume":"vol. 53","author":"Van Houdt","year":"2020","journal-title":"Artif. Intell. Rev."},{"key":"10.1016\/j.neucom.2025.132385_bib37","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"2261","article-title":"Densely connected convolutional networks","author":"Huang","year":"2017"},{"key":"10.1016\/j.neucom.2025.132385_bib38","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"4633","article-title":"Attention on attention for image captioning","author":"Huang","year":"2019"},{"key":"10.1016\/j.neucom.2025.132385_bib39","doi-asserted-by":"crossref","unstructured":"Z. Huang et al., Seeing out of the box: end-to-end pre-training for vision-language representation learning 2021.","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"10.1016\/j.neucom.2025.132385_bib40","unstructured":"Z. Huang et al., Pixel-BERT: Aligning image pixels with text by deep multi-modal transformers 2020."},{"key":"10.1016\/j.neucom.2025.132385_bib41","unstructured":"S. Ioffe et al., Batch normalization: accelerating deep network training by reducing internal covariate shift 2015."},{"key":"10.1016\/j.neucom.2025.132385_bib42","unstructured":"C. Jin, 2023, Self-supervised image captioning with CLIP."},{"key":"10.1016\/j.neucom.2025.132385_bib43","unstructured":"A. Johnson et al., MIMIC-CXR-JPG, a large publicly available database of labeled chest radiographs,\u201d 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib44","doi-asserted-by":"crossref","unstructured":"A. Karpathy et al., Deep visual-semantic alignments for generating image descriptions 2015.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"10.1016\/j.neucom.2025.132385_bib45","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","volume":"vol. 27","author":"Karpathy","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"8","key":"10.1016\/j.neucom.2025.132385_bib46","doi-asserted-by":"crossref","first-page":"5455","DOI":"10.1007\/s10462-020-09825-6","article-title":"A survey of the recent architectures of deep convolutional neural networks","volume":"vol. 53","author":"Khan","year":"2020","journal-title":"Artif. Intell. Rev."},{"issue":"no. 11","key":"10.1016\/j.neucom.2025.132385_bib47","doi-asserted-by":"crossref","first-page":"7348","DOI":"10.1109\/TPAMI.2021.3119754","article-title":"Dense relational image captioning via multi-task triple-stream networks","volume":"vol. 44","author":"Kim","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2025.132385_bib48","series-title":"Proceedings of the International Conference on Machine Learning (ICML)","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.neucom.2025.132385_bib49","unstructured":"D. Kingma, , 2022, Auto-encoding variational Bayes."},{"key":"10.1016\/j.neucom.2025.132385_bib50","series-title":"Proceedings of the 31st International Conference on Machine Learning (ICML)","first-page":"595","article-title":"Multimodal neural language models","author":"Kiros","year":"2014"},{"key":"10.1016\/j.neucom.2025.132385_bib51","unstructured":"R. Kiros, , 2014, Unifying visual-semantic embeddings with multimodal neural language models."},{"key":"10.1016\/j.neucom.2025.132385_bib52","unstructured":"I. Krasin et al., OpenImages: A public dataset for large-scale multi-label and multi-class image classification. Dataset available: \u3008https:\/\/github.com\/openimages\u3009, 2017."},{"key":"10.1016\/j.neucom.2025.132385_bib53","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual Genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neucom.2025.132385_bib54","series-title":"Proceedings of the 2nd Workshop on Statistical Machine Translation","first-page":"228","article-title":"METEOR: An automatic metric for MT evaluation with high levels of correlation with human judgments","author":"Lavie","year":"2007"},{"key":"10.1016\/j.neucom.2025.132385_bib55","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"8927","article-title":"Entangled transformer for image captioning","author":"Li","year":"2019"},{"key":"10.1016\/j.neucom.2025.132385_bib56","doi-asserted-by":"crossref","unstructured":"X. Li et al., Oscar: Object-semantics aligned pre-training for vision-language tasks 2020.","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"10.1016\/j.neucom.2025.132385_bib57","doi-asserted-by":"crossref","unstructured":"Y. Li et al., Scene graph generation from objects, phrases and region captions 2017.","DOI":"10.1109\/ICCV.2017.142"},{"key":"10.1016\/j.neucom.2025.132385_bib58","first-page":"74","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text. Summ. Branches Out."},{"key":"10.1016\/j.neucom.2025.132385_bib59","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"Lin","year":"2014"},{"issue":"no. 6","key":"10.1016\/j.neucom.2025.132385_bib60","doi-asserted-by":"crossref","first-page":"3685","DOI":"10.1109\/TCSVT.2021.3107035","article-title":"Region-aware image captioning via interaction learning","volume":"vol. 32","author":"Liu","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neucom.2025.132385_bib61","series-title":"Proceedings of the MICCAI","first-page":"637","article-title":"M-FLAG: Medical vision-language pre-training with frozen language models and latent space geometry optimization","author":"Liu","year":"2023"},{"key":"10.1016\/j.neucom.2025.132385_bib62","unstructured":"J. Lu et al., ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib63","doi-asserted-by":"crossref","unstructured":"J. Lu et al., Knowing when to look: Adaptive attention via a visual sentinel for image captioning 2017.","DOI":"10.1109\/CVPR.2017.345"},{"key":"10.1016\/j.neucom.2025.132385_bib64","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.107037","article-title":"Scaling-up medical vision-and-language representation learning with federated learning","volume":"vol. 126","author":"Lu","year":"2023","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.neucom.2025.132385_bib65","first-page":"2286","article-title":"Dual-level collaborative transformer for image captioning","volume":"vol. 35","author":"Luo","year":"2021","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.neucom.2025.132385_bib66","doi-asserted-by":"crossref","unstructured":"J. Mao et al., Generation and comprehension of unambiguous object descriptions 2016.","DOI":"10.1109\/CVPR.2016.9"},{"key":"10.1016\/j.neucom.2025.132385_bib67","unstructured":"J. Mao et al., Deep captioning with multimodal recurrent neural networks (m-RNN), arXiv preprint arXiv:1412.6632, 2014."},{"key":"10.1016\/j.neucom.2025.132385_bib68","doi-asserted-by":"crossref","unstructured":"A. Mathews et al., SentiCap: Generating image descriptions with sentiments 2015.","DOI":"10.1609\/aaai.v30i1.10475"},{"issue":"no. 8","key":"10.1016\/j.neucom.2025.132385_bib69","doi-asserted-by":"crossref","first-page":"1339","DOI":"10.1109\/JAS.2022.105734","article-title":"Visuals to text: a comprehensive review on automatic image captioning","volume":"vol. 9","author":"Ming","year":"2022","journal-title":"IEEE\/CAA J. Autom. Sin."},{"key":"10.1016\/j.neucom.2025.132385_bib70","unstructured":"R. Mokady et al., ClipCap: CLIP prefix for image captioning 2021."},{"issue":"no. 12","key":"10.1016\/j.neucom.2025.132385_bib71","doi-asserted-by":"crossref","first-page":"6070","DOI":"10.1109\/JBHI.2022.3207502","article-title":"Multi-modal understanding and generation for medical images and text via vision-language pre-training","volume":"vol. 26","author":"Moon","year":"2022","journal-title":"IEEE J. Biomed. Health Inf."},{"key":"10.1016\/j.neucom.2025.132385_bib72","series-title":"Proceedings of the ML4H","first-page":"353","article-title":"Med-Flamingo: a multimodal medical few-shot learner","author":"Moor","year":"2023"},{"key":"10.1016\/j.neucom.2025.132385_bib73","doi-asserted-by":"crossref","first-page":"48","DOI":"10.1016\/j.neucom.2021.03.091","article-title":"A review on the attention mechanism of deep learning","volume":"vol. 452","author":"Niu","year":"2021","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2025.132385_bib74","article-title":"\u201cIm2text: Describing images using 1 million captioned photographs","volume":"vol. 24","author":"Ordonez","year":"2011"},{"key":"10.1016\/j.neucom.2025.132385_bib75","doi-asserted-by":"crossref","unstructured":"Y. Pan et al., X-linear attention networks for image captioning 2020.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"10.1016\/j.neucom.2025.132385_bib76","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL)","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.neucom.2025.132385_bib77","doi-asserted-by":"crossref","unstructured":"M. Pedersoli et al., Areas of attention for image captioning 2017.","DOI":"10.1109\/ICCV.2017.140"},{"key":"10.1016\/j.neucom.2025.132385_bib78","unstructured":"G. Peng et al., Dynamic fusion with intra- and inter-modality attention flow for visual question answering 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib79","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"2641","article-title":"Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models","author":"Plummer","year":"2015"},{"key":"10.1016\/j.neucom.2025.132385_bib80","unstructured":"A. Radford et al., Language models are unsupervised multitask learners 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib81","first-page":"6626","article-title":"An n-state markovian jumping particle swarm optimization algorithm","volume":"51","author":"Rahman","year":"2021"},{"key":"10.1016\/j.neucom.2025.132385_bib82","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: towards real-time object detection with region proposal networks, 2016."},{"key":"10.1016\/j.neucom.2025.132385_bib83","doi-asserted-by":"crossref","unstructured":"Alexander Selivanov, Oleg Y. Rogov, Daniil Chesakov, Artem Shelmanov, Irina Fedulova, and Dmitry V.Dylov. Medical image captioning via generative pretrained transformers, 2022.","DOI":"10.21203\/rs.3.rs-2197859\/v1"},{"key":"10.1016\/j.neucom.2025.132385_bib84","unstructured":"Nan Ding, Piyush Sharma, Sebastian Goodman, Radu SoricutConceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In Proceedings of ACL, 2018., and ."},{"key":"10.1016\/j.neucom.2025.132385_bib85","doi-asserted-by":"crossref","unstructured":"Kurt Shuster, Samuel Humeau, Hexiang Hu, Antoine Bordes, and Jason Weston. Engaging image captioning via personality, 2019.","DOI":"10.1109\/CVPR.2019.01280"},{"key":"10.1016\/j.neucom.2025.132385_bib86","doi-asserted-by":"crossref","unstructured":"Oleksii Sidorov, Ronghang Hu, Marcus Rohrbach, and Amanpreet Singh. Textcaps: a dataset for image captioning with reading comprehension. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part II 16, pages 742\u2013758. Springer, 2020.","DOI":"10.1007\/978-3-030-58536-5_44"},{"issue":"1","key":"10.1016\/j.neucom.2025.132385_bib87","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2025.132385_bib88","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. From show to tell: a survey on deep learning-based image captioning, 2021."},{"key":"10.1016\/j.neucom.2025.132385_bib89","unstructured":"Chiranjib Sur. Tpsgtr: Neural-symbolic tensor product scene-graph-triplet representation for image captioning. ArXiv, abs\/1911.10115, 2019."},{"key":"10.1016\/j.neucom.2025.132385_bib90","doi-asserted-by":"crossref","unstructured":"Hao Tan and Mohit Bansal. Lxmert: Learning cross-modality encoder representations from transformers, 2019.","DOI":"10.18653\/v1\/D19-1514"},{"key":"10.1016\/j.neucom.2025.132385_bib91","unstructured":"Maria Tsimpoukelli, Jacob Menick, Serkan Cabi, S.M. Ali Eslami, Oriol Vinyals, and Felix Hill. Multimodal few-shot learning with frozen language models, 2021."},{"key":"10.1016\/j.neucom.2025.132385_bib92","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need, 2023."},{"key":"10.1016\/j.neucom.2025.132385_bib93","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam, C.Lawrence Zitnick, and Devi Parikh. Cider: consensus-based image description evaluation, 2015.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.neucom.2025.132385_bib94","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: a neural image caption generator, 2015.","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"4","key":"10.1016\/j.neucom.2025.132385_bib95","doi-asserted-by":"crossref","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","article-title":"Show and tell: lessons learned from the 2015 mscoco image captioning challenge","volume":"39","author":"Vinyals","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2025.132385_bib96","unstructured":"Chen Wang, Jin Zhao, and Jiaqi Gong. A survey on large language models from concept to implementation, 2024."},{"key":"10.1016\/j.neucom.2025.132385_bib97","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. Git: a generative image-to-text transformer for vision and language, 2022."},{"key":"10.1016\/j.neucom.2025.132385_bib98","article-title":"Show, reward and tell: automatic generation of narrative paragraph from photo stream by adversarial training","author":"Wang","year":"2018","journal-title":"AAAI Conf. Artif. Intell. (AAAI)"},{"key":"10.1016\/j.neucom.2025.132385_bib99","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2019.107075","article-title":"Learning visual relationship and context-aware attention for image captioning","volume":"98","author":"Wang","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.neucom.2025.132385_bib100","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework, 2022."},{"key":"10.1016\/j.neucom.2025.132385_bib101","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10760","article-title":"Visual commonsense r-cnn","author":"Wang","year":"2020"},{"key":"10.1016\/j.neucom.2025.132385_bib102","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. Simvlm: Simple visual language model pretraining with weak supervision, 2022."},{"key":"10.1016\/j.neucom.2025.132385_bib103","doi-asserted-by":"crossref","first-page":"153651","DOI":"10.1109\/ACCESS.2020.3018151","article-title":"Variations in variational autoencoders - a comparative evaluation","volume":"8","author":"Wei","year":"2020","journal-title":"IEEE Access"},{"key":"10.1016\/j.neucom.2025.132385_bib104","doi-asserted-by":"crossref","unstructured":"Qi Wu, Chunhua Shen, Lingqiao Liu, Anthony Dick, and Anton van den Hengel. What value do explicit high level concepts have in vision to language problems?, 2016.","DOI":"10.1109\/CVPR.2016.29"},{"key":"10.1016\/j.neucom.2025.132385_bib105","doi-asserted-by":"crossref","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Prahal Arora, Masoumeh Aminzadeh, Christoph Feichtenhofer, Florian Metze, and Luke Zettlemoyer. Vlm: task-agnostic video-language model pre-training for video understanding, 2021.","DOI":"10.18653\/v1\/2021.findings-acl.370"},{"issue":"1","key":"10.1016\/j.neucom.2025.132385_bib106","doi-asserted-by":"crossref","DOI":"10.1145\/3614435","article-title":"Diverse image captioning via conditional variational autoencoder and dual contrastive learning","volume":"20","author":"Xu","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.neucom.2025.132385_bib107","series-title":"Proceedings of the 32nd International Conference on Machine Learning (ICML\u201915), page 2048\u20132057. JMLR.org","article-title":"attend and tell: neural image caption generation with visual attention","author":"Kelvin Xu","year":"2015"},{"key":"10.1016\/j.neucom.2025.132385_bib108","doi-asserted-by":"crossref","first-page":"477","DOI":"10.1016\/j.jvcir.2018.12.027","article-title":"Scene graph captioner: Image captioning based on structural visual representation","volume":"58","author":"Xu","year":"2019","journal-title":"J. Vis. Commun. Image Represent."},{"issue":"4","key":"10.1016\/j.neucom.2025.132385_bib109","first-page":"3136","article-title":"Object relation attention for image paragraph captioning","volume":"35","author":"Yang","year":"2021","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.neucom.2025.132385_bib110","doi-asserted-by":"crossref","unstructured":"Xu Yang, Kaihua Tang, Hanwang Zhang, and Jianfei Cai. Auto-encoding scene graphs for image captioning, 2018.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"10.1016\/j.neucom.2025.132385_bib111","unstructured":"Yezhou Yang, Ching Teo, Hal Daum\u00e9 III, and Yiannis Aloimonos. Corpus-guided sentence generation of natural images. In Regina Barzilay and Mark Johnson, editors, Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, pages 444\u2013454, Edinburgh, Scotland, UK., July 2011. Association for Computational Linguistics."},{"key":"10.1016\/j.neucom.2025.132385_bib112","doi-asserted-by":"crossref","unstructured":"Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. Exploring visual relationship for image captioning, 2018.","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"10.1016\/j.neucom.2025.132385_bib113","series-title":"I2019 IEEE International Conference on Data Mining (ICDM)","first-page":"728","article-title":"Automatic generation of medical imaging diagnostic report with hierarchical recurrent neural network","author":"Yin","year":"2019"},{"key":"10.1016\/j.neucom.2025.132385_bib114","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. Coca: Contrastive captioners are image-text foundation models, 2022."},{"key":"10.1016\/j.neucom.2025.132385_bib115","doi-asserted-by":"crossref","unstructured":"Jianbo Yuan, Haofu Liao, Rui Luo, and Jiebo Luo. Automatic radiology report generation based on multi-view image fusion and medical concept enrichment, 2019.","DOI":"10.1007\/978-3-030-32226-7_80"},{"key":"10.1016\/j.neucom.2025.132385_bib116","author":"Yuan","year":"2021","journal-title":"Florence. A N. Found. Model Comput. Vis."},{"key":"10.1016\/j.neucom.2025.132385_bib117","doi-asserted-by":"crossref","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. Vinvl: Revisiting visual representations in vision-language models, 2021.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"10.1016\/j.neucom.2025.132385_bib118","first-page":"15460","article-title":"Rstnet: Captioning with adaptive attention on visual and non-visual words","volume":"2021","author":"Zhang","year":"2021"},{"key":"10.1016\/j.neucom.2025.132385_bib119","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","article-title":"Image captioning with transformer and knowledge graph","volume":"143","author":"Zhang","year":"2021","journal-title":"Pattern Recognit. Lett."},{"issue":"07","key":"10.1016\/j.neucom.2025.132385_bib120","first-page":"12984","article-title":"Memcap: Memorizing style knowledge for image captioning","volume":"34","author":"Zhao","year":"2020","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"5","key":"10.1016\/j.neucom.2025.132385_bib121","doi-asserted-by":"crossref","first-page":"3833","DOI":"10.1007\/s10462-021-10092-2","article-title":"Neural attention for image captioning: review of outstanding methods","volume":"55","author":"Zohourianshahzadi","year":"2021","journal-title":"Artif. Intell. Rev."},{"key":"10.1016\/j.neucom.2025.132385_bib122","doi-asserted-by":"crossref","first-page":"302","DOI":"10.1016\/j.inffus.2023.01.008","article-title":"Cross-modal text and visual generation: A systematic review. part 1: Image to text","volume":"93","author":"\u017belaszczyk","year":"2023","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.neucom.2025.132385_bib123","series-title":"Proceedings of the 40th International Conference on Machine Learning (ICML\u201923), Honolulu, Hawaii, USA","first-page":"814","article-title":"BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2025.132385_bib124","unstructured":"Liu, H. et al., 2023. Visual instruction tuning. arXiv preprint arXiv:2304.08485. Available at: \u3008https:\/\/arxiv.org\/abs\/2304.08485\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib125","unstructured":"Zhang, X. et al., 2023. GPT-4V(ision) as a generalist evaluator for vision-language tasks. arXiv preprint arXiv:2311.01361. Available at: \u3008https:\/\/arxiv.org\/abs\/2311.01361\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib126","unstructured":"Gemini Team et al., 2025. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805. Available at: \u3008https:\/\/arxiv.org\/abs\/2312.11805\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib127","unstructured":"Ye, Q. et al., 2024. mPLUG-Owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178."},{"key":"10.1016\/j.neucom.2025.132385_bib128","unstructured":"Bai, J. et al., 2023. Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966."},{"key":"10.1016\/j.neucom.2025.132385_bib129","unstructured":"Touvron, H. et al., 2023. LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971. Available at: \u3008https:\/\/arxiv.org\/abs\/2302.13971\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib130","unstructured":"Chen, X. et al., 2023. PaLI: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794. Available at: \u3008https:\/\/arxiv.org\/abs\/2209.06794\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib131","unstructured":"Tay, Y. et al., 2023. UL2: unifying language learning paradigms. arXiv preprint arXiv:2205.05131. Available at: \u3008https:\/\/arxiv.org\/abs\/2205.05131\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib132","series-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems (NeurIPS \u201923)","first-page":"49250","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.neucom.2025.132385_bib133","unstructured":"Wang, P. et al., 2024. Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191. Available at: \u3008https:\/\/arxiv.org\/abs\/2409.12191\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib134","unstructured":"Zhu, Z. et al., 2022. Exploring discrete diffusion models for image captioning. arXiv preprint arXiv:2211.11694. Available at: \u3008https:\/\/arxiv.org\/abs\/2211.11694\u3009."},{"issue":"1","key":"10.1016\/j.neucom.2025.132385_bib135","first-page":"1","article-title":"DiffCap: Diffusion-based real-time human motion capture using sparse IMUs and a monocular camera","volume":"1","author":"Pan","year":"2025","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"key":"10.1016\/j.neucom.2025.132385_bib136","unstructured":"Wang, Y. et al., 2024. LaDiC: are diffusion models really inferior to autoregressive counterparts for image-to-text generation? arXiv preprint arXiv:2404.10763. Available at: \u3008https:\/\/arxiv.org\/abs\/2404.10763\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib137","unstructured":"Chen, T. et al., 2023. Analog bits: generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202."},{"issue":"12","key":"10.1016\/j.neucom.2025.132385_bib138","doi-asserted-by":"crossref","first-page":"2939","DOI":"10.3390\/rs14122939","article-title":"A mask-guided transformer network with topic token for remote sensing image captioning","volume":"14","author":"Ren","year":"2022","journal-title":"Remote Sens."},{"issue":"35\u201336","key":"10.1016\/j.neucom.2025.132385_bib139","doi-asserted-by":"crossref","first-page":"26661","DOI":"10.1007\/s11042-020-09294-7","article-title":"Remote sensing image caption generation via transformer and reinforcement learning","volume":"79","author":"Shen","year":"2020","journal-title":"Multimed. Tools Appl."},{"key":"10.1016\/j.neucom.2025.132385_bib140","series-title":"Proceedings of the International Joint Conference on Neural Networks (IJCNN 2020), art. no. 9207381.","article-title":"Scene attention mechanism for remote sensing image caption generation","author":"Wu","year":"2020"},{"issue":"11","key":"10.1016\/j.neucom.2025.132385_bib141","doi-asserted-by":"crossref","first-page":"2001","DOI":"10.1109\/LGRS.2020.3009243","article-title":"Multiscale methods for optical remote-sensing image captioning","volume":"18","author":"Ma","year":"2021","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"issue":"9","key":"10.1016\/j.neucom.2025.132385_bib142","doi-asserted-by":"crossref","first-page":"1477","DOI":"10.3390\/rs16091477","article-title":"RS-LLaVA: A large vision-language model for joint captioning and question answering in remote sensing imagery","volume":"16","author":"Bazi","year":"2024","journal-title":"Remote Sens."},{"key":"10.1016\/j.neucom.2025.132385_bib143","first-page":"1","article-title":"Metalantis: A Comprehensive Underwater Image Enhancement Framework","volume":"62","author":"Wang","year":"2024","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.neucom.2025.132385_bib144","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.neucom.2025.132385_bib145","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1492","article-title":"Aggregated residual transformations for deep neural networks","author":"Xie","year":"2017"},{"key":"10.1016\/j.neucom.2025.132385_bib146","unstructured":"L.H.Y.-CODE, UICM-SOFF. Available at: \u3008https:\/\/gitee.com\/LHY-CODE\/UICM-SOFF\u3009."},{"key":"10.1016\/j.neucom.2025.132385_bib147","first-page":"1","article-title":"Underwater image captioning with AquaSketch-enhanced cross-scale information fusion","volume":"63","author":"Li","year":"2025","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.neucom.2025.132385_bib148","doi-asserted-by":"crossref","unstructured":"Liu, Z. et al., 2021. Swin Transformer: hierarchical vision transformer using shifted windows. arXiv preprint arXiv:2103.14030.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.neucom.2025.132385_bib150","unstructured":"Li, C. et al., 2019. An underwater image enhancement benchmark dataset and beyond. arXiv preprint arXiv:1901.05495."},{"key":"10.1016\/j.neucom.2025.132385_bib151","article-title":"Deep semantic understanding of high-resolution remote sensing image","volume":"2016","author":"Qu","year":"2016"},{"key":"10.1016\/j.neucom.2025.132385_bib152","doi-asserted-by":"crossref","DOI":"10.1007\/s00530-025-01801-w","article-title":"BMFNet: Bidirectional Multimodal Fusion Network for image captioning","volume":"31","author":"Xue","year":"2025","journal-title":"Multimed. Syst."},{"key":"10.1016\/j.neucom.2025.132385_bib153","doi-asserted-by":"crossref","first-page":"3325","DOI":"10.3390\/electronics14163325","article-title":"Image Captioning Model Based on Multi-Step Cross-Attention Cross-Modal Alignment and External Commonsense Knowledge Augmentation","volume":"14","author":"Wang","year":"2025","journal-title":"Electronics"},{"key":"10.1016\/j.neucom.2025.132385_bib154","doi-asserted-by":"crossref","unstructured":"Wang, Q., Chan, A.B., 2019. Describing like humans: on diversity in image captioning. arXiv preprint arXiv:1903.12020. self -cider.","DOI":"10.1109\/CVPR.2019.00432"},{"key":"10.1016\/j.neucom.2025.132385_bib155","series-title":"Proceedings of ACL. conceptual captions","article-title":"Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning","author":"Sharma","year":"2018"},{"key":"10.1016\/j.neucom.2025.132385_bib156","first-page":"5122","article-title":"Scene parsing through ADE20K dataset","volume":"2017","author":"Zhou","year":"2017"},{"key":"10.1016\/j.neucom.2025.132385_bib157","first-page":"4995","article-title":"Visual7W: grounded question answering in images","volume":"2016","author":"Zhu","year":"2016"},{"key":"10.1016\/j.neucom.2025.132385_bib158","article-title":"Im2Text: describing images using 1 million captioned photographs","author":"Ordonez","year":"2011","journal-title":"Int. J. Comput. Vis."}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225030577?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231225030577?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T04:26:50Z","timestamp":1773894410000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231225030577"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":157,"alternative-id":["S0925231225030577"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2025.132385","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Pixels to prose: A comprehensive survey of image captioning techniques with deep learning and generative artificial intelligence","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2025.132385","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132385"}}