{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T06:39:05Z","timestamp":1776926345279,"version":"3.51.2"},"reference-count":52,"publisher":"Tech Science Press","issue":"3","license":[{"start":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T00:00:00Z","timestamp":1754179200000},"content-version":"vor","delay-in-days":214,"URL":"https:\/\/doi.org\/10.32604\/TSP-CROSSMARKPOLICY"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.065169","type":"journal-article","created":{"date-parts":[[2025,7,10]],"date-time":"2025-07-10T09:14:33Z","timestamp":1752138873000},"page":"5487-5508","update-policy":"https:\/\/doi.org\/10.32604\/tsp-crossmarkpolicy","source":"Crossref","is-referenced-by-count":0,"title":["LREGT: Local Relationship Enhanced Gated Transformer for Image Captioning"],"prefix":"10.32604","volume":"84","author":[{"given":"Yuting","family":"He","sequence":"first","affiliation":[]},{"given":"Zetao","family":"Jiang","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"119773","DOI":"10.1016\/j.eswa.2023.119773","article-title":"Evolution of visual data captioning methods, datasets, and evaluation metrics: a comprehensive survey","volume":"221","author":"Sharma","year":"2023","journal-title":"Expert Syst Appl"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TPAMI.2024.3522295","article-title":"A review of deep learning for video captioning","volume":"2024","author":"Abdar","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"3833","DOI":"10.1007\/s10462-021-10092-2","article-title":"Neural attention for image captioning: review of outstanding methods","volume":"55","author":"Zohourianshahzadi","year":"2022","journal-title":"Artif Intell Rev"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"103264","DOI":"10.1016\/j.media.2024.103264","article-title":"From vision to text: a comprehensive review of natural image captioning in medical diagnosis and radiology report generation","volume":"97","author":"Reale-Nosei","year":"2024","journal-title":"Med Image Anal"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","article-title":"From show to tell: a survey on deep learning-based image captioning","volume":"45","author":"Stefanini","year":"2023","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"635","DOI":"10.37394\/23203.2020.15.63","article-title":"A survey on different deep learning architectures for image captioning","volume":"15","author":"Nivedita","year":"2020","journal-title":"WSEAS Trans Syst Control"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"012053","DOI":"10.1088\/1742-6596\/1914\/1\/012053","article-title":"A survey on recent advances in image captioning","volume":"1914","author":"Chen","year":"2021","journal-title":"J Phys Conf Ser"},{"key":"ref8","doi-asserted-by":"crossref","first-page":"012118","DOI":"10.1088\/1757-899X\/1116\/1\/012118","article-title":"A survey on image encoders and language models for image captioning","volume":"1116","author":"Sharma","year":"2021","journal-title":"IOP Conf Ser Mater Sci Eng"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"3962","DOI":"10.1109\/TMM.2022.3169061","article-title":"Multi-branch distance-sensitive self-attention network for image captioning","volume":"25","author":"Ji","year":"2022","journal-title":"IEEE Trans Multimed"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"109420","DOI":"10.1016\/j.patcog.2023.109420","article-title":"Towards local visual modeling for image captioning","volume":"138","author":"Ma","year":"2023","journal-title":"Pattern Recognit"},{"key":"ref11","first-page":"1733","article-title":"A review on vision-language-based approaches: challenges and applications","volume":"82","author":"Pham","year":"2025","journal-title":"Comput Mater Contin"},{"key":"ref12","first-page":"595","article-title":"Multimodal neural language models","volume":"32","author":"Kiros","year":"2014","journal-title":"Proc Mach Learn Res"},{"key":"ref13","series-title":"Proceedings of the 31st Conference on Neural Information Processing Systems (NIPS 2017); 2017 Dec 4\u20139","article-title":"Attention is all you need","author":"Vaswani"},{"key":"ref14","doi-asserted-by":"crossref","first-page":"3260","DOI":"10.3390\/app9163260","article-title":"Boosted transformer for image captioning","volume":"9","author":"Li","year":"2019","journal-title":"Appl Sci"},{"key":"ref15","series-title":"Proceedings of the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2020 Jun 13\u201319","article-title":"X-linear attention networks for image captioning","author":"Pan"},{"key":"ref16","series-title":"Proceedings of the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2020 Jun 13\u201319","article-title":"Meshed-memory transformer for image captioning","author":"Cornia"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"812","DOI":"10.1016\/j.ins.2022.12.018","article-title":"Label-attention transformer with geometrically coherent objects for image captioning","volume":"623","author":"Dubey","year":"2023","journal-title":"Inf Sci"},{"key":"ref18","series-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2022 Jun 18\u201324","article-title":"Injecting semantic concepts into end-to-end image captioning","author":"Fang"},{"key":"ref19","series-title":"Proceedings of the 30th ACM International Conference on Multimedia; 2022 Oct 10\u201314","article-title":"Progressive tree-structured prototype network for end-to-end image captioning","author":"Zeng"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"127651","DOI":"10.1016\/j.neucom.2024.127651","article-title":"Show, tell and rectify: boost image caption generation via an output rectifier","volume":"585","author":"Ge","year":"2024","journal-title":"Neurocomputing"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"9500","DOI":"10.1109\/TNNLS.2024.3440872","article-title":"Variational transformer: a framework beyond the tradeoff between accuracy and diversity for image captioning","volume":"36","author":"Yang","year":"2025","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"ref22","doi-asserted-by":"crossref","first-page":"1785","DOI":"10.1109\/TNNLS.2022.3185320","article-title":"Adaptive semantic-enhanced transformer for image captioning","volume":"35","author":"Zhang","year":"2024","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"ref23","first-page":"24261","article-title":"MLP-Mixer: an all-MLP architecture for vision","volume":"34","author":"Tolstikhin","year":"2021","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref24","first-page":"9204","article-title":"Pay attention to MLPs","volume":"34","author":"Liu","year":"2021","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref25","series-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2022 Jun 18\u201324","article-title":"Brain-inspired multilayer perceptron with spiking neurons","author":"Li"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"127823","DOI":"10.1016\/j.neucom.2024.127823","article-title":"SAMT-generator: a second-attention for image captioning based on multi-stage transformer network","volume":"593","author":"Yang","year":"2024","journal-title":"Neurocomputing"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1162\/tacl_a_00177","article-title":"Grounded compositional semantics for finding and describing images with sentences","volume":"2","author":"Socher","year":"2014","journal-title":"Trans Assoc Comput Linguist"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"109288","DOI":"10.1016\/j.engappai.2024.109288","article-title":"Image captioning by diffusion models: a survey","volume":"138","author":"Daneshfar","year":"2024","journal-title":"Eng Appl Artif Intell"},{"key":"ref29","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2017 Jul 21\u201326","article-title":"Self-critical sequence training for image captioning","author":"Rennie"},{"key":"ref30","series-title":"Proceedings of the 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition; 2018 Jun 18\u201323","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson"},{"key":"ref31","series-title":"Proceedings of the Workshop on Text Summarization Branches Out; 2004 Jul 25\u201326","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin"},{"key":"ref32","series-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics; 2002 Jul 7\u201312","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni"},{"key":"ref33","series-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization; 2005 Jun 29","article-title":"METEOR: an automatic metric for MT evaluation with improved correlation with human judgments","author":"Banerjee"},{"key":"ref34","series-title":"Proceedings of the 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2015 Jun 7\u201312","article-title":"CIDEr: consensus-based image description evaluation","author":"Vedantam"},{"key":"ref35","series-title":"Proceedings of the Computer Vision\u2014ECCV 2016; 2016 Oct 11\u201314","article-title":"SPICE: semantic propositional image caption evaluation","author":"Anderson"},{"key":"ref36","series-title":"Proceedings of the 2019 IEEE\/CVF International Conference on Computer Vision (ICCV); 2019 Oct 27\u2013Nov 2","article-title":"Attention on attention for image captioning","author":"Huang"},{"key":"ref37","series-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2022 Jun 18\u201324","article-title":"Beyond a pre-trained object detector: cross-modal textual and visual context for image captioning","author":"Kuo"},{"key":"ref38","series-title":"Proceedings of the 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2015 Jun 7\u201312","article-title":"Show and tell: a neural image caption generator","author":"Vinyals"},{"key":"ref39","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","volume":"37","author":"Xu","year":"2015","journal-title":"Proc Mach Learn Res"},{"key":"ref40","series-title":"Proceedings of the European Conference on Computer Vision (ECCV); 2018 Sep 8\u201314","article-title":"Recurrent fusion network for image captioning","author":"Jiang"},{"key":"ref41","series-title":"Proceedings of the European Conference on Computer Vision (ECCV); 2018 Sep 8\u201314","article-title":"Exploring visual relationship for image captioning","author":"Yao"},{"key":"ref42","series-title":"Proceedings of the 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2019 Jun 15\u201320","article-title":"Auto-encoding scene graphs for image captioning","author":"Yang"},{"key":"ref43","series-title":"Proceedings of the Advances in Neural Information Processing Systems (NIPS); 2019 Dec 8\u201314","article-title":"Image captioning: transforming objects into words","author":"Herdade"},{"key":"ref44","series-title":"Proceedings of the 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2021 Jun 20\u201325","article-title":"RSTNet: captioning with adaptive attention on visual and non-visual words","author":"Zhang"},{"key":"ref45","doi-asserted-by":"crossref","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","article-title":"Dual global enhanced transformer for image captioning","volume":"148","author":"Xian","year":"2022","journal-title":"Neural Netw"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"117174","DOI":"10.1016\/j.eswa.2022.117174","article-title":"Geometry Attention Transformer with position-aware LSTMs for image captioning","volume":"201","author":"Wang","year":"2022","journal-title":"Expert Syst Appl"},{"key":"ref47","series-title":"Proceedings of the 2022 International Conference on Multimedia Retrieval; 2022 Jun 27\u201330","article-title":"Improving image captioning via enhancing dual-side context awareness","author":"Gao"},{"key":"ref48","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1016\/j.neucom.2022.11.045","article-title":"MAENet: a novel multi-head association attention enhancement network for completing intra-modal interaction in image captioning","volume":"519","author":"Hu","year":"2023","journal-title":"Neurocomputing"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"5514","DOI":"10.1109\/TIP.2018.2855406","article-title":"Attentive linear transformation for image captioning","volume":"27","author":"Ye","year":"2018","journal-title":"IEEE Trans Image Process"},{"key":"ref50","doi-asserted-by":"crossref","first-page":"107075","DOI":"10.1016\/j.patcog.2019.107075","article-title":"Learning visual relationship and context-aware attention for image captioning","volume":"98","author":"Wang","year":"2020","journal-title":"Pattern Recognit"},{"key":"ref51","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","article-title":"Image captioning with transformer and knowledge graph","volume":"143","author":"Zhang","year":"2021","journal-title":"Pattern Recognit Lett"},{"key":"ref52","doi-asserted-by":"crossref","first-page":"118474","DOI":"10.1016\/j.eswa.2022.118474","article-title":"Learning joint relationship attention network for image captioning","volume":"211","author":"Wang","year":"2023","journal-title":"Expert Syst Appl"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-84-3\/TSP_CMC_65169\/TSP_CMC_65169.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T05:45:31Z","timestamp":1776923131000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v84n3\/63153"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":52,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.065169","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"2025-03-05","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-06-11","order":1,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-07-30","order":2,"name":"published","label":"Published Online","group":{"name":"publication_history","label":"Publication History"}}]}}