{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:30:27Z","timestamp":1766068227394,"version":"3.45.0"},"reference-count":67,"publisher":"Tech Science Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.065872","type":"journal-article","created":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T03:32:37Z","timestamp":1749785557000},"page":"3407-3429","source":"Crossref","is-referenced-by-count":1,"title":["Optimizing Sentiment Integration in Image Captioning Using Transformer-Based Fusion Strategies"],"prefix":"10.32604","volume":"84","author":[{"given":"Komal Rani","family":"Narejo","sequence":"first","affiliation":[]},{"given":"Hongying","family":"Zan","sequence":"additional","affiliation":[]},{"given":"Kheem Parkash","family":"Dharmani","sequence":"additional","affiliation":[]},{"given":"Orken","family":"Mamyrbayev","sequence":"additional","affiliation":[]},{"given":"Ainur","family":"Akhmediyarova","sequence":"additional","affiliation":[]},{"given":"Zhibek","family":"Alibiyeva","sequence":"additional","affiliation":[]},{"given":"Janna","family":"Alimkulova","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"7068349","DOI":"10.1155\/2018\/7068349","article-title":"Deep learning for computer vision: a brief review","volume":"2018","author":"Voulodimos","year":"2018","journal-title":"Comput Intell Neurosci"},{"key":"ref2","series-title":"Proceedings of Computer Vision\u2014ECCV 2024","first-page":"70","article-title":"BRIDGE: bridging gaps in image captioning evaluation with stronger visual cues","author":"Sarto","year":"2024 Sep 29\u2013Oct 4"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"3335","DOI":"10.3390\/electronics11203335","article-title":"Deep learning reader for visually impaired","volume":"11","author":"Ganesan","year":"2022","journal-title":"Electronics"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"1726","DOI":"10.3390\/electronics13091726","article-title":"Recent advances in synthesis and interaction of speech, text, and vision","volume":"13","author":"Orynbay","year":"2024","journal-title":"Electronics"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"803","DOI":"10.3390\/electronics13040803","article-title":"Perceptual image quality prediction: are contrastive language\u2013image pretraining (CLIP) visual features effective?","volume":"13","author":"Onuoha","year":"2024","journal-title":"Electronics"},{"key":"ref6","first-page":"3951","article-title":"Image captioning using multimodal deep learning approach","volume":"81","author":"Farkh","year":"2024","journal-title":"Comput Mater Contin"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"56","DOI":"10.1016\/j.neucom.2018.03.078","article-title":"Image captioning by incorporating affective concepts learned from both visual and textual components","volume":"328","author":"Yang","year":"2019","journal-title":"Neurocomputing"},{"key":"ref8","doi-asserted-by":"crossref","first-page":"11103","DOI":"10.3390\/app131911103","article-title":"A review of transformer-based approaches for image captioning","volume":"13","author":"Ondeng","year":"2023","journal-title":"Appl Sci"},{"key":"ref9","first-page":"4697","article-title":"Fine-grained features for image captioning","volume":"75","author":"Shao","year":"2023","journal-title":"Comput Mater Contin"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"661","DOI":"10.1613\/jair.1.12025","article-title":"Image captioning using facial expression and attention","volume":"68","author":"Nezami","year":"2020","journal-title":"J Artif Intell Res"},{"key":"ref11","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1186\/s40537-022-00571-w","article-title":"Image captioning model using attention and object features to mimic human image understanding","volume":"9","author":"Al-Malla","year":"2022","journal-title":"J Big Data"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"9861","DOI":"10.3390\/app12199861","article-title":"Image-caption model based on fusion feature","volume":"12","author":"Geng","year":"2022","journal-title":"Appl Sci"},{"key":"ref13","doi-asserted-by":"crossref","first-page":"03029","DOI":"10.1051\/itmconf\/20224403029","article-title":"Sentiment analysis of images using machine learning techniques","volume":"44","author":"Gherkar","year":"2022","journal-title":"ITM Web Conf"},{"key":"ref14","series-title":"Proceedings of the 16th International Joint Conference on e-Business and Telecommunications; 2019 Jul 26\u201328","first-page":"290","article-title":"An overview on image sentiment analysis: methods, datasets and current challenges","author":"Ortis"},{"key":"ref15","doi-asserted-by":"crossref","unstructured":"Singh H, Sharma A, Pant M. Pixels to prose: understanding the art of image captioning. arXiv:2408.15714. 2024.","DOI":"10.2139\/ssrn.5351410"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1007\/s11063-024-11527-x","article-title":"Self-enhanced attention for image captioning","volume":"56","author":"Sun","year":"2024","journal-title":"Neural Process Lett"},{"key":"ref17","first-page":"2756396","article-title":"Caption generation based on emotions using CSPDenseNet and BiLSTM with self-attention","volume":"2022","author":"Kavi Priya","year":"2022","journal-title":"Appl Comput Intell Soft Comput"},{"key":"ref18","series-title":"Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition; 2016 Jun 27\u201330; Las Vegas, NV, USA","first-page":"2818","article-title":"Rethinking the inception architecture for computer vision","author":"Szegedy"},{"key":"ref19","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition; 2017 Jul 21\u201326","first-page":"1251","article-title":"Xception: deep learning with depthwise separable convolutions","author":"Chollet"},{"key":"ref20","series-title":"Proceedings of the 32nd International Conference on Machine Learning; 2015 Jul 6\u201311; Lille, France","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","author":"Xu"},{"key":"ref21","series-title":"IEEE International Conference on Image Processing (ICIP)","first-page":"790","article-title":"Densenet for dense flow","author":"Zhu","year":"2017 Sep 17\u201320"},{"key":"ref22","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition; 2017 Jul 21\u201326; Honolulu, HI, USA","first-page":"4700","article-title":"Densely connected convolutional networks","author":"Huang"},{"key":"ref23","series-title":"International Conference on Machine Learning; 2019 Jun 9\u201315, Long Beach, CA, USA","first-page":"6105","article-title":"EfficientNet: rethinking model scaling for convolutional neural networks","author":"Tan"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1007\/978-1-4842-6168-2_10","author":"Koonce","year":"2021","journal-title":"Convolutional neural networks with swift for Tensorflow: image recognition and dataset categorization"},{"key":"ref25","unstructured":"Cordonnier J-B, Loukas A, Jaggi M. Multihead attention: collaborate instead of concatenate. arXiv:2006.16362. 2020."},{"key":"ref26","first-page":"6000","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref27","first-page":"856","article-title":"Multimodal sentiment analysis using deep learning fusion techniques and transformers","volume":"15","author":"Habib","year":"2024","journal-title":"Int J Adv Comput Sci Appl"},{"key":"ref28","series-title":"Proceedings of Pattern Recognition ICPR International Workshops and Challenges","first-page":"381","article-title":"Fusion models for improved image captioning","author":"Kalimuthu","year":"2021 Jan 10\u201315"},{"key":"ref29","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"21231","article-title":"It is okay to not be okay: overcoming emotional bias in affective image captioning by contrastive data collection","author":"Mohamed","year":"2022 Jun 18\u201324"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"60","DOI":"10.54097\/fcis.v6i1.12","article-title":"Image emotion analysis combining attention mechanism and multi-level correlation","volume":"6","author":"Ren","year":"2023","journal-title":"Front Comput Intell Syst"},{"key":"ref31","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9959","article-title":"Say as you wish: fine-grained control of image caption generation with abstract scene graphs","author":"Chen","year":"2020 Jun 13\u201319"},{"key":"ref32","doi-asserted-by":"crossref","first-page":"81857","DOI":"10.1007\/s11042-024-18680-4","article-title":"Fine-grained image emotion captioning based on generative adversarial networks","volume":"83","author":"Yang","year":"2024","journal-title":"Multimed Tools Appl"},{"key":"ref33","unstructured":"Aziz A, Chowdhury NK, Kabir MA, Chy AN, Siddique MJ. MMTF-DES: a fusion of multimodal transformer models for desire, emotion, and sentiment analysis of social media data. arXiv:2310.14143. 2023."},{"key":"ref34","doi-asserted-by":"crossref","first-page":"306","DOI":"10.1016\/j.inffus.2023.02.028","article-title":"Multimodal sentiment analysis based on fusion methods: a survey","volume":"95","author":"Zhu","year":"2023","journal-title":"Inf Fusion"},{"key":"ref35","article-title":"EmoAtCap: emotional attitude captioning dataset","author":"Kovenko","year":"2021","journal-title":"Mendeley Data"},{"key":"ref36","unstructured":"Liu Y. Roberta: a robustly optimized BERT pretraining approach. arXiv:1907.11692. 2019."},{"key":"ref37","unstructured":"Alexey D. An image is worth 16x16 words: transformers for image recognition at scale. arXiv:2010.11929. 2020."},{"key":"ref38","unstructured":"Wang J, Yang Z, Hu X, Li L, Lin K, Gan Z, et al. GIT: a generative image-to-text transformer for vision and language. arXiv:2205.14100. 2022."},{"key":"ref39","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume":"162","author":"Li","year":"2022","journal-title":"Proc Mach Learn Res"},{"key":"ref40","doi-asserted-by":"crossref","first-page":"1417","DOI":"10.3390\/electronics8121417","article-title":"A modularized architecture of multi-branch convolutional neural network for image captioning","volume":"8","author":"He","year":"2019","journal-title":"Electronics"},{"key":"ref41","series-title":"Proceedings of the 2015 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3156","article-title":"Show and tell: a neural image caption generator","author":"Vinyals","year":"2015 Jun 7\u201312"},{"key":"ref42","series-title":"Proceedings of Computer Vision\u2013ECCV 2014","first-page":"13","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014 Sep 6\u201312"},{"key":"ref43","doi-asserted-by":"crossref","first-page":"11382","DOI":"10.1007\/s10489-021-02988-x","article-title":"Controllable image caption with an encoder-decoder optimization structure","volume":"52","author":"Shao","year":"2022","journal-title":"Appl Intell"},{"key":"ref44","doi-asserted-by":"crossref","first-page":"25557","DOI":"10.1007\/s11042-021-10632-6","article-title":"A novel automatic image caption generation using bidirectional long-short term memory framework","volume":"80","author":"Ye","year":"2021","journal-title":"Multimed Tools Appl"},{"key":"ref45","first-page":"11135","article-title":"Image captioning: transforming objects into words","volume":"32","author":"Herdade","year":"2019","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"1073","DOI":"10.1007\/s11263-023-01752-7","article-title":"Sentimental visual captioning using multimodal transformer","volume":"131","author":"Wu","year":"2023","journal-title":"Int J Comput Vis"},{"key":"ref47","series-title":"Proceedings of 2021 IEEE International Conference on Multimedia and Expo (ICME)","first-page":"1","article-title":"Image captioning with inherent sentiment","author":"Li","year":"2021 Jul 5\u20139"},{"key":"ref48","series-title":"Proceedings of the 2023 IEEE 5th International Conference on Civil Aviation Safety and Information Technology (ICCASIT)","first-page":"296","article-title":"Multimodal sentiment analysis based on image captioning and attention mechanism","author":"Sun","year":"2023 Oct 11\u201313"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"4257","DOI":"10.1109\/TCSVT.2023.3243725","article-title":"Cross on cross attention: deep fusion transformer for image captioning","volume":"33","author":"Zhang","year":"2023","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref50","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1109\/TCSVT.2021.3067449","article-title":"Task-adaptive attention for image captioning","volume":"32","author":"Yan","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref51","series-title":"Proceedings of the 2022 26th International Conference on Pattern Recognition (ICPR)","first-page":"4765","article-title":"Multi-scale adaptive task attention network for few-shot learning","author":"Chen","year":"2022 Aug 21\u201325"},{"key":"ref52","unstructured":"Nezami OM, Dras M, Wan S, Paris C. Senti-attend: image captioning using sentiment and attention. arXiv:1811.09789. 2018."},{"key":"ref53","unstructured":"You Q, Jin H, Luo J. Image captioning at will: a versatile scheme for effectively injecting sentiments into image descriptions. arXiv:1801.10121. 2018."},{"key":"ref54","series-title":"Proceedings of CEUR Workshop Proceedings, CEUR-WS","article-title":"Leveraging clip for image emotion recognition","author":"Bondielli","year":"2021 Jun 23\u201326"},{"key":"ref55","doi-asserted-by":"crossref","unstructured":"Aggarwal S, Pandey A, Vishwakarma DK. Modelling visual semantics via image captioning to extract enhanced multi-level cross-modal semantic incongruity representation with attention for multimodal sarcasm detection. arXiv:2408.02595. 2024.","DOI":"10.1007\/s10489-025-06717-6"},{"key":"ref56","series-title":"Proceedings of AAAI Conference on Artificial Intelligence","first-page":"8957","article-title":"Hierarchical attention network for image captioning","author":"Wang","year":"2019 Jan 27\u2013Feb 1"},{"key":"ref57","doi-asserted-by":"crossref","first-page":"55706","DOI":"10.1109\/ACCESS.2023.3282444","article-title":"Switching text-based image encoders for captioning images with text","volume":"11","author":"Ueda","year":"2023","journal-title":"IEEE Access"},{"key":"ref58","doi-asserted-by":"crossref","first-page":"2403","DOI":"10.1007\/s11063-020-10201-2","article-title":"Visual sentiment prediction with attribute augmentation and multi-attention mechanism","volume":"51","author":"Wu","year":"2020","journal-title":"Neural Process Lett"},{"key":"ref59","series-title":"Proceedings of International Conference on Machine Learning","article-title":"Image captioning with vision\/text transformers","author":"Xin","year":"2021 Jul 18\u201324"},{"key":"ref60","article-title":"Deep learning approaches for image captioning: opportunities, challenges and future potential","author":"Jamil","year":"2024","journal-title":"IEEE Access"},{"key":"ref61","doi-asserted-by":"crossref","first-page":"2024","DOI":"10.3390\/app9102024","article-title":"A systematic literature review on image captioning","volume":"9","author":"Stani\u016bt\u0117","year":"2019","journal-title":"Appl Sci"},{"key":"ref62","series-title":"Proceedings of 2020 IEEE 6th International Conference on Computer and Communications (ICCC)","first-page":"1456","article-title":"Image emotion caption based on visual attention mechanisms","author":"Li","year":"2020 Dec 11\u201314"},{"key":"ref63","doi-asserted-by":"crossref","first-page":"86","DOI":"10.29099\/ijair.v4i2.152","article-title":"Random and synthetic over-sampling approach to resolve data imbalance in classification","volume":"4","author":"Hayaty","year":"2020","journal-title":"Int J Artif Intell Res"},{"key":"ref64","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002 Jul 7\u201312"},{"key":"ref65","series-title":"Proceedings of Text Summarization Branches Out","first-page":"74","article-title":"Rouge: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"ref66","series-title":"Proceedings of the 2015 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4566","article-title":"Cider: consensus-based image description evaluation","author":"Vedantam","year":"2015 Jun 7\u201312"},{"key":"ref67","unstructured":"Krotov A, Tebo A, Picart DK, Algave AD. Evaluating authenticity and quality of image captions via sentiment and semantic analyses. arXiv:2409.09560. 2024."}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-84-2\/TSP_CMC_65872\/TSP_CMC_65872.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:52:51Z","timestamp":1763344371000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v84n2\/62924"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":67,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.065872","relation":{},"ISSN":["1546-2226"],"issn-type":[{"type":"electronic","value":"1546-2226"}],"subject":[],"published":{"date-parts":[[2025]]}}}