{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T16:02:02Z","timestamp":1774540922258,"version":"3.50.1"},"reference-count":52,"publisher":"Tech Science Press","issue":"3","license":[{"start":{"date-parts":[[2024,12,29]],"date-time":"2024-12-29T00:00:00Z","timestamp":1735430400000},"content-version":"vor","delay-in-days":363,"URL":"https:\/\/doi.org\/10.32604\/TSP-CROSSMARKPOLICY"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2024]]},"DOI":"10.32604\/cmc.2024.053245","type":"journal-article","created":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T06:57:27Z","timestamp":1731653847000},"page":"3951-3968","update-policy":"https:\/\/doi.org\/10.32604\/tsp-crossmarkpolicy","source":"Crossref","is-referenced-by-count":5,"title":["Image Captioning Using Multimodal Deep Learning Approach"],"prefix":"10.32604","volume":"81","author":[{"given":"Rihem","family":"Farkh","sequence":"first","affiliation":[]},{"given":"Ghislain","family":"Oudinet","sequence":"additional","affiliation":[]},{"given":"Yasser","family":"Foued","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2024]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: A survey and taxonomy","volume":"41","author":"Baltru\u0161aitis","year":"Feb. 1, 2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"ref2","doi-asserted-by":"crossref","first-page":"60436","DOI":"10.1109\/ACCESS.2024.3393558","article-title":"Explainable vision transformers for vein biometric recognition","volume":"12","author":"Albano","year":"2024","journal-title":"IEEE Access"},{"key":"ref3","series-title":"IEEE 4th Int. Conf. Pattern Recognit. Mach. Learn. (PRML)","first-page":"193","article-title":"Research on image tibetan caption generation method fusion attention mechanism","author":"Xia","year":"2023"},{"key":"ref4","series-title":"Int. Conf. Front. Artif. Intell. Mach. Learn. (FAIML)","first-page":"196","article-title":"Moving target detection algorithm based on SIFT feature matching","author":"Song","year":"2022"},{"key":"ref5","series-title":"IEEE 3rd Inf. Technol., Netw., Electronic Autom. Control Conf. (ITNEC)","first-page":"1536","article-title":"Feature extraction method based on improved linear LBP operator","author":"Sun","year":"2019"},{"key":"ref6","series-title":"2022 IEEE Int. Conf. Smart Internet Things (SmartIoT)","first-page":"180","article-title":"A SAR remote sensing image change detection method based on DR-UNet-CRF model","author":"Zhang","year":"2022"},{"key":"ref7","series-title":"Int. Conf. Eng. MIS (ICEMIS)","first-page":"1","article-title":"Image captioning techniques: A review","author":"Al-Jamal","year":"2022"},{"key":"ref8","series-title":"Int. Conf. Adv. Technol. (ICONAT)","first-page":"1","article-title":"Image caption generation using deep neural networks","author":"Sudhakar","year":"2022"},{"key":"ref9","series-title":"2nd Int. Conf. Disrupt. Technol. (ICDT)","first-page":"1428","article-title":"Image captioning using VGG-16 deep learning model","author":"Jayaswal","year":"2024"},{"key":"ref10","series-title":"Med. Middle-East Geosci. Remote Sens. Symp. (M2GARSS)","first-page":"1","article-title":"A new CNN-RNN framework for remote sensing image captioning","author":"Hoxha","year":"2020"},{"key":"ref11","series-title":"Int. Conf. Adv. Mechatron., Intell. Manuf. Ind. Autom. (ICAMIMIA)","first-page":"994","article-title":"Combination of DenseNet and BiLSTM model for Indonesian image captioning","author":"Navastara","year":"2023"},{"key":"ref12","series-title":"IEEE Silchar Subsection Conf. (SILCON)","first-page":"1","article-title":"Image caption generation using ResNET-50 and LSTM","author":"Satti","year":"2023"},{"key":"ref13","series-title":"12th Int. Conf. Cloud Comput., Data Sci. Eng. (Confluence)","first-page":"312","article-title":"Vision 360: Image caption generation using encoder-decoder model","author":"Kumari","year":"2022"},{"key":"ref14","doi-asserted-by":"crossref","first-page":"24852","DOI":"10.1109\/ACCESS.2022.3151874","article-title":"Using neural encoder-decoder models with continuous outputs for remote sensing image captioning","volume":"10","author":"Ramos","year":"2022","journal-title":"IEEE Access"},{"key":"ref15","series-title":"IEEE Automat. Speech Recognit. Underst. Workshop (ASRU)","first-page":"70","article-title":"Improved multi-stage training of online attention-based encoder-decoder models","author":"Garg","year":"2019"},{"key":"ref16","series-title":"Int. Conf. Doc. Anal. Recognit. (ICDAR)","first-page":"916","article-title":"A comparative study of attention-based encoder-decoder approaches to natural scene text recognition","author":"Cong","year":"2019"},{"key":"ref17","series-title":"Int. Conf. Culture-Oriented Sci. Technol. (ICCST)","first-page":"208","article-title":"Video summarization with self-attention based encoder-decoder framework","author":"Feng","year":"2020"},{"key":"ref18","first-page":"6000","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"ref19","series-title":"RIVF Int. Conf. Comput. Commun. Technol. (RIVF)","first-page":"306","article-title":"Deep vision transformer and T5-based for image captioning","author":"Lam","year":"2023"},{"key":"ref20","first-page":"1","article-title":"Remote sensing image change captioning with dual-branch transformers: A new method and a large scale dataset","volume":"60","author":"Liu","year":"2022, Art. no. 5633520","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"ref21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/LGRS.2022.3198234","article-title":"Exploring transformer and multilabel classification for remote sensing image captioning","volume":"19","author":"Kandala","year":"2022, Art. no. 6514905","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"key":"ref22","series-title":"IEEE\/CVF Int. Conf. Comput. Vis. (ICCV)","first-page":"4633","article-title":"Attention on attention for image captioning","author":"Huang","year":"2019"},{"key":"ref23","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit., CVPR","first-page":"4651","article-title":"Image captioning with semantic attention","author":"Lu","year":"Jun. 27\u201330, 2016"},{"key":"ref24","article-title":"Improving robustness for vision transformer with a simple dynamic scanning augmentation","volume":"36","author":"Kotyan","year":"2024, Art. no. 127000","journal-title":"Neural Comput. Appl."},{"key":"ref25","doi-asserted-by":"crossref","first-page":"63373","DOI":"10.1109\/ACCESS.2019.2916887","article-title":"Deep multimodal representation learning: A survey","volume":"7","author":"Guo","year":"2019","journal-title":"IEEE Access"},{"key":"ref26","series-title":"IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)","first-page":"17959","article-title":"Scaling up vision-language pretraining for image captioning","author":"Hu","year":"2022"},{"key":"ref27","series-title":"IEEE 32nd Int. Conf. Tools Artif. Intell. (ICTAI)","first-page":"748","article-title":"Enhanced soft attention mechanism with an inception-like module for image captioning","author":"Lian","year":"2020"},{"key":"ref28","series-title":"Int. Joint Conf. Neural Netw. (IJCNN)","first-page":"1","article-title":"Near-optimal glimpse sequences for improved hard attention neural network training","author":"Harvey","year":"2022"},{"key":"ref29","first-page":"2313","article-title":"Auto-encoding and distilling scene graphs for image captioning","volume":"44","author":"Yang","year":"May 1, 2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"ref30","first-page":"139","article-title":"A multimodal fusion approach for image captioning","volume":"338","author":"Zhao","year":"2019","journal-title":"Neurocomputing"},{"key":"ref31","series-title":"13th Workshop Hyperspectral Imaging Signal Process.: Evol. Remote Sens. (WHISPERS)","first-page":"1","article-title":"Multimodal fusion methods with vision transformers for remote sensing semantic segmentation","author":"Morelli","year":"2023"},{"key":"ref32","series-title":"Proc. 36th Int. Conf. Mach. Learn.","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"Tan","year":"2019"},{"key":"ref33","series-title":"5th Int. Conf. Control Robotics (ICCR)","first-page":"156","article-title":"GUI-based YOLOv8 license plate detection system design","author":"Quan","year":"2023"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"1680","DOI":"10.3390\/make5040083","article-title":"A comprehensive review of YOLO architectures in computer vision: From YOLOv1 to YOLOv8 and YOLO-NAS","volume":"5","author":"Terven","year":"2023","journal-title":"Mach. Learn. Knowl. Extraction"},{"key":"ref35","unstructured":"Kaggle, \u201cFlickr 8k dataset,\u201d Apr. 1, 2024. Accessed: Jul. 31, 2024. [Online]. Available: https:\/\/www.kaggle.com\/datasets\/adityajn105\/flickr8k"},{"key":"ref36","series-title":"Proc. IEEE Conf. Comput. Visi. Pattern Recognit.","first-page":"3156","article-title":"Show and tell: A neural image caption generator","author":"Vinyals","year":"2015"},{"key":"ref37","series-title":"Proc. 32nd Int. Conf. Mach. Learn. (ICML-15)","first-page":"2048","article-title":"Attend and tell: Neural image caption generation with visual attention","author":"Xu","year":"2015"},{"key":"ref38","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"ref39","unstructured":"B. Alexey et al., \u201cYOLOv4: Optimal speed and accuracy of object detection,\u201d 2020, arXiv:2004.10934."},{"key":"ref40","doi-asserted-by":"crossref","first-page":"80","DOI":"10.54254\/2755-2721\/41\/20230714","article-title":"Improved small-object detection using YOLOv8: A comparative study","volume":"41","author":"Huang","year":"2024","journal-title":"Appl. Comput. Eng."},{"key":"ref41","unstructured":"Keras, \u201cYOLOV8 backbones,\u201d Apr. 1, 2024. Accessed: Jul. 31, 2024. [Online]. Available: https:\/\/keras.io\/api\/keras_cv\/models\/backbones\/yolo_v8\/"},{"key":"ref42","series-title":"14th Int. Conf. Human Syst. Interact. (HSI)","first-page":"1","article-title":"Practical analysis on architecture of EfficientNet","author":"Hoang","year":"2021"},{"key":"ref43","series-title":"7th Int. Conf. Image Signal Process. Appl. (ISPA)","first-page":"1","article-title":"Recognizing the style of a fine-art painting with EfficientNet and transfer learning","author":"Menai","year":"2022"},{"key":"ref44","doi-asserted-by":"crossref","first-page":"212499","DOI":"10.1109\/ACCESS.2020.3040275","article-title":"Multi-label classification of fundus images with EfficientNet","volume":"8","author":"Wang","year":"2020","journal-title":"IEEE Access"},{"key":"ref45","doi-asserted-by":"crossref","first-page":"14078","DOI":"10.1109\/ACCESS.2021.3051085","article-title":"Classification of remote sensing images using EfficientNet-B3 CNN model with attention","volume":"9","author":"Alhichri","year":"2021","journal-title":"IEEE Access"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"3623","DOI":"10.1109\/TCBB.2023.3307419","article-title":"TransRNAm: Identifying twelve types of RNA modifications by an interpretable multi-label deep learning model based on transformer","volume":"20","author":"Chen","year":"Nov.\u2013Dec. 2023","journal-title":"IEEE\/ACM Trans. Comput. Biol. Bioinform."},{"key":"ref47","doi-asserted-by":"crossref","first-page":"2183","DOI":"10.1109\/LSP.2022.3214768","article-title":"Transformer-based feature compensation and aggregation for DeepFake detection","volume":"29","author":"Tan","year":"2022","journal-title":"IEEE Signal Process. Lett."},{"key":"ref48","article-title":"Comparison of various CNN encoders for image captioning","volume":"2335","author":"Veena","year":"2022","journal-title":"J. Phys.: Conf. Ser."},{"key":"ref49","series-title":"14th Int. Conf. Inf. Commun. Technol. Converg. (ICTC)","first-page":"430","article-title":"RBBA: ResNet-BERT-Bahdanau attention for image caption generator","author":"Hoang","year":"2023"},{"key":"ref50","article-title":"Image captioning using DenseNet network and adaptive attention","volume":"85","author":"Deng","year":"2020, Art. no. 115836","journal-title":"Signal Process.: Image Commun."},{"key":"ref51","series-title":"Proc. IEEE Int. Conf. Comput. Vis. (ICCV)","article-title":"Guiding the long-short term memory model for image caption generation","author":"Jia","year":"2015"},{"key":"ref52","series-title":"IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)","article-title":"X-linear attention networks for image captioning","author":"Pan","year":"2020"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.techscience.com\/cmc\/v81n3\/59016\/pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,7]],"date-time":"2025-03-07T06:01:22Z","timestamp":1741327282000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v81n3\/59016"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":52,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2024]]},"published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2024.053245","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"2024-04-28","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-08-01","order":1,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-12-19","order":2,"name":"published","label":"Published Online","group":{"name":"publication_history","label":"Publication History"}}]}}