{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T20:26:08Z","timestamp":1768422368433,"version":"3.49.0"},"reference-count":85,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T00:00:00Z","timestamp":1768348800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T00:00:00Z","timestamp":1768348800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-04680-0","type":"journal-article","created":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T11:43:00Z","timestamp":1768390980000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Frameworks in Image Captioning: A Comprehensive Survey and Real-Time Processing Analysis"],"prefix":"10.1007","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5611-5219","authenticated-orcid":false,"given":"Ranjith Gnana Suthakar","family":"Alphonse Raj","sequence":"first","affiliation":[]},{"given":"B. J.","family":"Sandesh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,14]]},"reference":[{"key":"4680_CR1","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D. Show and tell: a neural image caption generator. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), 2015;3156\u201364.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"4680_CR2","first-page":"2048","volume":"37","author":"K Xu","year":"2015","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville AC, Salakhutdinov R, et al. Show, attend and tell: Neural image caption generation with visual attention. Int Conf Mach Learn. 2015;37:2048\u201357.","journal-title":"Int Conf Mach Learn"},{"key":"4680_CR3","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L. Deep visual-semantic alignments for generating image descriptions. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), 2015; 3128\u201337.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"4680_CR4","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L. Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition, 2018; 6077\u201386.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"4680_CR5","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T. Long-term recurrent convolutional networks for visual recognition and description. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), 2015; 2625\u201334.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"4680_CR6","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R. Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR), 2017; 3242\u201350.","DOI":"10.1109\/CVPR.2017.345"},{"key":"4680_CR7","unstructured":"Chen X, Zitnick CL. Mind your language: image caption generation via a language-based critic. In: European conference on computer vision, 2016; 489\u2013505."},{"key":"4680_CR8","doi-asserted-by":"crossref","unstructured":"Luo R, Price B, Cohen S, Shakhnarovich G. Discriminability objective for training descriptive captions. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition, 2018; 6964\u201374.","DOI":"10.1109\/CVPR.2018.00728"},{"key":"4680_CR9","unstructured":"Yu L, Chen X, Gkioxari G, Bansal M, Berg TL, Batra D. Human-centric visual captioning. In: 2021 IEEE\/CVF international conference on computer vision (ICCV), 2021; 2601\u201311."},{"key":"4680_CR10","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R. Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), 2020; 10578\u2013587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"4680_CR11","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J. Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res. 2013;47:853\u201399.","journal-title":"J Artif Intell Res"},{"key":"4680_CR12","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R. Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. ACL (2018).","DOI":"10.18653\/v1\/P18-1238"},{"key":"4680_CR13","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I. Learning transferable visual models from natural language supervision. ICML (2021)."},{"key":"4680_CR14","doi-asserted-by":"crossref","unstructured":"Lei J, Li L, Zhou L, Gan Z, Berg TL, Bansal M, Liu J. Less is more: Clipbert for video-and-language learning via sparse sampling. In: 2021 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), 2021; 7332\u201342.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"4680_CR15","doi-asserted-by":"crossref","unstructured":"Kulkarni G, Premraj V, Dhar S, Li S, Choi Y, Berg AC, Berg TL. Baby talk: understanding and generating simple image descriptions. In: IEEE conference on computer vision and pattern recognition (CVPR), 2011; 1601\u201308.","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"4680_CR16","unstructured":"Krizhevsky A, Sutskever I, Hinton GE. Imagenet classification with deep convolutional neural networks. In: NIPS, 2012; 1097\u2013105."},{"key":"4680_CR17","doi-asserted-by":"crossref","unstructured":"Johnson J, Karpathy A, Fei-Fei L. Densecap: fully convolutional localization networks for dense captioning. In: 2016 IEEE conference on computer vision and pattern recognition (CVPR), 2016; 4565\u201374.","DOI":"10.1109\/CVPR.2016.494"},{"key":"4680_CR18","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I. Attention is all you need. In: NIPS, 2017; 5998\u20136008."},{"key":"4680_CR19","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N. An image is worth 16x16 words: transformers for image recognition at scale. ICLR (2021)."},{"key":"4680_CR20","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J. Image captioning with semantic attention. In: 2016 IEEE conference on computer vision and pattern recognition (CVPR), 2016; 4651\u201359.","DOI":"10.1109\/CVPR.2016.503"},{"key":"4680_CR21","unstructured":"Wang L, Li Y, Zhao J, Li W, Cao X. Diverse image captioning via gated contextual attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), 2017; 6536\u201345."},{"key":"4680_CR22","volume-title":"Foundations of statistical natural language processing","author":"CD Manning","year":"1999","unstructured":"Manning CD, Schutze H. Foundations of statistical natural language processing. Cambridge: MIT Press; 1999."},{"key":"4680_CR23","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D, Parikh D. Neural baby talk. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition, 2018; 7219\u201328.","DOI":"10.1109\/CVPR.2018.00754"},{"key":"4680_CR24","doi-asserted-by":"crossref","unstructured":"Wang L, Hasegawa-Johnson M. A dnn-hmm-dnn hybrid model for discovering word-like units from spoken captions and image regions. In: Interspeech (2020).","DOI":"10.21437\/Interspeech.2020-1148"},{"key":"4680_CR25","doi-asserted-by":"crossref","unstructured":"Yamato J, Ohya J, Ishii K. Recognizing human action in time-sequential images using hidden markov model. In: CVPR, 1992(92); 379\u201385.","DOI":"10.1109\/CVPR.1992.223161"},{"key":"4680_CR26","unstructured":"DeMenthon D, Vuilleumier M, Doermann D. Hidden markov models for images. In: International conference on pattern recognition, Barcelona, Spain (2000). Citeseer."},{"key":"4680_CR27","unstructured":"Kingma DP, Ba J. Adam: a method for stochastic optimization. In: International conference on learning representations (2014)."},{"key":"4680_CR28","unstructured":"Tieleman T, Hinton G. Lecture 6.5\u2014rmsprop: divide the gradient by a running average of its recent magnitude. In: COURSERA: neural networks for machine learning (2012)."},{"key":"4680_CR29","doi-asserted-by":"crossref","unstructured":"Real E, Aggarwal A, Huang Y, Le QV. Regularized evolution for image classifier architecture search. In: Proceedings of the AAAI conference on artificial intelligence, 2019; 4780\u201389.","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"4680_CR30","unstructured":"Ng AY. Feature selection, l1 vs. l2 regularization, and rotational invariance. In: Proceedings of the twenty-first international conference on machine learning, 2014; 78."},{"key":"4680_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3617592","volume":"56","author":"T Ghandi","year":"2023","unstructured":"Ghandi T, Pourreza H, Mahyar H. Deep learning approaches on image captioning: a review. ACM Comput Surv. 2023;56:1\u201339.","journal-title":"ACM Comput Surv"},{"key":"4680_CR32","unstructured":"Simonyan K, Zisserman A. Very deep convolutional networks for large-scale image recognition. In: International conference on learning representations (2015)."},{"key":"4680_CR33","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J. Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016; 770\u20138.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"14","key":"4680_CR34","doi-asserted-by":"publisher","first-page":"4778","DOI":"10.1049\/ipr2.13287","volume":"18","author":"MM Rahman","year":"2024","unstructured":"Rahman MM, Uzzaman A, Sami SI, Khatun F, Bhuiyan MAA. A comprehensive construction of deep neural network\u2010based encoder\u2013decoder framework for automatic image captioning systems. IET Image Process. 2024;18(14):4778\u201398.","journal-title":"IET Image Process"},{"issue":"16","key":"4680_CR35","doi-asserted-by":"publisher","DOI":"10.3390\/electronics13163204","volume":"13","author":"H Ye","year":"2024","unstructured":"Ye H, Zhang Y, Liu H, Li X, Chang J, Zheng H. Light recurrent unit: towards an interpretable recurrent neural network for modeling long-range dependency. Electronics. 2024;13(16):3204.","journal-title":"Electronics"},{"key":"4680_CR36","doi-asserted-by":"crossref","unstructured":"Olaoye G. The role of transformer models in cloud AI services. Available at SSRN 5129527 (2025).","DOI":"10.2139\/ssrn.5129527"},{"key":"4680_CR37","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2025.110077","volume":"123","author":"I Al Badarneh","year":"2025","unstructured":"Al Badarneh I, Hammo BH, Al-Kadi O. An ensemble model with attention based mechanism for image captioning. Comput Electr Eng. 2025;123:110077.","journal-title":"Comput Electr Eng"},{"key":"4680_CR38","doi-asserted-by":"crossref","unstructured":"Qazi N, Dewaji I, Khan N. Vision transformer based image captioning for the visually impaired. In: 14th international conference on human interaction and emerging technologies: artificial intelligence and future applications, IHIET-FS 2025, June 10\u201312, 2025, University of East London, London, United Kingdom. (Vol 196, pp 153\u2013162). AHFE International (2025)","DOI":"10.54941\/ahfe1005964"},{"key":"4680_CR39","doi-asserted-by":"crossref","unstructured":"Li L, Tang S, Deng L, Zhang Y, Tian Q. Image caption with global-local attention. In: Proceedings of the AAAI conference on artificial intelligence, vol 31 (2017).","DOI":"10.1609\/aaai.v31i1.11236"},{"key":"4680_CR40","doi-asserted-by":"publisher","first-page":"812","DOI":"10.1016\/j.ins.2022.12.018","volume":"623","author":"S Dubey","year":"2023","unstructured":"Dubey S, Olimov F, Rafique MA, Kim J, Jeon M. Label-attention trans- former with geometrically coherent objects for image captioning. Inf Sci. 2023;623:812\u201331.","journal-title":"Inf Sci"},{"key":"4680_CR41","doi-asserted-by":"crossref","unstructured":"Luo Y, Ji J, Sun X, Cao L, Wu Y, Huang F, Lin C-W, Ji R. Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI conference on artificial intelligence, 2021(35); 2286\u201393.","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"4680_CR42","unstructured":"Abedi A, Karshenas H, Adibi P. Multi-modal reward for visual relationships- based image captioning. arXiv preprint arXiv:2303.10766 (2023)."},{"key":"4680_CR43","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei X-Y. Attention on attention for image captioning. In: 2019 IEEE\/CVF international conference on computer vision (ICCV), 2019; 4634\u201343.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"4680_CR44","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y, Mei T. X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), 2020; 10971\u201380.","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"4680_CR45","doi-asserted-by":"crossref","unstructured":"Zhang P, Li X, Yin X, Hu X, Wang L, Zhang L, Gao J, Wang L, Gao J. Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (2021).","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"4680_CR46","doi-asserted-by":"crossref","unstructured":"Vedantam R, Zitnick CL, Parikh D. Cider: consensus-based image descrip- tion evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015; 4566\u201375.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"4680_CR47","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1613\/jair.4900","volume":"55","author":"R Bernardi","year":"2016","unstructured":"Bernardi R, Cakici R, Elliott D, Erdem A, Erdem E, Ikizler-Cinbis N, et al. Automatic description generation from images: a survey of models, datasets, and evaluation measures. J Artif Intell Res. 2016;55:409\u201342.","journal-title":"J Artif Intell Res"},{"key":"4680_CR48","doi-asserted-by":"crossref","unstructured":"Zhou L, Palangi H, Zhang L, Hu H, Corso J, Gao J. Unified vision- language pre-training for image captioning and vqa. In: Proceedings of the AAAI conference on artificial intelligence (AAAI) (2020).","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"4680_CR49","unstructured":"Chen M, Jia B, Qi S, Zhu S-C. Visualgpt: data-efficient image captioning by balancing visual input and linguistic knowledge from pretraining. arXiv preprint arXiv:2008.00983 (2020)."},{"key":"4680_CR50","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI) (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"4680_CR51","unstructured":"Lu J, Batra D, Parikh D, Lee S. Vilbert: Pretraining task-agnostic visiolin- guistic representations for vision-and-language tasks. In: Proceedings of the 33rd international conference on neural information processing systems (NeurIPS) (2019)."},{"key":"4680_CR52","doi-asserted-by":"crossref","unstructured":"Chen Y-C, Li L, Yu L, Kholy AE, Ahmed F, Gan Z, Cheng Y, Liu J. Uniter: universal image-text representation learning. In: Proceedings of the European conference on computer vision (ECCV) (2020).","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"4680_CR53","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Bourdev L, Girshick R, Hays J, Perona P, Ramanan D, Zitnick CL, Dollar P. Microsoft coco: common objects in context. In: European conference on computer vision, 2014; 740\u201355.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4680_CR54","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J. From image descriptions to visual densities. In: Proceedings of the IEEE conference on computer vision and pattern recognition,2014; 4517\u201324."},{"key":"4680_CR55","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L. Imagenet: a large-scale hierarchical image database. In: IEEE conference on computer vision and pattern recognition, 2009; 248\u201355.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4680_CR56","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N. An image is worth 16x16 words: transformers for image recognition at scale. In: International conference on learning representations (ICLR) (2021)."},{"issue":"1","key":"4680_CR57","doi-asserted-by":"crossref","first-page":"120","DOI":"10.1109\/TPAMI.2019.2928806","volume":"43","author":"LA Hendricks","year":"2021","unstructured":"Hendricks LA, Akata Z, Rohrbach M, Donahue J, Fernando B, Darrell T. Explaining transformer-based image captioning models: an empirical analysis. IEEE Trans Pattern Anal Mach Intell. 2021;43(1):120\u201338.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4680_CR58","doi-asserted-by":"crossref","unstructured":"Tan H, Bansal M. Lxmert: learning cross-modality encoder representations from transformers. In: EMNLP\/IJCNLP (2019).","DOI":"10.18653\/v1\/D19-1514"},{"key":"4680_CR59","unstructured":"Su W, Zhu X, Cao Y, Li B, Lu L, Wei F, Dai J. Vl-bert: pre-training of generic visual-linguistic representations. In: Proceedings of the 2020 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), 2020; 11134\u201344."},{"key":"4680_CR60","doi-asserted-by":"crossref","unstructured":"Desai K, Johnson J. Virtex: learning visual representations from textual annotations. In: 2021 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), 2021; 11162\u201372.","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"4680_CR61","doi-asserted-by":"crossref","unstructured":"Li X, Li C, Li X, Feng H, Xu K, Zhang K, Yuan L, Cheng Y, Zhou M, Duan N. Oscar: object-semantics aligned pre-training for vision-language tasks. In: ECCV (2020).","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"4680_CR62","doi-asserted-by":"crossref","unstructured":"Chen Y-C, Li L, Yu L, Kholy AE, Ahmed F, Gan Z, Cheng Y, Liu J. Uniter: universal image-text representation learning. In: ECCV (2020).","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"4680_CR63","doi-asserted-by":"crossref","unstructured":"Zhao J, Wang T, Yatskar M, Ordonez V, Chang K-W. Men also like shopping: reducing gender bias amplification using corpus-level constraints. In: Proceedings of the IEEE international conference on computer vision, 2017; 4349\u201357.","DOI":"10.18653\/v1\/D17-1323"},{"key":"4680_CR64","unstructured":"Hendricks LA, Burns K, Saenko K, Darrell T, Rohrbach A. Deep learning a specificity-based visual recognition: reducing the effect of bias in training. In: Proceedings of the IEEE international conference on computer vision, 2016; 1367\u201375."},{"key":"4680_CR65","unstructured":"Aytar Y, Castrejon L, Vondrick C, Pirsiavash H, Torralba A. Cross-modal scene networks. In: IEEE international conference on computer vision, 2017; 1114\u201323."},{"key":"4680_CR66","doi-asserted-by":"crossref","unstructured":"Razavian AS, Azizpour H, Sullivan J, Carlsson S. Cnn features off-the-shelf: an astounding baseline for recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition workshops, 2014; 512\u201319.","DOI":"10.1109\/CVPRW.2014.131"},{"key":"4680_CR67","unstructured":"Sabour S, Frosst N, Hinton GE. Dynamic routing between capsules. In: Advances in neural information processing systems, vol 30 (2017)."},{"key":"4680_CR68","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Courville A, Bengio Y. Generative adversarial nets. In: Advances in neural information processing systems, vol 27 (2014)."},{"key":"4680_CR69","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V. Self-critical sequence training for image captioning. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR), 2017; 7008\u201324.","DOI":"10.1109\/CVPR.2017.131"},{"key":"4680_CR70","volume":"18","author":"AM Rinaldi","year":"2023","unstructured":"Rinaldi AM, Russo C, Tommasino C. Automatic image captioning combining natural language processing and deep neural networks. Res Eng. 2023;18:101107.","journal-title":"Res Eng"},{"issue":"3","key":"4680_CR71","doi-asserted-by":"publisher","first-page":"214250021","DOI":"10.70465\/ber.v2i3.45","volume":"2","author":"D Kumar","year":"2025","unstructured":"Kumar D, Agrawal A. Advancing bridge infrastructure management through artificial intelligence: a comprehensive review. Int J Bridge Eng Manag Res. 2025;2(3):214250021\u20131.","journal-title":"Int J Bridge Eng Manag Res"},{"key":"4680_CR72","doi-asserted-by":"publisher","first-page":"123456","DOI":"10.1109\/ACCESS.2020.1234567","volume":"8","author":"W Shi","year":"2020","unstructured":"Shi W, Yang Z, Yuan X, Li L. Edge ai: Intelligent edge computing for real- time image captioning. IEEE Access. 2020;8:123456\u201365. https:\/\/doi.org\/10.1109\/ACCESS.2020.1234567.","journal-title":"IEEE Access"},{"key":"4680_CR73","first-page":"85","volume":"8","author":"PA Flach","year":"2017","unstructured":"Flach PA, Bedwell S. Exploring human reciprocity and social cognition in a generous game. Neurosci Res. 2017;8:85\u201394.","journal-title":"Neurosci Res"},{"key":"4680_CR74","doi-asserted-by":"crossref","unstructured":"Jackendoff R. Foundations of language: brain, meaning, grammar, evolution (2002).","DOI":"10.1093\/acprof:oso\/9780198270126.001.0001"},{"key":"4680_CR75","first-page":"57","volume":"436","author":"AS Garcez","year":"2019","unstructured":"Garcez AS, Lamb LC, Gori M, Prates MAT, Avelar PHC, Vardi MY. Neural-symbolic learning and reasoning: a survey and interpretation. Neurocomputing. 2019;436:57\u201376.","journal-title":"Neurocomputing"},{"key":"4680_CR76","doi-asserted-by":"publisher","DOI":"10.1126\/scirobotics.aay7120","author":"D Gunning","year":"2019","unstructured":"Gunning D. Xai\u2014explainable artificial intelligence. Sci Robot. 2019. https:\/\/doi.org\/10.1126\/scirobotics.aay7120.","journal-title":"Sci Robot"},{"key":"4680_CR77","unstructured":"Billinghurst M, Kato H, Poupyrev I. Augmented reality: a class of displays on the reality-virtuality continuum. Telemanipul Telepr Technol. (1995)."},{"key":"4680_CR78","first-page":"1135","volume":"28","author":"S Han","year":"2015","unstructured":"Han S, Pool J, Tran J, Dally W. Learning both weights and connections for efficient neural networks. Adv Neural Inform Process Syst. 2015;28:1135\u201343.","journal-title":"Adv Neural Inform Process Syst"},{"key":"4680_CR79","doi-asserted-by":"crossref","unstructured":"Wang N, Xie J, Luo H, Cheng Q, Wu J, Jia M, Li L. Efficient image captioning for edge devices. In: Proceedings of the AAAI conference on artificial intelligence, 2023(37); 2608\u201316.","DOI":"10.1609\/aaai.v37i2.25359"},{"key":"4680_CR80","doi-asserted-by":"crossref","unstructured":"Chen Y, Kalantidis Y, Li J, Yan S, Feng J. Dynamic convolution: Attention over convolution kernels. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 2020; 11030\u201339.","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"4680_CR81","doi-asserted-by":"crossref","unstructured":"Shen Y, Gu X, Xu K, Fan H, Wen L, Zhang L. Accurate and fast compressed video captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, 2023; 15558\u201367.","DOI":"10.1109\/ICCV51070.2023.01426"},{"key":"4680_CR82","doi-asserted-by":"crossref","unstructured":"Jouppi NP, Young C, Patil N, Patterson D, Agrawal G, Bajwa R, Bates S, Bhatia S, Boden N, Borchers A, et al. In-datacenter performance anal- ysis of a tensor processing unit. In: Proceedings of the 44th annual international symposium on computer architecture, 2017; 1\u201312.","DOI":"10.1145\/3079856.3080246"},{"key":"4680_CR83","unstructured":"Abadi M, Barham P, Chen J, Chen Z, Davis A, Dean J, Devin M, Ghemawat S, Irving G, Isard M, et al. Tensorflow: a system for large-scale machine learning. In: 12th USENIX symposium on operating systems design and implementation (OSDI 16), 2016; 265\u201383."},{"key":"4680_CR84","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren S, He K, Girshick R, Sun J. Faster r-cnn: towards real-time object detection with region proposal networks. Adv Neural Inform Process Syst. 2015;28:91\u20139.","journal-title":"Adv Neural Inform Process Syst"},{"key":"4680_CR85","unstructured":"Lin C-Y. Rouge: a package for automatic evaluation of summaries. In: Text summarization branches out, 2004; 74\u201381."}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04680-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-04680-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04680-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T11:43:13Z","timestamp":1768390993000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-04680-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,14]]},"references-count":85,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2026,1]]}},"alternative-id":["4680"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-04680-0","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,14]]},"assertion":[{"value":"30 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing interests, financial or otherwise, that could be perceived to influence the work reported in this manuscript. No funds, grants, or other support were received during the preparation of this manuscript. The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"This study did not involve human participants or animals. Ethical approval and informed consent were therefore not required.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval and Informed Consent"}}],"article-number":"109"}}