{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T23:25:17Z","timestamp":1778196317586,"version":"3.51.4"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T00:00:00Z","timestamp":1743984000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T00:00:00Z","timestamp":1743984000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-03908-3","type":"journal-article","created":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T09:00:32Z","timestamp":1744016432000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["A Novel Technique for Image Captioning Based on Hierarchical Clustering and Deep Learning"],"prefix":"10.1007","volume":"6","author":[{"given":"Rizwan Ur","family":"Rahman","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5340-7777","authenticated-orcid":false,"given":"Pavan","family":"Kumar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aditya","family":"Mohan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rabia Musheer","family":"Aziz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Deepak Singh","family":"Tomar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,7]]},"reference":[{"key":"3908_CR1","doi-asserted-by":"crossref","unstructured":"Shuster K, Humeau S, Hu H, Bordes A, Weston J. Engaging image captioning via personality. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition 2019; 12516\u201312526.","DOI":"10.1109\/CVPR.2019.01280"},{"key":"3908_CR2","doi-asserted-by":"crossref","unstructured":"Oluwasammi A, Aftab MU, Qin Z, Ngo ST, Doan TV, Nguyen SB, Nguyen GH. Features to text: a comprehensive survey of deep learning on semantic segmentation and image captioning. Complexity 2021.","DOI":"10.1155\/2021\/5538927"},{"key":"3908_CR3","doi-asserted-by":"crossref","unstructured":"ur Rahman R, Tomar DS, Das S. Dynamic image based captcha. In 2012 International Conference on Communication Systems and Network Technologies 90\u201394. IEEE 2012, May.","DOI":"10.1109\/CSNT.2012.29"},{"issue":"2","key":"3908_CR4","doi-asserted-by":"publisher","first-page":"13","DOI":"10.3390\/data7020013","volume":"7","author":"GO Dos Santos","year":"2022","unstructured":"Dos Santos GO, Colombini EL, Avila S. # pracegover: A large dataset for image captioning in Portuguese. Data. 2022;7(2):13.","journal-title":"Data"},{"key":"3908_CR5","doi-asserted-by":"crossref","unstructured":"Stefanini M, Cornia M, Baraldi L, Cascianelli S, Fiameni G, Cucchiara R. From show to Tell: a survey on deep learning-based image captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence; 2022.","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"3908_CR6","doi-asserted-by":"crossref","unstructured":"Bleeker M, Rijke MD. Do lessons from metric learning generalize to image-caption retrieval? In European Conference on Information Retrieval 2022, April; 535\u2013551. Springer, Cham.","DOI":"10.1007\/978-3-030-99736-6_36"},{"issue":"7","key":"3908_CR7","doi-asserted-by":"publisher","first-page":"10051","DOI":"10.1007\/s11042-022-12042-8","volume":"81","author":"R Das","year":"2022","unstructured":"Das R, Singh TD. Assamese news image caption generation using attention mechanism. Multimedia Tools Appl. 2022;81(7):10051\u201369.","journal-title":"Multimedia Tools Appl"},{"key":"3908_CR8","doi-asserted-by":"publisher","first-page":"118669","DOI":"10.1016\/j.eswa.2022.118669","volume":"212","author":"A Salaberria","year":"2023","unstructured":"Salaberria A, Azkune G, de Lacalle OL, Soroa A, Agirre E. Image captioning for effective use of Language models in knowledge-based visual question answering. Expert Syst Appl. 2023;212:118669.","journal-title":"Expert Syst Appl"},{"key":"3908_CR9","doi-asserted-by":"publisher","first-page":"119305","DOI":"10.1016\/j.eswa.2022.119305","volume":"215","author":"C Balim","year":"2023","unstructured":"Balim C, \u00d6zkan K. Diagnosing fashion outfit compatibility with deep learning techniques. Expert Syst Appl. 2023;215:119305. https:\/\/doi.org\/10.1016\/j.eswa.2022.119305.","journal-title":"Expert Syst Appl"},{"key":"3908_CR10","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D. Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition 2015; 3156\u20133164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"3908_CR11","unstructured":"Kiros R, Salakhutdinov R, &Zemel RS. Unifying visual-semantic embeddings with multimodal neural Language models. ArXiv Preprint 2014; arXiv:14112539. https:\/\/arxiv.org\/abs\/1411.2539."},{"key":"3908_CR12","doi-asserted-by":"crossref","unstructured":"Zhou L, Xu C, Koch P, Corso JJ. Watch what you just said: Image captioning with text-conditional attention. In Proceedings of the on Thematic Workshops of ACM Multimedia 2017, October; 305\u2013313.","DOI":"10.1145\/3126686.3126717"},{"key":"3908_CR13","doi-asserted-by":"crossref","unstructured":"Hubenthal M, Kumar S. Image-Text Pre-Training for Logo Recognition. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision 2023; 1145\u20131154.","DOI":"10.1109\/WACV56688.2023.00120"},{"key":"3908_CR14","doi-asserted-by":"publisher","first-page":"107928","DOI":"10.1016\/j.patcog.2021.107928","volume":"115","author":"J Ji","year":"2021","unstructured":"Ji J, Du Z, Zhang X. Divergent-convergent attention for image captioning. Pattern Recogn. 2021;115:107928.","journal-title":"Pattern Recogn"},{"issue":"13","key":"3908_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10489-022-03463-x","volume":"52","author":"J Prudviraj","year":"2022","unstructured":"Prudviraj J, Vishnu C, Mohan CK. M-FFN: multi-scale feature fusion network for image captioning. Appl Intell. 2022;52(13):1\u201313.","journal-title":"Appl Intell"},{"key":"3908_CR16","doi-asserted-by":"crossref","unstructured":"Wu TW, Huang JH, Lin J, Worring M. Expert-defined Keywords Improve Interpretability of Retinal Image Captioning. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision 2023; 1859\u20131868.","DOI":"10.1109\/WACV56688.2023.00190"},{"issue":"13814421","key":"3908_CR17","first-page":"67","volume":"22","author":"V Faber","year":"1994","unstructured":"Faber V. Clustering and the continuous k-means algorithm. Los Alamos Sci. 1994;22(13814421):67.","journal-title":"Los Alamos Sci"},{"key":"3908_CR18","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan, Zitnick CL. Microsoft coco: Common objects in context. In European conference on computer vision 2014, September; 740\u2013755. Springer, Cham.","DOI":"10.1007\/978-3-319-10602-1_48"},{"issue":"2","key":"3908_CR19","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1145\/276305.276312","volume":"27","author":"S Guha","year":"1998","unstructured":"Guha S, Rastogi R, Shim K. CURE: an efficient clustering algorithm for large databases. ACM Sigmod Record. 1998;27(2):73\u201384.","journal-title":"ACM Sigmod Record"},{"key":"3908_CR20","doi-asserted-by":"crossref","unstructured":"Urbanek J, Bordes F, Astolfi P, Williamson M, Sharma V, Romero-Soriano A. A picture is worth more than 77 text tokens: Evaluating clip-style models on dense captions. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition 2024; 26700\u201326709).","DOI":"10.1109\/CVPR52733.2024.02521"},{"key":"3908_CR21","doi-asserted-by":"publisher","unstructured":"Fan L, Krishnan D, Isola P, Katabi D, Tian Y. Improving clip training with Language rewrites. Adv Neural Inf Process Syst, 2024; 36. https:\/\/doi.org\/10.48550\/arXiv.2305.20088.","DOI":"10.48550\/arXiv.2305.20088"},{"key":"3908_CR22","unstructured":"Dzabraev M, Kunitsyn A, Ivaniuta A. VLRM: Vision-Language Models act as Reward Models for Image Captioning. arXiv preprint 2024; arXiv:2404.01911."},{"key":"3908_CR23","unstructured":"Zhang J, Zhang H, Wan X. Entity-Aware Multimodal Alignment Framework for News Image Captioning. arXiv preprint 2024; arXiv:2402.19404."},{"key":"3908_CR24","unstructured":"Simonyan K, Zisserman A. Very deep convolutional networks for large-scale image recognition. ArXiv Preprint 2014; arXiv:1409.1556. https:\/\/arxiv.org\/abs\/1409.1556."},{"key":"3908_CR25","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov, Bengio Y.  Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning 2015, June; 2048\u20132057."},{"key":"3908_CR26","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD. Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) 2014, October; 1532\u20131543.","DOI":"10.3115\/v1\/D14-1162"},{"key":"3908_CR27","unstructured":"Ioffe S, Szegedy C. Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint 2015; arXiv:1502.03167. https:\/\/arxiv.org\/abs\/1409.1556."},{"issue":"8","key":"3908_CR28","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J. Long short-term memory. Neural Comput. 1997;9(8):1735\u201380.","journal-title":"Neural Comput"},{"issue":"1","key":"3908_CR29","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1162\/089120103321337458","volume":"29","author":"C Tillmann","year":"2003","unstructured":"Tillmann C, Ney H. Word reordering and a dynamic programming beam search algorithm for statistical machine translation. Comput Linguistics. 2003;29(1):97\u2013133.","journal-title":"Comput Linguistics"},{"key":"3908_CR30","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ. (2002, July). BLEU: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics (pp. 311\u2013318).","DOI":"10.3115\/1073083.1073135"},{"key":"3908_CR31","unstructured":"Banerjee, S., & Lavie, A. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization 2005, June; 65\u201372."},{"key":"3908_CR32","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L. Imagenet: A large-scale hierarchical image database. In 2009 IEEE conference on computer vision and pattern recognition 2009, June; 248\u2013255. IEEE.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"3908_CR33","unstructured":"Arthur D, Vassilvitskii S. k-means++: The advantages of careful seeding. Stanford 2006."}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03908-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-03908-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-03908-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T09:00:38Z","timestamp":1744016438000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-03908-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,7]]},"references-count":33,"journal-issue":{"issue":"4","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["3908"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-03908-3","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,7]]},"assertion":[{"value":"11 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The article does not contain any studies with human participants or animals.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"There is no conflict of interest between the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"360"}}