{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T13:59:24Z","timestamp":1776693564269,"version":"3.51.2"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,9,4]],"date-time":"2023-09-04T00:00:00Z","timestamp":1693785600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,4]],"date-time":"2023-09-04T00:00:00Z","timestamp":1693785600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"DST WOS-A","award":["WOSA\/DST\/ET-6\/2021"],"award-info":[{"award-number":["WOSA\/DST\/ET-6\/2021"]}]},{"name":"DST WOS-A","award":["WOSA\/DST\/ET-6\/2021"],"award-info":[{"award-number":["WOSA\/DST\/ET-6\/2021"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s11760-023-02725-6","type":"journal-article","created":{"date-parts":[[2023,9,4]],"date-time":"2023-09-04T19:03:10Z","timestamp":1693854190000},"page":"265-274","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Automatic image captioning system based on augmentation and ranking mechanism"],"prefix":"10.1007","volume":"18","author":[{"given":"B. S.","family":"Revathi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"A. Meena","family":"Kowshalya","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,9,4]]},"reference":[{"issue":"10","key":"2725_CR1","doi-asserted-by":"publisher","first-page":"2024","DOI":"10.3390\/app9102024","volume":"9","author":"R Stani\u016bt\u0117","year":"2019","unstructured":"Stani\u016bt\u0117, R., \u0160e\u0161ok, D.: A systematic literature review on image captioning. Appl. Sci. 9(10), 2024 (2019)","journal-title":"Appl. Sci."},{"key":"2725_CR2","unstructured":"Bahdanau, D., Cho, K.; ,engio, Y.: Neural Machine Translation by Jointly Learning to Align and Translate. arXiv 2015, arXiv:1409.0473"},{"key":"2725_CR3","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Polosukhin, I.: Attention is all you need.\u00a0Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2725_CR4","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In:\u00a0Computer Vision\u2013ECCV 2010: 11th European Conference on Computer Vision, Heraklion, Crete, Greece, September 5\u201311, 2010, Proceedings, Part IV 11\u00a0(pp. 15\u201329). Springe, Berlin, Heidelberg (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"2725_CR5","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Ordonez, V., Dhar, S., Li, S., Choi, Y., Berg, T.L.: Babytalk: understanding and generating simple image descriptions.\u00a0IEEE Trans. Pattern Anal. Mach. Intell.,\u00a035(12), 2891\u20132903 (2013)","DOI":"10.1109\/TPAMI.2012.162"},{"key":"2725_CR6","unstructured":"Li, S., Kulkarni, G., Berg, T., Berg, A., Choi, Y.: Composing simple image descriptions using web-scale n-grams. In\u00a0Proceedings of the Fifteenth Conference on Computational Natural Language Learning\u00a0(pp. 220\u2013228) (2011)"},{"key":"2725_CR7","unstructured":"Yang, Y., Teo, C., Daum\u00e9 III, H., & Aloimonos, Y. (2011, July). Corpus-guided sentence generation of natural images. In\u00a0Proceedings of the 2011 conference on empirical methods in natural language processing\u00a0(pp. 444\u2013454)."},{"key":"2725_CR8","unstructured":"Kuznetsova, P., Ordonez, V., Berg, A., Berg, T., Choi, Y.: Collective generation of natural image descriptions. In:\u00a0Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\u00a0(pp. 359\u2013368) (2012)"},{"key":"2725_CR9","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova, P., Ordonez, V., Berg, T.L., Choi, Y.: Treetalk: Composition and compression of trees for image descriptions. Trans. Assoc. Comput. Linguistics 2, 351\u2013362 (2014)","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"2725_CR10","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In:\u00a0Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 3128\u20133137) (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2725_CR11","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Yuille, A.L.: Explain images with multimodal recurrent neural networks.\u00a0arXiv preprint arXiv:1410.1090 (2014)"},{"key":"2725_CR12","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"2725_CR13","unstructured":"Jin, J., Fu, K., Cui, R., Sha, F., Zhang, C.: Aligning where to see and what to tell: image caption with region-based attention and scene factorization.\u00a0arXiv preprint arXiv:1506.06272 (2015)"},{"key":"2725_CR14","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.: Multimodal neural language models. In:\u00a0International Conference on Machine Learning\u00a0(pp. 595\u2013603). PMLR (2014)"},{"key":"2725_CR15","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In:\u00a0Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 2625\u20132634) (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"2725_CR16","doi-asserted-by":"crossref","unstructured":"Tomasi, C., Manduchi, R. Bilateral filtering for gray and color images. In:\u00a0Sixth International Conference on Computer Vision (IEEE Cat. No. 98CH36271)\u00a0(pp. 839\u2013846). IEEE (1998)","DOI":"10.1109\/ICCV.1998.710815"},{"key":"2725_CR17","doi-asserted-by":"crossref","unstructured":"Felzenszwalb, P., McAllester, D., Ramanan, D. A discriminatively trained, multiscale, deformable part model. I:n\u00a02008 IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 1\u20138) (2008)","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"2725_CR18","doi-asserted-by":"crossref","unstructured":"Divvala, S.K., Hoiem, D., Hays, J.H., Efros, A.A., Hebert, M.: An empirical study of context in object detection. In:\u00a02009 IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 1271\u20131278) (2009)","DOI":"10.1109\/CVPRW.2009.5206532"},{"key":"2725_CR19","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1016\/S0079-6123(06)55002-2","volume":"155","author":"A Oliva","year":"2006","unstructured":"Oliva, A., Torralba, A.: Building the gist of a scene: the role of global image features in recognition. Prog. Brain Res. 155, 23\u201336 (2006)","journal-title":"Prog. Brain Res."},{"key":"2725_CR20","doi-asserted-by":"crossref","unstructured":"Curran, J.R., Clark, S., Bos, J.: Linguistically motivated large-scale NLP with C&C and Boxer. In:\u00a0Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions\u00a0(pp. 33\u201336) (2007)","DOI":"10.3115\/1557769.1557781"},{"key":"2725_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In:\u00a0Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 770\u2013778) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2725_CR22","unstructured":"Perez, L., Wang, J.: The effectiveness of data augmentation in image classification using deep learning.\u00a0arXiv preprint arXiv:1712.04621 (2017)"},{"issue":"17","key":"2725_CR23","doi-asserted-by":"publisher","first-page":"5978","DOI":"10.3390\/app10175978","volume":"10","author":"V Atliha","year":"2020","unstructured":"Atliha, V., \u0160e\u0161ok, D.: Text augmentation using BERT for image captioning. Appl. Sci. 10(17), 5978 (2020)","journal-title":"Appl. Sci."},{"key":"2725_CR24","doi-asserted-by":"crossref","unstructured":"Cui, Y., Yang, G., Veit, A., Huang, X., Belongie, S.: Learning to evaluate image captioning. In:\u00a0Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 5804\u20135812) (2018)","DOI":"10.1109\/CVPR.2018.00608"},{"key":"2725_CR25","doi-asserted-by":"crossref","unstructured":"He, X., Wei, D., Lam, K.M., Li, J., Wang, L., Jia, W., Wu, Q.: Canny edge detection using bilateral filter on real hexagonal structure. In:\u00a0Advanced Concepts for Intelligent Vision Systems: 12th International Conference, ACIVS 2010, Sydney, Australia, December 13\u201316, 2010, Proceedings, Part I 12\u00a0(pp. 233-244). Springer, Berlin, Heidelberg (2010)","DOI":"10.1007\/978-3-642-17688-3_23"},{"issue":"1","key":"2725_CR26","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1007\/s11063-018-09973-5","volume":"50","author":"P Cao","year":"2019","unstructured":"Cao, P., Yang, Z., Sun, L., Liang, Y., Yang, M.Q., Guan, R.: Image captioning with bidirectional semantic attention-based guiding of long short-term memory. Neural Process. Lett. 50(1), 103\u2013119 (2019)","journal-title":"Neural Process. Lett."},{"key":"2725_CR27","doi-asserted-by":"crossref","unstructured":"Chowdhary, K.: Natural language processing.\u00a0Fund. Artif. Intell., pp. 603\u2013649 (2020)","DOI":"10.1007\/978-81-322-3972-7_19"},{"key":"2725_CR28","doi-asserted-by":"crossref","unstructured":"]. Makav, B., K\u0131l\u0131\u00e7, V.: A new image captioning approach for visually impaired people. In:\u00a02019 11th International Conference on Electrical and Electronics Engineering (ELECO)\u00a0(pp. 945\u2013949). IEEE (2019)","DOI":"10.23919\/ELECO47770.2019.8990630"},{"issue":"8","key":"2725_CR29","doi-asserted-by":"publisher","first-page":"2811","DOI":"10.3390\/s21082811","volume":"21","author":"W Ullah","year":"2021","unstructured":"Ullah, W., Ullah, A., Hussain, T., Khan, Z.A., Baik, S.W.: An efficient anomaly recognition framework using an attention residual LSTM in surveillance videos. Sensors 21(8), 2811 (2021)","journal-title":"Sensors"},{"key":"2725_CR30","doi-asserted-by":"crossref","unstructured":"Ullah, W., Ullah, A., Hussain, T., Muhammad, K., Heidari, A.A., Del Ser, J., De Albuquerque, V.H.C.: Artificial Intelligence of Things-assisted two-stream neural network for anomaly detection in surveillance Big Video Data.\u00a0Future Generat. Comput. Syst.,129, 286\u2013297 (2022)","DOI":"10.1016\/j.future.2021.10.033"},{"key":"2725_CR31","doi-asserted-by":"publisher","first-page":"16979","DOI":"10.1007\/s11042-020-09406-3","volume":"80","author":"W Ullah","year":"2021","unstructured":"Ullah, W., Ullah, A., Haq, I.U., Muhammad, K., Sajjad, M., Baik, S.W.: CNN features with bi-directional LSTM for real-time anomaly detection in surveillance networks. Multimedia tools and applications 80, 16979\u201316995 (2021)","journal-title":"Multimedia tools and applications"},{"key":"2725_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109456","volume":"253","author":"W Ullah","year":"2022","unstructured":"Ullah, W., Hussain, T., Khan, Z.A., Haroon, U., Baik, S.W.: Intelligent dual stream CNN and echo state network for anomaly detection. Knowl.Based Syst. 253, 109456 (2022)","journal-title":"Knowl.Based Syst."},{"issue":"3","key":"2725_CR33","doi-asserted-by":"publisher","first-page":"445","DOI":"10.1007\/s00371-018-1566-y","volume":"35","author":"X Liu","year":"2019","unstructured":"Liu, X., Xu, Q., Wang, N.: A survey on deep neural network-based image captioning. Vis. Comput. 35(3), 445\u2013470 (2019)","journal-title":"Vis. Comput."},{"key":"2725_CR34","doi-asserted-by":"publisher","first-page":"9627","DOI":"10.1109\/TIP.2020.3028651","volume":"29","author":"M Yang","year":"2020","unstructured":"Yang, M., Liu, J., Shen, Y., Zhao, Z., Chen, X., Wu, Q., Li, C.: An ensemble of generation-and retrieval-based image captioning with dual generator generative adversarial network. IEEE Trans. Image Process. 29, 9627\u20139640 (2020)","journal-title":"IEEE Trans. Image Process."},{"issue":"6","key":"2725_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M.Z., Sohel, F., Shiratuddin, M.F., Laga, H.: A comprehensive survey of deep learning for image captioning. ACM Comput. Surv. (CsUR) 51(6), 1\u201336 (2019)","journal-title":"ACM Comput. Surv. (CsUR)"},{"key":"2725_CR36","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.. Bleu: a method for automatic evaluation of machine translation. In:\u00a0Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics\u00a0(pp. 311\u2013318) (2002","DOI":"10.3115\/1073083.1073135"},{"key":"2725_CR37","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.:. Cider: Consensus-based image description evaluation. In:\u00a0Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition\u00a0(pp. 4566\u20134575) (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2725_CR38","unstructured":"Banerjee, S., Lavie, A. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In:\u00a0Proceedings of the Acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization\u00a0(pp. 65\u201372) (2005)"},{"issue":"2013","key":"2725_CR39","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: Data, models and evaluation metrics. J. Artif,. Intell. Res. 47(2013), 853\u2013899 (2013)","journal-title":"J. Artif,. Intell. Res."},{"key":"2725_CR40","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2text: Describing images using 1 million captioned photographs. Adv. Neural Inf. Process. Syst., pp. 1143\u20131151 (2011)"},{"issue":"8","key":"2725_CR41","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"2725_CR42","unstructured":"Lin, D.: An information-theoretic definition of similarity. In: ICML, 296\u2013304 (1998)"},{"key":"2725_CR43","unstructured":"Wu, J.: Introduction to convolutional neural networks.\u00a0National Key Lab for Novel Software Technology. Nanjing University. China,\u00a05(23), 495 (2017)"},{"key":"2725_CR44","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In:\u00a0Proceedings of the IEEE International Conference on Computer Vision\u00a0(pp. 1026\u20131034) (2015)","DOI":"10.1109\/ICCV.2015.123"},{"issue":"1","key":"2725_CR45","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1145\/361237.361242","volume":"15","author":"RO Duda","year":"1972","unstructured":"Duda, R.O., Hart, P.E.: Use of the Hough transformation to detect lines and curves in pictures. Commun. ACM 15(1), 11\u201315 (1972)","journal-title":"Commun. ACM"},{"key":"2725_CR46","unstructured":"Bieder, F., Sandk\u00fchler, R., Cattin, P.C. Comparison of methods generalizing max-and average-pooling.\u00a0arXiv preprint arXiv:2103.01746 (2021)"},{"key":"2725_CR47","doi-asserted-by":"crossref","unstructured":"Wilt, C.M., Thayer, J.T., Ruml, W.: A comparison of greedy search algorithms. In:\u00a0Third Annual Symposium on Combinatorial Search (2010)","DOI":"10.1609\/socs.v1i1.18182"},{"key":"2725_CR48","doi-asserted-by":"crossref","unstructured":"Tomasi, C., Manduchi, R.: Bilateral filtering for gray and color images. In:\u00a0Sixth International Conference on Computer Vision (IEEE Cat. No. 98CH36271)\u00a0(pp. 839\u2013846). IEEE (1998)","DOI":"10.1109\/ICCV.1998.710815"},{"key":"2725_CR49","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv 2015.\u00a0arXiv preprint arXiv:1502.03167 (2015)"},{"key":"2725_CR50","unstructured":"Vijayaraju, N.: Image retrieval using image captioning (2019)"},{"key":"2725_CR51","volume-title":"Document image analysis","author":"L O'Gorman","year":"1995","unstructured":"O\u2019Gorman, L., Kasturi, R.: Document image analysis, vol. 39. IEEE Computer Society Press, Los Alamitos (1995)"},{"key":"2725_CR52","doi-asserted-by":"crossref","unstructured":"Guo, K., Wu, Z., Wang, W., Ren, S., Zhou, X., Gadekallu, T. R., Liu, C.: GRTR: gradient rebalanced traffic sign recognition for autonomous vehicles.\u00a0IEEE Trans. Auto. Sci. Eng. (2023)","DOI":"10.1109\/TASE.2023.3270202"},{"key":"2725_CR53","doi-asserted-by":"crossref","unstructured":"Teng, L., Qiao, Y., Shafiq, M., Srivastava, G., Javed, A.R., Gadekallu, T.R., Yin, S.: FLPK-BiSeNet: Federated learning based on priori knowledge and bilateral segmentation network for image edge extraction.\u00a0IEEE Transa. Netw. Serv. Manag. (2023)","DOI":"10.1109\/TNSM.2023.3273991"},{"issue":"10","key":"2725_CR54","first-page":"571","volume":"10","author":"H Aldabbas","year":"2019","unstructured":"Aldabbas, H., Asad, M., Ryalat, M.H., Malik, K.R., Qureshi, M.Z.A.: Data augmentation to stabilize image caption generation models in deep learning. Int J Adv Comput Sci Appl 10(10), 571\u2013579 (2019)","journal-title":"Int J Adv Comput Sci Appl"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-023-02725-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-023-02725-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-023-02725-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T13:16:53Z","timestamp":1730035013000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-023-02725-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,4]]},"references-count":54,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["2725"],"URL":"https:\/\/doi.org\/10.1007\/s11760-023-02725-6","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,4]]},"assertion":[{"value":"9 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 July 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 August 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}