{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T18:47:40Z","timestamp":1769021260433,"version":"3.49.0"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"28-29","license":[{"start":{"date-parts":[[2021,7,9]],"date-time":"2021-07-09T00:00:00Z","timestamp":1625788800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,7,9]],"date-time":"2021-07-09T00:00:00Z","timestamp":1625788800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,11]]},"DOI":"10.1007\/s11042-021-11106-5","type":"journal-article","created":{"date-parts":[[2021,7,9]],"date-time":"2021-07-09T04:03:13Z","timestamp":1625803393000},"page":"35721-35740","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":24,"title":["An encoder-decoder based framework for hindi image caption generation"],"prefix":"10.1007","volume":"80","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2683-0542","authenticated-orcid":false,"given":"Alok","family":"Singh","sequence":"first","affiliation":[]},{"given":"Thoudam Doren","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Sivaji","family":"Bandyopadhyay","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,7,9]]},"reference":[{"key":"11106_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"11106_CR2","unstructured":"Banerjee S, Lavie A (2005) Meteor: an automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"11106_CR3","doi-asserted-by":"crossref","unstructured":"Dhir R, Mishra SK, Saha S, Bhattacharyya P (2019) A deep attention based framework for image caption generation in hindi language. Comput y Sist 23(3)","DOI":"10.13053\/cys-23-3-3269"},{"key":"11106_CR4","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J, Forsyth D (2010) Every picture tells a story: generating sentences from images. In: European conference on computer vision, pp 15\u201329. Springer","DOI":"10.1007\/978-3-642-15561-1_2"},{"issue":"9","key":"11106_CR5","doi-asserted-by":"publisher","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"PF Felzenszwalb","year":"2010","unstructured":"Felzenszwalb PF, Girshick RB, McAllester D, Ramanan D (2010) Object detection with discriminatively trained part-based models. IEEE Transactions on Pattern Analysis and Machine Intelligence 32(9):1627\u20131645. https:\/\/doi.org\/10.1109\/TPAMI.2009.167","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"11106_CR6","doi-asserted-by":"crossref","unstructured":"Gong Y, Wang L, Hodosh M, Hockenmaier J, Lazebnik S (2014) Improving image-sentence embeddings using large weakly annotated photo collections. In: European conference on computer vision, pp 529\u2013545. Springer","DOI":"10.1007\/978-3-319-10593-2_35"},{"issue":"6","key":"11106_CR7","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1109\/MSP.2017.2741510","volume":"34","author":"X He","year":"2017","unstructured":"He X, Deng L (2017) Deep learning for image-to-text generation: a technical overview. IEEE Signal Proc Mag 34(6):109\u2013116","journal-title":"IEEE Signal Proc Mag"},{"key":"11106_CR8","unstructured":"Hironobu YM, Takahashi H, Oka R (1999) Image-to-word transformation based on dividing and vector quantizing images with words. In: Boltzmann machines, neural networks, pp 405409"},{"issue":"8","key":"11106_CR9","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Computation 9(8):1735\u20131780. https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Computation"},{"key":"11106_CR10","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"issue":"6","key":"11106_CR11","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1145\/3295748","volume":"51","author":"M Hossain","year":"2019","unstructured":"Hossain M, Sohel F, Shiratuddin MF, Laga H (2019) A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CSUR) 51(6):118","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"11106_CR12","unstructured":"Isozaki H, Hirao T, Duh K, Sudoh K, Tsukada H (2010) Automatic evaluation of translation quality for distant language pairs. In: Proceedings of the 2010 conference on empirical methods in natural language processing, pp 944\u2013952. Association for Computational Linguistics, Cambridge. https:\/\/www.aclweb.org\/anthology\/D10-1092"},{"key":"11106_CR13","doi-asserted-by":"crossref","unstructured":"Jaffe A (2017) Generating image descriptions using multilingual data. In: Proceedings of the second conference on machine translation, pp 458\u2013464","DOI":"10.18653\/v1\/W17-4750"},{"key":"11106_CR14","doi-asserted-by":"crossref","unstructured":"Jia X, Gavves E, Fernando B, Tuytelaars T (2015) Guiding the long-short term memory model for image caption generation. In: Proceedings of the IEEE international conference on computer vision, pp 2407\u20132415","DOI":"10.1109\/ICCV.2015.277"},{"key":"11106_CR15","unstructured":"Kiros R, Salakhutdinov R, Zemel R (2014) Multimodal neural language models. In: Proceedings of the 31st international conference on international conference on machine learning - vol 32, ICML\u201914, pp II\u2013595\u2013II\u2013603. JMLR.org. http:\/\/dl.acm.org\/citation.cfm?id=3044805.3044959"},{"key":"11106_CR16","doi-asserted-by":"publisher","unstructured":"Laskar SR, Singh RP, Pakray P, Bandyopadhyay S (2019) English to Hindi multi-modal neural machine translation and Hindi image captioning. In: Proceedings of the 6th workshop on asian translation. Association for Computational Linguistics, Hong Kong, pp 62\u201367. https:\/\/doi.org\/10.18653\/v1\/D19-5205. https:\/\/www.aclweb.org\/anthology\/D19-5205","DOI":"10.18653\/v1\/D19-5205"},{"key":"11106_CR17","doi-asserted-by":"crossref","unstructured":"Liu M, Hu H, Li L, Yu Y, Guan W (2020) Chinese image caption generation via visual attention and topic modeling. IEEE Transactions on Cybernetics","DOI":"10.1109\/TCYB.2020.2997034"},{"key":"11106_CR18","doi-asserted-by":"publisher","unstructured":"Ma S, Han Y (2016) Describing images by feeding lstm with structural words. In: 2016 IEEE international conference on multimedia and expo (ICME), pp 1\u20136. https:\/\/doi.org\/10.1109\/ICME.2016.7552883","DOI":"10.1109\/ICME.2016.7552883"},{"key":"11106_CR19","unstructured":"Ordonez V, Kulkarni G, Berg TL (2011) Im2text: describing images using 1 million captioned photographs. In: Advances in neural information processing systems, pp 1143\u20131151"},{"key":"11106_CR20","doi-asserted-by":"publisher","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics, ACL \u201902, pp 311\u2013318. Association for Computational Linguistics, Stroudsburg. https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"11106_CR21","doi-asserted-by":"crossref","unstructured":"Parida S, Bojar O, Dash SR (2019) Hindi visual genome: a dataset for multimodal english-to-hindi machine translation. Computaci\u00f3n y Sistemas. In print. Presented at CICLing 2019, La Rochelle, France","DOI":"10.13053\/cys-23-4-3294"},{"key":"11106_CR22","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. In: Cortes C, Lawrence N, Lee D, Sugiyama M, Garnett R (eds) Advances in neural information processing systems, vol 28. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2015\/file\/14bfa6bb14875e45bba028a21ed38046-Paper.pdf"},{"key":"11106_CR23","doi-asserted-by":"crossref","unstructured":"Ren Z, Wang X, Zhang N, Lv X, Li LJ (2017) Deep reinforcement learning-based image captioning with embedding reward. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 290\u2013298","DOI":"10.1109\/CVPR.2017.128"},{"key":"11106_CR24","doi-asserted-by":"publisher","unstructured":"Sanayai Meetei L, Singh TD, Bandyopadhyay S (2019) WAT2019: English-Hindi translation on Hindi visual genome dataset. In: Proceedings of the 6th workshop on asian translation. https:\/\/doi.org\/10.18653\/v1\/D19-5224. https:\/\/www.aclweb.org\/anthology\/D19-5224. Association for Computational Linguistics, Hong Kong, pp 181\u2013188","DOI":"10.18653\/v1\/D19-5224"},{"key":"11106_CR25","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"11106_CR26","doi-asserted-by":"publisher","unstructured":"Singh A, Meetei LS, Singh TD, Bandyopadhyay S (2021) Generation and evaluation of hindi image captions of visual genome. In: Maji AK, Saha G, Das S, Basu S, Tavares JMRS (eds) Proceedings of the international conference on computing and communication systems. Springer Singapore, Singapore, pp 65\u201373. https:\/\/doi.org\/10.1007\/978-981-33-4084-8_7","DOI":"10.1007\/978-981-33-4084-8_7"},{"key":"11106_CR27","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, pp 3104\u20133112"},{"key":"11106_CR28","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"2s","key":"11106_CR29","doi-asserted-by":"publisher","first-page":"40:1","DOI":"10.1145\/3115432","volume":"14","author":"C Wang","year":"2018","unstructured":"Wang C, Yang H, Meinel C (2018) Image captioning with deep bidirectional lstms and multi-task learning. ACM Trans Multimedia Comput Commun Appl 14(2s):40:1\u201340:20. https:\/\/doi.org\/10.1145\/3115432","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"11106_CR30","doi-asserted-by":"crossref","unstructured":"Wang M, Li S, Yang X, Luo C (2016) A parallel-fusion rnn-lstm architecture for image caption generation. In: 2016 IEEE international conference on image processing (ICIP), pp 4448\u20134452","DOI":"10.1109\/ICIP.2016.7533201"},{"key":"11106_CR31","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning, pp 2048\u20132057"},{"key":"11106_CR32","unstructured":"Yang Y, Teo CL, Daum\u00e9 H III, Aloimonos Y (2011) Corpus-guided sentence generation of natural images. In: Proceedings of the conference on empirical methods in natural language processing, pp 444\u2013454. Association for Computational Linguistics"},{"key":"11106_CR33","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. TACL 2:67\u201378","journal-title":"TACL"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11106-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-11106-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11106-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,11,30]],"date-time":"2021-11-30T17:35:46Z","timestamp":1638293746000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-11106-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,9]]},"references-count":33,"journal-issue":{"issue":"28-29","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["11106"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-11106-5","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,7,9]]},"assertion":[{"value":"16 March 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 May 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 July 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}