{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:31:31Z","timestamp":1767141091666,"version":"build-2238731810"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2021,6,17]],"date-time":"2021-06-17T00:00:00Z","timestamp":1623888000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,17]],"date-time":"2021-06-17T00:00:00Z","timestamp":1623888000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100019550","name":"Scheme for Promotion of Academic and Research Collaboration","doi-asserted-by":"crossref","award":["P995 SPARC\/2018-2019\/119\/SL(IN)"],"award-info":[{"award-number":["P995 SPARC\/2018-2019\/119\/SL(IN)"]}],"id":[{"id":"10.13039\/501100019550","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2022,2]]},"DOI":"10.1007\/s00530-021-00816-3","type":"journal-article","created":{"date-parts":[[2021,6,17]],"date-time":"2021-06-17T10:03:59Z","timestamp":1623924239000},"page":"195-207","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Attention based video captioning framework for Hindi"],"prefix":"10.1007","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2683-0542","authenticated-orcid":false,"given":"Alok","family":"Singh","sequence":"first","affiliation":[]},{"given":"Thoudam Doren","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Sivaji","family":"Bandyopadhyay","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,6,17]]},"reference":[{"key":"816_CR1","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Venugopalan, S., Rohrbach, M., Mooney, R., Saenko, K., Darrell, T.: Deep compositional captioning: Describing novel object categories without paired training data. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.8"},{"issue":"12","key":"816_CR2","doi-asserted-by":"publisher","first-page":"833","DOI":"10.1016\/S0262-8856(01)00047-6","volume":"19","author":"D Ayers","year":"2001","unstructured":"Ayers, D., Shah, M.: Monitoring human behavior from video taken in an office environment. Image Vis. Comput. 19(12), 833\u2013846 (2001)","journal-title":"Image Vis. Comput."},{"key":"816_CR3","unstructured":"Brand, M.: \u201cThe inverse hollywood problem\u201d: From video to scripts and storyboards via causal analysis. In: AAAI\/IAAI, pp. 132\u2013137. Citeseer (1997)"},{"key":"816_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"816_CR5","first-page":"8167","volume":"33","author":"J Chen","year":"2019","unstructured":"Chen, J., Pan, Y., Li, Y., Yao, T., Chao, H., Mei, T.: Temporal deformable convolutional encoder\u2013decoder networks for video captioning. Proc. AAAI Conf. Artif. Intell. 33, 8167\u20138174 (2019)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"816_CR6","doi-asserted-by":"crossref","unstructured":"Dhir, R., Mishra, S.K., Saha, S., Bhattacharyya, P.: A deep attention based framework for image caption generation in Hindi language. Computaci\u00f3n y Sistemas 23(3) (2019)","DOI":"10.13053\/cys-23-3-3269"},{"issue":"12","key":"816_CR7","doi-asserted-by":"publisher","first-page":"1703","DOI":"10.1007\/s00371-018-1591-x","volume":"35","author":"X Du","year":"2019","unstructured":"Du, X., Yuan, J., Hu, L., Dai, Y.: Description generation of open-domain videos incorporating multimodal features and bidirectional encoder. Vis. Comput. 35(12), 1703\u20131712 (2019)","journal-title":"Vis. Comput."},{"issue":"9","key":"816_CR8","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., Xu, X., Shen, H.T.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimed. 19(9), 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimed."},{"key":"816_CR9","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., Saenko, K.: Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: Proceedings of the IEEE international conference on computer vision, pp. 2712\u20132719 (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"816_CR10","doi-asserted-by":"publisher","unstructured":"Jin, T., Huang, S., Chen, M., Li, Y., Zhang, Z.: Sbat: Video captioning with sparse boundary-aware transformer. In: Bessiere, C. (ed.) Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, IJCAI-20, pp. 630\u2013636. International Joint Conferences on Artificial Intelligence Organization (2020). https:\/\/doi.org\/10.24963\/ijcai.2020\/88. Main track","DOI":"10.24963\/ijcai.2020\/88"},{"key":"816_CR11","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"816_CR12","doi-asserted-by":"crossref","unstructured":"Kojima, A., Izumi, M., Tamura, T., Fukunaga, K.: Generating natural language description of human behavior from video images. In: Proceedings 15th International Conference on Pattern Recognition. ICPR-2000, vol.\u00a04, pp. 728\u2013731. IEEE (2000)","DOI":"10.1109\/ICPR.2000.903020"},{"issue":"2","key":"816_CR13","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. Int. J. Comput. Vision 50(2), 171\u2013184 (2002)","journal-title":"Int. J. Comput. Vision"},{"key":"816_CR14","doi-asserted-by":"crossref","unstructured":"Kollnig, H., Nagel, H.H., Otte, M.: Association of motion verbs with vehicle movements extracted from dense optical flow fields. In: European Conference on Computer Vision, pp. 338\u2013347. Springer (1994)","DOI":"10.1007\/BFb0028366"},{"key":"816_CR15","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Pan, Y., Chao, H., Mei, T.: Jointly localizing and describing events for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7492\u20137500 (2018)","DOI":"10.1109\/CVPR.2018.00782"},{"key":"816_CR16","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"816_CR17","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 375\u2013383 (2017)","DOI":"10.1109\/CVPR.2017.345"},{"issue":"5","key":"816_CR18","doi-asserted-by":"publisher","first-page":"339","DOI":"10.1007\/s00371-010-0423-4","volume":"26","author":"M Oshita","year":"2010","unstructured":"Oshita, M.: Generating animation from natural language texts and semantic analysis for motion search and scheduling. Vis. Comput. 26(5), 339\u2013352 (2010)","journal-title":"Vis. Comput."},{"key":"816_CR19","doi-asserted-by":"crossref","unstructured":"Pan, Y., Mei, T., Yao, T., Li, H., Rui, Y.: Jointly modeling embedding and translation to bridge video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4594\u20134602 (2016)","DOI":"10.1109\/CVPR.2016.497"},{"key":"816_CR20","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, H., Mei, T.: Video captioning with transferred semantic attributes. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 6504\u20136512 (2017)","DOI":"10.1109\/CVPR.2017.111"},{"key":"816_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics, pp. 311\u2013318. Association for Computational Linguistics (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"816_CR22","unstructured":"Pascanu, R., Gulcehre, C., Cho, K., Bengio, Y.: How to construct deep recurrent neural networks. arXiv:1312.6026 (2013)"},{"key":"816_CR23","doi-asserted-by":"crossref","unstructured":"Perez-Martin, J., Bustos, B., Perez, J.: Improving video captioning with temporal composition of a visual-syntactic embedding. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3039\u20133049 (2021)","DOI":"10.1109\/WACV48630.2021.00308"},{"issue":"6","key":"816_CR24","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"816_CR25","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1007\/s10044-018-00770-3","volume":"23","author":"S Sah","year":"2020","unstructured":"Sah, S., Nguyen, T., Ptucha, R.: Understanding temporal structure for video captioning. Pattern Anal. Appl. 23(1), 147\u2013159 (2020)","journal-title":"Pattern Anal. Appl."},{"key":"816_CR26","doi-asserted-by":"publisher","unstructured":"Sanayai\u00a0Meetei, L., Singh, T.D., Bandyopadhyay, S.: WAT2019: English-Hindi translation on Hindi visual genome dataset. In: Proceedings of the 6th Workshop on Asian Translation, pp. 181\u2013188. Association for Computational Linguistics, Hong Kong, China (2019). https:\/\/doi.org\/10.18653\/v1\/D19-5224. https:\/\/www.aclweb.org\/anthology\/D19-5224","DOI":"10.18653\/v1\/D19-5224"},{"key":"816_CR27","unstructured":"Shetty, R., Laaksonen, J.: Video captioning with recurrent networks based on frame-and video-level features and visual content classification. arXiv:1512.02949 (2015)"},{"key":"816_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556 (2014)"},{"key":"816_CR29","doi-asserted-by":"publisher","unstructured":"Singh, A., Meetei, L.S., Singh, T.D., Bandyopadhyay, S.: Generation and evaluation of hindi image captions of visual genome. In: Proceedings of the International Conference on Computing and Communication Systems: I3CS 2020, NEHU, Shillong, India, vol. 170, p.\u00a065. Springer Nature (2021). https:\/\/doi.org\/10.1007\/978-981-33-4084-8_7","DOI":"10.1007\/978-981-33-4084-8_7"},{"key":"816_CR30","unstructured":"Singh, A., Singh, T.D., Bandyopadhyay, S.: Nits-vc system for vatex video captioning challenge 2020. arXiv:2006.04058 (2020)"},{"key":"816_CR31","doi-asserted-by":"crossref","unstructured":"Singh, A., Thounaojam, D.M., Chakraborty, S.: A novel automatic shot boundary detection algorithm: robust to illumination and motion effect. Signal, Image and Video Processing 1\u20139 (2019)","DOI":"10.1007\/s11760-019-01593-3"},{"key":"816_CR32","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A.: Inception-v4, inception-resnet and the impact of residual connections on learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031 (2017)","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"816_CR33","doi-asserted-by":"crossref","unstructured":"Tan, G., Liu, D., Wang, M., Zha, Z.J.: Learning to discretely compose reasoning module networks for video captioning. arXiv:2007.09049 (2020)","DOI":"10.24963\/ijcai.2020\/104"},{"key":"816_CR34","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"816_CR35","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: Advances in neural information processing systems, pp. 5998\u20136008 (2017)"},{"key":"816_CR36","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"816_CR37","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"816_CR38","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. arXiv:1412.4729 (2014)","DOI":"10.3115\/v1\/N15-1173"},{"key":"816_CR39","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"816_CR40","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Jiang, W., Wang, J., Liu, W.: Controllable video captioning with pos sequence guidance based on gated fusion network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2641\u20132650 (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"816_CR41","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"816_CR42","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"816_CR43","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, pp. 2048\u20132057 (2015)"},{"key":"816_CR44","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"816_CR45","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., Xu, W.: Video paragraph captioning using hierarchical recurrent neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4584\u20134593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"816_CR46","doi-asserted-by":"crossref","unstructured":"Zheng, Q., Wang, C., Tao, D.: Syntax-aware action targeting for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13096\u201313105 (2020)","DOI":"10.1109\/CVPR42600.2020.01311"}],"updated-by":[{"DOI":"10.1007\/s00530-021-00834-1","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2021,7,17]],"date-time":"2021-07-17T00:00:00Z","timestamp":1626480000000}}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-021-00816-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-021-00816-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-021-00816-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T21:24:23Z","timestamp":1725225863000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-021-00816-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,17]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2022,2]]}},"alternative-id":["816"],"URL":"https:\/\/doi.org\/10.1007\/s00530-021-00816-3","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,6,17]]},"assertion":[{"value":"11 November 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 June 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 July 2021","order":4,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":5,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s00530-021-00834-1","URL":"https:\/\/doi.org\/10.1007\/s00530-021-00834-1","order":7,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there are no conflicts of interest regarding the publication of this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}