{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T03:33:28Z","timestamp":1782358408031,"version":"3.54.5"},"reference-count":161,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T00:00:00Z","timestamp":1614384000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T00:00:00Z","timestamp":1614384000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"published-print":{"date-parts":[[2021,4]]},"DOI":"10.1007\/s42979-021-00487-x","type":"journal-article","created":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T13:02:45Z","timestamp":1614430965000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["Exploring Video Captioning Techniques: A Comprehensive Survey on Deep Learning Methods"],"prefix":"10.1007","volume":"2","author":[{"given":"Saiful","family":"Islam","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aurpan","family":"Dash","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ashek","family":"Seum","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amir Hossain","family":"Raj","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tonmoy","family":"Hossain","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Faisal Muhammad","family":"Shah","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2021,2,27]]},"reference":[{"key":"487_CR1","doi-asserted-by":"crossref","unstructured":"Aafaq N, Akhtar N, Liu W, Gilani SZ, Mian A. Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019a;12487\u201312496.","DOI":"10.1109\/CVPR.2019.01277"},{"key":"487_CR2","unstructured":"Aafaq N, Akhtar N, Liu W, Mian A. Empirical autopsy of deep video captioning frameworks. 2019b. arXiv preprint arXiv:191109345"},{"issue":"6","key":"487_CR3","doi-asserted-by":"publisher","first-page":"992","DOI":"10.3390\/sym12060992","volume":"12","author":"A Aggarwal","year":"2020","unstructured":"Aggarwal A, Chauhan A, Kumar D, Mittal M, Roy S, Kim Th. Video caption based searching using end-to-end dense captioning and sentence embeddings. Symmetry. 2020;12(6):992.","journal-title":"Symmetry"},{"key":"487_CR4","doi-asserted-by":"crossref","unstructured":"Agrawal P, Yadav R, Yadav V, De K, Roy PP. Caption-based region extraction in images. In: Proceedings of 3rd International Conference on Computer Vision and Image Processing, Springer, 2020;27\u201338.","DOI":"10.1007\/978-981-32-9291-8_3"},{"key":"487_CR5","unstructured":"Akbari H, Palangi H, Yang J, Rao S, Celikyilmaz A, Fernandez R, Smolensky P, Gao J, Chang SF. Neuro-symbolic representations for video captioning: A case for leveraging inductive biases for vision and language. 2020. arXiv preprint arXiv:201109530"},{"key":"487_CR6","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Lawrence\u00a0Zitnick C, Parikh D. Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, 2015;2425\u20132433"},{"key":"487_CR7","unstructured":"Ba JL, Kiros JR, Hinton GE. Layer normalization. 2016. arXiv preprint arXiv:160706450"},{"key":"487_CR8","unstructured":"Ballas N, Yao L, Pal C, Courville A. Delving deeper into convolutional networks for learning video representations. 2015. arXiv preprint arXiv:151106432"},{"key":"487_CR9","unstructured":"Banerjee S, Lavie A. Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, 2005;65\u201372."},{"key":"487_CR10","doi-asserted-by":"crossref","unstructured":"Bantupalli K, Xie Y. American sign language recognition using deep learning and computer vision. In: 2018 IEEE International Conference on Big Data (Big Data), IEEE, 2018; 4896\u20134899","DOI":"10.1109\/BigData.2018.8622141"},{"key":"487_CR11","doi-asserted-by":"crossref","unstructured":"Baraldi L, Grana C, Cucchiara R. Hierarchical boundary-aware neural encoder for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017;1657\u20131666.","DOI":"10.1109\/CVPR.2017.339"},{"key":"487_CR12","doi-asserted-by":"crossref","unstructured":"Bin Y, Yang Y, Shen F, Xu X, Shen HT. Bidirectional long-short term memory for video description. In: Proceedings of the 24th ACM international conference on Multimedia, 2016;436\u2013440 .","DOI":"10.1145\/2964284.2967258"},{"key":"487_CR13","doi-asserted-by":"crossref","unstructured":"Blei DM, Jordan MI. Modeling annotated data. In: Proceedings of the 26th annual international ACM SIGIR conference on Research and development in informaion retrieval, 2003;127\u2013134.","DOI":"10.1145\/860435.860460"},{"key":"487_CR14","unstructured":"Chen DL, Dolan WB. Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1, Association for Computational Linguistics, 2011;190\u2013200."},{"key":"487_CR15","doi-asserted-by":"crossref","unstructured":"Chen H, Lin K, Maye A, Li J, Hu X. A semantics-assisted video captioning model trained with scheduled sampling. 2019a. arXiv preprint arXiv:190900121","DOI":"10.3389\/frobt.2020.475767"},{"key":"487_CR16","unstructured":"Chen H, Li J, Hu X. Delving deeper into the decoder for video captioning. 2020a. arXiv preprint arXiv:200105614"},{"key":"487_CR17","doi-asserted-by":"crossref","unstructured":"Chen J, Jin Q. Better captioning with sequence-level exploration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020;10890\u201310899.","DOI":"10.1109\/CVPR42600.2020.01090"},{"key":"487_CR18","doi-asserted-by":"publisher","first-page":"8167","DOI":"10.1609\/aaai.v33i01.33018167","volume":"33","author":"J Chen","year":"2019","unstructured":"Chen J, Pan Y, Li Y, Yao T, Chao H, Mei T. Temporal deformable convolutional encoder-decoder networks for video captioning. Proceedings of the AAAI Conference on Artificial Intelligence. 2019b;33:8167\u201374.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"487_CR19","doi-asserted-by":"publisher","first-page":"8191","DOI":"10.1609\/aaai.v33i01.33018191","volume":"33","author":"S Chen","year":"2019","unstructured":"Chen S, Jiang YG. Motion guided spatial attention for video captioning. Proceedings of the AAAI Conference on Artificial Intelligence. 2019;33:8191\u20138.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"487_CR20","doi-asserted-by":"publisher","first-page":"132","DOI":"10.1016\/j.patrec.2018.12.018","volume":"132","author":"X Chen","year":"2020","unstructured":"Chen X, Zhang M, Wang Z, Zuo L, Li B, Yang Y. Leveraging unpaired out-of-domain data for image captioning. Pattern Recognition Letters. 2020b;132:132\u201340.","journal-title":"Pattern Recognition Letters"},{"key":"487_CR21","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y. Learning phrase representations using rnn encoder-decoder for statistical machine translation. 2014. arXiv preprint arXiv:14061078","DOI":"10.3115\/v1\/D14-1179"},{"key":"487_CR22","doi-asserted-by":"crossref","unstructured":"Das A, Kottur S, Gupta K, Singh A, Yadav D, Moura JM, Parikh D, Batra D. Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017;326\u2013335.","DOI":"10.1109\/CVPR.2017.121"},{"key":"487_CR23","unstructured":"Das P, Xu C, Doell RF, Corso JJ. A thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2013;2634\u20132641."},{"key":"487_CR24","unstructured":"Deng J, Krause J, Berg AC, Fei-Fei L. Hedging your bets: Optimizing accuracy-specificity trade-offs in large scale visual recognition. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, 2012;3450\u20133457."},{"key":"487_CR25","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA. Sergio guadar rama. In: Marcus Rohrbach, Subhashini Venugopalan, Kate Saenko, and Trevor Darrell, \u201cLong-term recurrent con volutional networks for visual recognition and descrip tion,\u201d in Proceedings of the IEEE conference on com puter vision and pattern recognition, 2015;2625\u20132634.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"487_CR26","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne\u00a0Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T. Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015;2625\u20132634.","DOI":"10.1109\/CVPR.2015.7298878"},{"issue":"2","key":"487_CR27","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham M, Van Gool L, Williams CK, Winn J, Zisserman A. The pascal visual object classes (voc) challenge. International journal of computer vision. 2010;88(2):303\u201338.","journal-title":"International journal of computer vision"},{"key":"487_CR28","doi-asserted-by":"crossref","unstructured":"Fang Z, Gokhale T, Banerjee P, Baral C, Yang Y. Video2commonsense: Generating commonsense descriptions to enrich video captioning. 2020. arXiv preprint arXiv:200305162","DOI":"10.18653\/v1\/2020.emnlp-main.61"},{"key":"487_CR29","doi-asserted-by":"crossref","unstructured":"Fawzy NK, Marey MA, Aref MM. Video captioning using attention based visual fusion with bi-temporal context and bi-modal semantic feature learning. In: International Conference on Advanced Intelligent Systems and Informatics, Springer, 2020; 65\u201378.","DOI":"10.1007\/978-3-030-58669-0_6"},{"key":"487_CR30","doi-asserted-by":"crossref","unstructured":"Felzenszwalb P, McAllester D, Ramanan D. A discriminatively trained, multiscale, deformable part model. In: 2008 IEEE conference on computer vision and pattern recognition, IEEE, 2008;1\u20138.","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"487_CR31","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, Pu Y, Tran K, Gao J, Carin L, Deng L. Semantic compositional networks for visual captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2017;5630\u20135639.","DOI":"10.1109\/CVPR.2017.127"},{"key":"487_CR32","unstructured":"Guadarrama S, Krishnamoorthy N, Malkarnenkar G, Venugopalan S, Mooney R, Darrell T, Saenko K. Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: Proceedings of the IEEE international conference on computer vision, 2013;2712\u20132719 ."},{"issue":"2","key":"487_CR33","doi-asserted-by":"publisher","first-page":"735","DOI":"10.1007\/s11280-018-0530-0","volume":"22","author":"Y Guo","year":"2019","unstructured":"Guo Y, Zhang J, Gao L. Exploiting long-term temporal dynamics for video captioning. World Wide Web. 2019;22(2):735\u201349.","journal-title":"World Wide Web"},{"key":"487_CR34","doi-asserted-by":"crossref","unstructured":"Guo Z, Gao L, Song J, Xu X, Shao J, Shen HT. Attention-based lstm with semantic consistency for videos captioning. In: Proceedings of the 24th ACM international conference on Multimedia, 2016;357\u2013361.","DOI":"10.1145\/2964284.2967242"},{"key":"487_CR35","doi-asserted-by":"crossref","unstructured":"Hao X, Zhou F, Li X. Scene-edge gru for video caption. In: 2020 IEEE 4th Information Technology, Networking, Electronic and Automation Control Conference (ITNEC), IEEE, vol\u00a01, 2020;1290\u20131295.","DOI":"10.1109\/ITNEC48623.2020.9084781"},{"key":"487_CR36","doi-asserted-by":"crossref","unstructured":"Hara K, Kataoka H, Satoh Y. Learning spatio-temporal features with 3d residual networks for action recognition. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, 2017;3154\u20133160.","DOI":"10.1109\/ICCVW.2017.373"},{"key":"487_CR37","doi-asserted-by":"crossref","unstructured":"Hemalatha M, Sekhar CC. Domain-specific semantics guided approach to video captioning. In: 2020 IEEE Winter Conference on Applications of Computer Vision (WACV), IEEE, 2020;1576\u20131585.","DOI":"10.1109\/WACV45572.2020.9093344"},{"issue":"8","key":"487_CR38","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J. Long short-term memory. Neural computation. 1997;9(8):1735\u201380.","journal-title":"Neural computation"},{"key":"487_CR39","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J. Framing image description as a ranking task: Data, models and evaluation metrics. Journal of Artificial Intelligence Research. 2013;47:853\u201399.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"487_CR40","doi-asserted-by":"crossref","unstructured":"Hori C, Hori T, Lee TY, Zhang Z, Harsham B, Hershey JR, Marks TK, Sumi K. Attention-based multimodal fusion for video description. In: Proceedings of the IEEE international conference on computer vision, 2017;4193\u20134202.","DOI":"10.1109\/ICCV.2017.450"},{"key":"487_CR41","doi-asserted-by":"crossref","unstructured":"Hou J, Wu X, Zhao W, Luo J, Jia Y. Joint syntax representation learning and visual cue translation for video captioning. In: Proceedings of the IEEE International Conference on Computer Vision, 2019;8918\u20138927.","DOI":"10.1109\/ICCV.2019.00901"},{"key":"487_CR42","unstructured":"Hou J, Jia Y, Qi Y, et\u00a0al. Video captioning using weak annotation. 2020 arXiv preprint arXiv:200901067"},{"key":"487_CR43","unstructured":"Huang G, Pang B, Zhu Z, Rivera C, Soricut R. Multimodal pretraining for dense video captioning. 2020a. arXiv preprint arXiv:201111760"},{"key":"487_CR44","doi-asserted-by":"publisher","first-page":"4013","DOI":"10.1109\/TIP.2020.2969330","volume":"29","author":"Y Huang","year":"2020","unstructured":"Huang Y, Chen J, Ouyang W, Wan W, Xue Y. Image captioning with end-to-end attribute detection and subsequent attributes prediction. IEEE Transactions on Image Processing. 2020b;29:4013\u201326.","journal-title":"IEEE Transactions on Image Processing"},{"key":"487_CR45","doi-asserted-by":"crossref","unstructured":"Iashin V, Rahtu E. A better use of audio-visual cues: Dense video captioning with bi-modal transformer. 2020. arXiv preprint arXiv:200508271","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"487_CR46","unstructured":"Jain BD, Thakur SM, Suresh K. Visual assistance for blind using image processing. In: 2018 International Conference on Communication and Signal Processing (ICCSP), IEEE, 2018;0499\u20130503."},{"key":"487_CR47","doi-asserted-by":"crossref","unstructured":"Jeong D, Kim BG. Dong SY. Deep joint spatiotemporal network (djstn) for efficient facial expression recognition. Sensors. 2020;20(7) 1936.","DOI":"10.3390\/s20071936"},{"key":"487_CR48","unstructured":"Jia Z, Li X. icap: Interactive image captioning with predictive text. In: Proceedings of the 2020 International Conference on Multimedia Retrieval, 2020;428\u2013435."},{"key":"487_CR49","doi-asserted-by":"crossref","unstructured":"Jin T, Huang S, Chen M, Li Y, Zhang Z. Sbat: Video captioning with sparse boundary-aware transformer. 2020. arXiv preprint arXiv:200711888","DOI":"10.24963\/ijcai.2020\/88"},{"issue":"15","key":"487_CR50","doi-asserted-by":"publisher","first-page":"1731","DOI":"10.1016\/j.patrec.2004.07.009","volume":"25","author":"BG Kim","year":"2004","unstructured":"Kim BG, Park DJ. Unsupervised video object segmentation and tracking based on new edge features. Pattern Recognition Letters. 2004;25(15):1731\u201342.","journal-title":"Pattern Recognition Letters"},{"issue":"7","key":"487_CR51","doi-asserted-by":"publisher","first-page":"1162","DOI":"10.3390\/electronics9071162","volume":"9","author":"J Kim","year":"2020","unstructured":"Kim J, Choi I, Lee M. Context aware video caption generation with consecutive differentiable neural computer. Electronics. 2020;9(7):1162.","journal-title":"Electronics"},{"key":"487_CR52","doi-asserted-by":"publisher","first-page":"41273","DOI":"10.1109\/ACCESS.2019.2907327","volume":"7","author":"JH Kim","year":"2019","unstructured":"Kim JH, Kim BG, Roy PP, Jeong DM. Efficient facial expression recognition algorithm based on hierarchical deep neural network structure. IEEE Access. 2019;7:41273\u201385.","journal-title":"IEEE Access"},{"key":"487_CR53","doi-asserted-by":"crossref","unstructured":"Koehn P. Statistical machine translation. Cambridge University Press; 2009.","DOI":"10.1017\/CBO9780511815829"},{"issue":"2","key":"487_CR54","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima A, Tamura T, Fukunaga K. Natural language description of human activities from video images based on concept hierarchy of actions. International Journal of Computer Vision. 2002;50(2):171\u201384.","journal-title":"International Journal of Computer Vision"},{"key":"487_CR55","unstructured":"Korbar B, Petroni F, Girdhar R, Torresani L. Video understanding as machine translation. 2020. arXiv preprint arXiv:200607203"},{"key":"487_CR56","doi-asserted-by":"crossref","unstructured":"Krishna R, Hata K, Ren F, Fei-Fei L, Niebles JC. Dense-captioning events in videos. In: International Conference on Computer Vision (ICCV). 2017.","DOI":"10.1109\/ICCV.2017.83"},{"key":"487_CR57","doi-asserted-by":"crossref","unstructured":"Krishnamoorthy N, Malkarnenkar G, Mooney R, Saenko K, Guadarrama S. Generating natural-language video descriptions using text-mined knowledge. In: Twenty-Seventh AAAI Conference on Artificial Intelligence 2013.","DOI":"10.1609\/aaai.v27i1.8679"},{"key":"487_CR58","doi-asserted-by":"crossref","unstructured":"Lan W, Li X, Dong J. Fluency-guided cross-lingual image captioning. In: Proceedings of the 25th ACM international conference on Multimedia, 2017;1549\u20131557.","DOI":"10.1145\/3123266.3123366"},{"key":"487_CR59","doi-asserted-by":"crossref","unstructured":"Laptev I, Marszalek M, Schmid C, Rozenfeld B. Learning realistic human actions from movies. In: 2008 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, 2008;1\u20138.","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"487_CR60","doi-asserted-by":"crossref","unstructured":"Lei J, Wang L, Shen Y, Yu D, Berg TL, Bansal M. Mart: Memory-augmented recurrent transformer for coherent video paragraph captioning. 2020. arXiv preprint arXiv:200505402","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"487_CR61","doi-asserted-by":"crossref","unstructured":"Li L, Gong B. End-to-end video captioning with multitask reinforcement learning. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), IEEE, 2019;339\u2013348","DOI":"10.1109\/WACV.2019.00042"},{"key":"487_CR62","unstructured":"Li LJ, Su H, Fei-Fei L, Xing EP. Object bank: A high-level image representation for scene classification & semantic feature sparsification. In: Advances in neural information processing systems, 2010;1378\u20131386"},{"issue":"2","key":"487_CR63","doi-asserted-by":"publisher","first-page":"621","DOI":"10.1007\/s11280-018-0531-z","volume":"22","author":"X Li","year":"2019","unstructured":"Li X, Zhou Z, Chen L, Gao L. Residual attention-based lstm for video captioning. World Wide Web. 2019;22(2):621\u201336.","journal-title":"World Wide Web"},{"key":"487_CR64","doi-asserted-by":"crossref","unstructured":"Li Y, Yao T, Pan Y, Chao H, Mei T. Jointly localizing and describing events for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018;7492\u20137500.","DOI":"10.1109\/CVPR.2018.00782"},{"key":"487_CR65","unstructured":"Lin CY. Rouge: A packagefor automatic evaluation of summaries. In: ProceedingsofWorkshop on Text Summarization Branches Out, Post2Conference Workshop of ACL 2004."},{"key":"487_CR66","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL. Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, 2014;740\u2013755."},{"key":"487_CR67","doi-asserted-by":"crossref","unstructured":"Liu F, Gao S, Gao X. Segmentation of mr image based on maximum a posterior. In: 2001 Conference Proceedings of the 23rd Annual International Conference of the IEEE Engineering in Medicine and Biology Society, IEEE, vol\u00a03, 2001;2681\u20132684.","DOI":"10.1109\/IEMBS.2001.1017335"},{"key":"487_CR68","doi-asserted-by":"crossref","unstructured":"Liu S, Ren Z, Yuan J. Sibnet: Sibling convolutional encoder for video captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence. 2020.","DOI":"10.1109\/TPAMI.2019.2940007"},{"key":"487_CR69","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1162\/tacl_a_00013","volume":"6","author":"X Long","year":"2018","unstructured":"Long X, Gan C, de Melo G. Video captioning with multi-faceted attention. Transactions of the Association for Computational Linguistics. 2018;6:173\u201384.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"487_CR70","unstructured":"Luo H, Ji L, Shi B, Huang H, Duan N, Li T, Chen X, Zhou M. Univilm: A unified video and language pre-training model for multimodal understanding and generation. 2020. arXiv preprint arXiv:200206353"},{"key":"487_CR71","unstructured":"Malinowski M, Fritz M. A multi-world approach to question answering about real-world scenes based on uncertain input. In: Advances in neural information processing systems, 2014;1682\u20131690."},{"key":"487_CR72","unstructured":"Miech A, Zhukov D, Alayrac JB, Tapaswi M, Laptev I, Sivic J. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE International Conference on Computer Vision, 2019;2630\u20132640."},{"key":"487_CR73","unstructured":"Motwani TS, Mooney RJ. Improving video activity recognition using object recognition and text mining. In: ECAI, 2012;vol\u00a01, p\u00a02 ."},{"issue":"4","key":"487_CR74","first-page":"225","volume":"4","author":"S Mukherjee","year":"2017","unstructured":"Mukherjee S, Saini R, Kumar P, Roy PP, Dogra DP, Kim BG, et al. Fight detection in hockey videos using deep network. Journal of Multimedia Information System. 2017;4(4):225\u201332.","journal-title":"Journal of Multimedia Information System"},{"key":"487_CR75","doi-asserted-by":"crossref","unstructured":"Mukherjee S, Ghosh S, Ghosh S, Kumar P, Roy PP. Predicting video-frames using encoder-convlstm combination. In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics. IEEE: Speech and Signal Processing (ICASSP); 2019. p. 2027\u201331.","DOI":"10.1109\/ICASSP.2019.8682158"},{"key":"487_CR76","doi-asserted-by":"crossref","unstructured":"Mun J, Yang L, Ren Z, Xu N, Han B. Streamlined dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019;6588\u20136597","DOI":"10.1109\/CVPR.2019.00675"},{"key":"487_CR77","doi-asserted-by":"crossref","unstructured":"Olivastri S, Singh G, Cuzzolin F. End-to-end video captioning. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, 2019;0\u20130.","DOI":"10.1109\/ICCVW.2019.00185"},{"issue":"3","key":"487_CR78","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1016\/S0031-3203(98)00041-7","volume":"32","author":"P Pala","year":"1999","unstructured":"Pala P, Santini S. Image retrieval by shape and texture. Pattern Recognition. 1999;32(3):517\u201327.","journal-title":"Pattern Recognition"},{"key":"487_CR79","doi-asserted-by":"crossref","unstructured":"Pan B, Cai H, Huang DA, Lee KH, Gaidon A, Adeli E, Niebles JC. Spatio-temporal graph for video captioning with knowledge distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020;10870\u201310879.","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"487_CR80","doi-asserted-by":"crossref","unstructured":"Pan P, Xu Z, Yang Y, Wu F, Zhuang Y. Hierarchical recurrent neural encoder for video representation with application to captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016a;1029\u20131038.","DOI":"10.1109\/CVPR.2016.117"},{"key":"487_CR81","doi-asserted-by":"crossref","unstructured":"Pan Y, Mei T, Yao T, Li H, Rui Y. Jointly modeling embedding and translation to bridge video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016b;4594\u20134602.","DOI":"10.1109\/CVPR.2016.497"},{"key":"487_CR82","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li H, Mei T. Video captioning with transferred semantic attributes. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2017;6504\u20136512.","DOI":"10.1109\/CVPR.2017.111"},{"key":"487_CR83","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ. Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics, Association for Computational Linguistics, 2002;311\u2013318."},{"key":"487_CR84","doi-asserted-by":"crossref","unstructured":"Park JS, Rohrbach M, Darrell T, Rohrbach A. Adversarial inference for multi-sentence video description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019;6598\u20136608.","DOI":"10.1109\/CVPR.2019.00676"},{"key":"487_CR85","doi-asserted-by":"crossref","unstructured":"Pasunuru R, Bansal M. Multi-task video captioning with video and entailment generation. 2017. arXiv preprint arXiv:170407489","DOI":"10.18653\/v1\/D17-1103"},{"issue":"2","key":"487_CR86","doi-asserted-by":"publisher","first-page":"571","DOI":"10.1007\/s11280-018-0582-1","volume":"22","author":"K Pawar","year":"2019","unstructured":"Pawar K, Attar V. Deep learning approaches for video-based anomalous activity detection. World Wide Web. 2019;22(2):571\u2013601.","journal-title":"World Wide Web"},{"key":"487_CR87","doi-asserted-by":"crossref","unstructured":"Pei W, Zhang J, Wang X, Ke L, Shen X, Tai YW. Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019;8347\u20138356.","DOI":"10.1109\/CVPR.2019.00854"},{"key":"487_CR88","doi-asserted-by":"publisher","first-page":"446","DOI":"10.1016\/j.procs.2020.04.047","volume":"171","author":"S Pendurkar","year":"2020","unstructured":"Pendurkar S, Kolpekwar S, Dhoot S, Haribhakta YV, Banerjee B. Attention based multi-modal fusion architecture for open-ended video question answering systems. Procedia Computer Science. 2020;171:446\u201355.","journal-title":"Procedia Computer Science"},{"key":"487_CR89","unstructured":"Perez-Martin J, Bustos B, P\u00e9rez J. Attentive visual semantic specialized network for video captioning. In: International Conference on Computer Vision 2020."},{"key":"487_CR90","doi-asserted-by":"crossref","unstructured":"Perez-Martin J, Bustos B, Perez J. Improving video captioning with temporal composition of a visual-syntactic embedding. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021;3039\u20133049.","DOI":"10.1109\/WACV48630.2021.00308"},{"key":"487_CR91","doi-asserted-by":"crossref","unstructured":"Peris \u00c1, Bola\u00f1os M, Radeva P, Casacuberta F. Video description using bidirectional recurrent neural networks. In: International Conference on Artificial Neural Networks, Springer, 2016;3\u201311.","DOI":"10.1007\/978-3-319-44781-0_1"},{"key":"487_CR92","unstructured":"Plummer BA, Wang L, Cervantes CM, Caicedo JC, Hockenmaier J, Lazebnik S. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, 2015;2641\u20132649."},{"key":"487_CR93","unstructured":"Rahman M, Abedin T, Prottoy KS, Moshruba A, Siddiqui FH, et\u00a0al. Semantically sensible video captioning (ssvc). 2020. arXiv preprint arXiv:200907335"},{"key":"487_CR94","unstructured":"Rashtchian C, Young P, Hodosh M, Hockenmaier J. Collecting image annotations using amazon\u2019s mechanical turk. In: Proceedings of the NAACL HLT 2010 Workshop on Creating Speech and Language Data with Amazon\u2019s Mechanical Turk, 2010;139\u2013147"},{"key":"487_CR95","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1162\/tacl_a_00207","volume":"1","author":"M Regneri","year":"2013","unstructured":"Regneri M, Rohrbach M, Wetzel D, Thater S, Schiele B, Pinkal M. Grounding action descriptions in videos. Transactions of the Association for Computational Linguistics. 2013;1:25\u201336.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"487_CR96","unstructured":"Rimle P, Dogan P, Gross M. Enriching video captions with contextual text. 2020. arXiv preprint arXiv:200714682"},{"key":"487_CR97","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Qiu W, Friedrich A, Pinkal M, Schiele B. Coherent multi-sentence video description with variable level of detail. In: German conference on pattern recognition, Springer, 2014;184\u2013195.","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"487_CR98","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Schiele B. The long-short story of movie description. In: German conference on pattern recognition, Springer, 2015a;209\u2013221.","DOI":"10.1007\/978-3-319-24947-6_17"},{"key":"487_CR99","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Tandon N, Schiele B. A dataset for movie description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015b;3202\u20133212.","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"487_CR100","doi-asserted-by":"crossref","unstructured":"Rohrbach M, Qiu W, Titov I, Thater S, Pinkal M, Schiele B. Translating video content to natural language descriptions. In: Proceedings of the IEEE International Conference on Computer Vision, 2013;433\u2013440.","DOI":"10.1109\/ICCV.2013.61"},{"issue":"6","key":"487_CR101","doi-asserted-by":"publisher","first-page":"7767","DOI":"10.1007\/s11042-018-6484-5","volume":"78","author":"PP Roy","year":"2019","unstructured":"Roy PP, Bhunia AK, Bhattacharyya A, Pal U. Word searching in scene image and video frame in multi-script scenario using dynamic shape coding. Multimedia Tools and Applications. 2019;78(6):7767\u2013801.","journal-title":"Multimedia Tools and Applications"},{"issue":"1","key":"487_CR102","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1007\/s10044-018-00770-3","volume":"23","author":"S Sah","year":"2020","unstructured":"Sah S, Nguyen T, Ptucha R. Understanding temporal structure for video captioning. Pattern Analysis and Applications. 2020;23(1):147\u201359.","journal-title":"Pattern Analysis and Applications"},{"key":"487_CR103","unstructured":"Schuldt C, Laptev I, Caputo B. Recognizing human actions: a local svm approach. In: Proceedings of the 17th International Conference on Pattern Recognition, 2004. ICPR 2004., IEEE, 2004;3,32\u201336"},{"key":"487_CR104","doi-asserted-by":"crossref","unstructured":"Shen Z, Li J, Su Z, Li M, Chen Y, Jiang YG, Xue X. Weakly supervised dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017;1916\u20131924.","DOI":"10.1109\/CVPR.2017.548"},{"issue":"10","key":"487_CR105","doi-asserted-by":"publisher","first-page":"3047","DOI":"10.1109\/TNNLS.2018.2851077","volume":"30","author":"J Song","year":"2018","unstructured":"Song J, Guo Y, Gao L, Li X, Hanjalic A, Shen HT. From deterministic to generative: Multimodal stochastic rnns for video captioning. IEEE transactions on neural networks and learning systems. 2018;30(10):3047\u201358.","journal-title":"IEEE transactions on neural networks and learning systems"},{"key":"487_CR106","unstructured":"Song K, Tan X, Qin T, Lu J, Liu TY. Mass: Masked sequence to sequence pre-training for language generation. 2019. arXiv preprint arXiv:190502450"},{"key":"487_CR107","unstructured":"Soomro K, Zamir AR, Shah M. Ucf101: A dataset of 101 human actions classes from videos in the wild. 2012. arXiv preprint arXiv:12120402"},{"key":"487_CR108","unstructured":"Srivastava N, Mansimov E, Salakhudinov R. Unsupervised learning of video representations using lstms. In: International conference on machine learning, 2015;843\u2013852."},{"key":"487_CR109","doi-asserted-by":"crossref","unstructured":"Suin M, Rajagopalan A. An efficient framework for dense video captioning. In: AAAI, 2020;12039\u201312046.","DOI":"10.1609\/aaai.v34i07.6881"},{"key":"487_CR110","doi-asserted-by":"crossref","unstructured":"Sun C, Nevatia R. Semantic aware video transcription using random forest classifiers. In: European Conference on Computer Vision, Springer, 2014;772\u2013786.","DOI":"10.1007\/978-3-319-10590-1_50"},{"key":"487_CR111","unstructured":"Sur C. Sact: Self-aware multi-space feature composition transformer for multinomial attention for video captioning. 2020. arXiv preprint arXiv:200614262"},{"key":"487_CR112","doi-asserted-by":"crossref","unstructured":"Szegedy C, Ioffe S, Vanhoucke V, Alemi A. Inception-v4, inception-resnet and the impact of residual connections on learning. In: Proceedings of the AAAI Conference on Artificial Intelligence 2017.","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"487_CR113","doi-asserted-by":"crossref","unstructured":"Tan G, Liu D, Wang M, Zha ZJ. Learning to discretely compose reasoning module networks for video captioning. 2020. arXiv preprint arXiv:200709049","DOI":"10.24963\/ijcai.2020\/104"},{"key":"487_CR114","unstructured":"Tang J. Intelligent Mobile Projects with TensorFlow: Build 10+ Artificial Intelligence Apps Using TensorFlow Mobile and Lite for IOS, Android, and Raspberry Pi. Packt Publishing Ltd 2018."},{"key":"487_CR115","unstructured":"Thomason J, Venugopalan S, Guadarrama S, Saenko K, Mooney R. Integrating language and vision to generate natural language descriptions of videos in the wild. In: Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers, 2014;1218\u20131227."},{"key":"487_CR116","unstructured":"Torabi A, Pal C, Larochelle H, Courville A. Using descriptive video services to create a large data source for video annotation research. 2015. arXiv preprint arXiv:150301070"},{"key":"487_CR117","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M. Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, 2015;4489\u20134497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"487_CR118","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I. Attention is all you need. In: Advances in neural information processing systems, 2017;5998\u20136008."},{"key":"487_CR119","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D. Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015;4566\u20134575."},{"key":"487_CR120","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Xu H, Donahue J, Rohrbach M, Mooney R, Saenko K. Translating videos to natural language using deep recurrent neural networks. 2014. arXiv preprint arXiv:14124729","DOI":"10.3115\/v1\/N15-1173"},{"key":"487_CR121","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K. Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision, 2015;4534\u20134542.","DOI":"10.1109\/ICCV.2015.515"},{"key":"487_CR122","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Hendricks LA, Mooney R, Saenko K. Improving lstm-based video description with linguistic knowledge mined from text. 2016. arXiv preprint arXiv:160401729","DOI":"10.18653\/v1\/D16-1204"},{"key":"487_CR123","doi-asserted-by":"crossref","unstructured":"Wang B, Ma L, Zhang W, Liu W. Reconstruction network for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018a;7622\u20137631.","DOI":"10.1109\/CVPR.2018.00795"},{"key":"487_CR124","doi-asserted-by":"crossref","unstructured":"Wang B, Ma L, Zhang W, Jiang W, Wang J, Liu W. Controllable video captioning with pos sequence guidance based on gated fusion network. In: Proceedings of the IEEE International Conference on Computer Vision, 2019a;2641\u20132650.","DOI":"10.1109\/ICCV.2019.00273"},{"key":"487_CR125","doi-asserted-by":"crossref","unstructured":"Wang H, Kl\u00e4ser A, Schmid C, Liu CL. Action recognition by dense trajectories. In: CVPR 2011, IEEE, 2011;3169\u20133176.","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"487_CR126","doi-asserted-by":"publisher","first-page":"327","DOI":"10.1016\/j.patrec.2018.07.024","volume":"130","author":"H Wang","year":"2020","unstructured":"Wang H, Gao C, Han Y. Sequence in sequence for video captioning. Pattern Recognition Letters. 2020a;130:327\u201334.","journal-title":"Pattern Recognition Letters"},{"key":"487_CR127","doi-asserted-by":"crossref","unstructured":"Wang J, Jiang W, Ma L, Liu W, Xu Y. Bidirectional attentive fusion with context gating for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018b;7190\u20137198.","DOI":"10.1109\/CVPR.2018.00751"},{"key":"487_CR128","unstructured":"Wang J, Wang W, Huang Y, Wang L, Tan T. M3: Multimodal memory modelling for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018c;7512\u20137520."},{"key":"487_CR129","doi-asserted-by":"publisher","first-page":"107075","DOI":"10.1016\/j.patcog.2019.107075","volume":"98","author":"J Wang","year":"2020","unstructured":"Wang J, Wang W, Wang L, Wang Z, Feng DD, Tan T. Learning visual relationship and context-aware attention for image captioning. Pattern Recognition. 2020b;98:107075.","journal-title":"Pattern Recognition"},{"issue":"3","key":"487_CR130","doi-asserted-by":"publisher","first-page":"2013","DOI":"10.1007\/s11042-019-08209-5","volume":"79","author":"S Wang","year":"2020","unstructured":"Wang S, Lan L, Zhang X, Dong G, Luo Z. Object-aware semantics of attention for image captioning. Multimedia Tools and Applications. 2020c;79(3):2013\u201330.","journal-title":"Multimedia Tools and Applications"},{"key":"487_CR131","doi-asserted-by":"crossref","unstructured":"Wang T, Zheng H, Yu M, Tian Q, Hu H. Event-centric hierarchical representation for dense video captioning. IEEE Transactions on Circuits and Systems for Video Technology. 2020d.","DOI":"10.1109\/TCSVT.2020.3014606"},{"key":"487_CR132","doi-asserted-by":"crossref","unstructured":"Wang X, Chen W, Wu J, Wang YF, Yang\u00a0Wang W. Video captioning via hierarchical reinforcement learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018d;4213\u20134222.","DOI":"10.1109\/CVPR.2018.00443"},{"key":"487_CR133","doi-asserted-by":"publisher","first-page":"8965","DOI":"10.1609\/aaai.v33i01.33018965","volume":"33","author":"X Wang","year":"2019","unstructured":"Wang X, Wu J, Zhang D, Su Y, Wang WY. Learning to compose topic-aware mixture of experts for zero-shot video captioning. Proceedings of the AAAI Conference on Artificial Intelligence. 2019b;33:8965\u201372.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"487_CR134","unstructured":"Wu A, Han Y. Hierarchical memory decoding for video captioning. 2020. arXiv preprint arXiv:200211886"},{"key":"487_CR135","doi-asserted-by":"crossref","unstructured":"Wu X, Li G, Cao Q, Ji Q, Lin L. Interpretable video captioning via trajectory structured localization. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, 2018;6829\u20136837.","DOI":"10.1109\/CVPR.2018.00714"},{"key":"487_CR136","doi-asserted-by":"crossref","unstructured":"Xia Q, Huang H, Duan N, Zhang D, Ji L, Sui Z, Cui E, Bharti T, Zhou M. Xgpt: Cross-modal generative pre-training for image captioning. 2020. arXiv preprint arXiv:200301473","DOI":"10.1007\/978-3-030-88480-2_63"},{"key":"487_CR137","doi-asserted-by":"crossref","unstructured":"Xiao H, Shi J. Video captioning with text-based dynamic attention and step-by-step learning. Pattern Recognition Letters. 2020.","DOI":"10.1016\/j.patrec.2020.03.001"},{"key":"487_CR138","doi-asserted-by":"crossref","unstructured":"Xu H, Li B, Ramanishka V, Sigal L, Saenko K. Joint event detection and description in continuous video streams. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), IEEE, 2019;396\u2013405.","DOI":"10.1109\/WACV.2019.00048"},{"key":"487_CR139","unstructured":"Xu J, Mei T, Yao T, Rui Y. Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016;5288\u20135296."},{"key":"487_CR140","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y. Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, 2015a;2048\u20132057."},{"issue":"8","key":"487_CR141","doi-asserted-by":"publisher","first-page":"2482","DOI":"10.1109\/TCSVT.2018.2867286","volume":"29","author":"N Xu","year":"2018","unstructured":"Xu N, Liu AA, Wong Y, Zhang Y, Nie W, Su Y, Kankanhalli M. Dual-stream recurrent neural network for video captioning. IEEE Transactions on Circuits and Systems for Video Technology. 2018;29(8):2482\u201393.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"487_CR142","doi-asserted-by":"crossref","unstructured":"Xu R, Xiong C, Chen W, Corso JJ. Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: Twenty-Ninth AAAI Conference on Artificial Intelligence 2015b.","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"487_CR143","doi-asserted-by":"crossref","unstructured":"Yan C, Tu Y, Wang X, Zhang Y, Hao X, Zhang Y, Dai Q. Stat: spatial-temporal attention mechanism for video captioning. IEEE transactions on multimedia 2019.","DOI":"10.1109\/TMM.2020.2966830"},{"issue":"11","key":"487_CR144","doi-asserted-by":"publisher","first-page":"5600","DOI":"10.1109\/TIP.2018.2855422","volume":"27","author":"Y Yang","year":"2018","unstructured":"Yang Y, Zhou J, Ai J, Bin Y, Hanjalic A, Shen HT, Ji Y. Video captioning by adversarial lstm. IEEE Transactions on Image Processing. 2018;27(11):5600\u201311.","journal-title":"IEEE Transactions on Image Processing"},{"key":"487_CR145","doi-asserted-by":"crossref","unstructured":"Yang Z, Han Y, Wang Z. Catching the temporal regions-of-interest for video captioning. In: Proceedings of the 25th ACM international conference on Multimedia, 2017;146\u2013153.","DOI":"10.1145\/3123266.3123327"},{"key":"487_CR146","doi-asserted-by":"crossref","unstructured":"Yao L, Torabi A, Cho K, Ballas N, Pal C, Larochelle H, Courville A. Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision, 2015;4507\u20134515.","DOI":"10.1109\/ICCV.2015.512"},{"key":"487_CR147","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J. From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics. 2014;2:67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"487_CR148","doi-asserted-by":"crossref","unstructured":"Yu H, Wang J, Huang Z, Yang Y, Xu W. Video paragraph captioning using hierarchical recurrent neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016;4584\u20134593","DOI":"10.1109\/CVPR.2016.496"},{"key":"487_CR149","unstructured":"Zhang C, Tian Y. Automatic video description generation via lstm with joint two-stream encoding. In: 2016 23rd International Conference on Pattern Recognition (ICPR), IEEE, 2016;2924\u20132929."},{"key":"487_CR150","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y. Hierarchical vision-language alignment for video captioning. In: International Conference on Multimedia Modeling, Springer, 2019a;42\u201354.","DOI":"10.1007\/978-3-030-05710-7_4"},{"key":"487_CR151","doi-asserted-by":"crossref","unstructured":"Zhang J, Peng Y. Object-aware aggregation with bidirectional temporal graph for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019b;8327\u20138336.","DOI":"10.1109\/CVPR.2019.00852"},{"key":"487_CR152","unstructured":"Zhang X, Gao K, Zhang Y, Zhang D, Li J, Tian Q. Task-driven dynamic fusion: Reducing ambiguity in video description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017;3713\u20133721."},{"key":"487_CR153","doi-asserted-by":"crossref","unstructured":"Zhang X, Liu C, Chang F. Guidance module network for video captioning. 2020a. arXiv preprint arXiv:201210930","DOI":"10.23919\/CCC52363.2021.9550288"},{"key":"487_CR154","doi-asserted-by":"crossref","unstructured":"Zhang Z, Xu D, Ouyang W, Tan C (2019) Show, tell and summarize: Dense video captioning using visual cue aided sentence summarization. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2019.2936526"},{"key":"487_CR155","doi-asserted-by":"crossref","unstructured":"Zhang Z, Shi Y, Yuan C, Li B, Wang P, Hu W, Zha ZJ. Object relational graph with teacher-recommended learning for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020b;13278\u201313288.","DOI":"10.1109\/CVPR42600.2020.01329"},{"issue":"11","key":"487_CR156","doi-asserted-by":"publisher","first-page":"5552","DOI":"10.1109\/TIP.2019.2916757","volume":"28","author":"B Zhao","year":"2019","unstructured":"Zhao B, Li X, Lu X. Cam-rnn: Co-attention model based rnn for video captioning. IEEE Transactions on Image Processing. 2019;28(11):5552\u201365.","journal-title":"IEEE Transactions on Image Processing"},{"key":"487_CR157","unstructured":"Zhao W, Wu X, Zhang X. Memcap: Memorizing style knowledge for image captioning. In: AAAI, 2020;12984\u201312992."},{"key":"487_CR158","doi-asserted-by":"crossref","unstructured":"Zheng Q, Wang C, Tao D. Syntax-aware action targeting for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020;13096\u201313105.","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"487_CR159","doi-asserted-by":"crossref","unstructured":"Zhu F, Hwang JN, Ma Z, Chen G, Guo J. Understanding objects in video: Object-oriented video captioning via structured trajectory and adversarial learning. IEEE Access 2020a.","DOI":"10.1109\/ACCESS.2020.3021857"},{"key":"487_CR160","unstructured":"Zhu F, Hwang JN, Ma Z, Jun G. Object-oriented video captioning with temporal graph and prior knowledge building. 2020b. arXiv preprint arXiv:200303715"},{"key":"487_CR161","unstructured":"Zhu L, Yang Y. Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020;8746\u20138755."}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-021-00487-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s42979-021-00487-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-021-00487-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T23:54:27Z","timestamp":1724543667000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s42979-021-00487-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,27]]},"references-count":161,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2021,4]]}},"alternative-id":["487"],"URL":"https:\/\/doi.org\/10.1007\/s42979-021-00487-x","relation":{},"ISSN":["2662-995X","2661-8907"],"issn-type":[{"value":"2662-995X","type":"print"},{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,2,27]]},"assertion":[{"value":"6 November 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 January 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 February 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"All of the Authors declare that he\/she has no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"There are no execution of codes relating to this work as it is a survey paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"120"}}