{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T10:37:03Z","timestamp":1770892623651,"version":"3.50.1"},"reference-count":93,"publisher":"Springer Science and Business Media LLC","issue":"18","license":[{"start":{"date-parts":[[2021,5,28]],"date-time":"2021-05-28T00:00:00Z","timestamp":1622160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,5,28]],"date-time":"2021-05-28T00:00:00Z","timestamp":1622160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,7]]},"DOI":"10.1007\/s11042-021-10964-3","type":"journal-article","created":{"date-parts":[[2021,5,28]],"date-time":"2021-05-28T14:15:28Z","timestamp":1622211328000},"page":"28015-28059","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Evolution of automatic visual description techniques-a methodological survey"],"prefix":"10.1007","volume":"80","author":[{"given":"Arka","family":"Bhowmik","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8951-5996","authenticated-orcid":false,"given":"Sanjay","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Neeraj","family":"Bhat","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,5,28]]},"reference":[{"key":"10964_CR1","doi-asserted-by":"crossref","unstructured":"Abbas Q, Ibrahim ME, Jaffar MA (2018) Video scene analysis: An overview and challenges on deep learning algorithms. Multimedia Tools and Applications 1\u201339","DOI":"10.1007\/s11042-017-5438-7"},{"key":"10964_CR2","doi-asserted-by":"crossref","unstructured":"Alzubi J, Nayyar A, Kumar A (2018) Machine learning from theory to algorithms: An overview, vol 1142, p 012012","DOI":"10.1088\/1742-6596\/1142\/1\/012012"},{"key":"10964_CR3","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: European conference on computer vision. Springer, Cham, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"10964_CR4","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10964_CR5","doi-asserted-by":"crossref","unstructured":"Andriluka M, Pishchulin L, Gehler P, Schiele B (2014) 2d human pose estimation: New benchmark and state of the art analysis. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3686\u20133693","DOI":"10.1109\/CVPR.2014.471"},{"key":"10964_CR6","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"10964_CR7","unstructured":"Barbu A, Bridge A, Burchill Z, Coroian D, Dickinson S, Fidler S, Schmidt L (2012) Video in sentences out. arXiv:1204.2742"},{"key":"10964_CR8","doi-asserted-by":"crossref","unstructured":"Bhatnagar BL, Singh S, Arora C, Jawahar CV, CVIT K (2017) Unsupervised learning of deep feature representation for clustering egocentric actions. In: IJCAI, pp 1447\u20131453","DOI":"10.24963\/ijcai.2017\/200"},{"key":"10964_CR9","unstructured":"Blei DM, Ng AY, Jordan MI (2003) Latent dirichlet allocation. Journal of Machine Learning Research 993\u20131022"},{"key":"10964_CR10","unstructured":"Borji A, Itti L (2015) Cat2000: A large scale fixation dataset for boosting saliency research. arXiv:1505.03581"},{"key":"10964_CR11","doi-asserted-by":"crossref","unstructured":"Brox T, Bruhn A, Papenberg N, Weickert J (2004) High accuracy optical flow estimation based on a theory for warping. In: European conference on computer vision. Springer, Berlin, pp 25\u201336","DOI":"10.1007\/978-3-540-24673-2_3"},{"key":"10964_CR12","doi-asserted-by":"crossref","unstructured":"Caba Heilbron F, Escorcia V, Ghanem B, Carlos Niebles J (2015) Activitynet: A large-scale video benchmark for human activity understanding. In: Proceedings of the ieee conference on computer vision and pattern recognition, pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"10964_CR13","unstructured":"Chen DL, Dolan WB (2011) Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies. Association for Computational Linguistics, vol 1, pp 190\u2013200"},{"issue":"7","key":"10964_CR14","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1016\/j.patrec.2011.12.004","volume":"33","author":"MJ Choi","year":"2012","unstructured":"Choi MJ, Torralba A, Willsky AS (2012) Context models and out-of-context objects. Pattern Recognit Lett 33(7):853\u201362","journal-title":"Pattern Recognit Lett"},{"issue":"2","key":"10964_CR15","first-page":"48","volume":"14","author":"M Cornia","year":"2018","unstructured":"Cornia M, Baraldi L, Serra G, Cucchiara R (2018) Paying more attention to saliency: Image captioning with saliency and context attention. ACM Trans Multimed Comput Commun Appl (TOMM) 14(2):48","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"issue":"10","key":"10964_CR16","doi-asserted-by":"publisher","first-page":"5142","DOI":"10.1109\/TIP.2018.2851672","volume":"27","author":"M Cornia","year":"2018","unstructured":"Cornia M, Baraldi L, Serra G, Cucchiara R (2018) Predicting human eye fixations via an lstm-based saliency attentive model. IEEE Trans Image Process 27(10):5142\u20135154","journal-title":"IEEE Trans Image Process"},{"key":"10964_CR17","doi-asserted-by":"crossref","unstructured":"Dai B, Fidler S, Urtasun R, Lin D (2017) Towards diverse and natural image descriptions via a conditional gan. In: Proceedings of the IEEE international conference on computer vision, pp 2970\u20132979","DOI":"10.1109\/ICCV.2017.323"},{"key":"10964_CR18","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: 2005 IEEE computer society conference on computer vision and pattern recognition CVPR\u201905, vol 1. IEEE, pp 886\u2013893","DOI":"10.1109\/CVPR.2005.177"},{"key":"10964_CR19","doi-asserted-by":"publisher","first-page":"764","DOI":"10.1016\/j.procs.2015.06.090","volume":"54","author":"N Dhanachandra","year":"2015","unstructured":"Dhanachandra N, Manglem K, Chanu YJ (2015) Image segmentation using K-means clustering algorithm and subtractive clustering algorithm. Procedia Comput Sci 54:764\u2013771","journal-title":"Procedia Comput Sci"},{"key":"10964_CR20","doi-asserted-by":"crossref","unstructured":"Escorcia V, Heilbron FC, Niebles JC, Ghanem B (2016) Daps: Deep action proposals for action understanding. In: European conference on computer vision. Springer, Cham, pp 768\u2013784","DOI":"10.1007\/978-3-319-46487-9_47"},{"key":"10964_CR21","unstructured":"Fei-Fei L (2010) ImageNet: crowdsourcing, benchmarking & other cool things, CMU VASC Seminar"},{"key":"10964_CR22","doi-asserted-by":"crossref","unstructured":"Ferraro F, Mostafazadeh N, Vanderwende L, Devlin J, Galley M, Mitchell M (2015) A survey of current datasets for vision and language research. arXiv:1506.06833","DOI":"10.18653\/v1\/D15-1021"},{"key":"10964_CR23","doi-asserted-by":"crossref","unstructured":"Freitag M, Al-Onaizan Y (2017) Beam search strategies for neural machine translation. arXiv:1702.01806","DOI":"10.18653\/v1\/W17-3207"},{"issue":"12","key":"10964_CR24","doi-asserted-by":"publisher","first-page":"2321","DOI":"10.1109\/TPAMI.2016.2642953","volume":"39","author":"K Fu","year":"2017","unstructured":"Fu K, Jin J, Cui R, Sha F, Zhang C (2017) Aligning where to see and what to tell: Image captioning with region-based attention and scene-specific contexts. IEEE Trans Pattern Anal Mach Intell 39(12):2321\u20132334","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10964_CR25","doi-asserted-by":"crossref","unstructured":"Fu K, Li J, Jin J, Zhang C (2018) Image-text surgery: Efficient concept learning in image captioning by generating pseudopairs. IEEE Trans Neural Netw Learn Syst (99):1\u201312","DOI":"10.1109\/TNNLS.2018.2813306"},{"issue":"9","key":"10964_CR26","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao L, Guo Z, Zhang H, Xu X, Shen HT (2017) Video captioning with attention-based LSTM and semantic consistency. IEEE Transactions on Multimedia 19(9):2045\u20132055","journal-title":"IEEE Transactions on Multimedia"},{"key":"10964_CR27","doi-asserted-by":"crossref","unstructured":"Girshick R, Donahue J, Darrell T, Malik J (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 580\u2013587","DOI":"10.1109\/CVPR.2014.81"},{"key":"10964_CR28","unstructured":"Goldberg Y, Levy O (2014) word2vec Explained: deriving Mikolov et al.\u2019s negative-sampling word-embedding method. arXiv:1402.3722"},{"key":"10964_CR29","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Bengio Y (2014) Generative adversarial nets. In: Advances in neural information processing systems, pp 2672\u20132680"},{"key":"10964_CR30","unstructured":"Graves A (2013) Generating sequences with recurrent neural networks. arXiv:1308.0850"},{"key":"10964_CR31","unstructured":"Gregor K, Danihelka I, Graves A, Rezende DJ, Wierstra D (2015) Draw: A recurrent neural network for image generation. arXiv:1502.04623"},{"key":"10964_CR32","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"10964_CR33","unstructured":"Hermann KM, Kocisky T, Grefenstette E, Espeholt L, Kay W, Suleyman M, Blunsom P (2015) Teaching machines to read and comprehend. In: Advances in neural information processing systems, pp 1693\u20131701"},{"issue":"8","key":"10964_CR34","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Computat 9(8):1735\u201380","journal-title":"Neural Computat"},{"issue":"8","key":"10964_CR35","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural computat 9(8):1735\u201380","journal-title":"Neural computat"},{"key":"10964_CR36","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: Data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"issue":"6","key":"10964_CR37","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1145\/3295748","volume":"51","author":"MD Hossain","year":"2019","unstructured":"Hossain MD, Sohel F, Shiratuddin MF, Laga H (2019) A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CSUR) 51(6):118","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"10964_CR38","unstructured":"Howard AG, Zhu M, Chen B, Kalenichenko D, Wang W, Weyand T, Adam H (2017) Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv:1704.04861"},{"key":"10964_CR39","doi-asserted-by":"crossref","unstructured":"Hui TW, Tang X, Change Loy C (2018) Liteflownet: A lightweight convolutional neural network for optical flow estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8981\u20138989","DOI":"10.1109\/CVPR.2018.00936"},{"issue":"11","key":"10964_CR40","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/34.730558","volume":"20","author":"L Itti","year":"1998","unstructured":"Itti L, Koch C, Niebur E (1998) A model of saliency-based visual attention for rapid scene analysis. IEEE Trans Pattern Anal Mach Intell 20(11):1254\u20139","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10964_CR41","doi-asserted-by":"crossref","unstructured":"Jiang M, Huang S, Duan J, Zhao Q (2015) Salicon: Saliency in context. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1072\u20131080","DOI":"10.1109\/CVPR.2015.7298710"},{"key":"10964_CR42","unstructured":"Judd T, Durand F, Torralba A (2012) A benchmark of computational models of saliency to predict human fixations"},{"key":"10964_CR43","doi-asserted-by":"crossref","unstructured":"Judd T, Ehinger K, Durand F, Torralba A (2009) Learning to predict where humans look. In: 2009 IEEE 12th international conference on computer vision. IEEE, pp 2106\u20132113","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"10964_CR44","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"10964_CR45","doi-asserted-by":"crossref","unstructured":"Koch C, Ullman S (1987) Shifts in selective visual attention: towards the underlying neural circuitry. In: Matters of intelligence. Springer, Dordrecht, pp 115\u2013141","DOI":"10.1007\/978-94-009-3833-5_5"},{"key":"10964_CR46","doi-asserted-by":"crossref","unstructured":"Krishna R, Hata K, Ren F, Fei-Fei L, Carlos Niebles J (2017) Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision, pp 706\u2013715","DOI":"10.1109\/ICCV.2017.83"},{"key":"10964_CR47","first-page":"3319","volume":"7","author":"A Kumar","year":"2019","unstructured":"Kumar A, Sangwan SR, Arora A, Nayyar A, Abdel-Basset M (2019) Sarcasm detection using soft attention-based bidirectional long short-term memory model with convolution network. IEEE Access 7:3319\u201323328","journal-title":"IEEE Access"},{"key":"10964_CR48","unstructured":"Lin CY (2004) ROUGE: A packagefor automatic evaluation of summaries. In: Proceedings of workshop on text summarization branches out, post2conference workshop of ACL"},{"key":"10964_CR49","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: European conference on computer vision. Springer, Cham, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10964_CR50","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"10964_CR51","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"10964_CR52","doi-asserted-by":"crossref","unstructured":"Mathews AP, Xie L, He X (2016) Senticap: Generating image descriptions with sentiments. In: Thirtieth AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"10964_CR53","doi-asserted-by":"crossref","unstructured":"Mathews A, Xie L, He X (2018) Semstyle: Learning to generate stylised image captions using unaligned text. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8591\u20138600","DOI":"10.1109\/CVPR.2018.00896"},{"key":"10964_CR54","unstructured":"Melnyk I, Sercu T, Dognin PL, Ross J, Mroueh Y (2018) Improved image captioning with adversarial semantic alignment. arXiv:1805.00063"},{"key":"10964_CR55","unstructured":"Mnih V, Heess N, Graves A (2014) Recurrent models of visual attention. In: Advances in neural information processing systems, pp 2204\u20132212"},{"key":"10964_CR56","doi-asserted-by":"crossref","unstructured":"Pan P, Xu Z, Yang Y, Wu F, Zhuang Y (2016) Hierarchical recurrent neural encoder for video representation with application to captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1029\u20131038","DOI":"10.1109\/CVPR.2016.117"},{"key":"10964_CR57","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) BLEU: A method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics, Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"10964_CR58","unstructured":"Pascanu R, Gulcehre C, Cho K, Bengio Y (2013) How to construct deep recurrent neural networks. arXiv:1312.6026"},{"key":"10964_CR59","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning C (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"10964_CR60","doi-asserted-by":"crossref","unstructured":"Peris \u00c1, Bola\u00f1os M, Radeva P, Casacuberta F (2016) Video description using bidirectional recurrent neural networks. In: International conference on artificial neural networks. Springer, Cham, pp 3\u201311","DOI":"10.1007\/978-3-319-44781-0_1"},{"key":"10964_CR61","doi-asserted-by":"crossref","unstructured":"Pini S, Cornia M, Bolelli F, Baraldi L, Cucchiara R (2018) M-VAD names: A dataset for video captioning with naming. Multimedia Tools and Applications 1\u201321","DOI":"10.1007\/s11042-018-7040-z"},{"key":"10964_CR62","doi-asserted-by":"crossref","unstructured":"Redmon J, Divvala S, Girshick R, Farhadi A (2016) You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 779\u2013788","DOI":"10.1109\/CVPR.2016.91"},{"key":"10964_CR63","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"10964_CR64","doi-asserted-by":"crossref","unstructured":"Rohrbach A, Rohrbach M, Qiu W, Friedrich A, Pinkal M, Schiele B (2014) Coherent multi-sentence video description with variable level of detail. In: German conference on pattern recognition. Springer, Cham, pp 184\u2013195","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"10964_CR65","doi-asserted-by":"crossref","unstructured":"Sandler M, Howard A, Zhu M, Zhmoginov A, Chen LC (2018) Mobilenetv2: Inverted residuals and linear bottlenecks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4510\u20134520","DOI":"10.1109\/CVPR.2018.00474"},{"key":"10964_CR66","doi-asserted-by":"crossref","unstructured":"Shetty R, Rohrbach M, Anne Hendricks L, Fritz M, Schiele B (2017) Speaking the same language: Matching machine to human captions by adversarial training. In: Proceedings of the IEEE international conference on computer vision, pp 4135\u20134144","DOI":"10.1109\/ICCV.2017.445"},{"key":"10964_CR67","unstructured":"Shi X, Cai J, Gu J, Joty S (2018) Video captioning with boundary-aware hierarchical language decoding and joint video prediction. arXiv:1807.03658"},{"key":"10964_CR68","doi-asserted-by":"crossref","unstructured":"Shi H, Li P, Wang B, Wang Z (2018) Image captioning based on deep reinforcement learning. In: Proceedings of the 10th international conference on internet multimedia computing and service. ACM, p 45","DOI":"10.1145\/3240876.3240900"},{"key":"10964_CR69","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"10964_CR70","doi-asserted-by":"crossref","unstructured":"Tanti M, Gatt A, Camilleri KP (2017) What is the role of recurrent neural networks (RNNs) in an image caption generator? arXiv:1708.02043","DOI":"10.18653\/v1\/W17-3506"},{"issue":"3","key":"10964_CR71","doi-asserted-by":"publisher","first-page":"467","DOI":"10.1017\/S1351324918000098","volume":"24","author":"M Tanti","year":"2018","unstructured":"Tanti M, Gatt A, Camilleri KP (2018) Where to put the image in an image caption generator. Nat Lang Eng 24(3):467\u2013489","journal-title":"Nat Lang Eng"},{"key":"10964_CR72","unstructured":"Torabi A, Pal C, Larochelle H, Courville A (2015) Using descriptive video services to create a large data source for video annotation research. arXiv:1503.01070"},{"key":"10964_CR73","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"issue":"2","key":"10964_CR74","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JR Uijlings","year":"2013","unstructured":"Uijlings JR, Van De Sande KE, Gevers T, Smeulders AW (2013) Selective search for object recognition. Int J Comput Vision 104(2):154\u201371","journal-title":"Int J Comput Vision"},{"key":"10964_CR75","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10964_CR76","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"key":"10964_CR77","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Xu H, Donahue J, Rohrbach M, Mooney R, Saenko K (2014) Translating videos to natural language using deep recurrent neural networks. arXiv:1412.4729","DOI":"10.3115\/v1\/N15-1173"},{"key":"10964_CR78","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"1","key":"10964_CR79","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang H, Kl\u00e4ser A, Schmid C, Liu CL (2013) Dense trajectories and motion boundary descriptors for action recognition. Int J Comput Vision 103(1):60\u201379","journal-title":"Int J Comput Vision"},{"key":"10964_CR80","doi-asserted-by":"crossref","unstructured":"Wang W, Wu Y, Liu H, Wang S, Cheng J (2018) Temporal action detection by joint Identification-Verification. In: 2018 24th international conference on pattern recognition (ICPR). IEEE, pp 2026\u20132031","DOI":"10.1109\/ICPR.2018.8545487"},{"key":"10964_CR81","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2016) Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision. Springer, Cham, pp 20\u201336","DOI":"10.1007\/978-3-319-46484-8_2"},{"issue":"10","key":"10964_CR82","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1145\/135239.135244","volume":"35","author":"S Wu","year":"1992","unstructured":"Wu S, Manber U (1992) Fast text searching allowing errors. Commun ACM 35(10):83\u201392","journal-title":"Commun ACM"},{"key":"10964_CR83","doi-asserted-by":"crossref","unstructured":"Wu Z, Yao T, Fu Y, Jiang YG (2016) Deep learning for video classification and captioning. arXiv:1609.06782","DOI":"10.1145\/3122865.3122867"},{"key":"10964_CR84","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Bengio Y (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, pp 2048\u20132057"},{"key":"10964_CR85","doi-asserted-by":"crossref","unstructured":"Xu J, Mei T, Yao T, Rui Y (2016) Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5288\u20135296","DOI":"10.1109\/CVPR.2016.571"},{"key":"10964_CR86","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1016\/j.cviu.2016.10.010","volume":"156","author":"D Xu","year":"2017","unstructured":"Xu D, Yan Y, Ricci E, Sebe N (2017) Detecting anomalous events in videos by learning deep representations of appearance and motion. Elsevier J Comput Vis Image Underst 156:117\u2013127","journal-title":"Elsevier J Comput Vis Image Underst"},{"key":"10964_CR87","doi-asserted-by":"crossref","unstructured":"Xu Z, Yang Y, Hauptmann AG (2015) A discriminative cnn video representation for event detection. In: IEEE conference on computer vision and pattern recognition, CVPR","DOI":"10.1109\/CVPR.2015.7298789"},{"key":"10964_CR88","doi-asserted-by":"crossref","unstructured":"Yao L, Torabi A, Cho K, Ballas N, Pal C, Larochelle H, Courville A (2015) Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision, pp 4507\u20134515","DOI":"10.1109\/ICCV.2015.512"},{"key":"10964_CR89","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4651\u20134659","DOI":"10.1109\/CVPR.2016.503"},{"key":"10964_CR90","doi-asserted-by":"crossref","unstructured":"Young P, Lai A, Hodosh M, Hockenmaier J (2014) From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics 67\u201378","DOI":"10.1162\/tacl_a_00166"},{"key":"10964_CR91","doi-asserted-by":"crossref","unstructured":"Yu H, Wang J, Huang Z, Yang Y, Xu W (2016) Video paragraph captioning using hierarchical recurrent neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4584\u20134593","DOI":"10.1109\/CVPR.2016.496"},{"key":"10964_CR92","doi-asserted-by":"crossref","unstructured":"Yue-Hei Ng J, Hausknecht M, Vijayanarasimhan S, Vinyals O, Monga R, Toderici G (2015) Beyond short snippets: Deep networks for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4694\u20134702","DOI":"10.1109\/CVPR.2015.7299101"},{"issue":"1","key":"10964_CR93","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/TIP.2018.2855415","volume":"28","author":"M Zhang","year":"2019","unstructured":"Zhang M, Yang Y, Zhang H, Ji Y, Shen HT, Chua TS (2019) More is better: Precise and detailed image captioning using online positive recall and missing concepts mining. IEEE Trans Image Process 28(1):32\u201344","journal-title":"IEEE Trans Image Process"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-10964-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-10964-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-10964-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,4]],"date-time":"2023-11-04T08:15:33Z","timestamp":1699085733000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-10964-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,5,28]]},"references-count":93,"journal-issue":{"issue":"18","published-print":{"date-parts":[[2021,7]]}},"alternative-id":["10964"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-10964-3","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,5,28]]},"assertion":[{"value":"1 June 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 October 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 April 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 May 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}