{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T16:14:56Z","timestamp":1769271296533,"version":"3.49.0"},"reference-count":68,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1509206"],"award-info":[{"award-number":["U1509206"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61472276"],"award-info":[{"award-number":["61472276"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61472116"],"award-info":[{"award-number":["61472116"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61722204"],"award-info":[{"award-number":["61722204"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61429201"],"award-info":[{"award-number":["61429201"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006606","name":"Natural Science Foundation of Tianjin City","doi-asserted-by":"publisher","award":["15JCYBJC15400"],"award-info":[{"award-number":["15JCYBJC15400"]}],"id":[{"id":"10.13039\/501100006606","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000183","name":"Army Research Office","doi-asserted-by":"publisher","award":["W911NF-15-1-0290"],"award-info":[{"award-number":["W911NF-15-1-0290"]}],"id":[{"id":"10.13039\/100000183","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Faculty Research Gift Awards by the NEC Laboratories of America and Blippar"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2018,10]]},"DOI":"10.1109\/tip.2018.2846664","type":"journal-article","created":{"date-parts":[[2018,6,12]],"date-time":"2018-06-12T19:21:00Z","timestamp":1528831260000},"page":"4933-4944","source":"Crossref","is-referenced-by-count":95,"title":["Sequential Video VLAD: Training the Aggregation Locally and Temporally"],"prefix":"10.1109","volume":"27","author":[{"given":"Youjiang","family":"Xu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2768-1398","authenticated-orcid":false,"given":"Yahong","family":"Han","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-3986","authenticated-orcid":false,"given":"Richang","family":"Hong","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Tian","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","author":"chen","year":"2015","journal-title":"Microsoft COCO captions Data collection and evaluation server"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref33","first-page":"1494","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"venugopalan","year":"2014","journal-title":"Proc NAACL HLT"},{"key":"ref32","author":"torabi","year":"2015","journal-title":"Using descriptive video services to create a large data source for video annotation research"},{"key":"ref31","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"chen","year":"2011","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref37","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","volume":"8","author":"lin","year":"2004","journal-title":"Proc Workshop Text Summarization Branches Out"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"ref35","first-page":"311","article-title":"Bleu: A method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.339"},{"key":"ref60","first-page":"3468","article-title":"Spatiotemporal residual networks for video action recognition","author":"feichtenhofer","year":"2016","journal-title":"Proc NIPS"},{"key":"ref62","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1016\/j.cviu.2016.03.013","article-title":"Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice","volume":"150","author":"peng","year":"2016","journal-title":"Comput Vis Image Understand"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.83"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0859-0"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.147"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.278"},{"key":"ref27","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"wang","year":"2016","journal-title":"Proc ECCV"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.341"},{"key":"ref66","first-page":"697","article-title":"Sympathy for the details: Dense trajectories and hybrid classification architectures for action recognition","author":"de souza","year":"2016","journal-title":"Proc ECCV"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2666739"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref68","author":"kay","year":"2017","journal-title":"The kinetics human action video dataset"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.117"},{"key":"ref1","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"2014","journal-title":"Proc NIPS"},{"key":"ref20","author":"li","year":"2016","journal-title":"Videolstm convolves attends and flows for action recognition"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.572"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298789"},{"key":"ref24","first-page":"1","article-title":"Visual categorization with bags of keypoints","author":"csurka","year":"2004","journal-title":"Workshop on Statistical Learning in Computer Vision (ECCV)"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.337"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.219"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref50","first-page":"448","article-title":"Batch normalization: accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref51","author":"wang","year":"2015","journal-title":"Towards good practices for very deep two-stream convnets"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.604"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.291"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.522"},{"key":"ref53","author":"saxe","year":"2013","journal-title":"Exact solutions to the nonlinear dynamics of learning in deep linear neural networks"},{"key":"ref52","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","author":"glorot","year":"2010","journal-title":"Proc 13th Int Conf Artif Intell Statist"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2749125"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2781424"},{"key":"ref40","first-page":"630","article-title":"Identity mappings in deep residual networks","author":"he","year":"2016","journal-title":"Proc ECCV"},{"key":"ref12","article-title":"A unified metric learning-based framework for co-saliency detection","author":"han","year":"0","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2659221"},{"key":"ref15","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","volume":"14","author":"graves","year":"2014","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref16","first-page":"843","article-title":"Unsupervised learning of video representations using lstms","author":"srivastava","year":"2015","journal-title":"Proceedings of the 32nd Intl Conf on Machine Learning"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref18","article-title":"Delving deeper into convolutional networks for learning video representations","author":"ballas","year":"2016","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref19","first-page":"802","article-title":"Convolutional LSTM network: A machine learning approach for precipitation nowcasting","author":"shi","year":"2015","journal-title":"Proc NIPS"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0636-x"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540039"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref49","article-title":"THUMOS challenge: Action recognition with a large number of classes","author":"jiang","year":"2013","journal-title":"Proc ICCV 1st Int Workshop Action Recognit Large Number Classes (THUMOS)"},{"key":"ref9","first-page":"487","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"2014","journal-title":"Proc NIPS"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123327"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.111"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref47","author":"soomro","year":"2012","journal-title":"Ucf101 A Dataset of 101 Human Actions Classes from Videos in the Wild"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.496"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/83\/8396881\/08382330.pdf?arnumber=8382330","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T12:55:01Z","timestamp":1643201701000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8382330\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10]]},"references-count":68,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tip.2018.2846664","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,10]]}}}