{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T16:07:07Z","timestamp":1762445227406,"version":"3.37.3"},"reference-count":261,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/100006602","name":"United States Air Force Research Laboratory and the United States Air Force Artificial Intelligence Accelerator and was accomplished under Cooperative","doi-asserted-by":"publisher","award":["FA8750-19-2-1000"],"award-info":[{"award-number":["FA8750-19-2-1000"]}],"id":[{"id":"10.13039\/100006602","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/access.2021.3115476","type":"journal-article","created":{"date-parts":[[2021,9,24]],"date-time":"2021-09-24T20:50:05Z","timestamp":1632516605000},"page":"134611-134637","source":"Crossref","is-referenced-by-count":27,"title":["Video Action Understanding"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9226-0349","authenticated-orcid":false,"given":"Matthew S.","family":"Hutchinson","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4598-2808","authenticated-orcid":false,"given":"Vijay N.","family":"Gadepally","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","first-page":"8252","article-title":"Fixing the train-test resolution discrepancy","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Touvron","year":"2019"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00584"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_35"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3390\/computers2020088"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2013.01.013"},{"key":"ref9","article-title":"Advances in human action recognition: A survey","author":"Cheng","year":"2015","journal-title":"arXiv:1501.05964"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCC.2012.2198883"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/1964897.1964918"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.02.010"},{"key":"ref13","article-title":"Review of action recognition and detection methods","author":"Min Kang","year":"2016","journal-title":"arXiv:1610.06906"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2016.06.007"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2014.04.018"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"ref17","article-title":"Multi-moments in time: Learning and interpreting models for multi-action video understanding","author":"Monfort","year":"2019","journal-title":"arXiv:1911.00232"},{"volume-title":"Action","year":"2020","key":"ref18"},{"volume-title":"Action","year":"2020","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2009.11.014"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2010.10.002"},{"key":"ref22","first-page":"1650","article-title":"Action dataset\u2014A survey","volume-title":"Proc. SICE Annu. Conf.","author":"Ahad"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.05.019"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.01.010"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1049\/iet-cvi.2016.0355"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2017.150"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.29007\/x163"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.3390\/s19051005"},{"key":"ref29","article-title":"Spatio-temporal action recognition: A survey","author":"Bhoi","year":"2019","journal-title":"arXiv:1901.09403"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-018-9651-1"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2986861"},{"key":"ref32","article-title":"Deep learning for vision-based prediction: A survey","author":"Rasouli","year":"2020","journal-title":"arXiv:2007.00095"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.106970"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383137"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383132"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.337"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.119"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00124"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_5"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00372"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2019.102886"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00719"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.617"},{"key":"ref46","article-title":"Spatio-temporal human action localisation and instance segmentation in temporally untrimmed videos","author":"Saha","year":"2017","journal-title":"arXiv:1707.07213"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_43"},{"key":"ref48","first-page":"141","volume-title":"Spatio-Temporal Action Instance Segmentation Localisation","author":"Saha","year":"2020"},{"key":"ref49","article-title":"ActivityNet challenge 2017 summary","author":"Ghanem","year":"2017","journal-title":"arXiv:1710.08011"},{"key":"ref50","article-title":"The ActivityNet large-scale activity recognition challenge 2018 summary","author":"Ghanem","year":"2018","journal-title":"arXiv:1808.03766"},{"article-title":"International challenge on activity recognition","volume-title":"Proc. Comput. Vis. Pattern Recognit. (CVPR) Conf.","author":"Snoek","key":"ref51"},{"volume-title":"International Challenge on Activity Recognition","year":"2020","key":"ref52"},{"key":"ref53","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics, Hum. Lang. Technol.","author":"Chen"},{"key":"ref54","article-title":"Using descriptive video services to create a large data source for video annotation research","author":"Torabi","year":"2015","journal-title":"arXiv:1503.01070"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2019.2892755"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_16"},{"key":"ref59","article-title":"You only watch once: A unified CNN architecture for real-time spatiotemporal action localization","author":"K\u00f6p\u00fckl\u00fc","year":"2019","journal-title":"arXiv:1911.06644"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3391743"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.28"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4409105"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995586"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref68","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012","journal-title":"arXiv:1212.0402"},{"article-title":"THUMOS challenge: Action recognition with a large number of classes","volume-title":"Proc. Int. Conf. Comput. Vis. (ICCV)","author":"Jiang","key":"ref69"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.396"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref74","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv:1705.06950"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1013-y"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.29007\/h68j"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"ref79","article-title":"On the effectiveness of task granularity for transfer learning","author":"Mahdisoltani","year":"2018","journal-title":"arXiv:1804.09235"},{"key":"ref80","article-title":"Charades-ego: A large-scale dataset of paired third and first person videos","author":"Sigurdsson","year":"2018","journal-title":"arXiv:1804.09626"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00349"},{"key":"ref82","article-title":"A short note on the kinetics-700 human action dataset","author":"Carreira","year":"2019","journal-title":"arXiv:1907.06987"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00876"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"ref85","article-title":"Rescaling egocentric vision","author":"Damen","year":"2020","journal-title":"arXiv:2006.13256"},{"key":"ref86","article-title":"The AVA-kinetics localized human actions video dataset","author":"Li","year":"2020","journal-title":"arXiv:2005.00214"},{"key":"ref87","article-title":"AViD dataset: Anonymized videos from diverse countries","author":"Piergiovanni","year":"2020","journal-title":"arXiv:2007.05515"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"ref89","article-title":"HAA500: Human-centric atomic action dataset with curated videos","author":"Chung","year":"2020","journal-title":"arXiv:2009.05224"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref91","first-page":"1010","article-title":"NTU RGBD: A large scale dataset for 3D human activity analysis","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Shahroudy"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"article-title":"SLAC: A sparsely labeled dataset for action classification and localization","year":"2017","author":"Zhao","key":"ref93"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578775"},{"article-title":"THUMOS challenge: Action recognition with a large number of classes","volume-title":"Proc. Eur. Conf. Comput. Vis. (ECCV)","author":"Jiang","key":"ref95"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00524"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913478446"},{"key":"ref98","first-page":"720","article-title":"Scaling egocentric vision: The epic-kitchens dataset","volume-title":"Proc. Eur. Conf. Comput. Vis. (ECCV)","author":"Damen"},{"volume-title":"Lear-Inria Submission for the Thumos Workshop","year":"2013","author":"Wang","key":"ref99"},{"article-title":"University of Amsterdam at thumos challenge 2014","year":"2014","author":"Jain","key":"ref100"},{"volume-title":"The Lear Submission at Thumos 2014","year":"2014","author":"Oneata","key":"ref101"},{"article-title":"THUMOS challenge: Action recognition with a large number of classes","volume-title":"Proc. Comput. Vis. Pattern Recognit. Conf. (CVPR)","author":"Gorban","key":"ref102"},{"volume-title":"UTS-CMU at Thumos 2015","year":"2015","author":"Xu","key":"ref103"},{"volume-title":"ADSC Submission at Thumos Challenge 2015","year":"2015","author":"Yuan","key":"ref104"},{"key":"ref105","article-title":"CUHK & ETHZ & SIAT submission to ActivityNet challenge 2016","author":"Xiong","year":"2016","journal-title":"arXiv:1608.00797"},{"key":"ref106","article-title":"Revisiting the effectiveness of off-the-shelf temporal modeling approaches for large-scale video classification","author":"Bian","year":"2017","journal-title":"arXiv:1708.03805"},{"key":"ref107","article-title":"Temporal convolution based action proposal: Submission to ActivityNet 2017","author":"Lin","year":"2017","journal-title":"arXiv:1707.06750"},{"key":"ref108","article-title":"Exploiting spatial-temporal modelling and multi-modal fusion for human action recognition","author":"He","year":"2018","journal-title":"arXiv:1806.10319"},{"article-title":"Team deep-hri moments in time challenge 2018 technical report","year":"2018","author":"Li","key":"ref109"},{"author":"Guan","key":"ref110","article-title":"SYSU ISEE submission to moments in time challenge 2018"},{"article-title":"Human centric spatio-temporal action localization","volume-title":"Proc. ActivityNet Workshop (CVPR)","author":"Jiang","key":"ref111"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW50321.2020.9096926"},{"key":"ref113","article-title":"Trimmed action recognition, dense-captioning events in videos, and spatio-temporal action localization with focus on ActivityNet challenge 2019","author":"Qiu","year":"2019","journal-title":"arXiv:1906.07016"},{"key":"ref114","article-title":"Baidu-UTS submission to the EPIC-kitchens action recognition challenge 2019","author":"Wang","year":"2019","journal-title":"arXiv:1906.09383"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00635"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"volume-title":"Workshop on Multi-Modal Video Analysis and Moments in Time Challenge","year":"2019","key":"ref117"},{"key":"ref118","article-title":"Top-1 solution of multi-moments in time challenge 2019","author":"Zhang","year":"2020","journal-title":"arXiv:2003.05837"},{"key":"ref119","article-title":"Learning sparse 2D temporal adjacent networks for temporal action localization","author":"Zhang","year":"2019","journal-title":"arXiv:1912.03612"},{"key":"ref120","article-title":"CBR-Net: Cascade boundary refinement network for action detection: Submission to activitynet challenge 2020 (task 1)","author":"Wang","year":"2020","journal-title":"arXiv:2006.07526"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00110"},{"key":"ref122","article-title":"Temporal fusion network for temporal action localization: Submission to ActivityNet challenge 2020 (task E)","author":"Qing","year":"2020","journal-title":"arXiv:2006.07520"},{"author":"Gao","key":"ref123","article-title":"Multi-modal fusion network based on relation-aware pyramid network for temporal action localization"},{"key":"ref124","article-title":"1st place solution for AVA-kinetics crossover in AcitivityNet challenge 2020","author":"Chen","year":"2020","journal-title":"arXiv:2006.09116"},{"key":"ref125","article-title":"A short note on the Kinetics-700-2020 human action dataset","author":"Smaira","year":"2020","journal-title":"arXiv:2010.10864"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412541"},{"key":"ref127","article-title":"TinyAction challenge: Recognizing real-world low-resolution activities in videos","author":"Tirupattur","year":"2021","journal-title":"arXiv:2107.11494"},{"key":"ref128","article-title":"Proposal relation network for temporal action detection","author":"Wang","year":"2021","journal-title":"arXiv:2106.11812"},{"key":"ref129","article-title":"Exploring stronger feature for temporal action localization","author":"Qing","year":"2021","journal-title":"arXiv:2106.13014"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00508"},{"key":"ref131","article-title":"Feature combination meets attention: Baidu soccer embeddings and transformer based temporal detection","author":"Zhou","year":"2021","journal-title":"arXiv:2106.14447"},{"key":"ref132","article-title":"Relation modeling in spatio-temporal action localization","author":"Feng","year":"2021","journal-title":"arXiv:2106.08061"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/SSCI.2018.8628742"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1631\/jzus.C1400102"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.6"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-019-0197-0"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2868668"},{"key":"ref140","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","volume-title":"Advances in Neural Information Processing Systems 27","author":"Simonyan","year":"2014"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299065"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2014.131"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref146","doi-asserted-by":"crossref","first-page":"363","DOI":"10.1007\/3-540-45103-X_50","article-title":"Two-frame motion estimation based on polynomial expansion","volume-title":"Image Analysis","author":"Farneb\u00e4ck","year":"2003"},{"issue":"11","key":"ref147","first-page":"120","article-title":"The OpenCV Library","volume":"25","author":"Bradski","year":"2000","journal-title":"Dr. Dobb\u2019s J. Softw. Tools"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11024-6_51"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1007\/s00348-005-0068-7"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1145\/2185520.2185561"},{"volume-title":"The Perception of the Visual World","year":"1950","author":"Gibson","key":"ref151"},{"key":"ref152","first-page":"121","article-title":"An iterative image registration technique with an application to stereo vision","volume-title":"Proc. Imag. Understand. Workshop","author":"Lucas"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1117\/12.965761"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74936-3_22"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1145\/212094.212141"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-12939-2_20"},{"key":"ref158","first-page":"1","article-title":"Deep learning tutorial","volume-title":"Proc. Tuts. Int. Conf. Mach. Learn. (ICML)","author":"LeCun"},{"key":"ref159","first-page":"1","article-title":"A tutorial on deep learning part 2: Autoencoders, convolutional neural networks and recurrent neural networks","volume":"20","author":"Le","year":"2015","journal-title":"Google Brain"},{"key":"ref160","first-page":"4898","article-title":"Understanding the effective receptive field in deep convolutional neural networks","volume-title":"Advances in Neural Information Processing Systems 29","author":"Luo","year":"2016"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1142\/S0218488598000094"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref166","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"37","author":"Ioffe"},{"key":"ref167","article-title":"Layer normalization","author":"Lei Ba","year":"2016","journal-title":"arXiv:1607.06450"},{"key":"ref168","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"Ulyanov","year":"2016","journal-title":"arXiv:1607.08022"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01232"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00155"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00561"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00034"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.107037"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref181","first-page":"2204","article-title":"Trajectory convolution for action recognition","volume-title":"Advances in Neural Information Processing Systems 31","author":"Zhao","year":"2018"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00118"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref186","article-title":"A gentle tutorial of recurrent neural network with error backpropagation","author":"Chen","year":"2016","journal-title":"arXiv:1610.02583"},{"article-title":"A tutorial on backward propagation through time (BPTT) in the gated recurrent unit (GRU) RNN","year":"2016","author":"Li","key":"ref187"},{"key":"ref188","article-title":"Understanding LSTM\u2014A tutorial into long short-term memory recurrent neural networks","author":"Staudemeyer","year":"2019","journal-title":"arXiv:1909.09586"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1016\/j.physd.2019.132306"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296484"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1249"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018401"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15567-3_11"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20893-6_23"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00807"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093274"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.787"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2017.04.004"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.3390\/s18071979"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12333"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_43"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref209","first-page":"4694","article-title":"Beyond short snippets: Deep networks for video classification","volume-title":"Proc. CVPR","author":"Ng"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806222"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1088\/1757-899X\/569\/3\/032035"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/ICAICTA.2019.8904245"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2018.09.003"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"ref215","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref216","article-title":"Is space-time attention all you need for video understanding?","author":"Bertasius","year":"2021","journal-title":"arXiv:2102.05095"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"ref218","article-title":"ViViT: A video vision transformer","author":"Arnab","year":"2021","journal-title":"arXiv:2103.15691"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.92"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.326"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794278"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00710"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803534"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2016.7487478"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.39"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20887-5_28"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00173"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803820"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_47"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.675"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.392"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2965196"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.155"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-36718-3_40"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123343"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.93"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00224"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00043"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/CRV50864.2020.00035"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053319"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01404"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2018.12.019"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_45"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.95"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.393"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_19"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.620"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.472"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00035"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01021"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103187"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2009.03.002"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1016\/j.aci.2018.08.003"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.4018\/jdwm.2007070101"},{"key":"ref261","first-page":"3780","article-title":"A unified view of multi-label performance measures","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","volume":"70","author":"Wu"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9312710\/09548074.pdf?arnumber=9548074","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,24]],"date-time":"2024-01-24T01:08:01Z","timestamp":1706058481000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9548074\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":261,"URL":"https:\/\/doi.org\/10.1109\/access.2021.3115476","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2021]]}}}