{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T20:55:37Z","timestamp":1776200137431,"version":"3.50.1"},"reference-count":263,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2021ZD0110303"],"award-info":[{"award-number":["2021ZD0110303"]}]},{"name":"Humanities and Social Science Research of Ministry of Education of China","award":["20YJA890024"],"award-info":[{"award-number":["20YJA890024"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/tmm.2022.3232034","type":"journal-article","created":{"date-parts":[[2022,12,26]],"date-time":"2022-12-26T19:16:59Z","timestamp":1672082219000},"page":"7943-7966","source":"Crossref","is-referenced-by-count":108,"title":["A Survey on Video Action Recognition in Sports: Datasets, Methods and Applications"],"prefix":"10.1109","volume":"25","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2240-4829","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"first","affiliation":[{"name":"Department of Physical Education, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1562-8098","authenticated-orcid":false,"given":"Qingzhong","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"given":"Jiang","family":"Bian","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"given":"Ning","family":"Ding","sequence":"additional","affiliation":[{"name":"Department of Physical Education, Peking University, Beijing, China"}]},{"given":"Feixiang","family":"Lu","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"given":"Jun","family":"Cheng","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2949-6874","authenticated-orcid":false,"given":"Dejing","family":"Dou","sequence":"additional","affiliation":[{"name":"Boston Consulting Group (Greater China), Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5451-3253","authenticated-orcid":false,"given":"Haoyi","family":"Xiong","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}]}],"member":"263","reference":[{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2013.102"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2605305"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2011.5711524"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.05.019"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_36"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-09396-3_9"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.14257\/ijmue.2014.9.10.29"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2010.5543273"},{"key":"ref203","first-page":"1","article-title":"Video-based table tennis tracking and trajectory prediction using spatial-temporal CNNs based on deep learning","volume":"30","author":"li","year":"0","journal-title":"Fractals"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2009.69"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1260\/174795408786238515"},{"key":"ref55","first-page":"332","article-title":"Violence detection in video using computer vision techniques","author":"nievas","year":"0","journal-title":"Proc Int Conf Comput Anal Images Patterns"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2017.7966210"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15552-9_29"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1155\/2022\/4247082"},{"key":"ref209","article-title":"A comparative survey of deep active learning","author":"zhan","year":"2022"},{"key":"ref210","article-title":"Multiple-criteria based active learning with fixed-size determinantal point processes","author":"zhan","year":"2021"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/634"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11554-016-0638-3"},{"key":"ref50","first-page":"1","article-title":"Distributed video acquisition and annotation for sport-event summarization","volume":"8","author":"vleeschouwer","year":"2008","journal-title":"NEM Summit"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.02.002"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_24"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1017\/S0269888919000225"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587735"},{"key":"ref48","article-title":"Cvbase 06 dataset: A dataset for development and testing of computer vision based methods in sport environments","author":"pers","year":"2005","journal-title":"SN Ljubljana"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093278"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2655624"},{"key":"ref217","first-page":"1","article-title":"Action recognition and detection by combining motion and appearance features","volume":"1","author":"wang","year":"2014","journal-title":"THUMOS Action Recognition challenge"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.11591\/ijeecs.v11.i3.pp987-993"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2508123"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3183112"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.17485\/ijst\/2016\/v9i5\/72065"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1155\/2018\/3426178"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2959977"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3054132"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2016.10.016"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.111"},{"key":"ref6","article-title":"Youtube-8 m: A large-scale video classification benchmark","author":"abu-el-haija","year":"2016"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01018"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/11744047_33"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.330"},{"key":"ref40","article-title":"A comprehensive study of deep video action recognition","author":"zhu","year":"2020"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-12939-2_20"},{"key":"ref35","article-title":"Visual analytics for team-based invasion sports with significant events and Markov reward process","author":"zhao","year":"2019"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00233"},{"key":"ref37","first-page":"249","article-title":"Difference between team and individual sports with respect to psychological skills, overall emotional intelligence and athletic success motivation in Shiraz city athletes","volume":"11","author":"kajbafnezhad","year":"2011","journal-title":"Physical Education and Sport"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSE51940.2021.9569708"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.239"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3074831"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2794265"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.23919\/ICMU50196.2021.9638855"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1037\/a0030202"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1123\/jtpe.16.4.500"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00223"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_44"},{"key":"ref26","article-title":"Sports video: Fine-grained action detection and classification of table tennis strokes from videos for mediaeval 2021","author":"martin","year":"2021"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-08917-3"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.3390\/jfmk4020025"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.12928\/telkomnika.v18i4.14730"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3390\/s21020654"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01328"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01314"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2894161"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref249","first-page":"13153","article-title":"Direct multi-view multi-person 3D pose estimation","volume":"34","author":"zhang","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_31"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1023\/A:1011139631724"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1145\/3293318"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.5244\/C.22.99"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00047"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-15-1480-7_79"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01367"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093481"},{"key":"ref93","article-title":"$p^{2}a$: A dataset and benchmark for dense action detection from table tennis match broadcasting videos","author":"bian","year":"2022"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508417"},{"key":"ref92","article-title":"The ava-kinetics localized human actions video dataset","author":"li","year":"2020"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"ref255","article-title":"Dive into big model training","author":"liu","year":"2022"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1013-y"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01232"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_14"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"ref253","article-title":"Seal: A large-scale video dataset of multi-grained spatio-temporally action localization","author":"chen","year":"2022"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01685-7"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_23"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00403"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20341"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW54805.2022.00022"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00315"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.119250"},{"key":"ref137","first-page":"12493","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","volume":"34","author":"patrick","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00515"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00508"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref256","article-title":"Ernie 3.0: Large-scale knowledge enhanced pre-training for language understanding and generation","author":"sun","year":"2021"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.3390\/app11104426"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01576"},{"key":"ref257","article-title":"A review of sparse expert models in deep learning","author":"fedus","year":"2022"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053928"},{"key":"ref144","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc North Amer Chapter Assoc Comput Linguistics Human Lang Technologies"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3422844.3423051"},{"key":"ref145","article-title":"Tfcnet: Temporal fully connected networks for static unbiased temporal reasoning","author":"zhang","year":"2022"},{"key":"ref84","first-page":"2163","article-title":"Temporal segmentation of fine-grained semantic action: A motion-centered figure skating dataset","volume":"35","author":"liu","year":"0","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref142","article-title":"Vimpac: Video pre-training via masked token prediction and contrastive learning","author":"tan","year":"2021"},{"key":"ref263","first-page":"4911","article-title":"Video action detection: Analysing limitations and challenges","author":"modi","year":"0","journal-title":"Proc IEEE Conf Comput Vis and Pattern Recog"},{"key":"ref83","first-page":"107","article-title":"Fineskating: A high-quality figure skating dataset and multi-task approach for sport action","volume":"1","author":"zhao","year":"2020","journal-title":"Peng Cheng Lab Commumications"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01432"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00151"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20062-5_39"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-09414-3"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00055"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539881"},{"key":"ref229","article-title":"Deep long-tailed learning: A survey","author":"zhang","year":"2021"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00450"},{"key":"ref109","article-title":"Will person detection help bag-of-features action recognition?","author":"klaser","year":"2010","journal-title":"Ph D dissertation INRIA"},{"key":"ref106","article-title":"Mosift: Recognizing human actions in surveillance videos","author":"chen","year":"2009"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.100"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-005-1838-7"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.578"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00311"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref225","first-page":"1129","article-title":"Multi-view human action recognition under occlusion based on fuzzy distances and neural networks","author":"iosifidis","year":"0","journal-title":"Proc Proc 20th Eur Signal Process Conf"},{"key":"ref74","article-title":"Spin: A high speed, high resolution vision dataset for tracking and action recognition in ping pong","author":"schwarcz","year":"2019"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412541"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00039"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2010.03.024"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15558-1_46"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00161"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0594-8"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2944745"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR.2018.00090"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.5244\/C.23.124"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2017.8282246"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2015.11.095"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00039"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00786"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2018.8516488"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995624"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/SNPD.2018.8441034"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.01.001"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.106"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.173"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.21236\/ADA623249"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref69","article-title":"Classificazione di azioni cestistiche mediante tecniche di deep learning","author":"francia","year":"2018"},{"key":"ref118","first-page":"843","article-title":"Unsupervised learning of video representations using LSTMs","author":"srivastava","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00784"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.25"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383137"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2017.01.019"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.332"},{"key":"ref116","first-page":"4694","article-title":"Beyond short snippets: Deep networks for video classification","author":"yue-hei","year":"0","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref237","first-page":"5242","article-title":"Self-supervised learning of motion capture","volume":"30","author":"tung","year":"2017","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.16"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383332"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00631"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA.2017.8227494"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_28"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2015.7163105"},{"key":"ref122","first-page":"813","article-title":"Is space-time attention all you need for video understanding","volume":"2","author":"bertasius","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1145\/3530836"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7350760"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.217"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2868668"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2019.00068"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.217"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00817"},{"key":"ref242","first-page":"6010","article-title":"Rifle: Backpropagation in depth for deep transfer learning through re-initializing the fully-connected layer","author":"li","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref240","first-page":"1","article-title":"Delta: Deep learning transfer using feature map with attention for convolutional networks","author":"li","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.7557\/18.6282"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.113"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/844\/1\/012044"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.453"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.365"},{"key":"ref175","first-page":"69","article-title":"Team activity recognition in sports","author":"direko?lu","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref176","first-page":"1282","article-title":"What are they doing?: Collective activity classification using spatio-temporal relationship among people","author":"choi","year":"0","journal-title":"Proc IEEE 12th Int Conf Comput Vis Workshops ICCV Workshops"},{"key":"ref173","article-title":"Revisiting the effectiveness of off-the-shelf temporal modeling approaches for large-scale video classification","author":"bian","year":"2017"},{"key":"ref174","article-title":"Group activity recognition using wearable sensing devices","author":"gordon","year":"2014"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00446"},{"key":"ref172","article-title":"Audiovisual slowfast networks for video recognition","author":"xiao","year":"2020"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-022-01346-2"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00738"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.3390\/proceedings2020049095"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2010.2058795"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.3007173"},{"key":"ref187","article-title":"Estimating blink probability for highlight detection in figure skating videos","author":"nakano","year":"2020"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350609"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00458"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108360"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.17"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"ref155","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"hochreiter","year":"1997","journal-title":"Neural Comput"},{"key":"ref156","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"ref154","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413802"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3028207"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2017.8019447"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00132"},{"key":"ref167","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref165","first-page":"1","article-title":"Semi-supervised classification with graph convolutional networks","author":"kipf","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref162","first-page":"1","article-title":"Beit: Bert pre-training of image transformers","author":"bao","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref163","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"ramesh","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01433"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref10","article-title":"Two-stream convolutional networks for action recognition in videos","volume":"27","author":"simonyan","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1080\/02640414.2019.1684132"},{"key":"ref16","article-title":"The kinetics human action video dataset","author":"kay","year":"2017"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1186\/s40798-020-0237-5"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1080\/02640414.2018.1521769"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.06.108"},{"key":"ref1","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2017.7960043"},{"key":"ref192","first-page":"1","article-title":"Neighbours matter: Image captioning with similar images","author":"wang","year":"0","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref190","first-page":"242","article-title":"Template-free data-to-text generation of finnish sports news","author":"kanerva","year":"0","journal-title":"Proc Nordic Conf Comput Linguistics"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09904-8"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/THMS.2014.2325871"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.01.010"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00157"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00971"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3013834"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3159811"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/10016790\/09999033.pdf?arnumber=9999033","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,13]],"date-time":"2023-12-13T19:45:37Z","timestamp":1702496737000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9999033\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":263,"URL":"https:\/\/doi.org\/10.1109\/tmm.2022.3232034","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}