{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:12Z","timestamp":1778171052706,"version":"3.51.4"},"reference-count":91,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Plan of China","doi-asserted-by":"publisher","award":["2018AAA0102301"],"award-info":[{"award-number":["2018AAA0102301"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176025"],"award-info":[{"award-number":["62176025"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2045"],"award-info":[{"award-number":["U21B2045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006227"],"award-info":[{"award-number":["62006227"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62022011"],"award-info":[{"award-number":["62022011"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011347","name":"Research Program of State Key Laboratory of Software Development Environment","doi-asserted-by":"publisher","award":["SKLSDE-2021ZX-04"],"award-info":[{"award-number":["SKLSDE-2021ZX-04"]}],"id":[{"id":"10.13039\/501100011347","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1109\/tcsvt.2023.3278984","type":"journal-article","created":{"date-parts":[[2023,5,22]],"date-time":"2023-05-22T18:10:47Z","timestamp":1684779047000},"page":"7267-7281","source":"Crossref","is-referenced-by-count":4,"title":["Group Activity Representation Learning With Long-Short States Predictive Transformer"],"prefix":"10.1109","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9772-4579","authenticated-orcid":false,"given":"Longteng","family":"Kong","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0867-4117","authenticated-orcid":false,"given":"Wanting","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1584-5180","authenticated-orcid":false,"given":"Duoxuan","family":"Pei","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Software Development Environment, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3433-8435","authenticated-orcid":false,"given":"Zhaofeng","family":"He","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2412-9330","authenticated-orcid":false,"given":"Di","family":"Huang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Software Development Environment, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2930344"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2269780"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_7"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123321"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240572"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298872"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.217"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3156634"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.10.004"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2813971"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.453"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_44"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58598-3_13"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"ref16","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics","author":"Devlin"},{"key":"ref17","article-title":"Representation learning with contrastive predictive coding","author":"Van Den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref18","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_19"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2979190"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/204"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.502"},{"key":"ref27","article-title":"Self-supervised spatiotemporal feature learning via video rotation prediction","author":"Jing","year":"2018","journal-title":"arXiv:1811.11387"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.607"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.79"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/453"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00239"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_35"},{"key":"ref35","article-title":"Temporal fusion transformers for interpretable multi-horizon time series forecasting","author":"Lim","year":"2019","journal-title":"arXiv:1912.09363"},{"key":"ref36","article-title":"Deep transformer models for time series forecasting: The influenza prevalence case","author":"Wu","year":"2020","journal-title":"arXiv:2001.08317"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_30"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_28"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw.2009.5457461"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995707"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33765-9_16"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2012.6247821"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2893318"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009034"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.783"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461770"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.516"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00808"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00738"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3034233"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2978942"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2928540"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16437"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01341"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3114209"},{"key":"ref58","article-title":"Self-supervised visual learning by variable playback speeds prediction of a video","author":"Cho","year":"2020","journal-title":"arXiv:2003.02692"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3141051"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3169469"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref64","first-page":"7774","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Korbar"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref66","article-title":"Pretext-contrastive learning: Toward good practices in self-supervised video representation leaning","author":"Tao","year":"2020","journal-title":"arXiv:2010.15464"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10578-9_45"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018118"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2863279"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00710"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.18"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2882805"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00367"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00566"},{"key":"ref75","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref76","first-page":"1","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"key":"ref77","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref79","article-title":"CrossFormer: A versatile vision transformer hinging on cross-scale attention","author":"Wang","year":"2021","journal-title":"arXiv:2108.00154"},{"key":"ref80","article-title":"CAT: Cross attention in vision transformer","author":"Lin","year":"2021","journal-title":"arXiv:2106.05786"},{"key":"ref81","first-page":"813","article-title":"Is space-time attention all you need for video understanding?","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bertasius"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref83","article-title":"Video Swin transformer","author":"Liu","year":"2021","journal-title":"arXiv:2106.13230"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref85","first-page":"19594","article-title":"Space-time mixing attention for video transformer","volume-title":"Proc. Adv. Neural Inf. Process. Syst., Annu. Conf. Neural Inf. Process. Syst.","author":"Bulat"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref87","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012","journal-title":"arXiv:1212.0402"},{"issue":"8","key":"ref88","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Alec","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref89","first-page":"2672","article-title":"Generative adversarial nets","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Goodfellow"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.365"},{"key":"ref91","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Kingma"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/10348119\/10130581.pdf?arnumber=10130581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,20]],"date-time":"2023-12-20T01:19:06Z","timestamp":1703035146000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10130581\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":91,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2023.3278984","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12]]}}}