{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:13Z","timestamp":1778171053089,"version":"3.51.4"},"reference-count":84,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Plan of China","award":["2016YFB1001002"],"award-info":[{"award-number":["2016YFB1001002"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61573045"],"award-info":[{"award-number":["61573045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61421003"],"award-info":[{"award-number":["61421003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1813709"],"award-info":[{"award-number":["1813709"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1722847"],"award-info":[{"award-number":["1722847"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Futurewei"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2020,2]]},"DOI":"10.1109\/tcsvt.2019.2894161","type":"journal-article","created":{"date-parts":[[2019,1,23]],"date-time":"2019-01-23T02:04:51Z","timestamp":1548209091000},"page":"549-565","source":"Crossref","is-referenced-by-count":122,"title":["stagNet: An Attentive Semantic RNN for Group Activity and Individual Action Recognition"],"prefix":"10.1109","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6955-6635","authenticated-orcid":false,"given":"Mengshi","family":"Qi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8001-2703","authenticated-orcid":false,"given":"Yunhong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0306-534X","authenticated-orcid":false,"given":"Jie","family":"Qin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Annan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4516-9729","authenticated-orcid":false,"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Luc","family":"Van Gool","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.171"},{"key":"ref72","article-title":"Neural networks for machine learning: Lecture 6a&#x2014;Overview of mini-batch gradient descent","author":"hinton","year":"2012"},{"key":"ref71","author":"abadi","year":"2016","journal-title":"Tensorflow Large-scale machine learning on heterogeneous distributed systems"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3265845.3265851"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/98"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_44"},{"key":"ref78","author":"kay","year":"2017","journal-title":"The kinetics human action video dataset"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.453"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1090\/conm\/001"},{"key":"ref31","first-page":"1354","article-title":"Social roles in hierarchical models for human activity recognition","author":"lan","year":"2012","journal-title":"Proc IEEE CVPR"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-010-0355-5"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00180"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.313"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.365"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.783"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.286"},{"key":"ref62","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc ICML"},{"key":"ref61","author":"bahdanau","year":"2014","journal-title":"Neural machine translation by jointly learning to align and translate"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_9"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.332"},{"key":"ref27","first-page":"379","article-title":"R-FCN: Object detection via region-based fully convolutional networks","author":"dai","year":"2016","journal-title":"Proc NIPS"},{"key":"ref65","first-page":"4331","article-title":"Using fast weights to attend to the recent past","author":"ba","year":"2016","journal-title":"Proc NIPS"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.346"},{"key":"ref67","first-page":"4263","article-title":"An end-to-end spatio-temporal attention model for human action recognition from skeleton data","author":"song","year":"2017","journal-title":"Proc AAAI"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1137\/0105003"},{"key":"ref69","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"1704","DOI":"10.1109\/TPAMI.2012.242","article-title":"Learning to track and identify players from broadcast sports videos","volume":"35","author":"lu","year":"2013","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.217"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995707"},{"key":"ref23","first-page":"1282","article-title":"What are they doing?: Collective activity classification using spatio-temporal relationship among people","author":"choi","year":"2009","journal-title":"Proc ICCV Workshops"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33765-9_16"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_7"},{"key":"ref50","author":"chen","year":"2014","journal-title":"Semantic image segmentation with deep convolutional nets and fully connected crfs"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.179"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.338"},{"key":"ref58","first-page":"2204","article-title":"Recurrent models of visual attention","author":"mnih","year":"2014","journal-title":"Proc NIPS"},{"key":"ref57","first-page":"2409","article-title":"Action is in the eye of the beholder: Eye-gaze driven model for spatio-temporal action localization","author":"shapovalova","year":"2013","journal-title":"Proc NIPS"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/34.730558"},{"key":"ref55","first-page":"2014","article-title":"Learning convolutional neural networks for graphs","author":"niepert","year":"2016","journal-title":"Proc ICML"},{"key":"ref54","first-page":"3844","article-title":"Convolutional neural networks on graphs with fast localized spectral filtering","author":"defferrard","year":"2016","journal-title":"Proc NIPS"},{"key":"ref53","first-page":"2951","article-title":"Practical Bayesian optimization of machine learning algorithms","volume":"4","author":"snoek","year":"2012","journal-title":"Proc NIPS"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298621"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.712"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.516"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2572683"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.573"},{"key":"ref13","first-page":"428","article-title":"Human detection using oriented histograms of flow and appearance","author":"dalal","year":"2006","journal-title":"Proc ECCV"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298875"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref16","first-page":"937","article-title":"Globally trained handwritten word recognizer using spatial representation, convolutional neural networks, and hidden Markov models","author":"bengio","year":"1994","journal-title":"Proc NIPS"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref17","first-page":"109","article-title":"Efficient inference in fully connected CRFs with Gaussian edge potentials","author":"kr\u00e4henb\u00fchl","year":"2011","journal-title":"Proc NIPS"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0781-x"},{"key":"ref18","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc NIPS"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.147"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.221"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.228"},{"key":"ref6","first-page":"4576","article-title":"Joint inference of groups, events and human roles in aerial videos","author":"shu","year":"2015","journal-title":"Proc IEEE CVPR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_37"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.239"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33765-9_14"},{"key":"ref49","first-page":"1799","article-title":"Joint training of a convolutional network and a graphical model for human pose estimation","author":"tompson","year":"2014","journal-title":"Proc NIPS"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.117"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.162"},{"key":"ref45","first-page":"1785","article-title":"Learning deep structured models","author":"chen","year":"2014","journal-title":"Proc ICLR"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.326"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.212"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.86"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00046"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1006\/jmbi.2000.4315"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123311"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/76\/8984597\/8621027-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/8984597\/08621027.pdf?arnumber=8621027","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T14:39:58Z","timestamp":1651070398000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8621027\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2]]},"references-count":84,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2019.2894161","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,2]]}}}