{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T13:17:14Z","timestamp":1770297434979,"version":"3.49.0"},"reference-count":69,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T00:00:00Z","timestamp":1659312000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T00:00:00Z","timestamp":1659312000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T00:00:00Z","timestamp":1659312000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Plan of China","doi-asserted-by":"publisher","award":["2020AAA0106200"],"award-info":[{"award-number":["2020AAA0106200"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62036012"],"award-info":[{"award-number":["62036012"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61721004"],"award-info":[{"award-number":["61721004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102415"],"award-info":[{"award-number":["62102415"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072286"],"award-info":[{"award-number":["62072286"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61720106006"],"award-info":[{"award-number":["61720106006"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61832002"],"award-info":[{"award-number":["61832002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072455"],"award-info":[{"award-number":["62072455"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62002355"],"award-info":[{"award-number":["62002355"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1836220"],"award-info":[{"award-number":["U1836220"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1705262"],"award-info":[{"award-number":["U1705262"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002367","name":"Key Research Program of Frontier Sciences of the Chinese Academy of Sciences","doi-asserted-by":"publisher","award":["QYZDJSSW-JSC039"],"award-info":[{"award-number":["QYZDJSSW-JSC039"]}],"id":[{"id":"10.13039\/501100002367","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","award":["L201001"],"award-info":[{"award-number":["L201001"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2022,8]]},"DOI":"10.1109\/tcsvt.2021.3137023","type":"journal-article","created":{"date-parts":[[2021,12,20]],"date-time":"2021-12-20T21:51:16Z","timestamp":1640037076000},"page":"5213-5224","source":"Crossref","is-referenced-by-count":31,"title":["Learning Semantic-Aware Spatial-Temporal Attention for Interpretable Action Recognition"],"prefix":"10.1109","volume":"32","author":[{"given":"Jie","family":"Fu","sequence":"first","affiliation":[{"name":"School of Information Engineering, Zhengzhou University, Zhengzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8105-5497","authenticated-orcid":false,"given":"Junyu","family":"Gao","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2019.105820"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-68238-5_48"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.107037"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00561"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_17"},{"key":"ref37","article-title":"Spatiotemporal multimodal learning with 3D CNNs for video action recognition","author":"wu","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref35","article-title":"Temporal cross-layer correlation mining for action recognition","author":"zhu","year":"2021","journal-title":"IEEE Trans Multimedia"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2749159"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00155"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018401"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_43"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3341105.3373906"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.787"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01212"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2666540"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"ref66","article-title":"Improved residual networks for image and video recognition","author":"cosmin duta","year":"2020","journal-title":"arXiv 2004 04989"},{"key":"ref29","article-title":"PAN: Towards fast action recognition via learning persistence of appearance","author":"zhang","year":"2020","journal-title":"arXiv 2008 03462"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_18"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00155"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.373"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.05.058"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"ref26","article-title":"Interaction relational network for mutual action recognition","author":"perez","year":"2021","journal-title":"IEEE Trans Multimedia"},{"key":"ref25","article-title":"Cross-modality compensation convolutional neural networks for RGB-D action recognition","author":"cheng","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref50","article-title":"FactorNet: Holistic actor, object and scene factorization for action recognition in videos","author":"nigam","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref51","article-title":"FEXNet: Foreground extraction network for human action recognition","author":"shen","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref59","article-title":"Is space-time attention all you need for video understanding?","author":"bertasius","year":"2021","journal-title":"arXiv 2102 05095"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00610"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_30"},{"key":"ref56","first-page":"2264","article-title":"More is less: Learning efficient video representations by big-little network and depthwise temporal aggregation","author":"fan","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00558"},{"key":"ref53","first-page":"803","article-title":"Temporal relational reasoning in videos","author":"zhou","year":"2018","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref52","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"wang","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref10","article-title":"SmoothGrad: Removing noise by adding noise","author":"smilkov","year":"2017","journal-title":"arXiv 1706 03825"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00097"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2984904"},{"key":"ref12","first-page":"8930","article-title":"This looks like that: Deep learning for interpretable image recognition","author":"chen","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3390\/rs13091772"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3007352"},{"key":"ref15","article-title":"Learning video moment retrieval without a single annotated video","author":"gao","year":"2021","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00478"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref19","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012","journal-title":"arXiv 1212 0402"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00189"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.07.034"},{"key":"ref5","article-title":"Knowing what, where and when to look: Efficient video action modeling with attention","author":"perez-rua","year":"2020","journal-title":"arXiv 2004 01278"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01096"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.507"},{"key":"ref49","article-title":"Action recognition using visual attention","author":"sharma","year":"2015","journal-title":"arXiv 1511 04119"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11212"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2983355"},{"key":"ref48","first-page":"33","article-title":"Attentional pooling for action recognition","author":"girdhar","year":"2017","journal-title":"Proc 31st Int Conf Neural Inf Process Syst"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2896029"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-019-01733-3"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2992740"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3042986"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2020.102846"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/9849156\/09656725.pdf?arnumber=9656725","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T20:40:21Z","timestamp":1661805621000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9656725\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8]]},"references-count":69,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2021.3137023","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8]]}}}