{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:34:36Z","timestamp":1767339276885,"version":"3.37.3"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100002663","name":"Northwestern Polytechnical University","doi-asserted-by":"publisher","award":["N172608005","N172604004","N182612002"],"award-info":[{"award-number":["N172608005","N172604004","N182612002"]}],"id":[{"id":"10.13039\/501100002663","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005047","name":"Natural Science Foundation of Liaoning Province","doi-asserted-by":"publisher","award":["20180520007"],"award-info":[{"award-number":["20180520007"]}],"id":[{"id":"10.13039\/501100005047","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2019]]},"DOI":"10.1109\/access.2019.2953113","type":"journal-article","created":{"date-parts":[[2019,11,12]],"date-time":"2019-11-12T16:58:06Z","timestamp":1573577886000},"page":"164876-164886","source":"Crossref","is-referenced-by-count":8,"title":["SAST: Learning Semantic Action-Aware Spatial-Temporal Features for Efficient Action Recognition"],"prefix":"10.1109","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8296-8039","authenticated-orcid":false,"given":"Fei","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3861-1889","authenticated-orcid":false,"given":"Guorui","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0415-7473","authenticated-orcid":false,"given":"Yunwen","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1628-1404","authenticated-orcid":false,"given":"Hao","family":"Chu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.683"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2018.8451255"},{"key":"ref32","first-page":"12046","article-title":"Large-scale weakly-supervised pre-training for video action recognition","author":"ghadiyaram","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref31","article-title":"STM: SpatioTemporal and motion encoding for action recognition","author":"jiang","year":"2019","journal-title":"arXiv 1908 02486"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.373"},{"key":"ref37","article-title":"Multiple object recognition with visual attention","author":"ba","year":"2014","journal-title":"Arxiv 1412 7755"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1038\/nrn755"},{"key":"ref35","first-page":"1243","article-title":"Learning to combine foveal glimpses with a third-order Boltzmann machine","author":"larochelle","year":"2010","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00734"},{"key":"ref28","article-title":"Distinit: Learning video representations without a single labeled video","author":"girdhar","year":"2019","journal-title":"arXiv 1901 09244"},{"key":"ref27","article-title":"Video classification with channel-separated convolutional networks","author":"tran","year":"2019","journal-title":"arXiv 1904 02811"},{"key":"ref29","article-title":"Dynamonet: Dynamic action and motion network","author":"diba","year":"2019","journal-title":"arXiv 1904 11407"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/1291233.1291311"},{"article-title":"MoSIFT: Recognizing human actions in surveillance videos","year":"2009","author":"chen","key":"ref1"},{"key":"ref20","article-title":"Slowfast networks for video recognition","author":"feichtenhofer","year":"2018","journal-title":"arXiv 1812 03982"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref21","first-page":"7922","article-title":"PA3D: Pose-action 3D machine for video recognition","author":"yan","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref24","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"arXiv 1409 1556"},{"key":"ref23","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref25","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"arXiv 1502 03167"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"ref51","article-title":"Action recognition with spatial-temporal discriminative filter banks","author":"martinez","year":"2019","journal-title":"arXiv 1908 07625"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.297"},{"key":"ref54","first-page":"2204","article-title":"Trajectory convolution for action recognition","author":"zhao","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_25"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"ref10","first-page":"1","article-title":"Action recognition with coarse-to-fine deep feature integration and asynchronous fusion","author":"lin","year":"2018","journal-title":"Proc 22nd AAAI Conf Artif Intell"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref40","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014","journal-title":"arXiv 1409 0473"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref13","article-title":"The kinetics human action video dataset","author":"kay","year":"2017","journal-title":"arXiv 1705 06950"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545710"},{"key":"ref16","article-title":"Temporal 3D convnets: New architecture and transfer learning for video classification","author":"diba","year":"2017","journal-title":"arXiv 1711 08200"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_43"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref19","article-title":"A dataset of 101 human action classes from videos in the wild","author":"soomro","year":"2012","journal-title":"Computer and Vision Research Center"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0594-8"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref5","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref7","first-page":"4694","article-title":"Beyond short snippets: Deep networks for video classification","author":"ng","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.89"},{"key":"ref45","article-title":"Convnet architecture search for spatiotemporal feature learning","author":"tran","year":"2017","journal-title":"arXiv 1708 05038"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00155"},{"key":"ref47","first-page":"1","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc Int Conf Int Conf Mach Learn"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.604"},{"key":"ref41","article-title":"Action recognition using visual attention","author":"sharma","year":"2015","journal-title":"arXiv 1511 04119"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"ref43","article-title":"SCSampler: Sampling salient clips from video for efficient action recognition","author":"korbar","year":"2019","journal-title":"arXiv 1904 04289"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/8600701\/08896926.pdf?arnumber=8896926","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T11:32:13Z","timestamp":1641987133000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8896926\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/access.2019.2953113","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2019]]}}}