{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T01:41:45Z","timestamp":1768441305049,"version":"3.49.0"},"reference-count":50,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61836014"],"award-info":[{"award-number":["61836014"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006231"],"award-info":[{"award-number":["62006231"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072457"],"award-info":[{"award-number":["62072457"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2042"],"award-info":[{"award-number":["U21B2042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1016\/j.patcog.2022.108797","type":"journal-article","created":{"date-parts":[[2022,5,19]],"date-time":"2022-05-19T13:11:28Z","timestamp":1652965888000},"page":"108797","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":27,"special_numbering":"C","title":["Identifying the key frames: An attention-aware sampling method for action recognition"],"prefix":"10.1016","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8985-5707","authenticated-orcid":false,"given":"Wenkai","family":"Dong","sequence":"first","affiliation":[]},{"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chunfeng","family":"Song","sequence":"additional","affiliation":[]},{"given":"Tieniu","family":"Tan","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2022.108797_bib0001","series-title":"Proc. Adv. Neural Inf. Process. Syst.","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"key":"10.1016\/j.patcog.2022.108797_bib0002","series-title":"Proc. Adv. Neural Inf. Process. Syst.","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.patcog.2022.108797_bib0003","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"20","article-title":"Temporal segment networks: towards good practices for deep action recognition","author":"Wang","year":"2016"},{"key":"10.1016\/j.patcog.2022.108797_bib0004","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"2329","article-title":"Deep temporal linear encoding networks","author":"Diba","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0005","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"4768","article-title":"Spatiotemporal multiplier networks for video action recognition","author":"Feichtenhofer","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0006","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"373","article-title":"Interaction-aware spatio-temporal pyramid attention networks for action classification","author":"Du","year":"2018"},{"key":"10.1016\/j.patcog.2022.108797_bib0007","article-title":"Temporal segment networks for action recognition in videos","author":"Wang","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2022.108797_bib0008","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1016\/j.patcog.2019.03.005","article-title":"Spatiotemporal distilled dense-connectivity network for video action recognition","volume":"92","author":"Hao","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2022.108797_bib0009","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"2625","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"Donahue","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib0010","series-title":"2020 6th International Conference on Web Research (ICWR)","first-page":"133","article-title":"Human action recognition in video using DB-LSTM and resnet","author":"Mihanpour","year":"2020"},{"key":"10.1016\/j.patcog.2022.108797_bib0011","doi-asserted-by":"crossref","first-page":"105820","DOI":"10.1016\/j.asoc.2019.105820","article-title":"Human action recognition using two-stream attention based LSTM networks","volume":"86","author":"Dai","year":"2020","journal-title":"Appl. Soft Comput."},{"issue":"10","key":"10.1016\/j.patcog.2022.108797_bib0012","doi-asserted-by":"crossref","first-page":"2481","DOI":"10.1109\/TMM.2019.2960588","article-title":"2-D skeleton-based action recognition via two-branch stacked LSTM-RNNs","volume":"22","author":"Avola","year":"2019","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.patcog.2022.108797_bib0013","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"1227","article-title":"An attention enhanced graph convolutional LSTM network for skeleton-based action recognition","author":"Si","year":"2019"},{"key":"10.1016\/j.patcog.2022.108797_bib0014","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"4489","article-title":"Learning spatiotemporal features with 3D convolutional networks","author":"Tran","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib0015","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"5533","article-title":"Learning spatio-temporal representation with pseudo-3D residual networks","author":"Qiu","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0016","doi-asserted-by":"crossref","first-page":"107037","DOI":"10.1016\/j.patcog.2019.107037","article-title":"Spatio-temporal deformable 3D ConvNets with attention for action recognition","volume":"98","author":"Li","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2022.108797_bib0017","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"4694","article-title":"Beyond short snippets: deep networks for video classification","author":"Yue-Hei Ng","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib0018","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"6299","article-title":"Quo vadis, action recognition? A new model and the kinetics dataset","author":"Carreira","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0019","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"2556","article-title":"HMDB: a large video database for human motion recognition","author":"Kuehne","year":"2011"},{"key":"10.1016\/j.patcog.2022.108797_bib0020","series-title":"2017\u00a0IEEE International Conference on Image Processing (ICIP)","first-page":"870","article-title":"Improving human action recognition by temporal attention","author":"Liu","year":"2017"},{"issue":"11","key":"10.1016\/j.patcog.2022.108797_bib0021","doi-asserted-by":"crossref","first-page":"2673","DOI":"10.1109\/78.650093","article-title":"Bidirectional recurrent neural networks","volume":"45","author":"Schuster","year":"1997","journal-title":"IEEE Trans. Signal Process."},{"key":"10.1016\/j.patcog.2022.108797_bib0022","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"695","article-title":"Seed, expand and constrain: three principles for weakly-supervised image segmentation","author":"Kolesnikov","year":"2016"},{"key":"10.1016\/j.patcog.2022.108797_bib0023","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"6232","article-title":"SCSampler: sampling salient clips from video for efficient action recognition","author":"Korbar","year":"2019"},{"issue":"4","key":"10.1016\/j.patcog.2022.108797_bib0024","doi-asserted-by":"crossref","first-page":"047204","DOI":"10.1117\/1.OE.51.4.047204","article-title":"Crowd density estimation based on statistical analysis of local intra-crowd motions for public area surveillance","volume":"51","author":"Zhang","year":"2012","journal-title":"Opt. Eng."},{"key":"10.1016\/j.patcog.2022.108797_bib0025","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"189","article-title":"Blended grammar network for human parsing","author":"Zhang","year":"2020"},{"key":"10.1016\/j.patcog.2022.108797_bib0026","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"592","article-title":"Smooth neighborhood structure mining on multiple affinity graphs with applications to context-sensitive similarity","author":"Bai","year":"2016"},{"issue":"4","key":"10.1016\/j.patcog.2022.108797_bib0027","doi-asserted-by":"crossref","first-page":"1107","DOI":"10.1109\/TSMCB.2012.2187051","article-title":"Combining tensor space analysis and active appearance models for aging effect simulation on face images","volume":"42","author":"Wang","year":"2012","journal-title":"IEEE Trans. Syst. Man Cybern.Part B"},{"key":"10.1016\/j.patcog.2022.108797_bib0028","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1016\/j.patcog.2019.03.002","article-title":"Order-aware convolutional pooling for video based action recognition","volume":"91","author":"Wang","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2022.108797_bib0029","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"1725","article-title":"Large-scale video classification with convolutional neural networks","author":"Karpathy","year":"2014"},{"key":"10.1016\/j.patcog.2022.108797_bib0030","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"3551","article-title":"Action recognition with improved trajectories","author":"Wang","year":"2013"},{"key":"10.1016\/j.patcog.2022.108797_bib0031","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"1933","article-title":"Convolutional two-stream network fusion for video action recognition","author":"Feichtenhofer","year":"2016"},{"key":"10.1016\/j.patcog.2022.108797_bib0032","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"1991","article-title":"A key volume mining deep framework for action recognition","author":"Zhu","year":"2016"},{"key":"10.1016\/j.patcog.2022.108797_bib0033","unstructured":"Y. Wang, S. Wang, J. Tang, N. O\u2019Hare, Y. Chang, B. Li, Hierarchical attention network for action recognition in videos, arXiv preprint arXiv:1607.06416(2016)."},{"key":"10.1016\/j.patcog.2022.108797_sbref0034","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit. workshops","article-title":"Interpretable spatio-temporal attention for video action recognition","author":"Meng","year":"2019"},{"key":"10.1016\/j.patcog.2022.108797_bib0035","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"2911","article-title":"SST: single-stream temporal action proposals","author":"Buch","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0036","series-title":"Proc. IEEE Int. Conf. Comput. Vis.","first-page":"3931","article-title":"Attention-aware deep reinforcement learning for video face recognition","author":"Rao","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0037","series-title":"Proc. AAAI","first-page":"7582","article-title":"Deep reinforcement learning for unsupervised video summarization with diversity-representativeness reward","author":"Zhou","year":"2018"},{"key":"10.1016\/j.patcog.2022.108797_bib0038","series-title":"International Conference on Machine Learning","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","author":"Xu","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib0039","series-title":"Proc. AAAI","first-page":"8247","article-title":"Attention-aware sampling via deep reinforcement learning for action recognition","author":"Dong","year":"2019"},{"issue":"3\u20134","key":"10.1016\/j.patcog.2022.108797_bib0040","doi-asserted-by":"crossref","first-page":"229","DOI":"10.1023\/A:1022672621406","article-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning","volume":"8","author":"Williams","year":"1992","journal-title":"Mach. Learn."},{"key":"10.1016\/j.patcog.2022.108797_bib0041","unstructured":"K. Soomro, A.R. Zamir, M. Shah, UCF101: a dataset of 101 human actions classes from videos in the wild, arXiv preprint arXiv:1212.0402(2012)."},{"key":"10.1016\/j.patcog.2022.108797_bib0042","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"248","article-title":"ImageNet: a large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.patcog.2022.108797_bib0043","series-title":"Joint Pattern Recognition Symposium","first-page":"214","article-title":"A duality based approach for realtime TV-l 1 optical flow","author":"Zach","year":"2007"},{"key":"10.1016\/j.patcog.2022.108797_bib0044","unstructured":"D.P. Kingma, J. Ba, Adam: a method for stochastic optimization, arXiv preprint arXiv:1412.6980(2014)."},{"key":"10.1016\/j.patcog.2022.108797_bib0045","series-title":"International Conference on Machine Learning","first-page":"448","article-title":"Batch normalization: accelerating deep network training by reducing internal covariate shift","author":"Ioffe","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib0046","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"1529","article-title":"Spatiotemporal pyramid network for video action recognition","author":"Wang","year":"2017"},{"key":"10.1016\/j.patcog.2022.108797_bib0047","series-title":"Proc. Eur. Conf. Comput. Vis.","first-page":"352","article-title":"Multi-fiber networks for video recognition","author":"Chen","year":"2018"},{"key":"10.1016\/j.patcog.2022.108797_bib0048","series-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","first-page":"4305","article-title":"Action recognition with trajectory-pooled deep-convolutional descriptors","author":"Wang","year":"2015"},{"key":"10.1016\/j.patcog.2022.108797_bib50","series-title":"Proc. AAAI","first-page":"1984","article-title":"Group-Wise Semantic Mining for Weakly Supervised Semantic Segmentation","author":"X. Li","year":"2021"},{"key":"10.1016\/j.patcog.2022.108797_bib51","series-title":"Proc. Adv. Neural Inf. Process. Syst.","article-title":"Efficient neural architecture transformation search in channel-level for object detection","author":"J. Peng","year":"2019"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320322002783?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320322002783?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,9,21]],"date-time":"2025-09-21T17:04:25Z","timestamp":1758474265000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320322002783"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10]]},"references-count":50,"alternative-id":["S0031320322002783"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2022.108797","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2022,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Identifying the key frames: An attention-aware sampling method for action recognition","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2022.108797","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"108797"}}