{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T07:30:22Z","timestamp":1770017422845,"version":"3.49.0"},"reference-count":89,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-026-21330-6","type":"journal-article","created":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T21:09:29Z","timestamp":1769980169000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A systematic review on human action detection and classification architectures using deep learning methodology"],"prefix":"10.1007","volume":"85","author":[{"given":"Mandar","family":"Parale","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"C. H.","family":"Patil","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9341-949X","authenticated-orcid":false,"given":"S. M.","family":"Mali","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,1]]},"reference":[{"key":"21330_CR1","doi-asserted-by":"publisher","first-page":"1165","DOI":"10.2307\/41703503","volume":"36","author":"H Chen","year":"2012","unstructured":"Chen H, Chiang RHL, Storey VC (2012) Business intelligence and analytics: from big data to big impact. MIS Q 36:1165. https:\/\/doi.org\/10.2307\/41703503","journal-title":"MIS Q"},{"key":"21330_CR2","doi-asserted-by":"publisher","unstructured":"Diba A, Fayyaz M, Sharma V, Karami AH, Arzani MM, Yousefzadeh R, Van Gool L (2017) Temporal 3D ConvNets: New architecture and transfer learning for video classification. arXiv preprint arXiv:1711.08200. https:\/\/doi.org\/10.48550\/arXiv.1711.08200","DOI":"10.48550\/arXiv.1711.08200"},{"key":"21330_CR3","doi-asserted-by":"publisher","unstructured":"Gu C, Sun C, Ross DA, Vondrick C, Pantofaru C, Li Y, Vijayanarasimhan S, Toderici G, Ricco S, Sukthankar R, Schmid C, Malik J (2018) AVA: A video dataset of spatio-temporally localized atomic visual actions. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 6047\u20136056. https:\/\/doi.org\/10.1109\/cvpr.2018.00633","DOI":"10.1109\/cvpr.2018.00633"},{"key":"21330_CR4","doi-asserted-by":"publisher","first-page":"615","DOI":"10.1109\/tpami.2011.209","volume":"34","author":"O Kliper-Gross","year":"2012","unstructured":"Kliper-Gross O, Hassner T, Wolf L (2012) The action similarity labeling challenge. IEEE Trans Pattern Anal Mach Intell 34:615\u2013621. https:\/\/doi.org\/10.1109\/tpami.2011.209","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR5","doi-asserted-by":"publisher","unstructured":"Hou R, Chen C, Shah M (2017) Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos. 2017 IEEE International Conference on Computer Vision 5822\u20135832. https:\/\/doi.org\/10.1109\/ICCV.2017.620","DOI":"10.1109\/ICCV.2017.620"},{"key":"21330_CR6","doi-asserted-by":"publisher","unstructured":"Jhuang H, Gall J, Zuffi S, Schmid C, Black MJ (2013) Towards understanding action recognition. 2013 IEEE International Conference on Computer Vision 3192\u20133199. https:\/\/doi.org\/10.1109\/iccv.2013.396","DOI":"10.1109\/iccv.2013.396"},{"key":"21330_CR7","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1605.05197","author":"P Weinzaepfel","year":"2016","unstructured":"Weinzaepfel P, Martin X, Schmid C (2016) Human action localization with sparse Spatial supervision. ArXiv Preprint. https:\/\/doi.org\/10.48550\/arXiv.1605.05197. arXiv:1605.05197","journal-title":"ArXiv Preprint"},{"key":"21330_CR8","doi-asserted-by":"publisher","unstructured":"Marszalek M, Laptev I, Schmid C (2009) Actions in context. 2009 IEEE Conf Comput Vis Pattern Recognit 2929\u20132936. https:\/\/doi.org\/10.1109\/cvpr.2009.5206557","DOI":"10.1109\/cvpr.2009.5206557"},{"key":"21330_CR9","doi-asserted-by":"publisher","unstructured":"Rohrbach M, Amin S, Andriluka M, Schiele B (2012) A database for fine grained activity detection of cooking activities. 2012 IEEE Conference on Computer Vision and Pattern Recognition 1194\u20131201. https:\/\/doi.org\/10.1109\/cvpr.2012.6247801","DOI":"10.1109\/cvpr.2012.6247801"},{"key":"21330_CR10","doi-asserted-by":"publisher","DOI":"10.1136\/bmj.b2535","volume":"339","author":"D Moher","year":"2009","unstructured":"Moher D, Liberati A, Tetzlaff J, Altman DG (2009) Preferred reporting items for systematic reviews and meta-analyses: the PRISMA statement. BMJ 339:b2535. https:\/\/doi.org\/10.1136\/bmj.b2535","journal-title":"BMJ"},{"key":"21330_CR11","doi-asserted-by":"publisher","first-page":"6","DOI":"10.7326\/acpjc-1995-123-3-a12","volume":"123","author":"WS Richardson","year":"1995","unstructured":"Richardson WS, Wilson MC, Nishikawa J, Hayward RS (1995) The well-built clinical question: a key to evidence-based decisions. ACP J Club 123:6\u20138. https:\/\/doi.org\/10.7326\/acpjc-1995-123-3-a12","journal-title":"ACP J Club"},{"key":"21330_CR12","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2017) ImageNet classification with deep convolutional neural networks. Commun ACM 60:84\u201390. https:\/\/doi.org\/10.1145\/3065386","journal-title":"Commun ACM"},{"key":"21330_CR13","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/tpami.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W, Yang M, Yu K (2013) 3D convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35:221\u2013231. https:\/\/doi.org\/10.1109\/tpami.2012.59","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR14","doi-asserted-by":"publisher","unstructured":"Ferrari V, Marin-Jimenez M, Zisserman A (2008) Progressive search space reduction for human pose estimation. 2008 IEEE Conference on Computer Vision and Pattern Recognition 1\u20138. https:\/\/doi.org\/10.1109\/cvpr.2008.4587468","DOI":"10.1109\/cvpr.2008.4587468"},{"key":"21330_CR15","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Osindero S, Teh YW (2006) A fast learning algorithm for deep belief nets. Neural Comput 18:1527\u20131554. https:\/\/doi.org\/10.1162\/neco.2006.18.7.1527","journal-title":"Neural Comput"},{"key":"21330_CR16","unstructured":"Nair V, Hinton GE (2009) 3D object recognition with deep belief Nets. Neural Inform Process Syst 22. http:\/\/papers.nips.cc\/paper\/3872-3d-object-recognition-with-deep-belief-nets"},{"key":"21330_CR17","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1007\/978-3-642-15567-3_11","volume":"2010","author":"GW Taylor","year":"2010","unstructured":"Taylor GW, Fergus R, LeCun Y, Bregler C (2010) Convolutional learning of Spatio-temporal features. Comput Vis 11th Eur Conf Comput Vis 2010:140\u2013153. https:\/\/doi.org\/10.1007\/978-3-642-15567-3_11","journal-title":"Comput Vis 11th Eur Conf Comput Vis"},{"key":"21330_CR18","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1202.3748","author":"V Mnih","year":"2011","unstructured":"Mnih V, Larochelle H, Hinton GE (2011) Conditional restricted Boltzmann machines for structured output prediction. ArXiv Preprint. https:\/\/doi.org\/10.48550\/arXiv.1202.3748. arXiv:1202.3748","journal-title":"ArXiv Preprint"},{"key":"21330_CR19","doi-asserted-by":"publisher","unstructured":"Tang Y, Salakhutdinov R, Hinton G (2012) Robust boltzmann machines for recognition and denoising. 2012 IEEE Conference on Computer Vision and Pattern Recognition 2264\u20132271. https:\/\/doi.org\/10.1109\/CVPR.2012.6247936","DOI":"10.1109\/CVPR.2012.6247936"},{"key":"21330_CR20","doi-asserted-by":"publisher","first-page":"294","DOI":"10.1080\/02564602.2015.1015631","volume":"32","author":"B Liao","year":"2015","unstructured":"Liao B, Xu J, Lv J, Zhou S (2015) An image retrieval method for binary images based on DBN and softmax classifier. IETE Tech Rev 32:294\u2013303. https:\/\/doi.org\/10.1080\/02564602.2015.1015631","journal-title":"IETE Tech Rev"},{"key":"21330_CR21","doi-asserted-by":"publisher","first-page":"943","DOI":"10.22214\/ijraset.2022.47789","volume":"10","author":"A Saxena","year":"2022","unstructured":"Saxena A (2022) An introduction to convolutional neural networks. Int J Res Appl Sci Eng Technol 10:943\u2013947. https:\/\/doi.org\/10.22214\/ijraset.2022.47789","journal-title":"Int J Res Appl Sci Eng Technol"},{"key":"21330_CR22","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun Y, Bottou L, Bengio Y, Haffner P (1998) Gradient-based learning applied to document recognition. Proc IEEE 86:2278\u20132324. https:\/\/doi.org\/10.1109\/5.726791","journal-title":"Proc IEEE"},{"key":"21330_CR23","doi-asserted-by":"publisher","unstructured":"Zeiler MD, Fergus R (2014) Visualizing and understanding convolutional networks. Computer Vision\u2013ECCV 2014 818\u2013833. https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"21330_CR24","doi-asserted-by":"publisher","unstructured":"Liu S, Deng W (2015) Very deep convolutional neural network based image classification using small training sample size. 2015 3rd IAPR Asian Conf Pattern Recognit 730\u2013734. https:\/\/doi.org\/10.1109\/ACPR.2015.7486599","DOI":"10.1109\/ACPR.2015.7486599"},{"key":"21330_CR25","doi-asserted-by":"publisher","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V (2015) Going deeper with convolutions. Proc IEEE Conf Comput Vis Pattern Recognit 2015(1\u20139). https:\/\/doi.org\/10.1109\/cvpr.2015.7298594","DOI":"10.1109\/cvpr.2015.7298594"},{"key":"21330_CR26","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition 770\u2013778. https:\/\/doi.org\/10.1109\/cvpr.2016.90","DOI":"10.1109\/cvpr.2016.90"},{"key":"21330_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.physd.2019.132306","volume":"404","author":"A Sherstinsky","year":"2020","unstructured":"Sherstinsky A (2020) Fundamentals of recurrent neural network (RNN) and long Short-Term memory (LSTM) network. Physica D 404:1\u201343. https:\/\/doi.org\/10.1016\/j.physd.2019.132306","journal-title":"Physica D"},{"key":"21330_CR28","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long Short-Term memory. Neural Comput 9:1735\u20131780. https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput"},{"key":"21330_CR29","doi-asserted-by":"publisher","unstructured":"Baccouche M, Mamalet F, Wolf C, Garcia C, Baskurt A (2011) Sequential deep learning for human action recognition. Lect Notes Comput Sci 29\u201339. https:\/\/doi.org\/10.1007\/978-3-642-25446-8_4","DOI":"10.1007\/978-3-642-25446-8_4"},{"key":"21330_CR30","doi-asserted-by":"publisher","first-page":"843","DOI":"10.48550\/arXiv.1502.04681","volume":"37","author":"N Srivastava","year":"2015","unstructured":"Srivastava N, Mansimov E, Salakhudinov R (2015) Unsupervised learning of video representations using LSTMs. Proc 32nd Int Conf Mach Learn 37:843\u2013852. https:\/\/doi.org\/10.48550\/arXiv.1502.04681","journal-title":"Proc 32nd Int Conf Mach Learn"},{"key":"21330_CR31","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume":"2016","author":"L Wang","year":"2016","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2016) Temporal segment networks: towards good practices for deep action recognition. Comput Vision\u2013ECCV 2016:20\u201336. https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2","journal-title":"Comput Vision\u2013ECCV"},{"key":"21330_CR32","doi-asserted-by":"publisher","first-page":"352","DOI":"10.1109\/tpami.2017.2670560","volume":"40","author":"YG Jiang","year":"2018","unstructured":"Jiang YG, Wu Z, Wang J, Xue X, Chang SF (2018) Exploiting feature and class relationships in video categorization with regularized deep neural networks. Trans Pattern Anal Mach Intell 40:352\u2013364. https:\/\/doi.org\/10.1109\/tpami.2017.2670560","journal-title":"Trans Pattern Anal Mach Intell"},{"key":"21330_CR33","doi-asserted-by":"publisher","unstructured":"Liu W, Anguelov D, Erhan D, Szegedy C, Reed S, Fu CY, Berg AC (2016) SSD: single shot multibox detector. Comput Vision\u2013ECCV 2016 21\u201337. https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"21330_CR34","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/tpami.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39:1137\u20131149. https:\/\/doi.org\/10.1109\/tpami.2016.2577031","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR35","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JRR Uijlings","year":"2013","unstructured":"Uijlings JRR, Van De Sande KEA, Gevers T, Smeulders AWM (2013) Selective search for object recognition. Int J Comput Vis 104:154\u2013171. https:\/\/doi.org\/10.1007\/s11263-013-0620-5","journal-title":"Int J Comput Vis"},{"key":"21330_CR36","doi-asserted-by":"publisher","unstructured":"Girshick R, Donahue J, Darrell T, Malik J (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. 2014 IEEE Conf Comput Vis Pattern Recognit 580\u2013587. https:\/\/doi.org\/10.1109\/cvpr.2014.81","DOI":"10.1109\/cvpr.2014.81"},{"key":"21330_CR37","doi-asserted-by":"publisher","first-page":"1448","DOI":"10.1109\/iccv.2015.169","volume":"1440","author":"R Girshick","year":"2015","unstructured":"Girshick R (2015) Fast R-CNN. 2015 IEEE Int Conf Comput Vis 1440:1448. https:\/\/doi.org\/10.1109\/iccv.2015.169","journal-title":"2015 IEEE Int Conf Comput Vis"},{"key":"21330_CR38","doi-asserted-by":"publisher","unstructured":"Lin TY, Doll\u00e1r P, Girshick R, He K, Hariharan B, Belongie S (2017) Feature pyramid networks for object detection. 2017 IEEE Conference on Computer Vision and Pattern Recognition 2117\u20132125. https:\/\/doi.org\/10.1109\/CVPR.2017.106","DOI":"10.1109\/CVPR.2017.106"},{"key":"21330_CR39","doi-asserted-by":"publisher","unstructured":"Liu S, Qi L, Qin H, Shi J, Jia J (2018) Instance segmentation path aggregation network for. 2018 IEEE Conference on Computer Vision and Pattern Recognition 8759\u20138768. https:\/\/doi.org\/10.1109\/CVPR.2018.00913","DOI":"10.1109\/CVPR.2018.00913"},{"key":"21330_CR40","doi-asserted-by":"publisher","unstructured":"Huang J, Rathod V, Sun C, Zhu M, Korattikara A, Fathi A, Fischer I, Wojna Z, Song Y, Guadarrama S, Murphy K (2017) Speed\/accuracy trade-offs for modern convolutional object detectors. 2017 IEEE Conference on Computer Vision and Pattern Recognition 7310\u20137311. https:\/\/doi.org\/10.1109\/cvpr.2017.351","DOI":"10.1109\/cvpr.2017.351"},{"key":"21330_CR41","doi-asserted-by":"publisher","unstructured":"Mettes P, Van Gemert JC, Snoek CG (2017) Spot on: Action localization from pointly-supervised proposals. 2017 IEEE Conference on Computer Vision and Pattern Recognition 437\u2013453. https:\/\/doi.org\/10.1007\/978-3-319-46454-1_27","DOI":"10.1007\/978-3-319-46454-1_27"},{"key":"21330_CR42","doi-asserted-by":"publisher","unstructured":"Huang G, Liu Z, Van Der Maaten L, Weinberger KQ (2017) Densely connected convolutional networks. 2017 IEEE Conference on Computer Vision and Pattern Recognition 2261\u20132269. https:\/\/doi.org\/10.1109\/CVPR.2017.243","DOI":"10.1109\/CVPR.2017.243"},{"key":"21330_CR43","doi-asserted-by":"publisher","unstructured":"Kalogeiton V, Weinzaepfel P, Ferrari V, Schmid C (2017) Action Tubelet detector for spatio-temporal action localization. 2017 IEEE International Conference on Computer Vision 4415\u20134423. https:\/\/doi.org\/10.1109\/ICCV.2017.472","DOI":"10.1109\/ICCV.2017.472"},{"key":"21330_CR44","doi-asserted-by":"publisher","unstructured":"Sermanet P, Chintala S, LeCun Y (2012) Convolutional neural networks applied to house numbers digit classification. arXiv preprint arXiv:1204.3968. https:\/\/doi.org\/10.48550\/arXiv.1204.3968","DOI":"10.48550\/arXiv.1204.3968"},{"key":"21330_CR45","doi-asserted-by":"publisher","first-page":"1915","DOI":"10.1109\/tpami.2012.231","volume":"35","author":"C Farabet","year":"2013","unstructured":"Farabet C, Couprie C, Najman L, LeCun Y (2013) Learning hierarchical features for scene labeling. IEEE Trans Pattern Anal Mach Intell 35:1915\u20131929. https:\/\/doi.org\/10.1109\/tpami.2012.231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR46","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1301.3572","author":"C Couprie","year":"2013","unstructured":"Couprie C, Farabet C, Najman L, LeCun Y (2013) Indoor semantic segmentation using depth information. ArXiv Preprint arXiv:1301 3572. https:\/\/doi.org\/10.48550\/arXiv.1301.3572","journal-title":"ArXiv Preprint arXiv:1301 3572"},{"key":"21330_CR47","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1312.6229","author":"P Sermanet","year":"2014","unstructured":"Sermanet P, Eigen D, Zhang X, Mathieu M, Fergus R, LeCun Y (2014) OverFeat: integrated recognition, localization and detection using convolutional networks. ArXiv Preprint arXiv 1312 6229. https:\/\/doi.org\/10.48550\/arXiv.1312.6229","journal-title":"ArXiv Preprint arXiv 1312 6229"},{"key":"21330_CR48","doi-asserted-by":"publisher","unstructured":"Sharif Razavian A, Azizpour H, Sullivan J, Carlsson S (2014) CNN features off-the-shelf: An astounding baseline for recognition. 2014 IEEE Conference on Computer Vision and Pattern Recognition Workshops 806\u2013813. https:\/\/doi.org\/10.1109\/CVPRW.2014.131","DOI":"10.1109\/CVPRW.2014.131"},{"key":"21330_CR49","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.2199","author":"K Simonyan","year":"2014","unstructured":"Simonyan K, Zisserman A (2014) Two-Stream convolutional networks for action recognition in videos. Adv Neural Inform Process Syst 27. https:\/\/doi.org\/10.48550\/arXiv.1406.2199","journal-title":"Adv Neural Inform Process Syst 27"},{"key":"21330_CR50","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2014.223","author":"A Karpathy","year":"2014","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. 2014 IEEE Conf Comput Vis Pattern Recognit. https:\/\/doi.org\/10.1109\/cvpr.2014.223","journal-title":"2014 IEEE Conf Comput Vis Pattern Recognit"},{"key":"21330_CR51","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional Two-Stream network fusion for video action recognition. 2016 IEEE Conf Comput Vis Pattern Recognit 1933\u20131941. https:\/\/doi.org\/10.1109\/cvpr.2016.213","DOI":"10.1109\/cvpr.2016.213"},{"key":"21330_CR52","doi-asserted-by":"publisher","unstructured":"Xie S, Girshick R, Doll\u00e1r P, Tu Z, He K (2017) Aggregated residual transformations for deep neural networks. 2017 IEEE Conf Comput Vis Pattern Recognit 1492\u20131500. https:\/\/doi.org\/10.1109\/CVPR.2017.634","DOI":"10.1109\/CVPR.2017.634"},{"key":"21330_CR53","doi-asserted-by":"publisher","unstructured":"Wang L, Qiao Y, Tang X (2015) Action recognition with trajectory-pooled deep-convolutional descriptors. 2015 IEEE Conf Comput Vis Pattern Recognit 4305\u20134314. https:\/\/doi.org\/10.1109\/CVPR.2015.7299059","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"21330_CR54","doi-asserted-by":"publisher","unstructured":"Li A, Vostrikov A, Ross DA, Zisserman A, Thotakuri M, Ross DA, Carreira J, Vostrikov A, Zisserman A (2020) The AVA-Kinetics Localized Human Actions Video Dataset. arXiv preprint arXiv:2005.00214. https:\/\/doi.org\/10.48550\/arXiv.2005.00214","DOI":"10.48550\/arXiv.2005.00214"},{"key":"21330_CR55","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2015.510","author":"D Tran","year":"2015","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3D convolutional networks. 2015 IEEE Int Conf Comput Vis. https:\/\/doi.org\/10.1109\/iccv.2015.510","journal-title":"2015 IEEE Int Conf Comput Vis"},{"key":"21330_CR56","doi-asserted-by":"publisher","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2021) An image is worth 16x16 words: Transformers for image recognition at scale. https:\/\/doi.org\/10.48550\/arXiv.2010.11929. arXiv preprint arXiv:2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"21330_CR57","doi-asserted-by":"publisher","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lu\u010di\u0107 M, Schmid C (2021) ViViT: A video vision transformer. 2021 IEEE\/CVF International Conference on Computer Vision 6816\u20136826. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00676","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"21330_CR58","doi-asserted-by":"publisher","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik J, Feichtenhofer C (2021) Multiscale Vision Transformers. 2021 IEEE\/CVF International Conference on Computer Vision 6804\u20136815. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00675","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"21330_CR59","doi-asserted-by":"publisher","first-page":"10078","DOI":"10.48550\/arXiv.2203.12602","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong Z, Song Y, Wang J, Wang L (2022) VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv Neural Inf Process Syst 35:10078\u201310093. https:\/\/doi.org\/10.48550\/arXiv.2203.12602","journal-title":"Adv Neural Inf Process Syst"},{"key":"21330_CR60","doi-asserted-by":"publisher","DOI":"10.3390\/app13042058","author":"J Shi","year":"2023","unstructured":"Shi J, Zhang Y, Wang W, Xing B, Hu D, Chen L (2023) A novel two-stream transformer-based framework for multi-modality human action recognition. Appl Sci. https:\/\/doi.org\/10.3390\/app13042058","journal-title":"Appl Sci"},{"key":"21330_CR61","doi-asserted-by":"publisher","unstructured":"Li K, Wang Y, He Y, Li Y, Wang Y, Wang L, Qiao Y (2022) UniFormerV2: Spatiotemporal learning by arming image vits with video uniformer. https:\/\/doi.org\/10.48550\/arXiv.2211.09552. arXiv preprint arXiv:2211.09552","DOI":"10.48550\/arXiv.2211.09552"},{"key":"21330_CR62","doi-asserted-by":"publisher","first-page":"12581","DOI":"10.1109\/TPAMI.2023.3282631","volume":"45","author":"K Li","year":"2023","unstructured":"Li K, Wang Y, Zhang J, Gao P, Song G, Liu Y, Li H, Qiao Y (2023) UniFormer: unifying convolution and self-attention for visual recognition. IEEE Trans Pattern Anal Mach Intell 45:12581\u201312600. https:\/\/doi.org\/10.1109\/TPAMI.2023.3282631","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR63","doi-asserted-by":"publisher","unstructured":"Mehta S, Rastegari M (2022) Mobilevit: Light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178. https:\/\/doi.org\/10.48550\/arXiv.2110.02178","DOI":"10.48550\/arXiv.2110.02178"},{"key":"21330_CR64","doi-asserted-by":"publisher","unstructured":"Zhu X, Toisoul A, Perez-Rua JM, Zhang L, Martinez B, Xiang T (2021) Few-shot action recognition with prototype-centered attentive learning. arXiv preprint arXiv:2101.08085. https:\/\/doi.org\/10.48550\/arXiv.2101.08085","DOI":"10.48550\/arXiv.2101.08085"},{"key":"21330_CR65","doi-asserted-by":"publisher","unstructured":"Guo H, Yu W, Que S, Du K, Yan Y, Wang H (2024) Video-to-task learning via motion-guided attention for few-shot action recognition. arXiv preprint. https:\/\/doi.org\/10.48550\/arXiv.2411.11335","DOI":"10.48550\/arXiv.2411.11335"},{"key":"21330_CR66","doi-asserted-by":"publisher","first-page":"386","DOI":"10.1109\/TPAMI.2018.2844175","volume":"42","author":"K He","year":"2020","unstructured":"He K, Gkioxari G, Doll\u00e1r P, Girshick R (2020) Mask R-CNN. IEEE Trans Pattern Anal Mach Intell 42:386\u2013397. https:\/\/doi.org\/10.1109\/TPAMI.2018.2844175","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR67","doi-asserted-by":"publisher","unstructured":"Shou Z, Wang D, Chang SF (2016) Temporal action localization in untrimmed videos via multi-stage CNNs. 2016 IEEE Conference on Computer Vision and Pattern Recognition 1049\u20131058. https:\/\/doi.org\/10.1109\/CVPR.2016.119","DOI":"10.1109\/CVPR.2016.119"},{"key":"21330_CR68","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/tpami.2017.2712608","volume":"40","author":"G Varol","year":"2018","unstructured":"Varol G, Laptev I, Schmid C (2018) Long-term temporal convolutions for action recognition. IEEE Trans Pattern Anal Mach Intell 40:1510\u20131517. https:\/\/doi.org\/10.1109\/tpami.2017.2712608","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"21330_CR69","doi-asserted-by":"publisher","unstructured":"Carreira J, Zisserman A Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. 2017 IEEE Conference on Computer Vision and, Recognition P (2017) https:\/\/doi.org\/10.1109\/cvpr.2017.502","DOI":"10.1109\/cvpr.2017.502"},{"key":"21330_CR70","doi-asserted-by":"publisher","unstructured":"Zeeshan Khan M, Hassan MA, Farooq A, Ghanni Khan MU Deep CNN Based Data-Driven Recognition of Cricket Batting Shots. 2018 International Conference on Applied and, Mathematics E (2018) https:\/\/doi.org\/10.1109\/icaem.2018.8536277","DOI":"10.1109\/icaem.2018.8536277"},{"key":"21330_CR71","doi-asserted-by":"publisher","unstructured":"Hara K, Kataoka H, Satoh Y (2017) Learning spatio-temporal features with 3D residual networks for action recognition. 2017 IEEE International Conference on Computer Vision Workshops. https:\/\/doi.org\/10.1109\/iccvw.2017.373","DOI":"10.1109\/iccvw.2017.373"},{"key":"21330_CR72","doi-asserted-by":"publisher","unstructured":"Qiu Z, Yao T, Mei T (2017) Learning spatio-temporal representation with pseudo-3D Residual networks. 2017 IEEE International Conference on Computer Vision 5533\u20135541. https:\/\/doi.org\/10.1109\/CVPR.2016.119","DOI":"10.1109\/CVPR.2016.119"},{"key":"21330_CR73","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume":"2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson GA, Varol G, Wang X, Farhadi A, Laptev I, Gupta A (2016) Hollywood in homes: crowdsourcing data collection for activity understanding. Comput Vision\u2013ECCV 2016:510\u2013526. https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31","journal-title":"Comput Vision\u2013ECCV"},{"key":"21330_CR74","doi-asserted-by":"publisher","first-page":"1","DOI":"10.7717\/peerj-cs.1804","volume":"10","author":"M Gao","year":"2024","unstructured":"Gao M, Ju B (2024) Attention-enhanced gated recurrent unit for action recognition in tennis. PeerJ Comput Sci 10:1\u201314. https:\/\/doi.org\/10.7717\/peerj-cs.1804","journal-title":"PeerJ Comput Sci"},{"key":"21330_CR75","doi-asserted-by":"publisher","unstructured":"Piergiovanni A, Angelova A, Toshev A, Ryoo M (2019) Evolving space-time neural architectures for videos. 2019 IEEE\/CVF International Conference on Computer Vision 1793\u20131802. https:\/\/doi.org\/10.1109\/ICCV.2019.00188","DOI":"10.1109\/ICCV.2019.00188"},{"key":"21330_CR76","doi-asserted-by":"publisher","first-page":"1545","DOI":"10.32604\/CMC.2020.09867","volume":"63","author":"S Zhou","year":"2020","unstructured":"Zhou S, Chen L, Sugumaran V (2020) Hidden two-stream collaborative learning network for action recognition. Comput Mater Continua 63:1545\u20131561. https:\/\/doi.org\/10.32604\/CMC.2020.09867","journal-title":"Comput Mater Continua"},{"key":"21330_CR77","doi-asserted-by":"publisher","unstructured":"Girdhar R, Joao Carreira J, Doersch C, Zisserman A (2019) Video action transformer network. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 244\u2013253. https:\/\/doi.org\/10.1109\/CVPR.2019.00033","DOI":"10.1109\/CVPR.2019.00033"},{"key":"21330_CR78","doi-asserted-by":"publisher","unstructured":"Jain M, Jegou H, Bouthemy P (2013) Better exploiting motion for better action recognition. 2013 IEEE Conference on Computer Vision and Pattern Recognition 2555\u20132562. https:\/\/doi.org\/10.1109\/cvpr.2013.330","DOI":"10.1109\/cvpr.2013.330"},{"key":"21330_CR79","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C (2020) X3D: Expanding Architectures for Efficient Video Recognition. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 200\u2013210. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00028","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"21330_CR80","doi-asserted-by":"publisher","unstructured":"Sigurdsson GA, Gupta A, Schmid C, Farhadi A, Alahari K (2018) Actor and Observer: Joint Modeling of First and Third-Person Videos. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 7396\u20137404. https:\/\/doi.org\/10.1109\/CVPR.2018.00772","DOI":"10.1109\/CVPR.2018.00772"},{"key":"21330_CR81","doi-asserted-by":"publisher","first-page":"30392","DOI":"10.48550\/arXiv.2106.14881","volume":"36","author":"T Xiao","year":"2021","unstructured":"Xiao T, Singh M, Mintun E, Darrell T, Doll\u00e1r P, Girshick R (2021) Early convolutions help transformers see better. Adv Neural Inf Process Syst 36:30392\u201330400. https:\/\/doi.org\/10.48550\/arXiv.2106.14881","journal-title":"Adv Neural Inf Process Syst"},{"key":"21330_CR82","doi-asserted-by":"publisher","unstructured":"Materzynska J, Berger G, Bax I, Memisevic R (2019) The Jester Dataset: A Large-Scale Video Dataset of Human Gestures. 2019 IEEE\/CVF International Conference on Computer Vision Workshop 2874\u20132882. https:\/\/doi.org\/10.1109\/ICCVW.2019.00349","DOI":"10.1109\/ICCVW.2019.00349"},{"key":"21330_CR83","doi-asserted-by":"publisher","unstructured":"Cao K, Ji J, Cao Z, Chang CY, Niebles JC (2020) Few-shot video classification via temporal alignment. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 10615\u201310624. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01063","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"21330_CR84","doi-asserted-by":"publisher","unstructured":"Li Y, Lin S, Zhang B, Liu J, Doermann D, Wu Y, Huang F, Ji R (2019) Exploiting kernel sparsity and entropy for interpretable CNN compression. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition 2795\u20132804. https:\/\/doi.org\/10.1109\/CVPR.2019.00291","DOI":"10.1109\/CVPR.2019.00291"},{"key":"21330_CR85","doi-asserted-by":"publisher","first-page":"10096","DOI":"10.48550\/arXiv.2104.00298","volume":"139","author":"M Tan","year":"2021","unstructured":"Tan M, Le Q (2021) Efficientnetv2: smaller models and faster training. Proc Mach Learn Res 139:10096\u201310106. https:\/\/doi.org\/10.48550\/arXiv.2104.00298","journal-title":"Proc Mach Learn Res"},{"key":"21330_CR86","doi-asserted-by":"publisher","unstructured":"Derpanis KG, Lecce M, Daniilidis K, Wildes RP (2012) Dynamic scene understanding: The role of orientation features in space and time in scene classification. 2012 IEEE Conference on Computer Vision and Pattern Recognition 1306\u20131313. https:\/\/doi.org\/10.1109\/CVPR.2012.6247815","DOI":"10.1109\/CVPR.2012.6247815"},{"key":"21330_CR87","doi-asserted-by":"publisher","unstructured":"Ryoo MS, Aggarwal JK (2009) Spatio-temporal relationship match: Video structure comparison for recognition of complex human activities. 2009 IEEE 12th International Conference on Computer Vision 1593\u20131600. https:\/\/doi.org\/10.1109\/ICCV.2009.5459361","DOI":"10.1109\/ICCV.2009.5459361"},{"key":"21330_CR88","doi-asserted-by":"publisher","unstructured":"Shechtman E, Irani M (2005) Space-time behavior based correlation. 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition 1:405\u2013412. https:\/\/doi.org\/10.1109\/CVPR.2005.328","DOI":"10.1109\/CVPR.2005.328"},{"key":"21330_CR89","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1007\/978-3-030-58548-8_19","volume":"12349","author":"S Pratt","year":"2020","unstructured":"Pratt S, Yatskar M, Weihs L, Farhadi A, Kembhavi A (2020) Grounded situation recognition. Lect Notes Comput Sci 12349:314\u2013332. https:\/\/doi.org\/10.1007\/978-3-030-58548-8_19","journal-title":"Lect Notes Comput Sci"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21330-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-026-21330-6","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-026-21330-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T21:09:34Z","timestamp":1769980174000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-026-21330-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,1]]},"references-count":89,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2026,2]]}},"alternative-id":["21330"],"URL":"https:\/\/doi.org\/10.1007\/s11042-026-21330-6","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,1]]},"assertion":[{"value":"17 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 September 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Not Applicable.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval\u00a0& informed consent"}},{"value":"None.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"77"}}