{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T21:04:11Z","timestamp":1774991051708,"version":"3.50.1"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["51405448"],"award-info":[{"award-number":["51405448"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s13735-024-00350-8","type":"journal-article","created":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T12:32:28Z","timestamp":1733315548000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["STCA: an action recognition network with spatio-temporal convolution and attention"],"prefix":"10.1007","volume":"14","author":[{"given":"Qiuhong","family":"Tian","sequence":"first","affiliation":[]},{"given":"Weilun","family":"Miao","sequence":"additional","affiliation":[]},{"given":"Lizao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ziyu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yanying","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Lan","family":"Yao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,4]]},"reference":[{"key":"350_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2022.108319","volume":"103","author":"KB Sahay","year":"2022","unstructured":"Sahay KB, Balachander B, Jagadeesh B, Kumar GA, Kumar R, Parvathy LR (2022) A real time crime scene intelligent video surveillance systems in violence detection framework using deep learning techniques. Comput Electr Eng 103:108319","journal-title":"Comput Electr Eng"},{"key":"350_CR2","doi-asserted-by":"publisher","first-page":"37347","DOI":"10.2196\/37347","volume":"11","author":"PP Morita","year":"2023","unstructured":"Morita PP, Sahu KS, Oetomo A (2023) Health monitoring using smart home technologies: scoping review. JMIR Mhealth Uhealth 11:37347","journal-title":"JMIR Mhealth Uhealth"},{"issue":"3","key":"350_CR3","doi-asserted-by":"publisher","first-page":"1907","DOI":"10.32604\/iasc.2022.027233","volume":"34","author":"S Mekruksavanich","year":"2022","unstructured":"Mekruksavanich S, Jitpattanakul A (2022) Sport-related activity recognition from wearable sensors using bidirectional GRU network. Intell Autom Soft Comput 34(3):1907\u20131925","journal-title":"Intell Autom Soft Comput"},{"key":"350_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2022.103531","volume":"86","author":"P Khaire","year":"2022","unstructured":"Khaire P, Kumar P (2022) Deep learning and RGB-D based human action, human\u2013human and human\u2013object interaction recognition: a survey. J Vis Commun Image Represent 86:103531","journal-title":"J Vis Commun Image Represent"},{"issue":"3","key":"350_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550299","volume":"6","author":"H Haresamudram","year":"2022","unstructured":"Haresamudram H, Essa I, Pl\u00f6tz T (2022) Assessing the state of self-supervised human activity recognition using wearables. Proc ACM Interact Mob Wear Ubiquitous Technol 6(3):1\u201347","journal-title":"Proc ACM Interact Mob Wear Ubiquitous Technol"},{"key":"350_CR6","doi-asserted-by":"publisher","first-page":"241","DOI":"10.1016\/j.inffus.2021.11.006","volume":"80","author":"S Qiu","year":"2022","unstructured":"Qiu S, Zhao H, Jiang N, Wang Z, Liu L, An Y, Zhao H, Miao X, Liu R, Fortino G (2022) Multi-sensor information fusion based on machine learning for real applications in human activity recognition: state-of-the-art and research challenges. Inf Fusion 80:241\u2013265","journal-title":"Inf Fusion"},{"issue":"6","key":"350_CR7","doi-asserted-by":"publisher","first-page":"4755","DOI":"10.1007\/s10462-021-10116-x","volume":"55","author":"N Gupta","year":"2022","unstructured":"Gupta N, Gupta SK, Pathak RK, Jain V, Rashidi P, Suri JS (2022) Human activity recognition in artificial intelligence framework: a narrative review. Artif Intell Rev 55(6):4755\u20134808","journal-title":"Artif Intell Rev"},{"issue":"5","key":"350_CR8","doi-asserted-by":"publisher","first-page":"1005","DOI":"10.3390\/s19051005","volume":"19","author":"H-B Zhang","year":"2019","unstructured":"Zhang H-B, Zhang Y-X, Zhong B, Lei Q, Yang L, Du J-X, Chen D-S (2019) A comprehensive survey of vision-based human action recognition methods. Sensors 19(5):1005","journal-title":"Sensors"},{"issue":"5","key":"350_CR9","doi-asserted-by":"publisher","first-page":"1366","DOI":"10.1007\/s11263-022-01594-9","volume":"130","author":"Y Kong","year":"2022","unstructured":"Kong Y, Fu Y (2022) Human action recognition and prediction: a survey. Int J Comput Vis 130(5):1366\u20131401","journal-title":"Int J Comput Vis"},{"key":"350_CR10","first-page":"3200","volume":"45","author":"Z Sun","year":"2022","unstructured":"Sun Z, Ke Q, Rahmani H, Bennamoun M, Wang G, Liu J (2022) Human action recognition from various data modalities: a review. IEEE Trans Pattern Anal Mach Intell 45:3200\u20133225","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"1","key":"350_CR11","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1186\/s13640-020-00544-0","volume":"2021","author":"Z Weng","year":"2021","unstructured":"Weng Z, Li W, Jin Z (2021) Human activity prediction using saliency-aware motion enhancement and weighted LSTM network. EURASIP J Image Video Process 2021(1):3","journal-title":"EURASIP J Image Video Process"},{"key":"350_CR12","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Feiszli M (2019) Video classification with channel-separated convolutional networks. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5552\u20135561","DOI":"10.1109\/ICCV.2019.00565"},{"key":"350_CR13","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"350_CR14","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"350_CR15","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van\u00a0Gool L (2016) Temporal segment networks: towards good practices for deep action recognition. In: European conference on computer vision, pp 20\u201336. Springer","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"350_CR16","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) TSM: temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"issue":"2","key":"350_CR17","first-page":"021004","volume":"28","author":"Z Weng","year":"2019","unstructured":"Weng Z, Guan Y (2019) Trajectory-aware three-stream CNN for video action recognition. J Electron Imaging 28(2):021004\u2013021004","journal-title":"J Electron Imaging"},{"key":"350_CR18","doi-asserted-by":"crossref","unstructured":"Zhou Y, Chen S, Wang Y, Huan W (2020) Review of research on lightweight convolutional neural networks. In: 2020 IEEE 5th information technology and mechatronics engineering conference (ITOEC), pp 1713\u20131720. IEEE","DOI":"10.1109\/ITOEC49072.2020.9141847"},{"key":"350_CR19","unstructured":"Tan M, Le Q (2019) Efficientnet: rethinking model scaling for convolutional neural networks. In: International conference on machine learning, pp 6105\u20136114. PMLR"},{"key":"350_CR20","unstructured":"Sou\u010dek T, Loko\u010d J (2020) Transnet v2: an effective deep network architecture for fast shot transition detection. arXiv preprint arXiv:2008.04838"},{"key":"350_CR21","doi-asserted-by":"crossref","unstructured":"Girdhar R, Carreira J, Doersch C, Zisserman A (2019) Video action transformer network. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 244\u2013253","DOI":"10.1109\/CVPR.2019.00033"},{"key":"350_CR22","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"350_CR23","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lu\u010di\u0107 M, Schmid C (2021) Vivit: a video vision transformer. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"issue":"10","key":"350_CR24","doi-asserted-by":"publisher","first-page":"5515","DOI":"10.1007\/s00521-023-09362-7","volume":"36","author":"G Pareek","year":"2024","unstructured":"Pareek G, Nigam S, Singh R (2024) Modeling transformer architecture with attention layer for human activity recognition. Neural Comput Appl 36(10):5515\u20135528","journal-title":"Neural Comput Appl"},{"key":"350_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127256","volume":"574","author":"W Sun","year":"2024","unstructured":"Sun W, Ma Y, Wang R (2024) k-nn attention-based video vision transformer for action recognition. Neurocomputing 574:127256","journal-title":"Neurocomputing"},{"key":"350_CR26","doi-asserted-by":"crossref","unstructured":"Wang X, Wu Z, Jiang B, Bao Z, Zhu L, Li G, Wang Y, Tian Y (2024) Hardvs: revisiting human activity recognition with dynamic vision sensors. In: Proceedings of the AAAI conference on artificial intelligence, vol 38, pp 5615\u20135623","DOI":"10.1609\/aaai.v38i6.28372"},{"key":"350_CR27","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"350_CR28","doi-asserted-by":"crossref","unstructured":"Sandler M, Howard A, Zhu M, Zhmoginov A, Chen L-C (2018) Mobilenetv2: inverted residuals and linear bottlenecks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4510\u20134520","DOI":"10.1109\/CVPR.2018.00474"},{"key":"350_CR29","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems, vol 30"},{"key":"350_CR30","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv preprint arXiv:1607.06450"},{"key":"350_CR31","unstructured":"Kay W, Carreira J, Simonyan K, Zhang B, Hillier C, Vijayanarasimhan S, Viola F, Green T, Back T, Natsev P, et al (2017) The kinetics human action video dataset. arXiv preprint arXiv:1705.06950"},{"key":"350_CR32","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"350_CR33","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) Hmdb: a large video database for human motion recognition. In: 2011 International conference on computer vision, pp 2556\u20132563. IEEE","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"350_CR34","doi-asserted-by":"crossref","unstructured":"Goyal R, Ebrahimi\u00a0Kahou S, Michalski V, Materzynska J, Westphal S, Kim H, Haenel V, Fruend I, Yianilos P, Mueller-Freitag M (2017) The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE international conference on computer vision, pp 5842\u20135850","DOI":"10.1109\/ICCV.2017.622"},{"key":"350_CR35","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C (2020) X3d: expanding architectures for efficient video recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 203\u2013213","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"350_CR36","unstructured":"Tran D, Ray J, Shou Z, Chang S-F, Paluri M (2017) Convnet architecture search for spatiotemporal feature learning. arXiv preprint arXiv:1708.05038"},{"key":"350_CR37","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2017) Rethinking spatiotemporal feature learning for video understanding, vol 1(2), p 5. arXiv preprint arXiv:1712.04851"},{"key":"350_CR38","doi-asserted-by":"crossref","unstructured":"Chen Y, Kalantidis Y, Li J, Yan S, Feng J (2018) Multi-fiber networks for video recognition. In: Proceedings of the European conference on computer vision (ECCV), pp 352\u2013367","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"350_CR39","unstructured":"Lin J, Gan C, Han S (2018) Temporal shift module for efficient video understanding. CoRR abs\/1811.08383 (1811)"},{"key":"350_CR40","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7794\u20137803","DOI":"10.1109\/CVPR.2018.00813"},{"key":"350_CR41","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6450\u20136459","DOI":"10.1109\/CVPR.2018.00675"},{"key":"350_CR42","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"350_CR43","doi-asserted-by":"crossref","unstructured":"Luo C, Yuille AL (2019) Grouped spatial-temporal aggregation for efficient action recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5512\u20135521","DOI":"10.1109\/ICCV.2019.00561"},{"key":"350_CR44","doi-asserted-by":"crossref","unstructured":"Kwon H, Kim M, Kwak S, Cho M (2020) Motionsqueeze: neural motion feature learning for video understanding. In: Computer vision\u2014ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, part XVI 16, pp 345\u2013362. Springer","DOI":"10.1007\/978-3-030-58517-4_21"},{"key":"350_CR45","unstructured":"Li K, Li X, Wang Y, Wang J, Qiao Y (2021) Ct-net: channel tensorization network for video classification. arXiv preprint arXiv:2106.01603"},{"key":"350_CR46","doi-asserted-by":"crossref","unstructured":"Wang L, Tong Z, Ji B, Wu G (2021) TDN: temporal difference networks for efficient action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1895\u20131904","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"350_CR47","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? In: ICML, vol 2, p 4"},{"key":"350_CR48","doi-asserted-by":"crossref","unstructured":"Fan H, Xiong B, Mangalam K, Li Y, Yan Z, Malik J, Feichtenhofer C (2021) Multiscale vision transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6824\u20136835","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"350_CR49","doi-asserted-by":"crossref","unstructured":"Kim M, Seo PH, Schmid C, Cho M (2024) Learning correlation structures for vision transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 18941\u201318951","DOI":"10.1109\/CVPR52733.2024.01792"},{"key":"350_CR50","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10586-024-04553-w","volume":"27","author":"L Xia","year":"2024","unstructured":"Xia L, Fu W (2024) Spatial-temporal multiscale feature optimization based two-stream convolutional neural network for action recognition. Clust Comput 27:1\u201316","journal-title":"Clust Comput"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-024-00350-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-024-00350-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-024-00350-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,17]],"date-time":"2025-03-17T07:32:47Z","timestamp":1742196767000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-024-00350-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,4]]},"references-count":50,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["350"],"URL":"https:\/\/doi.org\/10.1007\/s13735-024-00350-8","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,4]]},"assertion":[{"value":"21 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 November 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 November 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"1"}}