{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T02:55:56Z","timestamp":1769741756692,"version":"3.49.0"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2023,3,22]],"date-time":"2023-03-22T00:00:00Z","timestamp":1679443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,3,22]],"date-time":"2023-03-22T00:00:00Z","timestamp":1679443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1007\/s13042-023-01820-x","type":"journal-article","created":{"date-parts":[[2023,3,22]],"date-time":"2023-03-22T03:03:11Z","timestamp":1679454191000},"page":"3059-3070","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["ESTI: an action recognition network with enhanced spatio-temporal information"],"prefix":"10.1007","volume":"14","author":[{"given":"ZhiYu","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Shu","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,3,22]]},"reference":[{"key":"1820_CR1","unstructured":"Bertasius G, Feichtenhofer C, Tran D, Shi J, Torresani L (2018) Learning discriminative motion features through detection. arXiv preprint arXiv:1812.04172"},{"key":"1820_CR2","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"1820_CR3","doi-asserted-by":"crossref","unstructured":"Dinarevi\u0107 EC, Husi\u0107 JB, Barakovi\u0107 S (2019) Issues of human activity recognition in healthcare. In: 2019 18th International Symposium INFOTEH-JAHORINA (INFOTEH), IEEE. pp. 1\u20136","DOI":"10.1109\/INFOTEH.2019.8717749"},{"key":"1820_CR4","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"1820_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108043","volume":"118","author":"K Gedamu","year":"2021","unstructured":"Gedamu K, Ji Y, Yang Y, Gao L, Shen HT (2021) Arbitrary-view human action recognition via novel-view action generation. Pattern Recognition 118:108043","journal-title":"Pattern Recognition"},{"key":"1820_CR6","doi-asserted-by":"crossref","unstructured":"Goyal R, Ebrahimi\u00a0Kahou S, Michalski V, Materzynska J, Westphal S, Kim H, Haenel V, Fruend I, Yianilos P, Mueller-Freitag M, et\u00a0al (2017). The\u201c something something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE international conference on computer vision, pp. 5842\u20135850","DOI":"10.1109\/ICCV.2017.622"},{"key":"1820_CR7","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1016\/j.neucom.2020.05.118","volume":"444","author":"JY He","year":"2021","unstructured":"He JY, Wu X, Cheng ZQ, Yuan Z, Jiang YG (2021) Db-lstm: Densely-connected bi-directional lstm for human action recognition. Neurocomputing 444:319\u2013331","journal-title":"Neurocomputing"},{"key":"1820_CR8","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"1820_CR9","doi-asserted-by":"crossref","unstructured":"Host K, Iva\u0161i\u0107-Kos M (2022) An overview of human action recognition in sports based on computer vision. Heliyon , e09633","DOI":"10.1016\/j.heliyon.2022.e09633"},{"key":"1820_CR10","doi-asserted-by":"crossref","unstructured":"Hu H, Zhou W, Li X, Yan N, Li H (2020) Mv2flow: Learning motion representation for fast compressed video action recognition. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) 16, 1\u201319","DOI":"10.1145\/3422360"},{"key":"1820_CR11","doi-asserted-by":"crossref","unstructured":"Jiang B, Wang M, Gan W, Wu W, Yan J (2019) Stm: Spatiotemporal and motion encoding for action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2000\u20132009","DOI":"10.1109\/ICCV.2019.00209"},{"key":"1820_CR12","doi-asserted-by":"crossref","unstructured":"Kanojia G, Kumawat S, Raman S (2019) Attentive spatio-temporal representation learning for diving classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 0\u20130","DOI":"10.1109\/CVPRW.2019.00302"},{"key":"1820_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104244","volume":"114","author":"RC Li","year":"2021","unstructured":"Li RC, Wu XJ, Wu C, Xu TY, Kittler J (2021) Dynamic information enhancement for video classification. Image and Vision Computing 114:104244","journal-title":"Image and Vision Computing"},{"key":"1820_CR14","doi-asserted-by":"crossref","unstructured":"Li X, Wang Y, Zhou Z, Qiao Y (2020a) Smallbignet: Integrating core and contextual views for video classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1092\u20131101","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"1820_CR15","doi-asserted-by":"publisher","first-page":"1059","DOI":"10.1049\/iet-ipr.2019.0963","volume":"14","author":"X Li","year":"2020","unstructured":"Li X, Xie M, Zhang Y, Ding G, Tong W (2020) Dual attention convolutional network for action recognition. IET Image Processing 14:1059\u20131065","journal-title":"IET Image Processing"},{"key":"1820_CR16","doi-asserted-by":"crossref","unstructured":"Li Y, Ji B, Shi X, Zhang J, Kang B, Wang L (2020c) Tea: Temporal excitation and aggregation for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 909\u2013918","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"1820_CR17","doi-asserted-by":"crossref","unstructured":"Li Y, Li Y, Vasconcelos N (2018) Resound: Towards action recognition without representation bias. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 513\u2013528","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"1820_CR18","doi-asserted-by":"crossref","unstructured":"Li Z, Li D (2022) Action recognition of construction workers under occlusion. Journal of Building Engineering 45, 103352. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S2352710221012109, doi:https:\/\/doi.org\/10.1016\/j.jobe.2021.103352","DOI":"10.1016\/j.jobe.2021.103352"},{"key":"1820_CR19","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"1820_CR20","doi-asserted-by":"crossref","unstructured":"Liu Z, Luo D, Wang Y, Wang L, Tai Y, Wang C, Li J, Huang F, Lu T (2020) Teinet: Towards an efficient architecture for video recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 11669\u201311676","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"1820_CR21","doi-asserted-by":"crossref","unstructured":"Liu Z, Wang L, Wu W, Qian C, Lu T (2021) Tam: Temporal adaptive module for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13708\u201313718","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"1820_CR22","doi-asserted-by":"crossref","unstructured":"Luo C, Yuille AL (2019) Grouped spatial-temporal aggregation for efficient action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5512\u20135521","DOI":"10.1109\/ICCV.2019.00561"},{"key":"1820_CR23","unstructured":"Mahdisoltani F, Berger G, Gharbieh W, Fleet D, Memisevic R (2018) On the effectiveness of task granularity for transfer learning. arXiv preprint arXiv:1804.09235"},{"key":"1820_CR24","doi-asserted-by":"crossref","unstructured":"Materzynska J, Berger G, Bax I, Memisevic R (2019) The jester dataset: A large-scale video dataset of human gestures. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops, pp. 0\u20130","DOI":"10.1109\/ICCVW.2019.00349"},{"key":"1820_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108487","volume":"124","author":"V Mazzia","year":"2022","unstructured":"Mazzia V, Angarano S, Salvetti F, Angelini F, Chiaberge M (2022) Action transformer: A self-attention model for short-time pose-based human action recognition. Pattern Recognition 124:108487","journal-title":"Pattern Recognition"},{"key":"1820_CR26","doi-asserted-by":"crossref","unstructured":"Qiu Z, Yao T, Mei T (2017) Learning spatio-temporal representation with pseudo-3d residual networks. In: proceedings of the IEEE International Conference on Computer Vision, pp. 5533\u20135541","DOI":"10.1109\/ICCV.2017.590"},{"key":"1820_CR27","doi-asserted-by":"publisher","first-page":"155014771666552","DOI":"10.1177\/1550147716665520","volume":"12","author":"S Ranasinghe","year":"2016","unstructured":"Ranasinghe S, Al Machot F, Mayr HC (2016) A review on applications of activity recognition systems with regard to performance and evaluation. International Journal of Distributed Sensor Networks 12:1550147716665520","journal-title":"International Journal of Distributed Sensor Networks"},{"key":"1820_CR28","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Cogswell M, Das A, Vedantam R, Parikh D, Batra D (2017) Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE international conference on computer vision, pp. 618\u2013626","DOI":"10.1109\/ICCV.2017.74"},{"key":"1820_CR29","doi-asserted-by":"crossref","unstructured":"Shao H, Qian S, Liu Y (2020) Temporal interlacing network. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 11966\u201311973","DOI":"10.1609\/aaai.v34i07.6872"},{"key":"1820_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104122","volume":"109","author":"Z Shen","year":"2021","unstructured":"Shen Z, Wu XJ, Kittler J (2021) 2d progressive fusion module for action recognition. Image and Vision Computing 109:104122","journal-title":"Image and Vision Computing"},{"key":"1820_CR31","doi-asserted-by":"crossref","unstructured":"Shi Q, Zhang HB, Li Z, Du JX, Lei Q, Liu JH (2022) Shuffle-invariant network for action recognition in videos. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) 18, 1\u201318","DOI":"10.1145\/3485665"},{"key":"1820_CR32","unstructured":"Simonyan K, Zisserman A (2014a) Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27"},{"key":"1820_CR33","unstructured":"Simonyan K, Zisserman A (2014b) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"key":"1820_CR34","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"1820_CR35","unstructured":"Srivastava N, Mansimov E, Salakhudinov R (2015) Unsupervised learning of video representations using lstms. In: International conference on machine learning, PMLR. pp. 843\u2013852"},{"key":"1820_CR36","doi-asserted-by":"crossref","unstructured":"Sudhakaran S, Escalera S, Lanz O (2020) Gate-shift networks for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1102\u20131111","DOI":"10.1109\/CVPR42600.2020.00118"},{"key":"1820_CR37","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.118484","volume":"210","author":"KS Tan","year":"2022","unstructured":"Tan KS, Lim KM, Lee CP, Kwek LC (2022) Bidirectional long short-term memory with temporal dense sampling for human action recognition. Expert Systems with Applications 210:118484","journal-title":"Expert Systems with Applications"},{"key":"1820_CR38","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp. 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"1820_CR39","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459","DOI":"10.1109\/CVPR.2018.00675"},{"key":"1820_CR40","doi-asserted-by":"crossref","unstructured":"Wang L, Tong Z, Ji B, Wu G (2021) Tdn: Temporal difference networks for efficient action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1895\u20131904","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"1820_CR41","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Gool LV (2016) Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision, Springer. pp. 20\u201336","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"1820_CR42","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018). Non-local neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 7794\u20137803","DOI":"10.1109\/CVPR.2018.00813"},{"key":"1820_CR43","doi-asserted-by":"crossref","unstructured":"Weng, J., Luo, D., Wang, Y., Tai, Y., Wang, C., Li, J., Huang, F., Jiang, X., Yuan, J., 2020. Temporal distinct representation learning for action recognition. In: European Conference on Computer Vision, Springer. pp. 363\u2013378","DOI":"10.1007\/978-3-030-58571-6_22"},{"key":"1820_CR44","doi-asserted-by":"crossref","unstructured":"Wu M, Jiang B, Luo D, Yan J, Wang Y, Tai Y, Wang C, Li J, Huang F, Yang X (2021) Learning comprehensive motion representation for action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 2934\u20132942","DOI":"10.1609\/aaai.v35i4.16400"},{"key":"1820_CR45","doi-asserted-by":"crossref","unstructured":"Xu H, Jin X, Wang Q, Hussain A, Huang K (2022) Exploiting attention-consistency loss for spatial-temporal stream action recognition. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM)","DOI":"10.1145\/3538749"},{"key":"1820_CR46","unstructured":"Yu F, Koltun V (2015) Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv:1511.07122"},{"key":"1820_CR47","doi-asserted-by":"crossref","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal relational reasoning in videos. In: Proceedings of the European conference on computer vision (ECCV), pp. 803\u2013818","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"1820_CR48","doi-asserted-by":"crossref","unstructured":"Zolfaghari M, Singh K, Brox T (2018) Eco: Efficient convolutional network for online video understanding. In: Proceedings of the European conference on computer vision (ECCV), pp. 695\u2013712","DOI":"10.1007\/978-3-030-01216-8_43"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-023-01820-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-023-01820-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-023-01820-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,21]],"date-time":"2023-07-21T04:28:02Z","timestamp":1689913682000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-023-01820-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,22]]},"references-count":48,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2023,9]]}},"alternative-id":["1820"],"URL":"https:\/\/doi.org\/10.1007\/s13042-023-01820-x","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,3,22]]},"assertion":[{"value":"14 September 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 March 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 March 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}