{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:05:39Z","timestamp":1776114339082,"version":"3.50.1"},"reference-count":169,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100002347","name":"Bundesministerium f\u00fcr Forschung, Technologie und Raumfahrt","doi-asserted-by":"publisher","award":["16SV9304"],"award-info":[{"award-number":["16SV9304"]}],"id":[{"id":"10.13039\/501100002347","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.engappai.2026.114406","type":"journal-article","created":{"date-parts":[[2026,3,16]],"date-time":"2026-03-16T17:09:22Z","timestamp":1773680962000},"page":"114406","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Role of prior in human activity recognition: A survey"],"prefix":"10.1016","volume":"174","author":[{"given":"Yuanheng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Artur","family":"Piet","sequence":"additional","affiliation":[]},{"given":"Sonja Dana","family":"Roelen","sequence":"additional","affiliation":[]},{"given":"Omair","family":"Ali","sequence":"additional","affiliation":[]},{"given":"Jarek","family":"Krajewski","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3210-3891","authenticated-orcid":false,"given":"Xinyu","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Marcin","family":"Grzegorzek","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2026.114406_b1","doi-asserted-by":"crossref","first-page":"23716","DOI":"10.52202\/068431-1723","article-title":"Flamingo: A visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b2","doi-asserted-by":"crossref","unstructured":"Amir, A., Taba, B., Berg, D., Melano, T., McKinstry, J., Di Nolfo, C., Nayak, T., Andreopoulos, A., Garreau, G., Mendoza, M., Kusnitz, J., Debole, M., Esser, S., Delbruck, T., Flickner, M., Modha, D., 2017. A Low Power, Fully Event-Based Gesture Recognition System. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7243\u20137252.","DOI":"10.1109\/CVPR.2017.781"},{"key":"10.1016\/j.engappai.2026.114406_b3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C., 2021. ViViT: A Video Vision Transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 6836\u20136846.","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"10.1016\/j.engappai.2026.114406_b4","series-title":"Delving deeper into convolutional networks for learning video representations","author":"Ballas","year":"2016"},{"issue":"3","key":"10.1016\/j.engappai.2026.114406_b5","doi-asserted-by":"crossref","first-page":"433","DOI":"10.1145\/212094.212141","article-title":"The computation of optical flow","volume":"27","author":"Beauchemin","year":"1995","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.engappai.2026.114406_b6","series-title":"Is space-time attention all you need for video understanding?","author":"Bertasius","year":"2021"},{"key":"10.1016\/j.engappai.2026.114406_b7","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3034","article-title":"Dynamic image networks for action recognition","author":"Bilen","year":"2016"},{"key":"10.1016\/j.engappai.2026.114406_b8","doi-asserted-by":"crossref","unstructured":"Cao, K., Ji, J., Cao, Z., Chang, C.-Y., Niebles, J.C., 2020. Few-Shot Video Classification via Temporal Alignment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10618\u201310627.","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"10.1016\/j.engappai.2026.114406_b9","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A., 2017. Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6299\u20136308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"10.1016\/j.engappai.2026.114406_b10","series-title":"2018 IEEE Winter Conference on Applications of Computer Vision","first-page":"381","article-title":"Learning to detect human-object interactions","author":"Chao","year":"2018"},{"key":"10.1016\/j.engappai.2026.114406_b11","doi-asserted-by":"crossref","unstructured":"Chen, Y., Zhang, Z., Yuan, C., Li, B., Deng, Y., Hu, W., 2021. Channel-Wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 13359\u201313368.","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"10.1016\/j.engappai.2026.114406_b12","doi-asserted-by":"crossref","first-page":"17043","DOI":"10.52202\/068431-1240","article-title":"Two-stream network for sign language recognition and translation","volume":"35","author":"Chen","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b13","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., He, X., Chen, W., Cheng, J., Lu, H., 2020. Skeleton-Based Action Recognition With Shift Graph Convolutional Network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 183\u2013192.","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"10.1016\/j.engappai.2026.114406_b14","doi-asserted-by":"crossref","unstructured":"Chi, H.-g., Ha, M.H., Chi, S., Lee, S.W., Huang, Q., Ramani, K., 2022. InfoGCN: Representation Learning for Human Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20186\u201320196.","DOI":"10.1109\/CVPR52688.2022.01955"},{"key":"10.1016\/j.engappai.2026.114406_b15","first-page":"4479","article-title":"Fast Fourier convolution","volume":"vol. 33","author":"Chi","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b16","doi-asserted-by":"crossref","first-page":"39020","DOI":"10.52202\/068431-2828","article-title":"Enabling detailed action recognition evaluation through video dataset augmentation","volume":"35","author":"Chung","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.130911","article-title":"Machine learning models for wearable-based human activity recognition: A comparative study","volume":"650","author":"Ciortuz","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.engappai.2026.114406_b18","series-title":"2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"1","article-title":"EventTransAct: A video transformer-based framework for event-camera based action recognition","author":"de Blegiers","year":"2023"},{"key":"10.1016\/j.engappai.2026.114406_b19","series-title":"Cognitive Computing and Information Processing","first-page":"475","article-title":"Human action detection and recognition using SIFT and SVM","author":"Dhulavvagol","year":"2018"},{"key":"10.1016\/j.engappai.2026.114406_b20","doi-asserted-by":"crossref","unstructured":"Duan, H., Xu, M., Shuai, B., Modolo, D., Tu, Z., Tighe, J., Bergamo, A., 2023. SkeleTR: Towards Skeleton-based Action Recognition in the Wild. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 13634\u201313644.","DOI":"10.1109\/ICCV51070.2023.01254"},{"key":"10.1016\/j.engappai.2026.114406_b21","doi-asserted-by":"crossref","unstructured":"Duan, H., Zhao, Y., Chen, K., Lin, D., Dai, B., 2022. Revisiting Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2969\u20132978.","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"10.1016\/j.engappai.2026.114406_b22","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"670","article-title":"Omni-sourced webly-supervised learning for video recognition","author":"Duan","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b23","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., Feichtenhofer, C., 2021. Multiscale Vision Transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 6824\u20136835.","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"10.1016\/j.engappai.2026.114406_b24","doi-asserted-by":"crossref","unstructured":"Farha, Y.A., Gall, J., 2019. MS-TCN: Multi-Stage Temporal Convolutional Network for Action Segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3575\u20133584.","DOI":"10.1109\/CVPR.2019.00369"},{"key":"10.1016\/j.engappai.2026.114406_b25","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., 2020. X3D: Expanding Architectures for Efficient Video Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 203\u2013213.","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"10.1016\/j.engappai.2026.114406_b26","first-page":"35946","article-title":"Masked autoencoders as spatiotemporal learners","volume":"35","author":"Feichtenhofer","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b27","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K., 2019. SlowFast Networks for Video Recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 6202\u20136211.","DOI":"10.1109\/ICCV.2019.00630"},{"key":"10.1016\/j.engappai.2026.114406_b28","series-title":"Relation modeling in spatio-temporal action localization","author":"Feng","year":"2021"},{"issue":"1","key":"10.1016\/j.engappai.2026.114406_b29","doi-asserted-by":"crossref","first-page":"202","DOI":"10.1109\/TETC.2022.3230912","article-title":"Skeleton-based action segmentation with multi-stage spatial-temporal graph convolutional neural networks","volume":"12","author":"Filtjens","year":"2024","journal-title":"IEEE Trans. Emerg. Top. Comput."},{"key":"10.1016\/j.engappai.2026.114406_b30","series-title":"Proceedings of the 34th International Conference on Machine Learning","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","author":"Finn","year":"2017"},{"key":"10.1016\/j.engappai.2026.114406_b31","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"696","article-title":"DRG: Dual relation graph for human-object interaction detection","author":"Gao","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b32","series-title":"iCAN: Instance-centric attention network for human-object interaction detection","author":"Gao","year":"2018"},{"key":"10.1016\/j.engappai.2026.114406_b33","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A., 2019. Video Action Transformer Network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 244\u2013253.","DOI":"10.1109\/CVPR.2019.00033"},{"key":"10.1016\/j.engappai.2026.114406_b34","doi-asserted-by":"crossref","unstructured":"Girish, D., Singh, V., Ralescu, A., 2020. Understanding Action Recognition in Still Images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 370\u2013371.","DOI":"10.1109\/CVPRW50498.2020.00193"},{"key":"10.1016\/j.engappai.2026.114406_b35","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Girshick, R., Doll\u00e1r, P., He, K., 2018. Detecting and Recognizing Human-Object Interactions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 8359\u20138367.","DOI":"10.1109\/CVPR.2018.00872"},{"key":"10.1016\/j.engappai.2026.114406_b36","doi-asserted-by":"crossref","unstructured":"Gowda, S., Gao, B., Gu, X., Jin, X., 2025. Is Temporal Prompting All We Need For Limited Labeled Action Recognition?. In: Proceedings of the Computer Vision and Pattern Recognition Conference. pp. 682\u2013692.","DOI":"10.1109\/CVPRW67362.2025.00073"},{"key":"10.1016\/j.engappai.2026.114406_b37","doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi Kahou, S., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., Hoppe, F., Thurau, C., Bax, I., Memisevic, R., 2017. The \u201dSomething Something\u201d Video Database for Learning and Evaluating Visual Common Sense. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"10.1016\/j.engappai.2026.114406_b38","series-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2024"},{"key":"10.1016\/j.engappai.2026.114406_b39","doi-asserted-by":"crossref","unstructured":"Gu, C., Sun, C., Ross, D.A., Vondrick, C., Pantofaru, C., Li, Y., Vijayanarasimhan, S., Toderici, G., Ricco, S., Sukthankar, R., Schmid, C., Malik, J., 2018. AVA: A Video Dataset of Spatio-Temporally Localized Atomic Visual Actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6047\u20136056.","DOI":"10.1109\/CVPR.2018.00633"},{"issue":"1","key":"10.1016\/j.engappai.2026.114406_b40","doi-asserted-by":"crossref","first-page":"185","DOI":"10.1016\/0004-3702(81)90024-2","article-title":"Determining optical flow","volume":"17","author":"Horn","year":"1981","journal-title":"Artificial Intelligence"},{"key":"10.1016\/j.engappai.2026.114406_b41","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"584","article-title":"Visual compositional learning for human-object interaction detection","author":"Hou","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b42","doi-asserted-by":"crossref","unstructured":"Hou, Z., Yu, B., Qiao, Y., Peng, X., Tao, D., 2021. Affordance Transfer Learning for Human-Object Interaction Detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 495\u2013504.","DOI":"10.1109\/CVPR46437.2021.00056"},{"key":"10.1016\/j.engappai.2026.114406_b43","first-page":"3833","article-title":"SeFAR: semi-supervised fine-grained action recognition with temporal perturbation and learning stabilization","volume":"vol. 39","author":"Huang","year":"2025"},{"key":"10.1016\/j.engappai.2026.114406_b44","doi-asserted-by":"crossref","unstructured":"Huang, B., Wang, X., Chen, H., Song, Z., Zhu, W., 2024. VTimeLLM: Empower LLM to Grasp Video Moments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14271\u201314280.","DOI":"10.1109\/CVPR52733.2024.01353"},{"issue":"7","key":"10.1016\/j.engappai.2026.114406_b45","doi-asserted-by":"crossref","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","article-title":"Human3.6M: Large scale datasets and predictive methods for 3D human sensing in natural environments","volume":"36","author":"Ionescu","year":"2014","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114406_b46","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"425","article-title":"Video representation learning by recognizing temporal transformations","author":"Jenni","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b47","series-title":"Computer Vision \u2013 ECCV 2024","first-page":"400","article-title":"Language-assisted skeleton action understanding for skeleton-based temporal action segmentation","author":"Ji","year":"2025"},{"key":"10.1016\/j.engappai.2026.114406_b48","doi-asserted-by":"crossref","unstructured":"Jiang, B., Wang, M., Gan, W., Wu, W., Yan, J., 2019. STM: SpatioTemporal and Motion Encoding for Action Recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2000\u20132009.","DOI":"10.1109\/ICCV.2019.00209"},{"key":"10.1016\/j.engappai.2026.114406_b49","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M.J., Jacobs, D.W., Malik, J., 2018. End-to-End Recovery of Human Shape and Pose. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7122\u20137131.","DOI":"10.1109\/CVPR.2018.00744"},{"key":"10.1016\/j.engappai.2026.114406_b50","doi-asserted-by":"crossref","first-page":"36372","DOI":"10.1109\/ACCESS.2024.3373199","article-title":"Human action recognition systems: A review of the trends and state-of-the-art","volume":"12","author":"Karim","year":"2024","journal-title":"IEEE Access"},{"key":"10.1016\/j.engappai.2026.114406_b51","doi-asserted-by":"crossref","first-page":"42769","DOI":"10.1109\/ACCESS.2024.3378515","article-title":"HADE: Exploiting human action recognition through fine-tuned deep learning methods","volume":"12","author":"Karim","year":"2024","journal-title":"IEEE Access"},{"key":"10.1016\/j.engappai.2026.114406_b52","doi-asserted-by":"crossref","first-page":"135609","DOI":"10.1109\/ACCESS.2025.3590073","article-title":"Next generation human action recognition: A comprehensive review of state-of-the-art signal processing techniques","volume":"13","author":"Karim","year":"2025","journal-title":"IEEE Access"},{"key":"10.1016\/j.engappai.2026.114406_b53","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"498","article-title":"UnionDet: Union-level detector towards real-time human-object interaction detection","author":"Kim","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b54","doi-asserted-by":"crossref","unstructured":"Kim, B., Lee, J., Kang, J., Kim, E.-S., Kim, H.J., 2021. HOTR: End-to-End Human-Object Interaction Detection With Transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 74\u201383.","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"10.1016\/j.engappai.2026.114406_b55","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.108068","article-title":"Weakly-supervised temporal attention 3D network for human action recognition","volume":"119","author":"Kim","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.engappai.2026.114406_b56","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Athanasiou, N., Black, M.J., 2020. VIBE: Video Inference for Human Body Pose and Shape Estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5253\u20135263.","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"10.1016\/j.engappai.2026.114406_b57","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Black, M.J., Daniilidis, K., 2019. Learning to Reconstruct 3D Human Pose and Shape via Model-Fitting in the Loop. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2252\u20132261.","DOI":"10.1109\/ICCV.2019.00234"},{"key":"10.1016\/j.engappai.2026.114406_b58","series-title":"2011 International Conference on Computer Vision","first-page":"2556","article-title":"HMDB: A large video database for human motion recognition","author":"Kuehne","year":"2011"},{"key":"10.1016\/j.engappai.2026.114406_b59","doi-asserted-by":"crossref","unstructured":"Li, M., Chen, S., Chen, X., Zhang, Y., Wang, Y., Tian, Q., 2019. Actional-Structural Graph Convolutional Networks for Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3595\u20133603.","DOI":"10.1109\/CVPR.2019.00371"},{"key":"10.1016\/j.engappai.2026.114406_b60","series-title":"A decoupled spatio-temporal framework for skeleton-based action segmentation","author":"Li","year":"2023"},{"key":"10.1016\/j.engappai.2026.114406_b61","first-page":"1404","article-title":"TA2n: two-stage action alignment network for few-shot action recognition","volume":"vol. 36","author":"Li","year":"2022"},{"key":"10.1016\/j.engappai.2026.114406_b62","doi-asserted-by":"crossref","unstructured":"Li, W., Liu, H., Tang, H., Wang, P., Van Gool, L., 2022. MHFormer: Multi-Hypothesis Transformer for 3D Human Pose Estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 13147\u201313156.","DOI":"10.1109\/CVPR52688.2022.01280"},{"key":"10.1016\/j.engappai.2026.114406_b63","first-page":"5011","article-title":"HOI analysis: integrating and decomposing human-object interaction","volume":"vol. 33","author":"Li","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b64","series-title":"The AVA-kinetics localized human actions video dataset","author":"Li","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b65","doi-asserted-by":"crossref","unstructured":"Li, L., Wang, M., Ni, B., Wang, H., Yang, J., Zhang, W., 2021. 3D Human Action Representation Learning via Cross-View Consistency Pursuit. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4741\u20134750.","DOI":"10.1109\/CVPR46437.2021.00471"},{"key":"10.1016\/j.engappai.2026.114406_b66","doi-asserted-by":"crossref","unstructured":"Li, Y.-L., Zhou, S., Huang, X., Xu, L., Ma, Z., Fang, H.-S., Wang, Y., Lu, C., 2019. Transferable Interactiveness Knowledge for Human-Object Interaction Detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3585\u20133594.","DOI":"10.1109\/CVPR.2019.00370"},{"key":"10.1016\/j.engappai.2026.114406_b67","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Wang, F., Chen, Y., Qian, C., Feng, J., 2020. PPDM: Parallel Point Detection and Matching for Real-Time Human-Object Interaction Detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 482\u2013490.","DOI":"10.1109\/CVPR42600.2020.00056"},{"key":"10.1016\/j.engappai.2026.114406_b68","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S., 2019. TSM: Temporal Shift Module for Efficient Video Understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 7083\u20137093.","DOI":"10.1109\/ICCV.2019.00718"},{"key":"10.1016\/j.engappai.2026.114406_b69","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"2490","article-title":"MS2l: Multi-Task Self-Supervised Learning for skeleton based action recognition","author":"Lin","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b70","doi-asserted-by":"crossref","unstructured":"Lin, K., Wang, L., Liu, Z., 2021. End-to-End Human Pose and Mesh Reconstruction with Transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1954\u20131963.","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"10.1016\/j.engappai.2026.114406_b71","first-page":"25268","article-title":"Motion-x: A large-scale 3D expressive whole-body human motion dataset","volume":"36","author":"Lin","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b72","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"248","article-title":"Amplifying key cues for human-object-interaction detection","author":"Liu","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b73","series-title":"PKU-mmd: A large scale benchmark for continuous multi-modal human action understanding","author":"Liu","year":"2017"},{"key":"10.1016\/j.engappai.2026.114406_b74","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., Hu, H., 2022. Video Swin Transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"10.1016\/j.engappai.2026.114406_b75","doi-asserted-by":"crossref","unstructured":"Liu, X., Pintea, S.L., Nejadasl, F.K., Booij, O., van Gemert, J.C., 2021. No Frame Left Behind: Full Video Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14892\u201314901.","DOI":"10.1109\/CVPR46437.2021.01465"},{"issue":"10","key":"10.1016\/j.engappai.2026.114406_b76","doi-asserted-by":"crossref","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","article-title":"NTU RGB+d 120: a large-scale benchmark for 3D human activity understanding","volume":"42","author":"Liu","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.114406_b77","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, H., Chen, Z., Wang, Z., Ouyang, W., 2020. Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 143\u2013152.","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"10.1016\/j.engappai.2026.114406_b78","first-page":"17939","article-title":"MOMA: Multi-object multi-actor activity parsing","volume":"vol 34","author":"Luo","year":"2021"},{"issue":"12","key":"10.1016\/j.engappai.2026.114406_b79","doi-asserted-by":"crossref","first-page":"13411","DOI":"10.1109\/TCYB.2021.3132016","article-title":"Fine-grained unsupervised temporal action segmentation and distributed representation for skeleton-based human motion analysis","volume":"52","author":"Ma","year":"2022","journal-title":"IEEE Trans. Cybern."},{"key":"10.1016\/j.engappai.2026.114406_b80","doi-asserted-by":"crossref","unstructured":"Martinez, B., Modolo, D., Xiong, Y., Tighe, J., 2019. Action Recognition With Spatial-Temporal Discriminative Filter Banks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 5482\u20135491.","DOI":"10.1109\/ICCV.2019.00558"},{"key":"10.1016\/j.engappai.2026.114406_b81","doi-asserted-by":"crossref","DOI":"10.3389\/fnbot.2019.00038","article-title":"Neuromorphic vision datasets for pedestrian detection, action recognition, and fall detection","volume":"13","author":"Miao","year":"2019","journal-title":"Front. Neurorobotics"},{"key":"10.1016\/j.engappai.2026.114406_b82","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"752","article-title":"I2L-MeshNet: Image-to-lixel prediction network for accurate 3D human pose and mesh estimation from a single RGB image","author":"Moon","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b83","doi-asserted-by":"crossref","unstructured":"Neimark, D., Bar, O., Zohar, M., Asselmann, D., 2021. Video Transformer Network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 3163\u20133172.","DOI":"10.1109\/ICCVW54120.2021.00355"},{"issue":"19","key":"10.1016\/j.engappai.2026.114406_b84","doi-asserted-by":"crossref","first-page":"8234","DOI":"10.3390\/s23198234","article-title":"A hierarchical multitask learning approach for the recognition of activities of daily living using data from wearable sensors","volume":"23","author":"Nisar","year":"2023","journal-title":"Sensors"},{"key":"10.1016\/j.engappai.2026.114406_b85","doi-asserted-by":"crossref","unstructured":"Noor, N., Jametoni, F., Kim, J., Hong, H., Park, I.K., 2024. Efficient Skeleton-Based Action Recognition for Real-Time Embedded Systems. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5889\u20135897.","DOI":"10.1109\/CVPRW63382.2024.00596"},{"key":"10.1016\/j.engappai.2026.114406_b86","doi-asserted-by":"crossref","unstructured":"Pan, J., Chen, S., Shou, M.Z., Liu, Y., Shao, J., Li, H., 2021. Actor-Context-Actor Relation Network for Spatio-Temporal Action Localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 464\u2013474.","DOI":"10.1109\/CVPR46437.2021.00053"},{"issue":"4","key":"10.1016\/j.engappai.2026.114406_b87","doi-asserted-by":"crossref","first-page":"773","DOI":"10.1016\/j.sigpro.2010.08.010","article-title":"Efficient HOG human detection","volume":"91","author":"Pang","year":"2011","journal-title":"Signal Process."},{"key":"10.1016\/j.engappai.2026.114406_b88","first-page":"12493","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","volume":"vol. 34","author":"Patrick","year":"2021"},{"key":"10.1016\/j.engappai.2026.114406_b89","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Choutas, V., Ghorbani, N., Bolkart, T., Osman, A.A.A., Tzionas, D., Black, M.J., 2019. Expressive Body Capture: 3D Hands, Face, and Body From a Single Image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10975\u201310985.","DOI":"10.1109\/CVPR.2019.01123"},{"key":"10.1016\/j.engappai.2026.114406_b90","doi-asserted-by":"crossref","unstructured":"Pavllo, D., Feichtenhofer, C., Grangier, D., Auli, M., 2019. 3D Human Pose Estimation in Video With Temporal Convolutions and Semi-Supervised Training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 7753\u20137762.","DOI":"10.1109\/CVPR.2019.00794"},{"key":"10.1016\/j.engappai.2026.114406_b91","doi-asserted-by":"crossref","unstructured":"Perrett, T., Masullo, A., Burghardt, T., Mirmehdi, M., Damen, D., 2021. Temporal-Relational CrossTransformers for Few-Shot Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 475\u2013484.","DOI":"10.1109\/CVPR46437.2021.00054"},{"key":"10.1016\/j.engappai.2026.114406_b92","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J., 2021. BABEL: Bodies, Action and Behavior With English Labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 722\u2013731.","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"10.1016\/j.engappai.2026.114406_b93","first-page":"119336","article-title":"Streaming long video understanding with large language models","volume":"37","author":"Qian","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b94","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Ngo, C.-W., Tian, X., Mei, T., 2019. Learning Spatio-Temporal Representation With Local and Global Diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 12056\u201312065.","DOI":"10.1109\/CVPR.2019.01233"},{"key":"10.1016\/j.engappai.2026.114406_b95","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Shu, Y., Ngo, C.-W., Mei, T., 2021. Condensing a Sequence to One Informative Frame for Video Recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 16311\u201316320.","DOI":"10.1109\/ICCV48922.2021.01600"},{"key":"10.1016\/j.engappai.2026.114406_b96","doi-asserted-by":"crossref","unstructured":"Qu, H., Cai, Y., Liu, J., 2024. LLMs Are Good Action Recognizers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18395\u201318406.","DOI":"10.1109\/CVPR52733.2024.01741"},{"key":"10.1016\/j.engappai.2026.114406_b97","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.engappai.2026.114406_b98","doi-asserted-by":"crossref","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., Feichtenhofer, C., Malik, J., 2023. On the Benefits of 3D Pose and Tracking for Human Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 640\u2013649.","DOI":"10.1109\/CVPR52729.2023.00069"},{"key":"10.1016\/j.engappai.2026.114406_b99","doi-asserted-by":"crossref","unstructured":"Sabater, A., Montesano, L., Murillo, A.C., 2022. Event Transformer. A Sparse-Aware Solution for Efficient Event Data Processing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2677\u20132686.","DOI":"10.1109\/CVPRW56347.2022.00301"},{"key":"10.1016\/j.engappai.2026.114406_b100","doi-asserted-by":"crossref","unstructured":"Schmidtke, L., Vlontzos, A., Ellershaw, S., Lukens, A., Arichi, T., Kainz, B., 2021. Unsupervised Human Pose Estimation Through Transforming Shape Templates. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2484\u20132494.","DOI":"10.1109\/CVPR46437.2021.00251"},{"key":"10.1016\/j.engappai.2026.114406_b101","first-page":"32","article-title":"Recognizing human actions: A local SVM approach","volume":"vol. 3","author":"Schuldt","year":"2004"},{"key":"10.1016\/j.engappai.2026.114406_b102","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.-T., Wang, G., 2016. NTU RGB+D: A Large Scale Dataset for 3D Human Activity Analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 1010\u20131019.","DOI":"10.1109\/CVPR.2016.115"},{"key":"10.1016\/j.engappai.2026.114406_b103","series-title":"Computer Vision \u2013 ECCV 2022","first-page":"461","article-title":"P-STMO: Pre-trained spatial temporal many-to-one model for 3D human pose estimation","author":"Shan","year":"2022"},{"key":"10.1016\/j.engappai.2026.114406_b104","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D., 2020. FineGym: A Hierarchical Video Dataset for Fine-Grained Action Understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2616\u20132625.","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"10.1016\/j.engappai.2026.114406_b105","series-title":"Convolutional LSTM network: aMachine learning approach for precipitation nowcasting","author":"Shi","year":"2015"},{"key":"10.1016\/j.engappai.2026.114406_b106","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H., 2019. Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 12026\u201312035.","DOI":"10.1109\/CVPR.2019.01230"},{"key":"10.1016\/j.engappai.2026.114406_b107","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H., 2020. Decoupled Spatial-Temporal Attention Network for Skeleton-Based Action-Gesture Recognition. In: Proceedings of the Asian Conference on Computer Vision.","DOI":"10.1007\/978-3-030-69541-5_3"},{"key":"10.1016\/j.engappai.2026.114406_b108","article-title":"Two-stream convolutional networks for action recognition in videos","volume":"vol. 27","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.engappai.2026.114406_b109","series-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"10.1016\/j.engappai.2026.114406_b110","doi-asserted-by":"crossref","unstructured":"Su, Y., Lin, G., Wu, Q., 2021. Self-Supervised 3D Skeleton Action Representation Learning With Motion Consistency and Continuity. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 13328\u201313338.","DOI":"10.1109\/ICCV48922.2021.01308"},{"key":"10.1016\/j.engappai.2026.114406_b111","doi-asserted-by":"crossref","unstructured":"Sun, P., Cao, J., Jiang, Y., Yuan, Z., Bai, S., Kitani, K., Luo, P., 2022. DanceTrack: Multi-Object Tracking in Uniform Appearance and Diverse Motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20993\u201321002.","DOI":"10.1109\/CVPR52688.2022.02032"},{"key":"10.1016\/j.engappai.2026.114406_b112","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"318","article-title":"Actor-centric relation network","author":"Sun","year":"2018"},{"key":"10.1016\/j.engappai.2026.114406_b113","doi-asserted-by":"crossref","unstructured":"Tan, F., Tang, D., Dou, M., Guo, K., Pandey, R., Keskin, C., Du, R., Sun, D., Bouaziz, S., Fanello, S., Tan, P., Zhang, Y., 2021. HumanGPS: Geodesic PreServing Feature for Dense Human Correspondences. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1820\u20131830.","DOI":"10.1109\/CVPR46437.2021.00186"},{"key":"10.1016\/j.engappai.2026.114406_b114","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"71","article-title":"Asynchronous interaction aggregation for action detection","author":"Tang","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b115","series-title":"AVD: Adversarial video distillation","author":"Tavakolian","year":"2019"},{"key":"10.1016\/j.engappai.2026.114406_b116","doi-asserted-by":"crossref","unstructured":"Tavakolian, M., Tavakoli, H.R., Hadid, A., 2019b. AWSD: Adaptive Weighted Spatiotemporal Distillation for Video Representation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 8020\u20138029.","DOI":"10.1109\/ICCV.2019.00811"},{"key":"10.1016\/j.engappai.2026.114406_b117","series-title":"Chameleon: mixed-modal early-fusion foundation models","author":"Team","year":"2025"},{"key":"10.1016\/j.engappai.2026.114406_b118","doi-asserted-by":"crossref","unstructured":"Thatipelli, A., Narayan, S., Khan, S., Anwer, R.M., Khan, F.S., Ghanem, B., 2022. Spatio-Temporal Relation Modeling for Few-Shot Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19958\u201319967.","DOI":"10.1109\/CVPR52688.2022.01933"},{"key":"10.1016\/j.engappai.2026.114406_b119","first-page":"10078","article-title":"VideoMAE: Masked Autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b120","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Feiszli, M., 2019. Video Classification With Channel-Separated Convolutional Networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 5552\u20135561.","DOI":"10.1109\/ICCV.2019.00565"},{"key":"10.1016\/j.engappai.2026.114406_b121","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M., 2018. A Closer Look at Spatiotemporal Convolutions for Action Recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 6450\u20136459.","DOI":"10.1109\/CVPR.2018.00675"},{"key":"10.1016\/j.engappai.2026.114406_b122","doi-asserted-by":"crossref","unstructured":"Ulutan, O., Iftekhar, A.S.M., Manjunath, B.S., 2020. VSGNet: Spatial Attention Network for Detecting Human Object Interactions Using Graph Convolutions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 13617\u201313626.","DOI":"10.1109\/CVPR42600.2020.01363"},{"key":"10.1016\/j.engappai.2026.114406_b123","doi-asserted-by":"crossref","unstructured":"Vasu, P.K.A., Faghri, F., Li, C.-L., Koc, C., True, N., Antony, A., Santhanam, G., Gabriel, J., Grasch, P., Tuzel, O., Pouransari, H., 2025. FastVLM: Efficient Vision Encoding for Vision Language Models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19769\u201319780.","DOI":"10.1109\/CVPR52734.2025.01841"},{"key":"10.1016\/j.engappai.2026.114406_b124","article-title":"Matching networks for one shot learning","volume":"vol. 29","author":"Vinyals","year":"2016"},{"key":"10.1016\/j.engappai.2026.114406_b125","doi-asserted-by":"crossref","unstructured":"Wan, B., Zhou, D., Liu, Y., Li, R., He, X., 2019. Pose-Aware Multi-Level Feature Network for Human Object Interaction Detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 9469\u20139478.","DOI":"10.1109\/ICCV.2019.00956"},{"key":"10.1016\/j.engappai.2026.114406_b126","doi-asserted-by":"crossref","unstructured":"Wang, J., Cherian, A., Porikli, F., Gould, S., 2018. Video Representation Learning Using Discriminative Pooling. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 1149\u20131158.","DOI":"10.1109\/CVPR.2018.00126"},{"key":"10.1016\/j.engappai.2026.114406_b127","doi-asserted-by":"crossref","unstructured":"Wang, Z., Fang, Y., Cao, J., Zhang, Q., Wang, Z., Xu, R., 2023. Masked Spiking Transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1761\u20131771.","DOI":"10.1109\/ICCV51070.2023.00169"},{"key":"10.1016\/j.engappai.2026.114406_b128","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K., 2018. Non-Local Neural Networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7794\u20137803.","DOI":"10.1109\/CVPR.2018.00813"},{"issue":"9","key":"10.1016\/j.engappai.2026.114406_b129","doi-asserted-by":"crossref","first-page":"8522","DOI":"10.1109\/TCSVT.2024.3384875","article-title":"Few-shot action recognition via multi-view representation learning","volume":"34","author":"Wang","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.engappai.2026.114406_b130","first-page":"121475","article-title":"CogVLM: Visual expert for pretrained language models","volume":"37","author":"Wang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b131","doi-asserted-by":"crossref","unstructured":"Wang, L., Tong, Z., Ji, B., Wu, G., 2021. TDN: Temporal Difference Networks for Efficient Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1895\u20131904.","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"10.1016\/j.engappai.2026.114406_b132","series-title":"Computer Vision \u2013 ECCV 2016","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"Wang","year":"2016"},{"issue":"10","key":"10.1016\/j.engappai.2026.114406_b133","doi-asserted-by":"crossref","first-page":"5932","DOI":"10.1109\/TCSVT.2023.3262670","article-title":"Task-aware dual-representation network for few-shot action recognition","volume":"33","author":"Wang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"6","key":"10.1016\/j.engappai.2026.114406_b134","doi-asserted-by":"crossref","first-page":"1899","DOI":"10.1007\/s11263-023-01917-4","article-title":"CLIP-guided prototype modulating for few-shot action recognition","volume":"132","author":"Wang","year":"2024","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.engappai.2026.114406_b135","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, S., Qing, Z., Gao, C., Zhang, Y., Zhao, D., Sang, N., 2023b. MoLo: Motion-Augmented Long-Short Contrastive Learning for Few-Shot Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18011\u201318021.","DOI":"10.1109\/CVPR52729.2023.01727"},{"key":"10.1016\/j.engappai.2026.114406_b136","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, S., Qing, Z., Tang, M., Zuo, Z., Gao, C., Jin, R., Sang, N., 2022. Hybrid Relation Guided Set Matching for Few-Shot Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19948\u201319957.","DOI":"10.1109\/CVPR52688.2022.01932"},{"key":"10.1016\/j.engappai.2026.114406_b137","doi-asserted-by":"crossref","unstructured":"Wang, N., Zhu, G., Li, H., Zhang, L., Shah, S.A.A., Bennamoun, M., 2024. Language Model Guided Interpretable Video Action Reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18878\u201318887.","DOI":"10.1109\/CVPR52733.2024.01786"},{"key":"10.1016\/j.engappai.2026.114406_b138","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C.-Y., Yuille, A., Feichtenhofer, C., 2022. Masked Feature Prediction for Self-Supervised Visual Pre-Training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14668\u201314678.","DOI":"10.1109\/CVPR52688.2022.01426"},{"issue":"3","key":"10.1016\/j.engappai.2026.114406_b139","doi-asserted-by":"crossref","first-page":"534","DOI":"10.26599\/BDMA.2024.9020076","article-title":"SVMFN-FSAR: semantic-guided video multimodal fusion network for few-shot action recognition","volume":"8","author":"Wei","year":"2025","journal-title":"Big Data Min. Anal."},{"key":"10.1016\/j.engappai.2026.114406_b140","series-title":"Janus: decoupling visual encoding for unified multimodal understanding and generation","author":"Wu","year":"2024"},{"key":"10.1016\/j.engappai.2026.114406_b141","doi-asserted-by":"crossref","unstructured":"Wu, C.-Y., Feichtenhofer, C., Fan, H., He, K., Krahenbuhl, P., Girshick, R., 2019. Long-Term Feature Banks for Detailed Video Understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 284\u2013293.","DOI":"10.1109\/CVPR.2019.00037"},{"key":"10.1016\/j.engappai.2026.114406_b142","doi-asserted-by":"crossref","unstructured":"Wu, C.-Y., Krahenbuhl, P., 2021. Towards Long-Form Video Understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1884\u20131894.","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"10.1016\/j.engappai.2026.114406_b143","first-page":"2847","article-title":"Revisiting classifier: transferring vision-language models for video recognition","volume":"vol. 37","author":"Wu","year":"2023"},{"key":"10.1016\/j.engappai.2026.114406_b144","doi-asserted-by":"crossref","unstructured":"Wu, J., Zhang, T., Zhang, Z., Wu, F., Zhang, Y., 2022. Motion-Modulated Temporal Fragment Alignment Network for Few-Shot Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9151\u20139160.","DOI":"10.1109\/CVPR52688.2022.00894"},{"key":"10.1016\/j.engappai.2026.114406_b145","series-title":"2012 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops","first-page":"20","article-title":"View invariant human action recognition using histograms of 3D joints","author":"Xia","year":"2012"},{"key":"10.1016\/j.engappai.2026.114406_b146","doi-asserted-by":"crossref","unstructured":"Xiang, W., Li, C., Zhou, Y., Wang, B., Zhang, L., 2023. Generative Action Description Prompts for Skeleton-based Action Recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10276\u201310285.","DOI":"10.1109\/ICCV51070.2023.00943"},{"key":"10.1016\/j.engappai.2026.114406_b147","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K., 2018. Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video Classification. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 305\u2013321.","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"10.1016\/j.engappai.2026.114406_b148","first-page":"1","article-title":"Language knowledge-assisted representation learning for skeleton-based action recognition","author":"Xu","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.engappai.2026.114406_b149","series-title":"Proceedings 1992 IEEE Computer Society Conference on Computer Vision and Pattern Recognition","first-page":"379","article-title":"Recognizing human action in time-sequential images using hidden Markov model","author":"Yamato","year":"1992"},{"key":"10.1016\/j.engappai.2026.114406_b150","article-title":"Spatial temporal graph convolutional networks for skeleton-based action recognition","volume":"vol. 32","author":"Yan","year":"2018"},{"key":"10.1016\/j.engappai.2026.114406_b151","doi-asserted-by":"crossref","unstructured":"Yang, J., Dong, X., Liu, L., Zhang, C., Shen, J., Yu, D., 2022. Recurring the Transformer for Video Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14063\u201314073.","DOI":"10.1109\/CVPR52688.2022.01367"},{"key":"10.1016\/j.engappai.2026.114406_b152","series-title":"Computer Vision \u2013 ECCV 2024","first-page":"304","article-title":"Follow the rules: Reasoning for video anomaly detection with large language models","author":"Yang","year":"2025"},{"key":"10.1016\/j.engappai.2026.114406_b153","series-title":"2021 16th IEEE International Conference on Automatic Face and Gesture Recognition","first-page":"1","article-title":"Self-supervised video pose representation learning for occlusion- robust action recognition","author":"Yang","year":"2021"},{"issue":"1","key":"10.1016\/j.engappai.2026.114406_b154","doi-asserted-by":"crossref","first-page":"34","DOI":"10.1109\/3468.553220","article-title":"Human action learning via hidden Markov model","volume":"27","author":"Yang","year":"1997","journal-title":"IEEE Trans. Syst. Man Cybern. - Part A: Syst. Humans"},{"key":"10.1016\/j.engappai.2026.114406_b155","series-title":"ASFormer: Transformer for action segmentation","author":"Yi","year":"2021"},{"key":"10.1016\/j.engappai.2026.114406_b156","doi-asserted-by":"crossref","unstructured":"Zhang, H., Leong, M.C., Li, L., Lin, W., 2024. PeVL: Pose-Enhanced Vision-Language Model for Fine-Grained Human Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18857\u201318867.","DOI":"10.1109\/CVPR52733.2024.01784"},{"key":"10.1016\/j.engappai.2026.114406_b157","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tokmakov, P., Hebert, M., Schmid, C., 2019. A Structured Model for Action Detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9975\u20139984.","DOI":"10.1109\/CVPR.2019.01021"},{"key":"10.1016\/j.engappai.2026.114406_b158","doi-asserted-by":"crossref","unstructured":"Zhang, J., Tu, Z., Yang, J., Chen, Y., Yuan, J., 2022. MixSTE: Seq2seq Mixed Spatio-Temporal Encoder for 3D Human Pose Estimation in Video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 13232\u201313242.","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"10.1016\/j.engappai.2026.114406_b159","series-title":"Computer Vision \u2013 ECCV 2020","first-page":"525","article-title":"Few-shot action recognition with permutation-invariant attention","author":"Zhang","year":"2020"},{"key":"10.1016\/j.engappai.2026.114406_b160","doi-asserted-by":"crossref","unstructured":"Zhao, M., Yu, Y., Wang, X., Yang, L., Niu, D., 2023. Search-Map-Search: A Frame Selection Paradigm for Action Recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10627\u201310636.","DOI":"10.1109\/CVPR52729.2023.01024"},{"key":"10.1016\/j.engappai.2026.114406_b161","doi-asserted-by":"crossref","unstructured":"Zheng, C., Zhu, S., Mendieta, M., Yang, T., Chen, C., Ding, Z., 2021. 3D Human Pose Estimation With Spatial and Temporal Transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 11656\u201311665.","DOI":"10.1109\/ICCV48922.2021.01145"},{"key":"10.1016\/j.engappai.2026.114406_b162","first-page":"65069","article-title":"Learning human action recognition representations without real humans","volume":"36","author":"Zhong","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2026.114406_b163","doi-asserted-by":"crossref","unstructured":"Zhou, X., Arnab, A., Sun, C., Schmid, C., 2023. How Can Objects Help Action Recognition?. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2353\u20132362.","DOI":"10.1109\/CVPR52729.2023.00233"},{"key":"10.1016\/j.engappai.2026.114406_b164","doi-asserted-by":"crossref","unstructured":"Zhou, J., Zheng, X., Lyu, Y., Wang, L., 2024. ExACT: Language-guided Conceptual Reasoning and Uncertainty Estimation for Event-based Action Recognition and More. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18633\u201318643.","DOI":"10.1109\/CVPR52733.2024.01763"},{"key":"10.1016\/j.engappai.2026.114406_b165","doi-asserted-by":"crossref","unstructured":"Zhu, W., Ma, X., Liu, Z., Liu, L., Wu, W., Wang, Y., 2023. MotionBERT: A Unified Perspective on Learning Human Motion Representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 15085\u201315099.","DOI":"10.1109\/ICCV51070.2023.01385"},{"key":"10.1016\/j.engappai.2026.114406_b166","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Min, M.R., Kadav, A., Graf, H.P., 2020. S3VAE: Self-Supervised Sequential VAE for Representation Disentanglement and Data Generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6538\u20136547.","DOI":"10.1109\/CVPR42600.2020.00657"},{"key":"10.1016\/j.engappai.2026.114406_b167","unstructured":"Zhu, X., Tao, X., Shi, L., Chen, S., Yin, R., Ding, L., Obinata, Y., Yamamoto, T., Tan, Z., 0000. Multi-Scale Spatiotemporal Features for Action Localization."},{"key":"10.1016\/j.engappai.2026.114406_b168","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y., 2018. Compound Memory Networks for Few-shot Video Classification. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 751\u2013766.","DOI":"10.1007\/978-3-030-01234-2_46"},{"key":"10.1016\/j.engappai.2026.114406_b169","series-title":"2018 24th International Conference on Pattern Recognition","first-page":"645","article-title":"End-to-end video-level representation learning for action recognition","author":"Zhu","year":"2018"}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626006871?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626006871?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:47:28Z","timestamp":1776109648000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197626006871"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":169,"alternative-id":["S0952197626006871"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114406","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Role of prior in human activity recognition: A survey","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2026.114406","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114406"}}