{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T14:20:50Z","timestamp":1773843650574,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":326,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717746","type":"proceedings-article","created":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T14:10:32Z","timestamp":1750687832000},"page":"1869-1884","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["The Journey of Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5309-4340","authenticated-orcid":false,"given":"Xi","family":"Ding","sequence":"first","affiliation":[{"name":"Australian National University, Canberra, Australian Capital Territory, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8600-7099","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Griffith University, Brisbane, Queensland, Australia and Australian National University, Canberra, Australian Capital Territory, Australia"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3086668"},{"key":"e_1_3_2_2_2_1","volume-title":"EZ-CLIP: Efficient Zeroshot Video Action Recognition. ArXiv","author":"Ahmad Shahzad","year":"2023","unstructured":"Shahzad Ahmad, Sukalpa Chanda, and Yogesh Singh Rawat. 2023. EZ-CLIP: Efficient Zeroshot Video Action Recognition. ArXiv, Vol. abs\/2312.08010 (2023). https:\/\/api.semanticscholar.org\/CorpusID:266191106"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cogsys.2018.04.002"},{"key":"e_1_3_2_2_4_1","volume-title":"Flamingo: a Visual Language Model for Few-Shot Learning. ArXiv","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikolaj Binkowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Karen Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. ArXiv, Vol. abs\/2204.14198 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248476411"},{"key":"e_1_3_2_2_5_1","volume-title":"Lucas Smaira, Sander Dieleman, and Andrew Zisserman.","author":"Alayrac Jean-Baptiste","year":"2020","unstructured":"Jean-Baptiste Alayrac, Adria Recasens, Rosalia Schneider, Relja Arandjelovi\u0107, Jason Ramapuram, Jeffrey De Fauw, Lucas Smaira, Sander Dieleman, and Andrew Zisserman. 2020. Self-supervised multimodal versatile networks. Advances in neural information processing systems, Vol. 33 (2020), 25--37."},{"key":"e_1_3_2_2_6_1","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"Alwassel Humam","year":"2020","unstructured":"Humam Alwassel, Dhruv Mahajan, Bruno Korbar, Lorenzo Torresani, Bernard Ghanem, and Du Tran. 2020. Self-supervised learning by cross-modal audio-video clustering. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9758--9770.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_2_8_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 1708--1718. https:\/\/api.semanticscholar.org\/CorpusID:232478955"},{"key":"e_1_3_2_2_9_1","volume-title":"Surf: Speeded up robust features. In Computer Vision--ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, May 7--13","author":"Bay Herbert","year":"2006","unstructured":"Herbert Bay, Tinne Tuytelaars, and Luc Van Gool. 2006. Surf: Speeded up robust features. In Computer Vision--ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, May 7--13, 2006. Proceedings, Part I 9. Springer, 404--417."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01888"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00089"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"e_1_3_2_2_13_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_2_14_1","volume-title":"Action recognition with dynamic image networks","author":"Bilen Hakan","year":"2017","unstructured":"Hakan Bilen, Basura Fernando, Efstratios Gavves, and Andrea Vedaldi. 2017. Action recognition with dynamic image networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 12 (2017), 2799--2813."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2012.6239175"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206779"},{"key":"e_1_3_2_2_17_1","volume-title":"Swathikiran Sudhakaran, Brais Martinez, and Georgios Tzimiropoulos.","author":"Bulat Adrian","year":"2021","unstructured":"Adrian Bulat, Juan Manuel Perez Rua, Swathikiran Sudhakaran, Brais Martinez, and Georgios Tzimiropoulos. 2021. Space-time mixing attention for video transformer. Advances in neural information processing systems, Vol. 34 (2021), 19594--19607."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_2_19_1","volume-title":"A short note about kinetics-600. arXiv preprint arXiv:1808.01340","author":"Carreira Joao","year":"2018","unstructured":"Joao Carreira, Eric Noland, Andras Banki-Horvath, Chloe Hillier, and Andrew Zisserman. 2018. A short note about kinetics-600. arXiv preprint arXiv:1808.01340 (2018)."},{"key":"e_1_3_2_2_20_1","volume-title":"A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987","author":"Carreira Joao","year":"2019","unstructured":"Joao Carreira, Eric Noland, Chloe Hillier, and Andrew Zisserman. 2019. A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987 (2019)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7350781"},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6165--6175","author":"Richard Chen Chun-Fu","year":"2021","unstructured":"Chun-Fu Richard Chen, Rameswar Panda, Kandan Ramakrishnan, Rogerio Feris, John Cohn, Aude Oliva, and Quanfu Fan. 2021a. Deep analysis of cnn-based spatio-temporal representations for action recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6165--6175."},{"key":"e_1_3_2_2_24_1","volume-title":"When spatial meets temporal in action recognition. arXiv preprint arXiv:2411.15284","author":"Chen Huilin","year":"2024","unstructured":"Huilin Chen, Lei Wang, Yifan Chen, Tom Gedeon, and Piotr Koniusz. 2024b. When spatial meets temporal in action recognition. arXiv preprint arXiv:2411.15284 (2024)."},{"key":"e_1_3_2_2_25_1","volume-title":"The 16th Asian Conference on Machine Learning (Conference Track).","author":"Chen Qixiang","unstructured":"Qixiang Chen, Lei Wang, Piotr Koniusz, and Tom Gedeon. [n.,d.]. Motion meets attention: Video motion prompts. In The 16th Asian Conference on Machine Learning (Conference Track)."},{"key":"e_1_3_2_2_26_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_2_27_1","volume-title":"Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Chen Yuxin","year":"2021","unstructured":"Yuxin Chen, Ziqi Zhang, Chunfen Yuan, Bing Li, Ying Deng, and Weiming Hu. 2021b. Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 13339--13348. https:\/\/api.semanticscholar.org\/CorpusID:236428765"},{"key":"e_1_3_2_2_28_1","volume-title":"KAN-HyperpointNet for Point Cloud Sequence-Based 3D Human Action Recognition. arXiv preprint arXiv:2409.09444","author":"Chen Zhaoyu","year":"2024","unstructured":"Zhaoyu Chen, Xing Li, Qian Huang, Qiang Geng, Tianjin Yang, and Shihao Han. 2024a. KAN-HyperpointNet for Point Cloud Sequence-Based 3D Human Action Recognition. arXiv preprint arXiv:2409.09444 (2024)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_29"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.123061"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.172"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.368"},{"key":"e_1_3_2_2_34_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20186--20196","author":"Ha Myoung Hoon","year":"2022","unstructured":"Hyung-gun Chi, Myoung Hoon Ha, Seunggeun Chi, Sang Wan Lee, Qixing Huang, and Karthik Ramani. 2022. InfoGCN: Representation Learning for Human Skeleton-Based Action Recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20186--20196."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.3390\/biology12071033"},{"key":"e_1_3_2_2_36_1","volume-title":"Enhancing Skeleton-Based Action Recognition in Real-World Scenarios Through Realistic Data Augmentation. 2024 IEEE\/CVF Winter Conference on Applications of Computer Vision Workshops (WACVW) (2024","author":"Cormier Mickael","year":"2024","unstructured":"Mickael Cormier, Yannik Schmid, and J\u00fcrgen Beyerer. 2024. Enhancing Skeleton-Based Action Recognition in Real-World Scenarios Through Realistic Data Augmentation. 2024 IEEE\/CVF Winter Conference on Applications of Computer Vision Workshops (WACVW) (2024), 300--309. https:\/\/api.semanticscholar.org\/CorpusID:269191024"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the European conference on computer vision (ECCV). 720--736","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Sanja Fidler, Antonino Furnari, Evangelos Kazakos, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al. 2018. Scaling egocentric vision: The epic-kitchens dataset. In Proceedings of the European conference on computer vision (ECCV). 720--736."},{"key":"e_1_3_2_2_39_1","volume-title":"Antonino Furnari, Evangelos Kazakos, Jian Ma, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al.","author":"Damen Dima","year":"2022","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Antonino Furnari, Evangelos Kazakos, Jian Ma, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al. 2022. Rescaling egocentric vision: Collection, pipeline and challenges for epic-kitchens-100. International Journal of Computer Vision (2022), 1--23."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00092"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2024.3456670"},{"key":"e_1_3_2_2_42_1","volume-title":"Transfer Learning in Human Activity Recognition: A Survey. ArXiv","author":"Dhekane Sourish Gunesh","year":"2024","unstructured":"Sourish Gunesh Dhekane and Thomas Ploetz. 2024. Transfer Learning in Human Activity Recognition: A Survey. ArXiv, Vol. abs\/2401.10185 (2024). https:\/\/api.semanticscholar.org\/CorpusID:267034857"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2965299"},{"key":"e_1_3_2_2_44_1","volume-title":"Deep Temporal Linear Encoding Networks. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Diba Ali","year":"2016","unstructured":"Ali Diba, Vivek Sharma, and Luc Van Gool. 2016. Deep Temporal Linear Encoding Networks. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016), 1541--1550. https:\/\/api.semanticscholar.org\/CorpusID:6709077"},{"key":"e_1_3_2_2_45_1","volume-title":"Lego: Learnable expansion of graph operators for multi-modal feature fusion. arXiv preprint arXiv:2410.01506","author":"Ding Dexuan","year":"2024","unstructured":"Dexuan Ding, Lei Wang, Liyun Zhu, Tom Gedeon, and Piotr Koniusz. 2024. Lego: Learnable expansion of graph operators for multi-modal feature fusion. arXiv preprint arXiv:2410.01506 (2024)."},{"key":"e_1_3_2_2_46_1","volume-title":"Do Language Models Understand Time? arXiv preprint arXiv:2412.13845","author":"Ding Xi","year":"2024","unstructured":"Xi Ding and Lei Wang. 2024a. Do Language Models Understand Time? arXiv preprint arXiv:2412.13845 (2024)."},{"key":"e_1_3_2_2_47_1","volume-title":"Anomaly Detection? LLMs and VLMs in the Spotlight. arXiv preprint arXiv:2412.18298","author":"Ding Xi","year":"2024","unstructured":"Xi Ding and Lei Wang. 2024b. Quo Vadis, Anomaly Detection? LLMs and VLMs in the Spotlight. arXiv preprint arXiv:2412.18298 (2024)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/VSPETS.2005.1570899"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25127"},{"key":"e_1_3_2_2_51_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv","author":"Dosovitskiy Alexey","year":"1929","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv, Vol. abs\/2010.11929 (2020). https:\/\/api.semanticscholar.org\/CorpusID:225039882"},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 1110--1118","author":"Du Yong","year":"2015","unstructured":"Yong Du, Wei Wang, and Liang Wang. 2015. Hierarchical recurrent neural network for skeleton based action recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition. 1110--1118."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01398"},{"key":"e_1_3_2_2_56_1","unstructured":"Hehe Fan Xin Yu Yuhang Ding Yi Yang and M. Kankanhalli. 2022. PSTNet: Point Spatio-Temporal Convolution on Point Cloud Sequences. ArXiv Vol. abs\/2205.13713 (2022). https:\/\/api.semanticscholar.org\/CorpusID:235613642"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00470"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_2_2_59_1","volume-title":"Masked Autoencoders As Spatiotemporal Learners. ArXiv","author":"Feichtenhofer Christoph","year":"2022","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Yanghao Li, and Kaiming He. 2022. Masked Autoencoders As Spatiotemporal Learners. ArXiv, Vol. abs\/2205.09113 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248863181"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"e_1_3_2_2_63_1","volume-title":"Rank pooling for action recognition","author":"Fernando Basura","year":"2016","unstructured":"Basura Fernando, Efstratios Gavves, Jos\u00e9 Oramas, Amir Ghodrati, and Tinne Tuytelaars. 2016. Rank pooling for action recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 4 (2016), 773--787."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2207676.2208303"},{"key":"e_1_3_2_2_65_1","volume-title":"Hyperbolic self-paced learning for self-supervised skeleton-based action representations. arXiv preprint arXiv:2303.06242","author":"Franco Luca","year":"2023","unstructured":"Luca Franco, Paolo Mandica, Bharti Munjal, and Fabio Galasso. 2023. Hyperbolic self-paced learning for self-supervised skeleton-based action representations. arXiv preprint arXiv:2303.06242 (2023)."},{"key":"e_1_3_2_2_66_1","volume-title":"Need for Speed: A Benchmark for Higher Frame Rate Object Tracking. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Galoogahi Hamed Kiani","year":"2017","unstructured":"Hamed Kiani Galoogahi, Ashton Fagg, Chen Huang, Deva Ramanan, and Simon Lucey. 2017. Need for Speed: A Benchmark for Higher Frame Rate Object Tracking. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 1134--1143. https:\/\/api.semanticscholar.org\/CorpusID:9857301"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.106"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.05.094"},{"key":"e_1_3_2_2_69_1","volume-title":"Large-Scale Weakly-Supervised Pre-Training for Video Action Recognition. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019","author":"Ghadiyaram Deepti","year":"2019","unstructured":"Deepti Ghadiyaram, Matt Feiszli, Du Tran, Xueting Yan, Heng Wang, and Dhruv Kumar Mahajan. 2019. Large-Scale Weakly-Supervised Pre-Training for Video Action Recognition. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019), 12038--12047. https:\/\/api.semanticscholar.org\/CorpusID:143423501"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01003"},{"key":"e_1_3_2_2_71_1","volume-title":"CATER: A diagnostic dataset for Compositional Actions and TEmporal Reasoning. arXiv preprint arXiv:1910.04744","author":"Girdhar Rohit","year":"2019","unstructured":"Rohit Girdhar and Deva Ramanan. 2019. CATER: A diagnostic dataset for Compositional Actions and TEmporal Reasoning. arXiv preprint arXiv:1910.04744 (2019)."},{"key":"e_1_3_2_2_72_1","volume-title":"ActionVLAD: Learning Spatio-Temporal Aggregation for Action Classification. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Girdhar Rohit","year":"2017","unstructured":"Rohit Girdhar, Deva Ramanan, Abhinav Kumar Gupta, Josef Sivic, and Bryan C. Russell. 2017. ActionVLAD: Learning Spatio-Temporal Aggregation for Action Classification. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017), 3165--3174. https:\/\/api.semanticscholar.org\/CorpusID:16091693"},{"key":"e_1_3_2_2_73_1","volume-title":"Watching the World Go By: Representation Learning from Unlabeled Videos. ArXiv","author":"Gordon Daniel","year":"2020","unstructured":"Daniel Gordon, Kiana Ehsani, Dieter Fox, and Ali Farhadi. 2020. Watching the World Go By: Representation Learning from Unlabeled Videos. ArXiv, Vol. abs\/2003.07990 (2020). https:\/\/api.semanticscholar.org\/CorpusID:212747934"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70711"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19821-2_14"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_11"},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_2_2_80_1","first-page":"1","article-title":"A Survey on Deep Learning for Human Activity Recognition","volume":"54","author":"Gu Fuqiang","year":"2021","unstructured":"Fuqiang Gu, Mu-Huan Chung, Mark H. Chignell, Shahrokh Valaee, Baoding Zhou, and Xue Liu. 2021. A Survey on Deep Learning for Human Activity Recognition. ACM Computing Surveys (CSUR), Vol. 54 (2021), 1 -- 34. https:\/\/api.semanticscholar.org\/CorpusID:238260765","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"e_1_3_2_2_81_1","volume-title":"Self-supervised co-training for video representation learning. Advances in neural information processing systems","author":"Han Tengda","year":"2020","unstructured":"Tengda Han, Weidi Xie, and Andrew Zisserman. 2020. Self-supervised co-training for video representation learning. Advances in neural information processing systems, Vol. 33 (2020), 5679--5690."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.05.118"},{"key":"e_1_3_2_2_84_1","volume-title":"Masked Autoencoders Are Scalable Vision Learners. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"He Kaiming","year":"2021","unstructured":"Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll'ar, and Ross B. Girshick. 2021. Masked Autoencoders Are Scalable Vision Learners. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 15979--15988. https:\/\/api.semanticscholar.org\/CorpusID:243985980"},{"key":"e_1_3_2_2_85_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.01.010"},{"key":"e_1_3_2_2_86_1","volume-title":"Object-Region Video Transformers. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Herzig Roei","year":"2021","unstructured":"Roei Herzig, Elad Ben-Avraham, Karttikeya Mangalam, Amir Bar, Gal Chechik, Anna Rohrbach, Trevor Darrell, and Amir Globerson. 2021. Object-Region Video Transformers. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 3138--3149. https:\/\/api.semanticscholar.org\/CorpusID:238744000"},{"key":"e_1_3_2_2_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299172"},{"key":"e_1_3_2_2_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681062"},{"key":"e_1_3_2_2_89_1","volume-title":"Graph contrastive learning for skeleton-based action recognition. arXiv preprint arXiv:2301.10900","author":"Huang Xiaohu","year":"2023","unstructured":"Xiaohu Huang, Hao Zhou, Jian Wang, Haocheng Feng, Junyu Han, Errui Ding, Jingdong Wang, Xinggang Wang, Wenyu Liu, and Bin Feng. 2023. Graph contrastive learning for skeleton-based action recognition. arXiv preprint arXiv:2301.10900 (2023)."},{"key":"e_1_3_2_2_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.137"},{"key":"e_1_3_2_2_91_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"e_1_3_2_2_92_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.infrared.2019.103014"},{"key":"e_1_3_2_2_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.396"},{"key":"e_1_3_2_2_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_2_2_95_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIIBMS62405.2024.10792697"},{"key":"e_1_3_2_2_96_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.44"},{"key":"e_1_3_2_2_97_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICICSP54369.2021.9611970"},{"key":"e_1_3_2_2_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00115"},{"key":"e_1_3_2_2_99_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-68238-5_48"},{"key":"e_1_3_2_2_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3373199"},{"key":"e_1_3_2_2_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_2_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2017.8078497"},{"key":"e_1_3_2_2_103_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_104_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.486"},{"key":"e_1_3_2_2_105_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-021-10811-5"},{"key":"e_1_3_2_2_106_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72664-4_5"},{"key":"e_1_3_2_2_107_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.22.99"},{"key":"e_1_3_2_2_108_1","volume-title":"MoViNets: Mobile Video Networks for Efficient Video Recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Kondratyuk D. I.","year":"2021","unstructured":"D. I. Kondratyuk, Liangzhe Yuan, Yandong Li, Li Zhang, Mingxing Tan, Matthew A. Brown, and Boqing Gong. 2021. MoViNets: Mobile Video Networks for Efficient Video Recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 16015--16025. https:\/\/api.semanticscholar.org\/CorpusID:232307534"},{"key":"e_1_3_2_2_109_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"e_1_3_2_2_110_1","volume-title":"Tensor Representations for Action Recognition","author":"Koniusz Piotr","unstructured":"Piotr Koniusz, Lei Wang, and Anoop Cherian. 2020. Tensor Representations for Action Recognition. In IEEE Transactions on Pattern Analysis and Machine Intelligence. IEEE."},{"key":"e_1_3_2_2_111_1","doi-asserted-by":"publisher","DOI":"10.1049\/iet-cvi.2016.0355"},{"key":"e_1_3_2_2_112_1","volume-title":"Learning Human Activities and Object Affordances from RGB-D Videos. IJRR (1","author":"Koppula Hema Swetha","year":"2013","unstructured":"Hema Swetha Koppula, Rudhir Gupta, and Ashutosh Saxena. 2013. Learning Human Activities and Object Affordances from RGB-D Videos. IJRR (1 2013)."},{"key":"e_1_3_2_2_113_1","volume-title":"Dense-Captioning Events in Videos. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Kenji Hata, Frederic Ren, Li Fei-Fei, and Juan Carlos Niebles. 2017. Dense-Captioning Events in Videos. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 706--715. https:\/\/api.semanticscholar.org\/CorpusID:1026139"},{"key":"e_1_3_2_2_114_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_2_115_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-17529-6"},{"key":"e_1_3_2_2_116_1","volume-title":"Deep Local Video Feature for Action Recognition. 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)","author":"Lan Zhenzhong","year":"2017","unstructured":"Zhenzhong Lan, Yi Zhu, Alexander G. Hauptmann, and S. Newsam. 2017. Deep Local Video Feature for Action Recognition. 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW) (2017), 1219--1225. https:\/\/api.semanticscholar.org\/CorpusID:11599090"},{"key":"e_1_3_2_2_117_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-005-1838-7"},{"key":"e_1_3_2_2_118_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"e_1_3_2_2_119_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.318"},{"key":"e_1_3_2_2_120_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Lee Dongho","year":"2024","unstructured":"Dongho Lee, Jongseo Lee, and Jinwoo Choi. 2024. CAST: cross-attention in space and time for video action recognition. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_121_1","volume-title":"Scale-Aware Graph Convolutional Network with Part-Level Refinement for Skeleton-Based Human Action Recognition","author":"Li Chang","year":"2023","unstructured":"Chang Li, Yingchi Mao, Qian Huang, Xiaowei Zhu, and Jie Wu. 2023a. Scale-Aware Graph Convolutional Network with Part-Level Refinement for Skeleton-Based Human Action Recognition. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_2_122_1","volume-title":"2017 IEEE International conference on multimedia & expo workshops (ICMEW). IEEE, 585--590","author":"Li Chuankun","year":"2017","unstructured":"Chuankun Li, Pichao Wang, Shuang Wang, Yonghong Hou, and Wanqing Li. 2017a. Skeleton-based action recognition using LSTM and CNN. In 2017 IEEE International conference on multimedia & expo workshops (ICMEW). IEEE, 585--590."},{"key":"e_1_3_2_2_123_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2017.8026285"},{"key":"e_1_3_2_2_124_1","volume-title":"Hoi","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C. H. Hoi. 2022a. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:246411402"},{"key":"e_1_3_2_2_125_1","volume-title":"Uniformer: Unified transformer for efficient spatiotemporal representation learning. arXiv preprint arXiv:2201.04676","author":"Li Kunchang","year":"2022","unstructured":"Kunchang Li, Yali Wang, Peng Gao, Guanglu Song, Yu Liu, Hongsheng Li, and Yu Qiao. 2022b. Uniformer: Unified transformer for efficient spatiotemporal representation learning. arXiv preprint arXiv:2201.04676 (2022)."},{"key":"e_1_3_2_2_126_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00157"},{"key":"e_1_3_2_2_127_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00471"},{"key":"e_1_3_2_2_128_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00371"},{"key":"e_1_3_2_2_129_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72920-1_21"},{"key":"e_1_3_2_2_130_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72640-8_25"},{"key":"e_1_3_2_2_131_1","volume-title":"Learning Spatiotemporal Features via Video and Text Pair Discrimination. ArXiv","author":"Li Tianhao","year":"2020","unstructured":"Tianhao Li and Limin Wang. 2020. Learning Spatiotemporal Features via Video and Text Pair Discrimination. ArXiv, Vol. abs\/2001.05691 (2020). https:\/\/api.semanticscholar.org\/CorpusID:210698572"},{"key":"e_1_3_2_2_132_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2010.5543273"},{"key":"e_1_3_2_2_133_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"e_1_3_2_2_134_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"e_1_3_2_2_135_1","volume-title":"Chen Change Loy, and Wei-Shi Zheng","author":"Lin Kun-Yu","year":"2024","unstructured":"Kun-Yu Lin, Henghui Ding, Jiaming Zhou, Yu-Ming Tang, Yi-Xing Peng, Zhilin Zhao, Chen Change Loy, and Wei-Shi Zheng. 2024. Rethinking clip-based video learners in cross-domain open-vocabulary action recognition. arXiv preprint arXiv:2403.01560 (2024)."},{"key":"e_1_3_2_2_136_1","doi-asserted-by":"publisher","DOI":"10.1145\/2393347.2396381"},{"key":"e_1_3_2_2_137_1","volume-title":"Pku-mmd: A large scale benchmark for continuous multi-modal human action understanding. arXiv preprint arXiv:1703.07475","author":"Liu Chunhui","year":"2017","unstructured":"Chunhui Liu, Yueyu Hu, Yanghao Li, Sijie Song, and Jiaying Liu. 2017b. Pku-mmd: A large scale benchmark for continuous multi-modal human action understanding. arXiv preprint arXiv:1703.07475 (2017)."},{"key":"e_1_3_2_2_138_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952393"},{"key":"e_1_3_2_2_139_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3240472"},{"key":"e_1_3_2_2_140_1","volume-title":"Revealing Key Details to See Differences: A Novel Prototypical Perspective for Skeleton-based Action Recognition. arXiv preprint arXiv:2411.18941","author":"Liu Hongda","year":"2024","unstructured":"Hongda Liu, Yunfan Liu, Min Ren, Hao Wang, Yunlong Wang, and Zhenan Sun. 2024. Revealing Key Details to See Differences: A Novel Prototypical Perspective for Skeleton-based Action Recognition. arXiv preprint arXiv:2411.18941 (2024)."},{"key":"e_1_3_2_2_141_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206744"},{"key":"e_1_3_2_2_142_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"e_1_3_2_2_143_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2771306"},{"key":"e_1_3_2_2_144_1","doi-asserted-by":"publisher","DOI":"10.1145\/3365212"},{"key":"e_1_3_2_2_145_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.02.030"},{"key":"e_1_3_2_2_146_1","volume-title":"FSD-10: a dataset for competitive sports content analysis. arXiv preprint arXiv:2002.03312","author":"Liu Shenlan","year":"2020","unstructured":"Shenlan Liu, Xiang Liu, Gao Huang, Lin Feng, Lianyu Hu, Dong Jiang, Aibin Zhang, Yang Liu, and Hong Qiao. 2020a. FSD-10: a dataset for competitive sports content analysis. arXiv preprint arXiv:2002.03312 (2020)."},{"key":"e_1_3_2_2_147_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00934"},{"key":"e_1_3_2_2_148_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00133"},{"key":"e_1_3_2_2_149_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2823910"},{"key":"e_1_3_2_2_150_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_2_151_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60639-8_40"},{"key":"e_1_3_2_2_152_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00817"},{"key":"e_1_3_2_2_153_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_11"},{"key":"e_1_3_2_2_154_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.10.095"},{"key":"e_1_3_2_2_155_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"e_1_3_2_2_156_1","volume-title":"Something-Else: Compositional Action Recognition With Spatial-Temporal Interaction Networks. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019","author":"Materzynska Joanna","year":"2019","unstructured":"Joanna Materzynska, Tete Xiao, Roei Herzig, Huijuan Xu, Xiaolong Wang, and Trevor Darrell. 2019. Something-Else: Compositional Action Recognition With Spatial-Temporal Interaction Networks. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019), 1046--1056. https:\/\/api.semanticscholar.org\/CorpusID:209439709"},{"key":"e_1_3_2_2_157_1","first-page":"43","article-title":"A new technique based on 3D convolutional neural networks and filtering optical flow maps for action classification in infrared video","volume":"21","author":"Meglouli Hocine","year":"2019","unstructured":"Hocine Meglouli, Layachi Bentabet, Mohamed Airouche, et al. 2019. A new technique based on 3D convolutional neural networks and filtering optical flow maps for action classification in infrared video. Journal of Control Engineering and Applied Informatics, Vol. 21, 4 (2019), 43--50.","journal-title":"Journal of Control Engineering and Applied Informatics"},{"key":"e_1_3_2_2_158_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412632"},{"key":"e_1_3_2_2_159_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459154"},{"key":"e_1_3_2_2_160_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459154"},{"key":"e_1_3_2_2_161_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_2_162_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_163_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2005.188"},{"key":"e_1_3_2_2_164_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00580"},{"key":"e_1_3_2_2_165_1","volume-title":"Tom Yan, Lisa Brown, Quanfu Fan, Dan Gutfreund, Carl Vondrick, et al.","author":"Monfort Mathew","year":"2019","unstructured":"Mathew Monfort, Alex Andonian, Bolei Zhou, Kandan Ramakrishnan, Sarah Adel Bargal, Tom Yan, Lisa Brown, Quanfu Fan, Dan Gutfreund, Carl Vondrick, et al. 2019. Moments in time dataset: one million videos for event understanding. IEEE transactions on pattern analysis and machine intelligence, Vol. 42, 2 (2019), 502--508."},{"key":"e_1_3_2_2_166_1","volume-title":"Audio-Visual Instance Discrimination with Cross-Modal Agreement. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Morgado Pedro","year":"2020","unstructured":"Pedro Morgado, Nuno Vasconcelos, and Ishan Misra. 2020. Audio-Visual Instance Discrimination with Cross-Modal Agreement. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 12470--12481. https:\/\/api.semanticscholar.org\/CorpusID:216553230"},{"key":"e_1_3_2_2_167_1","volume-title":"Switzerland)","author":"Morshed Md Golam","year":"2023","unstructured":"Md Golam Morshed, Tangina Sultana, Aftab Alam, and Young-Koo Lee. 2023. Human Action Recognition: A Taxonomy-Based Survey, Updates, and Opportunities. Sensors (Basel, Switzerland), Vol. 23 (2023). https:\/\/api.semanticscholar.org\/CorpusID:256936214"},{"key":"e_1_3_2_2_168_1","volume-title":"Mocap database hdm05","author":"M\u00fcller Meinard","year":"2007","unstructured":"Meinard M\u00fcller, Tido R\u00f6der, Michael Clausen, Bernhard Eberhardt, Bj\u00f6rn Kr\u00fcger, and Andreas Weber. 2007. Mocap database hdm05. Institut f\u00fcr Informatik II, Universit\u00e4t Bonn, Vol. 2, 7 (2007)."},{"key":"e_1_3_2_2_169_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3378886"},{"key":"e_1_3_2_2_170_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"e_1_3_2_2_171_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15552-9_29"},{"key":"e_1_3_2_2_172_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.03.091"},{"key":"e_1_3_2_2_173_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2013.6474999"},{"key":"e_1_3_2_2_174_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.98"},{"key":"e_1_3_2_2_175_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09904-8"},{"key":"e_1_3_2_2_176_1","unstructured":"Mandela Patrick Yuki Asano Polina Kuznetsova Ruth Fong Joao F. Henriques Geoffrey Zweig and Andrea Vedaldi. 2021a. Multi-modal Self-Supervision from Generalized Data Transformations. https:\/\/openreview.net\/forum?id=mgVbI13p96"},{"key":"e_1_3_2_2_177_1","volume-title":"Keeping your eye on the ball: Trajectory attention in video transformers. Advances in neural information processing systems","author":"Patrick Mandela","year":"2021","unstructured":"Mandela Patrick, Dylan Campbell, Yuki Asano, Ishan Misra, Florian Metze, Christoph Feichtenhofer, Andrea Vedaldi, and Joao F Henriques. 2021b. Keeping your eye on the ball: Trajectory attention in video transformers. Advances in neural information processing systems, Vol. 34 (2021), 12493--12506."},{"key":"e_1_3_2_2_178_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.03.013"},{"key":"e_1_3_2_2_179_1","volume-title":"Oxford TRECVID 2006-Notebook paper.. In TRECVID.","author":"Philbin James","year":"2006","unstructured":"James Philbin, Anna Bosch, Ondrej Chum, Jan-Mark Geusebroek, Josef Sivic, Andrew Zisserman, et al. 2006. Oxford TRECVID 2006-Notebook paper.. In TRECVID."},{"key":"e_1_3_2_2_180_1","volume-title":"Evolving Losses for Unsupervised Video Representation Learning. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Piergiovanni A. J.","year":"2020","unstructured":"A. J. Piergiovanni, Anelia Angelova, and Michael S. Ryoo. 2020. Evolving Losses for Unsupervised Video Representation Learning. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 130--139. https:\/\/api.semanticscholar.org\/CorpusID:211532320"},{"key":"e_1_3_2_2_181_1","volume-title":"Spatiotemporal Contrastive Video Representation Learning. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Qian Rui","year":"2020","unstructured":"Rui Qian, Tianjian Meng, Boqing Gong, Ming-Hsuan Yang, H. Wang, Serge J. Belongie, and Yin Cui. 2020. Spatiotemporal Contrastive Video Representation Learning. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 6960--6970. https:\/\/api.semanticscholar.org\/CorpusID:221090567"},{"key":"e_1_3_2_2_182_1","volume-title":"Fusing Higher-Order Features in Graph Neural Networks for Skeleton-based Action Recognition","author":"Qin Zhenyue","year":"2022","unstructured":"Zhenyue Qin, Yang Liu, Pan Ji, Dongwoo Kim, Lei Wang, Bob McKay, Saeed Anwar, and Tom Gedeon. 2022. Fusing Higher-Order Features in Graph Neural Networks for Skeleton-based Action Recognition. IEEE TNNLS (2022)."},{"key":"e_1_3_2_2_183_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"e_1_3_2_2_184_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445"},{"key":"e_1_3_2_2_185_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.621"},{"key":"e_1_3_2_2_186_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2533389"},{"key":"e_1_3_2_2_187_1","volume-title":"Proceedings, Part II 13","author":"Rahmani Hossein","year":"2014","unstructured":"Hossein Rahmani, Arif Mahmood, Du Q Huynh, and Ajmal Mian. 2014. HOPC: Histogram of oriented principal components of 3D pointclouds for action recognition. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part II 13. Springer, 742--757."},{"key":"e_1_3_2_2_188_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.167"},{"key":"e_1_3_2_2_189_1","volume-title":"DEAR: Depth-Enhanced Action Recognition. arXiv preprint arXiv:2408.15679","author":"Rahmaniboldaji Sadegh","year":"2024","unstructured":"Sadegh Rahmaniboldaji, Filip Rybansky, Quoc Vuong, Frank Guerin, and Andrew Gilbert. 2024. DEAR: Depth-Enhanced Action Recognition. arXiv preprint arXiv:2408.15679 (2024)."},{"key":"e_1_3_2_2_190_1","volume-title":"Proceedings, Part I 11","author":"Raptis Michalis","year":"2010","unstructured":"Michalis Raptis and Stefano Soatto. 2010. Tracklet descriptors for action modeling and video analysis. In Computer Vision--ECCV 2010: 11th European Conference on Computer Vision, Heraklion, Crete, Greece, September 5--11, 2010, Proceedings, Part I 11. Springer, 577--590."},{"key":"e_1_3_2_2_191_1","volume-title":"Recognizing 50 human action categories of web videos. Machine vision and applications","author":"Reddy Kishore K","year":"2013","unstructured":"Kishore K Reddy and Mubarak Shah. 2013. Recognizing 50 human action categories of web videos. Machine vision and applications, Vol. 24, 5 (2013), 971--981."},{"key":"e_1_3_2_2_192_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"e_1_3_2_2_193_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"e_1_3_2_2_194_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0851-8"},{"key":"e_1_3_2_2_195_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_196_1","volume-title":"International Conference on Machine Learning. PMLR, 29441--29454","author":"Ryali Chaitanya","year":"2023","unstructured":"Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, et al. 2023. Hiera: A hierarchical vision transformer without the bells-and-whistles. In International Conference on Machine Learning. PMLR, 29441--29454."},{"key":"e_1_3_2_2_197_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-12091-z"},{"key":"e_1_3_2_2_198_1","volume-title":"Exploiting the ConvLSTM: Human Action Recognition using Raw Depth Video-Based Recurrent Neural Networks. ArXiv","author":"S\u00e1nchez-Caballero Adri\u00e1n","year":"2020","unstructured":"Adri\u00e1n S\u00e1nchez-Caballero, David Fuentes-Jim\u00e9nez, and Cristina Losada-Guti\u00e9rrez. 2020b. Exploiting the ConvLSTM: Human Action Recognition using Raw Depth Video-Based Recurrent Neural Networks. ArXiv, Vol. abs\/2006.07744 (2020). https:\/\/api.semanticscholar.org\/CorpusID:219687182"},{"key":"e_1_3_2_2_199_1","doi-asserted-by":"publisher","DOI":"10.3390\/app7010110"},{"key":"e_1_3_2_2_200_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2024.3499995"},{"key":"e_1_3_2_2_201_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"e_1_3_2_2_202_1","doi-asserted-by":"publisher","DOI":"10.1145\/1291233.1291311"},{"key":"e_1_3_2_2_203_1","volume-title":"CVPR Workshop","author":"Seidenari L.","year":"2013","unstructured":"L. Seidenari, V. Varano, S. Berretti, A. Del Bimbo, and P. Pala. 2013. Recognizing actions from depth cameras as weakly aligned multi-part bag-of-poses. CVPR Workshop (2013)."},{"key":"e_1_3_2_2_204_1","volume-title":"Optics and Photonics for Information Processing XII","author":"Shah Anuj K","unstructured":"Anuj K Shah, Ripul Ghosh, and Aparna Akula. 2018. A spatio-temporal deep learning approach for human action recognition in infrared videos. In Optics and Photonics for Information Processing XII, Vol. 10751. SPIE, 249--257."},{"key":"e_1_3_2_2_205_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"e_1_3_2_2_206_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"e_1_3_2_2_207_1","volume-title":"An image is worth 16x16 words, what is a video worth? arXiv preprint arXiv:2103.13915","author":"Sharir Gilad","year":"2021","unstructured":"Gilad Sharir, Asaf Noy, and Lihi Zelnik-Manor. 2021. An image is worth 16x16 words, what is a video worth? arXiv preprint arXiv:2103.13915 (2021)."},{"key":"e_1_3_2_2_208_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01520"},{"key":"e_1_3_2_2_209_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01514"},{"key":"e_1_3_2_2_210_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00810"},{"key":"e_1_3_2_2_211_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01230"},{"key":"e_1_3_2_2_212_1","volume-title":"Proceedings of the Asian conference on computer vision.","author":"Shi Lei","year":"2020","unstructured":"Lei Shi, Yifan Zhang, Jian Cheng, and Hanqing Lu. 2020. Decoupled spatial-temporal attention network for skeleton-based action-gesture recognition. In Proceedings of the Asian conference on computer vision."},{"key":"e_1_3_2_2_213_1","volume-title":"Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626","author":"Sigurdsson Gunnar A","year":"2018","unstructured":"Gunnar A Sigurdsson, Abhinav Gupta, Cordelia Schmid, Ali Farhadi, and Karteek Alahari. 2018. Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)."},{"key":"e_1_3_2_2_214_1","volume-title":"Proceedings, Part I 14","author":"Sigurdsson Gunnar A","year":"2016","unstructured":"Gunnar A Sigurdsson, G\u00fcl Varol, Xiaolong Wang, Ali Farhadi, Ivan Laptev, and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14. Springer, 510--526."},{"key":"e_1_3_2_2_215_1","volume-title":"Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_2_216_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01025"},{"key":"e_1_3_2_2_217_1","volume-title":"Prototypical networks for few-shot learning. Advances in neural information processing systems","author":"Snell Jake","year":"2017","unstructured":"Jake Snell, Kevin Swersky, and Richard Zemel. 2017. Prototypical networks for few-shot learning. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_218_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11212"},{"key":"e_1_3_2_2_219_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413802"},{"key":"e_1_3_2_2_220_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3015051"},{"key":"e_1_3_2_2_221_1","volume-title":"UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402","author":"Soomro K","year":"2012","unstructured":"K Soomro. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_2_222_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-022-07366-3"},{"key":"e_1_3_2_2_223_1","volume-title":"International conference on machine learning. PMLR, 843--852","author":"Srivastava Nitish","year":"2015","unstructured":"Nitish Srivastava, Elman Mansimov, and Ruslan Salakhudinov. 2015. Unsupervised learning of video representations using lstms. In International conference on machine learning. PMLR, 843--852."},{"key":"e_1_3_2_2_224_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2010.5583046"},{"key":"e_1_3_2_2_225_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3183112"},{"key":"e_1_3_2_2_226_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00131"},{"key":"e_1_3_2_2_227_1","volume-title":"Workshops at the twenty-fifth AAAI conference on artificial intelligence.","author":"Sung Jaeyong","year":"2011","unstructured":"Jaeyong Sung, Colin Ponce, Bart Selman, and Ashutosh Saxena. 2011. Human activity detection from RGBD images. In Workshops at the twenty-fifth AAAI conference on artificial intelligence."},{"key":"e_1_3_2_2_228_1","volume-title":"VIMPAC: Video Pre-Training via Masked Token Prediction and Contrastive Learning. ArXiv","author":"Tan Hao","year":"2021","unstructured":"Hao Tan, Jie Lei, Thomas Wolf, and Mohit Bansal. 2021. VIMPAC: Video Pre-Training via Masked Token Prediction and Contrastive Learning. ArXiv, Vol. abs\/2106.11250 (2021). https:\/\/api.semanticscholar.org\/CorpusID:235489838"},{"key":"e_1_3_2_2_229_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00558"},{"key":"e_1_3_2_2_230_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, Vol. 35 (2022), 10078--10093."},{"key":"e_1_3_2_2_231_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_232_1","volume-title":"Human Activity Recognition with Metric Learning. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:14160859","author":"Tran Du","year":"2008","unstructured":"Du Tran and Alexander Sorokin. 2008. Human Activity Recognition with Metric Learning. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:14160859"},{"key":"e_1_3_2_2_233_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00565"},{"key":"e_1_3_2_2_234_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_2_235_1","volume-title":"OV Ramana Murthy, and Abhinav Dhall","author":"Vadivelu Somasundaram","year":"2017","unstructured":"Somasundaram Vadivelu, Sudakshin Ganesan, OV Ramana Murthy, and Abhinav Dhall. 2017. Thermal imaging based elderly fall detection. In Computer Vision--ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20--24, 2016, Revised Selected Papers, Part III 13. Springer, 541--553."},{"key":"e_1_3_2_2_236_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.484"},{"key":"e_1_3_2_2_237_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW63481.2024.10645477"},{"key":"e_1_3_2_2_238_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956634"},{"key":"e_1_3_2_2_239_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2020.3044719"},{"key":"e_1_3_2_2_240_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0594-8"},{"key":"e_1_3_2_2_241_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"e_1_3_2_2_242_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_2_243_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.387"},{"key":"e_1_3_2_2_244_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00381"},{"key":"e_1_3_2_2_245_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01724"},{"key":"e_1_3_2_2_246_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247813"},{"key":"e_1_3_2_2_247_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.198"},{"key":"e_1_3_2_2_248_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.339"},{"key":"e_1_3_2_2_249_1","volume-title":"Analysis and Evaluation of Kinect-based Action Recognition Algorithms. Master's thesis. School of the Computer Science and Software Engineering","author":"Wang Lei","unstructured":"Lei Wang. 2017. Analysis and Evaluation of Kinect-based Action Recognition Algorithms. Master's thesis. School of the Computer Science and Software Engineering, The University of Western Australia."},{"key":"e_1_3_2_2_250_1","volume-title":"Robust human action modelling. Ph.,D. Dissertation","author":"Wang Lei","unstructured":"Lei Wang. 2023. Robust human action modelling. Ph.,D. Dissertation. The Australian National University (Australia)."},{"key":"e_1_3_2_2_251_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"e_1_3_2_2_252_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2925285"},{"key":"e_1_3_2_2_253_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475572"},{"key":"e_1_3_2_2_254_1","volume-title":"Proceedings of the Asian Conference on Computer Vision (ACCV). 4176--4193","author":"Wang Lei","year":"2022","unstructured":"Lei Wang and Piotr Koniusz. 2022a. Temporal-Viewpoint Transportation Plan for Skeletal Few-shot Action Recognition. In Proceedings of the Asian Conference on Computer Vision (ACCV). 4176--4193."},{"key":"e_1_3_2_2_255_1","volume-title":"Proceedings, Part XXI. Springer, 176--195","author":"Wang Lei","year":"2022","unstructured":"Lei Wang and Piotr Koniusz. 2022b. Uncertainty-DTW for time series and sequences. In Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXI. Springer, 176--195."},{"key":"e_1_3_2_2_256_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00544"},{"key":"e_1_3_2_2_257_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446223"},{"key":"e_1_3_2_2_258_1","volume-title":"Huynh","author":"Wang Lei","year":"2019","unstructured":"Lei Wang, Piotr Koniusz, and Du Q. Huynh. 2019b. Hallucinating IDT Descriptors and I3D Optical Flow Features for Action Recognition With CNNs. In ICCV."},{"key":"e_1_3_2_2_259_1","volume-title":"3D Skeleton-based Few-shot Action Recognition with JEANIE is not so Na\u00efve. arXiv preprint arXiv:2112.12668","author":"Wang Lei","year":"2021","unstructured":"Lei Wang, Jun Liu, and Piotr Koniusz. 2021b. 3D Skeleton-based Few-shot Action Recognition with JEANIE is not so Na\u00efve. arXiv preprint arXiv:2112.12668 (2021)."},{"key":"e_1_3_2_2_260_1","volume-title":"Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via Temporal-Viewpoint Alignment. International Journal of Computer Vision","author":"Wang Lei","year":"2024","unstructured":"Lei Wang, Jun Liu, Liang Zheng, Tom Gedeon, and Piotr Koniusz. 2024c. Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via Temporal-Viewpoint Alignment. International Journal of Computer Vision (2024), 1--32."},{"key":"e_1_3_2_2_261_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"e_1_3_2_2_262_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446900"},{"key":"e_1_3_2_2_263_1","volume-title":"TDN: Temporal Difference Networks for Efficient Action Recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","author":"Wang Limin","year":"2020","unstructured":"Limin Wang, Zhan Tong, Bin Ji, and Gangshan Wu. 2020b. TDN: Temporal Difference Networks for Efficient Action Recognition. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020), 1895--1904. https:\/\/api.semanticscholar.org\/CorpusID:229331798"},{"key":"e_1_3_2_2_264_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_2_265_1","volume-title":"Temporal segment networks for action recognition in videos","author":"Wang Limin","year":"2018","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2018b. Temporal segment networks for action recognition in videos. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 11 (2018), 2740--2755."},{"key":"e_1_3_2_2_266_1","volume-title":"Taylor Videos for Action Recognition. In Forty-first International Conference on Machine Learning.","author":"Wang Lei","unstructured":"Lei Wang, Xiuyuan Yuan, Tom Gedeon, and Liang Zheng. [n.,d.]. Taylor Videos for Action Recognition. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_2_267_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2818329"},{"key":"e_1_3_2_2_268_1","doi-asserted-by":"publisher","DOI":"10.1109\/THMS.2015.2504550"},{"key":"e_1_3_2_2_269_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.123"},{"key":"e_1_3_2_2_270_1","volume-title":"BEVT: BERT Pretraining of Video Transformers. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Wang Rui","year":"2021","unstructured":"Rui Wang, Dongdong Chen, Zuxuan Wu, Yinpeng Chen, Xiyang Dai, Mengchen Liu, Yu-Gang Jiang, Luowei Zhou, and Lu Yuan. 2021a. BEVT: BERT Pretraining of Video Transformers. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 14713--14723. https:\/\/api.semanticscholar.org\/CorpusID:244799265"},{"key":"e_1_3_2_2_271_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"e_1_3_2_2_272_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_2_273_1","doi-asserted-by":"crossref","unstructured":"Yi Wang Kunchang Li Xinhao Li Jiashuo Yu Yinan He Guo Chen Baoqi Pei Rongkun Zheng Jilan Xu Zun Wang et al. 2024b. Internvideo2: Scaling video foundation models for multimodal video understanding. Arxiv e-prints (2024) arXiv--2403.","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"e_1_3_2_2_274_1","volume-title":"InternVideo: General Video Foundation Models via Generative and Discriminative Learning. arXiv preprint arXiv:2212.03191","author":"Wang Yi","year":"2022","unstructured":"Yi Wang, Kunchang Li, Yizhuo Li, Yinan He, Bingkun Huang, Zhiyu Zhao, Hongjie Zhang, Jilan Xu, Yi Liu, Zun Wang, Sen Xing, Guo Chen, Junting Pan, Jiashuo Yu, Yali Wang, Limin Wang, and Yu Qiao. 2022. InternVideo: General Video Foundation Models via Generative and Discriminative Learning. arXiv preprint arXiv:2212.03191 (2022)."},{"key":"e_1_3_2_2_275_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00059"},{"key":"e_1_3_2_2_276_1","doi-asserted-by":"crossref","unstructured":"Zihan Wang Yang Yang Zhi Liu and Y. Zheng. 2023c. Deep Neural Networks in Video Human Action Recognition: A Review. ArXiv Vol. abs\/2305.15692 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258887932","DOI":"10.36227\/techrxiv.22146914"},{"key":"e_1_3_2_2_277_1","volume-title":"A Comprehensive Review of Few-shot Action Recognition. ArXiv","author":"Wanyan Yuyang","year":"2024","unstructured":"Yuyang Wanyan, Xiaoshan Yang, Weiming Dong, and Changsheng Xu. 2024. A Comprehensive Review of Few-shot Action Recognition. ArXiv, Vol. abs\/2407.14744 (2024). https:\/\/api.semanticscholar.org\/CorpusID:271329302"},{"key":"e_1_3_2_2_278_1","volume-title":"Masked Feature Prediction for Self-Supervised Visual Pre-Training. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021","author":"Wei Chen","year":"2021","unstructured":"Chen Wei, Haoqi Fan, Saining Xie, Chaoxia Wu, Alan Loddon Yuille, and Christoph Feichtenhofer. 2021a. Masked Feature Prediction for Self-Supervised Visual Pre-Training. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 14648--14658. https:\/\/api.semanticscholar.org\/CorpusID:245218767"},{"key":"e_1_3_2_2_279_1","volume-title":"2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","author":"Wei Yimin","year":"2021","unstructured":"Yimin Wei, Hao Liu, Tingting Xie, Qiuhong Ke, and Yulan Guo. 2021b. Spatial-Temporal Transformer for 3D Point Cloud Sequences. 2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV) (2021), 657--666. https:\/\/api.semanticscholar.org\/CorpusID:239024595"},{"key":"e_1_3_2_2_280_1","volume-title":"Free viewpoint action recognition using motion history volumes. Computer vision and image understanding","author":"Weinland Daniel","year":"2006","unstructured":"Daniel Weinland, Remi Ronfard, and Edmond Boyer. 2006. Free viewpoint action recognition using motion history volumes. Computer vision and image understanding, Vol. 104, 2--3 (2006), 249--257."},{"key":"e_1_3_2_2_281_1","volume-title":"USDRL: Unified Skeleton-Based Dense Representation Learning with Multi-Grained Feature Decorrelation. arXiv preprint arXiv:2412.09220","author":"Weng Wanjiang","year":"2024","unstructured":"Wanjiang Weng, Hongsong Wang, Junbo He, Lei He, and Guosen Xie. 2024. USDRL: Unified Skeleton-Based Dense Representation Learning with Multi-Grained Feature Decorrelation. arXiv preprint arXiv:2412.09220 (2024)."},{"key":"e_1_3_2_2_282_1","volume-title":"Proceedings, Part II 10","author":"Willems Geert","year":"2008","unstructured":"Geert Willems, Tinne Tuytelaars, and Luc Van Gool. 2008. An efficient dense and scale-invariant spatio-temporal interest point detector. In Computer Vision--ECCV 2008: 10th European Conference on Computer Vision, Marseille, France, October 12--18, 2008, Proceedings, Part II 10. Springer, 650--663."},{"key":"e_1_3_2_2_283_1","volume-title":"Transformers: State-of-the-Art Natural Language Processing. arXiv preprint arXiv:1910.03771","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf. 2020. Transformers: State-of-the-Art Natural Language Processing. arXiv preprint arXiv:1910.03771 (2020)."},{"key":"e_1_3_2_2_284_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4408923"},{"key":"e_1_3_2_2_285_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2012.6239233"},{"key":"e_1_3_2_2_286_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00325"},{"key":"e_1_3_2_2_287_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2018.12.050"},{"key":"e_1_3_2_2_288_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00169"},{"key":"e_1_3_2_2_289_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"e_1_3_2_2_290_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00167"},{"key":"e_1_3_2_2_291_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00712"},{"key":"e_1_3_2_2_292_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"e_1_3_2_2_293_1","volume-title":"Action-OOD: An End-to-End Skeleton-Based Model for Robust Out-of-Distribution Human Action Detection. arXiv preprint arXiv:2405.20633","author":"Xu Jing","year":"2024","unstructured":"Jing Xu, Anqi Zhu, Jingyu Lin, Qiuhong Ke, and Cunjian Chen. 2024. Action-OOD: An End-to-End Skeleton-Based Model for Robust Out-of-Distribution Human Action Detection. arXiv preprint arXiv:2405.20633 (2024)."},{"key":"e_1_3_2_2_294_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"e_1_3_2_2_295_1","doi-asserted-by":"crossref","unstructured":"Sijie Yan Yuanjun Xiong and Dahua Lin. 2018. Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition. In AAAI.","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"e_1_3_2_2_296_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_2_297_1","volume-title":"Video Representation Learning with Visual Tempo Consistency. ArXiv","author":"Yang Ceyuan","year":"2020","unstructured":"Ceyuan Yang, Yinghao Xu, Bo Dai, and Bolei Zhou. 2020a. Video Representation Learning with Visual Tempo Consistency. ArXiv, Vol. abs\/2006.15489 (2020). https:\/\/api.semanticscholar.org\/CorpusID:220250229"},{"key":"e_1_3_2_2_298_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"e_1_3_2_2_299_1","volume-title":"Unik: A unified framework for real-world skeleton-based action recognition. arXiv preprint arXiv:2107.08580","author":"Yang Di","year":"2021","unstructured":"Di Yang, Yaohui Wang, Antitza Dantcheva, Lorenzo Garattoni, Gianpiero Francesca, and Fran\u00e7ois Br\u00e9mond. 2021b. Unik: A unified framework for real-world skeleton-based action recognition. arXiv preprint arXiv:2107.08580 (2021)."},{"key":"e_1_3_2_2_300_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01967--8"},{"key":"e_1_3_2_2_301_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3129117"},{"key":"e_1_3_2_2_302_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00748"},{"key":"e_1_3_2_2_303_1","unstructured":"Yuheng Yang. 2024. Skeleton-based Action Recognition with Non-linear Dependency Modeling and Hilbert-Schmidt Independence Criterion. arxiv: 2412.18780 [cs.CV] https:\/\/arxiv.org\/abs\/2412.18780"},{"key":"e_1_3_2_2_304_1","volume-title":"Action recognition with multi-stream motion modeling and mutual information maximization. arXiv preprint arXiv:2306.07576","author":"Yang Yuheng","year":"2023","unstructured":"Yuheng Yang, Haipeng Chen, Zhenguang Liu, Yingda Lyu, Beibei Zhang, Shuang Wu, Zhibo Wang, and Kui Ren. 2023a. Action recognition with multi-stream motion modeling and mutual information maximization. arXiv preprint arXiv:2306.07576 (2023)."},{"key":"e_1_3_2_2_305_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.05.018"},{"key":"e_1_3_2_2_306_1","volume-title":"Side4video: Spatial-temporal side network for memory-efficient image-to-video transfer learning. arXiv preprint arXiv:2311.15769","author":"Yao Huanjin","year":"2023","unstructured":"Huanjin Yao, Wenhao Wu, and Zhiheng Li. 2023. Side4video: Spatial-temporal side network for memory-efficient image-to-video transfer learning. arXiv preprint arXiv:2311.15769 (2023)."},{"key":"e_1_3_2_2_307_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2020.106713"},{"key":"e_1_3_2_2_308_1","volume-title":"SeCo: Exploring Sequence Supervision for Unsupervised Representation Learning. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:220935968","author":"Yao Ting","year":"2020","unstructured":"Ting Yao, Yiheng Zhang, Zhaofan Qiu, Yingwei Pan, and Tao Mei. 2020b. SeCo: Exploring Sequence Supervision for Unsupervised Representation Learning. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:220935968"},{"key":"e_1_3_2_2_309_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459201"},{"key":"e_1_3_2_2_310_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612380"},{"key":"e_1_3_2_2_311_1","volume-title":"Building a Multi-modal Spatiotemporal Expert for Zero-shot Action Recognition with CLIP. arXiv preprint arXiv:2412.09895","author":"Yu Yating","year":"2024","unstructured":"Yating Yu, Congqi Cao, Yueran Zhang, Qinyi Lv, Lingtong Min, and Yanning Zhang. 2024. Building a Multi-modal Spatiotemporal Expert for Zero-shot Action Recognition with CLIP. arXiv preprint arXiv:2412.09895 (2024)."},{"key":"e_1_3_2_2_312_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"e_1_3_2_2_313_1","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023a. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In Conference on Empirical Methods in Natural Language Processing. https:\/\/api.semanticscholar.org\/CorpusID:259075356","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_2_314_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611774"},{"key":"e_1_3_2_2_315_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.233"},{"key":"e_1_3_2_2_316_1","volume-title":"Tfcnet: Temporal fully connected networks for static unbiased temporal reasoning. arXiv preprint arXiv:2203.05928","author":"Zhang Shiwen","year":"2022","unstructured":"Shiwen Zhang. 2022. Tfcnet: Temporal fully connected networks for static unbiased temporal reasoning. arXiv preprint arXiv:2203.05928 (2022)."},{"key":"e_1_3_2_2_317_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.280"},{"key":"e_1_3_2_2_318_1","volume-title":"Generalized cross entropy loss for training deep neural networks with noisy labels. Advances in neural information processing systems","author":"Zhang Zhilu","year":"2018","unstructured":"Zhilu Zhang and Mert Sabuncu. 2018. Generalized cross entropy loss for training deep neural networks with noisy labels. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_2_319_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00876"},{"key":"e_1_3_2_2_320_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-00961--3"},{"key":"e_1_3_2_2_321_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"e_1_3_2_2_322_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01022"},{"key":"e_1_3_2_2_323_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00200"},{"key":"e_1_3_2_2_324_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2016.06.007"},{"key":"e_1_3_2_2_325_1","volume-title":"The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Zhu Liyun","unstructured":"Liyun Zhu, Lei Wang, Arjun Raj, Tom Gedeon, and Chen Chen. [n.,d.]. Advancing Video Anomaly Detection: A Concise Review and a New Dataset. In The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_2_326_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104108"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717746","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T18:23:16Z","timestamp":1759861396000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717746"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":326,"alternative-id":["10.1145\/3701716.3717746","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717746","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}