{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:04:48Z","timestamp":1773486288596,"version":"3.50.1"},"reference-count":68,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T00:00:00Z","timestamp":1770595200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62173248, 62333017"],"award-info":[{"award-number":["62173248, 62333017"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Pattern Anal Applic"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s10044-026-01628-9","type":"journal-article","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T17:31:49Z","timestamp":1770658309000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CCT-HAR: Cross-modal contrastive sample mining with temporal alignment for self-supervised human action recognition"],"prefix":"10.1007","volume":"29","author":[{"given":"Xun","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Jiayao","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Chengju","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Qijun","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,9]]},"reference":[{"issue":"3","key":"1628_CR1","doi-asserted-by":"publisher","first-page":"3200","DOI":"10.1109\/TPAMI.2022.3183112","volume":"45","author":"Z Sun","year":"2023","unstructured":"Sun Z, Ke Q, Rahmani H et al (2023) Human action recognition from various data modalities: a review. IEEE Trans Pattern Anal Mach Intell 45(3):3200\u20133225. https:\/\/doi.org\/10.1109\/TPAMI.2022.3183112","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1628_CR2","doi-asserted-by":"publisher","unstructured":"Yang T, Zhu Y, Xie Y, et\u00a0al (2023) Aim: adapting image models for efficient video action recognition. arXiv preprint. arXiv:2302.03024, https:\/\/doi.org\/10.48550\/arXiv.2302.03024","DOI":"10.48550\/arXiv.2302.03024"},{"key":"1628_CR3","doi-asserted-by":"publisher","unstructured":"Duan H, Zhao Y, Chen K, et\u00a0al (2022) Revisiting skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2969\u20132978.\u00a0https:\/\/doi.org\/10.1109\/cvpr52688.2022.00298","DOI":"10.1109\/cvpr52688.2022.00298"},{"key":"1628_CR4","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1016\/j.ins.2023.03.058","volume":"633","author":"S Qiu","year":"2023","unstructured":"Qiu S, Fan T, Jiang J, Wang Z, Wang Y, Xu J, Sun T, Jiang N (2023) A novel two-level interactive action recognition model based on inertial data fusion. Inform Sci 633:264\u2013279. https:\/\/doi.org\/10.1016\/j.ins.2023.03.058","journal-title":"Inform Sci"},{"issue":"3","key":"1628_CR5","doi-asserted-by":"publisher","first-page":"3522","DOI":"10.1109\/TPAMI.2022.3177813","volume":"45","author":"BX Yu","year":"2023","unstructured":"Yu BX, Liu Y, Zhang X et al (2023) MMNet: a model-based multimodal network for human action recognition in RGB-D videos. IEEE Trans Pattern Anal Mach Intell 45(3):3522\u20133538. https:\/\/doi.org\/10.1109\/TPAMI.2022.3177813","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"3","key":"1628_CR6","doi-asserted-by":"publisher","first-page":"1250","DOI":"10.1109\/TCSVT.2021.3077512","volume":"32","author":"H Wu","year":"2022","unstructured":"Wu H, Ma X, Li Y (2022) Spatiotemporal multimodal learning with 3D CNNs for video action recognition. IEEE Trans Circuits Syst Video Technol 32(3):1250\u20131261. https:\/\/doi.org\/10.1109\/TCSVT.2021.3077512","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"1628_CR7","doi-asserted-by":"publisher","unstructured":"Chen T, Kornblith S, Norouzi M, et\u00a0al (2020) A simple framework for contrastive learning of visual representations. In: Proceedings of the 37th International Conference on Machine Learning. PMLR, Virtual, pp 1597\u20131607.\u00a0https:\/\/doi.org\/10.5555\/3524938.3525087","DOI":"10.5555\/3524938.3525087"},{"key":"1628_CR8","doi-asserted-by":"publisher","unstructured":"Tian Y, Krishnan D, Isola P (2020) Contrastive multiview coding. In: Vedaldi A, Bischof H, Brox T, et\u00a0al (Eds) Computer vision \u2013 ECCV 2020. Springer International Publishing, Cham, Lecture Notes in Computer Science, pp 776\u2013794.\u00a0https:\/\/doi.org\/10.1007\/978-3-030-58621-8_45","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"1628_CR9","doi-asserted-by":"publisher","unstructured":"Radford A, Kim JW, Hallacy C, et\u00a0al (2021) Learning transferable visual models from natural language supervision. https:\/\/doi.org\/10.48550\/arXiv.2103.00020, arXiv:2103.00020","DOI":"10.48550\/arXiv.2103.00020"},{"key":"1628_CR10","doi-asserted-by":"publisher","unstructured":"Li J, Li D, Xiong C, et\u00a0al (2022) BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. https:\/\/doi.org\/10.48550\/arXiv.2201.12086, arXiv:2201.12086","DOI":"10.48550\/arXiv.2201.12086"},{"key":"1628_CR11","doi-asserted-by":"publisher","unstructured":"Li J, Li D, Savarese S, et\u00a0al (2023) BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. https:\/\/doi.org\/10.48550\/arXiv.2301.12597, arXiv:2301.12597","DOI":"10.48550\/arXiv.2301.12597"},{"issue":"5","key":"1628_CR12","doi-asserted-by":"publisher","first-page":"1366","DOI":"10.1007\/s11263-022-01594-9","volume":"130","author":"Y Kong","year":"2022","unstructured":"Kong Y, Fu Y (2022) Human action recognition and prediction: a survey. Int J Comput Vis 130(5):1366\u20131401. https:\/\/doi.org\/10.1007\/s11263-022-01594-9","journal-title":"Int J Comput Vis"},{"key":"1628_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128645","author":"H Hu","year":"2024","unstructured":"Hu H, Wang X, Zhang Y et al (2024) A comprehensive survey on contrastive learning. Neurocomputing. https:\/\/doi.org\/10.1016\/j.neucom.2024.128645","journal-title":"Neurocomputing"},{"key":"1628_CR14","doi-asserted-by":"publisher","unstructured":"M\u00fcller M (2007) Dynamic time warping. In: M\u00fcller M (Ed) information retrieval for music and motion. Springer, Berlin, p 69\u201384.\u00a0https:\/\/doi.org\/10.1007\/978-3-540-74048-3_4","DOI":"10.1007\/978-3-540-74048-3_4"},{"key":"1628_CR15","doi-asserted-by":"publisher","unstructured":"Cuturi M, Blondel M (2017) Soft-DTW: a differentiable loss function for time-series. In: Proceedings of the 34th International Conference on Machine Learning - Volume 70. JMLR.org, Sydney, NSW, Australia, ICML\u201917, pp 894\u2013903.\u00a0https:\/\/doi.org\/10.5555\/3305381.3305474","DOI":"10.5555\/3305381.3305474"},{"key":"1628_CR16","doi-asserted-by":"publisher","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1, NIPS\u201914, vol\u00a01. MIT Press, Cambridge, MA, USA, pp 568\u2013576. https:\/\/doi.org\/10.5555\/2968826.2968890","DOI":"10.5555\/2968826.2968890"},{"key":"1628_CR17","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Fan H, Malik J, et\u00a0al (2019) SlowFast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 6202\u20136211.\u00a0https:\/\/doi.org\/10.1109\/ICCV.2019.00630","DOI":"10.1109\/ICCV.2019.00630"},{"issue":"1","key":"1628_CR18","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W, Yang M et al (2013) 3D convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231. https:\/\/doi.org\/10.1109\/TPAMI.2012.59","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1628_CR19","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"1628_CR20","doi-asserted-by":"publisher","unstructured":"Hara K, Kataoka H, Satoh Y (2017) Learning spatio-temporal features with 3D residual networks for action recognition. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp 3154\u20133160.\u00a0https:\/\/doi.org\/10.1109\/ICCVW.2017.373","DOI":"10.1109\/ICCVW.2017.373"},{"key":"1628_CR21","doi-asserted-by":"publisher","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et\u00a0al (2020) An image is worth 16x16 words: transformers for image recognition at scale. https:\/\/doi.org\/10.48550\/arXiv.2010.11929, arXiv:2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"1628_CR22","doi-asserted-by":"publisher","unstructured":"Liu Z, Hu H, Lin Y, et\u00a0al (2022) Swin transformer V2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12009\u201312019.\u00a0https:\/\/doi.org\/10.1109\/CVPR52688.2022.01170","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"1628_CR23","doi-asserted-by":"publisher","unstructured":"Yang J, Dong X, Liu L, et\u00a0al (2022) Recurring the transformer for video action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14063\u201314073.\u00a0https:\/\/doi.org\/10.1109\/CVPR52688.2022.01367","DOI":"10.1109\/CVPR52688.2022.01367"},{"issue":"1","key":"1628_CR24","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-024-01407-4","volume":"28","author":"L Xia","year":"2025","unstructured":"Xia L, Fu W (2025) ST-HViT: spatial-temporal hierarchical vision transformer for action recognition. Pattern Anal Appl 28(1):22. https:\/\/doi.org\/10.1007\/s10044-024-01407-4","journal-title":"Pattern Anal Appl"},{"issue":"10","key":"1628_CR25","doi-asserted-by":"publisher","first-page":"12581","DOI":"10.1109\/TPAMI.2023.3282631","volume":"45","author":"K Li","year":"2023","unstructured":"Li K, Wang Y, Zhang J et al (2023) UniFormer: unifying convolution and self-attention for visual recognition. IEEE Trans Pattern Anal Mach Intell 45(10):12581\u201312600. https:\/\/doi.org\/10.1109\/TPAMI.2023.3282631","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1628_CR26","unstructured":"Li K, Wang Y, He Y, et\u00a0al (2022) UniFormerV2: spatiotemporal learning by arming image ViTs with video uniformer. arXiv:2211.09552"},{"key":"1628_CR27","doi-asserted-by":"publisher","unstructured":"Lee I, Kim D, Kang S, et\u00a0al (2017) Ensemble deep learning for skeleton-based action recognition using temporal sliding LSTM networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp 1012\u20131020. https:\/\/doi.org\/10.1109\/ICCV.2017.115","DOI":"10.1109\/ICCV.2017.115"},{"key":"1628_CR28","doi-asserted-by":"publisher","unstructured":"Graves A (2012) Long short-term memory. In: supervised sequence labelling with recurrent neural networks. Springer Berlin Heidelberg, Berlin, Heidelberg, p 37\u201345, https:\/\/doi.org\/10.1007\/978-3-642-24797-2_4","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"1628_CR29","doi-asserted-by":"publisher","unstructured":"Kipf TN, Welling M (2017) Semi-supervised classification with graph convolutional networks. https:\/\/doi.org\/10.48550\/arXiv.1609.02907, arXiv:1609.02907","DOI":"10.48550\/arXiv.1609.02907"},{"key":"1628_CR30","doi-asserted-by":"publisher","unstructured":"Yan S, Xiong Y, Lin D (2018) Spatial temporal graph convolutional networks for skeleton-based action recognition. Proceedings of the AAAI Conference on Artificial Intelligence 32(1). https:\/\/doi.org\/10.1609\/aaai.v32i1.12328","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"1628_CR31","doi-asserted-by":"publisher","unstructured":"Duan H, Wang J, Chen K, et\u00a0al (2022) PYSKL: towards good practices for skeleton action recognition. https:\/\/doi.org\/10.48550\/arXiv.2205.09443, arXiv:2205.09443","DOI":"10.48550\/arXiv.2205.09443"},{"key":"1628_CR32","doi-asserted-by":"publisher","unstructured":"Chen Y, Zhang Z, Yuan C, et\u00a0al (2021) Channel-wise topology refinement graph convolution for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 13359\u201313368.\u00a0https:\/\/doi.org\/10.1109\/ICCV48922.2021.01311","DOI":"10.1109\/ICCV48922.2021.01311"},{"issue":"4","key":"1628_CR33","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-024-01319-3","volume":"27","author":"H Deng","year":"2024","unstructured":"Deng H, Lin G, Li C et al (2024) Research on decoupled adaptive graph convolution networks based on skeleton data for action recognition. Pattern Anal Appl 27(4):118. https:\/\/doi.org\/10.1007\/s10044-024-01319-3","journal-title":"Pattern Anal Appl"},{"key":"1628_CR34","doi-asserted-by":"publisher","unstructured":"Zhou Y, Yan X, Cheng ZQ, et\u00a0al (2024) BlockGCN: redefine topology awareness for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2049\u20132058.\u00a0https:\/\/doi.org\/10.1109\/CVPR52733.2024.00200","DOI":"10.1109\/CVPR52733.2024.00200"},{"key":"1628_CR35","doi-asserted-by":"crossref","unstructured":"Liu H, Liu Y, Ren M, et\u00a0al (2025) Revealing key details to see differences: a novel prototypical perspective for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 29248\u201329257","DOI":"10.1109\/CVPR52734.2025.02723"},{"issue":"3","key":"1628_CR36","doi-asserted-by":"publisher","first-page":"1303","DOI":"10.1007\/s10044-023-01156-w","volume":"26","author":"Q Cheng","year":"2023","unstructured":"Cheng Q, Cheng J, Ren Z et al (2023) Multi-scale spatial-temporal convolutional neural network for skeleton-based action recognition. Pattern Anal Appl 26(3):1303\u20131315. https:\/\/doi.org\/10.1007\/s10044-023-01156-w","journal-title":"Pattern Anal Appl"},{"issue":"9","key":"1628_CR37","doi-asserted-by":"publisher","first-page":"10978","DOI":"10.1109\/JSEN.2021.3062261","volume":"21","author":"Z Ahmad","year":"2021","unstructured":"Ahmad Z, Khan N (2021) Inertial sensor data to image encoding for human action recognition. IEEE Sens J 21(9):10978\u201310988. https:\/\/doi.org\/10.1109\/JSEN.2021.3062261","journal-title":"IEEE Sens J"},{"issue":"7","key":"1628_CR38","doi-asserted-by":"publisher","first-page":"5165","DOI":"10.1007\/s00521-022-07911-0","volume":"35","author":"A Sarkar","year":"2023","unstructured":"Sarkar A, Hossain SKS, Sarkar R (2023) Human activity recognition from sensor data using spatial attention-aided CNN with genetic algorithm. Neural Comput Appl 35(7):5165\u20135191. https:\/\/doi.org\/10.1007\/s00521-022-07911-0","journal-title":"Neural Comput Appl"},{"key":"1628_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2025.113373","volume":"319","author":"Z Huang","year":"2025","unstructured":"Huang Z, Deng J, Wang S, Tang C, Xiao S (2025) TFC: Time\u2013frequency contrasting network for wearable-based human activity recognition. Knowl Based Syst 319:113373. https:\/\/doi.org\/10.1016\/j.knosys.2025.113373","journal-title":"Knowl Based Syst"},{"issue":"6","key":"1628_CR40","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-021-01249-8","volume":"32","author":"SY Boulahia","year":"2021","unstructured":"Boulahia SY, Amamra A, Madi MR et al (2021) Early, intermediate and late fusion strategies for robust deep learning-based multimodal action recognition. Mach Vis Appl 32(6):121. https:\/\/doi.org\/10.1007\/s00138-021-01249-8","journal-title":"Mach Vis Appl"},{"key":"1628_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107356","volume":"104","author":"J Li","year":"2020","unstructured":"Li J, Xie X, Pan Q et al (2020) SGM-Net: skeleton-guided multimodal network for action recognition. Pattern Recognit 104:107356. https:\/\/doi.org\/10.1016\/j.patcog.2020.107356","journal-title":"Pattern Recognit"},{"key":"1628_CR42","doi-asserted-by":"publisher","unstructured":"Tran D, Wang H, Torresani L, et\u00a0al (2018) A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6450\u20136459.\u00a0https:\/\/doi.org\/10.1109\/CVPR.2018.00675","DOI":"10.1109\/CVPR.2018.00675"},{"key":"1628_CR43","doi-asserted-by":"publisher","unstructured":"Vaezi\u00a0Joze HR, Shaban A, Iuzzolino ML, et\u00a0al (2020) MMTM: multimodal transfer module for CNN fusion. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 13286\u201313296.\u00a0https:\/\/doi.org\/10.1109\/CVPR42600.2020.01330","DOI":"10.1109\/CVPR42600.2020.01330"},{"issue":"4","key":"1628_CR44","doi-asserted-by":"publisher","first-page":"1307","DOI":"10.1007\/s10044-018-0727-y","volume":"22","author":"M Farrajota","year":"2019","unstructured":"Farrajota M, Rodrigues JMF, du Buf JMH (2019) Human action recognition in videos with articulated pose information by deep networks. Pattern Anal Appl 22(4):1307\u20131318. https:\/\/doi.org\/10.1007\/s10044-018-0727-y","journal-title":"Pattern Anal Appl"},{"key":"1628_CR45","doi-asserted-by":"publisher","unstructured":"Duhme M, Memmesheimer R, Paulus D (2021) Fusion-GCN: multimodal action recognition using graph convolutional networks. In: DAGM German Conference on Pattern Recognition. Springer, Bonn, Germany, pp 265\u2013281.\u00a0https:\/\/doi.org\/10.1007\/978-3-030-92659-5_17","DOI":"10.1007\/978-3-030-92659-5_17"},{"key":"1628_CR46","doi-asserted-by":"publisher","unstructured":"Liu S, Wang X, Xiong R, et\u00a0al (2024) GCN-based multi-modality fusion network for action recognition. IEEE Transactions on Multimedia pp 1\u201313. https:\/\/doi.org\/10.1109\/TMM.2024.3521749","DOI":"10.1109\/TMM.2024.3521749"},{"key":"1628_CR47","doi-asserted-by":"publisher","first-page":"2573","DOI":"10.1109\/TMM.2022.3148588","volume":"25","author":"S Liu","year":"2023","unstructured":"Liu S, Ma X (2023) Attention-driven appearance-motion fusion network for action recognition. IEEE Trans Multimedia 25:2573\u20132584. https:\/\/doi.org\/10.1109\/TMM.2022.3148588","journal-title":"IEEE Trans Multimedia"},{"key":"1628_CR48","doi-asserted-by":"publisher","unstructured":"Ahn D, Kim S, Hong H, et\u00a0al (2023) STAR-transformer: a spatio-temporal cross attention transformer for human action recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 3330\u20133339.\u00a0https:\/\/doi.org\/10.1109\/WACV56688.2023.00333","DOI":"10.1109\/WACV56688.2023.00333"},{"issue":"23","key":"1628_CR49","doi-asserted-by":"publisher","first-page":"28446","DOI":"10.1007\/s10489-023-04978-7","volume":"53","author":"D Ahn","year":"2023","unstructured":"Ahn D, Kim S, Ko BC (2023) STAR++: rethinking spatio-temporal cross attention transformer for video action recognition. Appl Intell 53(23):28446\u201328459. https:\/\/doi.org\/10.1007\/s10489-023-04978-7","journal-title":"Appl Intell"},{"key":"1628_CR50","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TIM.2025.3588988","volume":"74","author":"X Liu","year":"2025","unstructured":"Liu X, Yuan G, Zhang Y et al (2025) Enhancing multimodal action recognition with semantic-guided feature disentangled and multichannel adaptive feature fusion. IEEE Trans Instrum Meas 74:1\u201317. https:\/\/doi.org\/10.1109\/TIM.2025.3588988","journal-title":"IEEE Trans Instrum Meas"},{"key":"1628_CR51","doi-asserted-by":"publisher","unstructured":"He K, Fan H, Wu Y, et\u00a0al (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9729\u20139738.\u00a0https:\/\/doi.org\/10.1109\/CVPR42600.2020.00975","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1628_CR52","doi-asserted-by":"publisher","unstructured":"Cui J, Zhong Z, Liu S, et\u00a0al (2021) Parametric contrastive learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 715\u2013724.\u00a0https:\/\/doi.org\/10.1109\/ICCV48922.2021.00075","DOI":"10.1109\/ICCV48922.2021.00075"},{"key":"1628_CR53","doi-asserted-by":"publisher","unstructured":"Yeh CH, Hong CY, Hsu YC, et\u00a0al (2022) Decoupled contrastive learning. In: Avidan S, Brostow G, Ciss\u00e9 M, et\u00a0al (Eds) Computer Vision \u2013 ECCV 2022. Springer Nature Switzerland, Cham, Lecture Notes in Computer Science, pp 668\u2013684.\u00a0https:\/\/doi.org\/10.1007\/978-3-031-19809-0_38","DOI":"10.1007\/978-3-031-19809-0_38"},{"key":"1628_CR54","doi-asserted-by":"publisher","unstructured":"Cui J, Zhong Z, Tian Z, et\u00a0al (2023) Generalized parametric contrastive learning. IEEE transactions on pattern analysis and machine intelligence pp 1\u201312. https:\/\/doi.org\/10.1109\/TPAMI.2023.3278694","DOI":"10.1109\/TPAMI.2023.3278694"},{"key":"1628_CR55","doi-asserted-by":"publisher","first-page":"395","DOI":"10.1109\/TIP.2023.3338410","volume":"33","author":"S Guan","year":"2024","unstructured":"Guan S, Yu X, Huang W et al (2024) DMMG: dual min-max games for self-supervised skeleton-based action recognition. IEEE Trans Image Process 33:395\u2013407. https:\/\/doi.org\/10.1109\/TIP.2023.3338410","journal-title":"IEEE Trans Image Process"},{"key":"1628_CR56","doi-asserted-by":"publisher","unstructured":"Jin Z, Wang Y, Wang Q, et\u00a0al (2023) SSRL: self-supervised spatial-temporal representation learning for 3D action recognition. IEEE Transactions on Circuits and Systems for Video Technology pp 1\u20131. https:\/\/doi.org\/10.1109\/TCSVT.2023.3284493","DOI":"10.1109\/TCSVT.2023.3284493"},{"key":"1628_CR57","doi-asserted-by":"publisher","first-page":"624","DOI":"10.1109\/TMM.2021.3129616","volume":"25","author":"S Xu","year":"2023","unstructured":"Xu S, Rao H, Hu X et al (2023) Prototypical contrast and reverse prediction: unsupervised skeleton based action recognition. IEEE Trans Multimedia 25:624\u2013634. https:\/\/doi.org\/10.1109\/TMM.2021.3129616","journal-title":"IEEE Trans Multimedia"},{"issue":"2","key":"1628_CR58","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-025-01472-3","volume":"28","author":"Z Ren","year":"2025","unstructured":"Ren Z, Liu R, Qin Y et al (2025) Masked cosine similarity prediction for self-supervised skeleton-based action representation learning. Pattern Anal Appl 28(2):86. https:\/\/doi.org\/10.1007\/s10044-025-01472-3","journal-title":"Pattern Anal Appl"},{"key":"1628_CR59","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111377","volume":"162","author":"R Liu","year":"2025","unstructured":"Liu R, Liu Y, Wu M, Xin W, Miao Q, Liu X, Li L (2025) SG-CLR: Semantic representation-guided contrastive learning for self-supervised skeleton-based action recognition. Pattern Recognition 162:111377. https:\/\/doi.org\/10.1016\/j.patcog.2025.111377","journal-title":"Pattern Recognition"},{"key":"1628_CR60","doi-asserted-by":"publisher","unstructured":"Haresh S, Kumar S, Coskun H, et\u00a0al (2021) Learning by aligning videos in time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5548\u20135558.\u00a0https:\/\/doi.org\/10.1109\/CVPR46437.2021.00550","DOI":"10.1109\/CVPR46437.2021.00550"},{"key":"1628_CR61","doi-asserted-by":"publisher","unstructured":"Ni X, Liu Y, Wen H, et\u00a0al (2024) Multimodal prototype-enhanced network for few-shot action recognition. In: Proceedings of the 2024 International Conference on Multimedia Retrieval. Association for Computing Machinery, New York, NY, USA, ICMR \u201924, pp 1\u201310.\u00a0https:\/\/doi.org\/10.1145\/3652583.3658044","DOI":"10.1145\/3652583.3658044"},{"issue":"5555\/3157096","key":"1628_CR62","first-page":"3157304","volume":"10","author":"K Sohn","year":"2016","unstructured":"Sohn K (2016) Improved deep metric learning with multi-class n-pair loss objective. Adv Neur Inf Process Syst doi 10(5555\/3157096):3157304","journal-title":"Adv Neur Inf Process Syst doi"},{"key":"1628_CR63","doi-asserted-by":"publisher","unstructured":"Yuan J, Liu C, Liu C, et\u00a0al (2022) Real-time human falling recognition via spatial and temporal self-attention augmented graph convolutional network. In: 2022 IEEE International Conference on Real-Time Computing and Robotics (RCAR). IEEE, Guiyang, China, pp 438\u2013443.\u00a0https:\/\/doi.org\/10.1109\/RCAR54675.2022.9872276","DOI":"10.1109\/RCAR54675.2022.9872276"},{"key":"1628_CR64","doi-asserted-by":"publisher","unstructured":"Chen C, Jafari R, Kehtarnavaz N (2015) UTD-MHAD: a multimodal dataset for human action recognition utilizing a depth camera and a wearable inertial sensor. In: 2015 IEEE International Conference on Image Processing (ICIP). IEEE, pp 168\u2013172.\u00a0https:\/\/doi.org\/10.1109\/ICIP.2015.7350781","DOI":"10.1109\/ICIP.2015.7350781"},{"key":"1628_CR65","doi-asserted-by":"publisher","unstructured":"Kong Q, Wu Z, Deng Z, et\u00a0al (2019) MMAct: a large-scale dataset for cross modal human action understanding. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 8657\u20138666.\u00a0https:\/\/doi.org\/10.1109\/ICCV.2019.00875","DOI":"10.1109\/ICCV.2019.00875"},{"key":"1628_CR66","doi-asserted-by":"publisher","unstructured":"Cao Z, Simon T, Wei SE, et\u00a0al (2017) Realtime multi-person 2d pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 7291\u20137299.\u00a0https:\/\/doi.org\/10.1109\/CVPR.2017.143","DOI":"10.1109\/CVPR.2017.143"},{"issue":"11","key":"1628_CR67","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten L, Hinton G (2008) Visualizing data using t-SNE. J Mach Learn Res 9(11):2579\u2013605","journal-title":"J Mach Learn Res"},{"key":"1628_CR68","doi-asserted-by":"publisher","unstructured":"Ni X, Liu Y, Wen H, et\u00a0al (2024) Multimodal prototype-enhanced network for few-shot action recognition. In: Proceedings of the 2024 International Conference on Multimedia Retrieval. Association for Computing Machinery, New York, NY, USA, ICMR \u201924, pp 1\u201310. https:\/\/doi.org\/10.1145\/3652583.3658044","DOI":"10.1145\/3652583.3658044"}],"container-title":["Pattern Analysis and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01628-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10044-026-01628-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01628-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T10:38:34Z","timestamp":1773484714000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10044-026-01628-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,9]]},"references-count":68,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["1628"],"URL":"https:\/\/doi.org\/10.1007\/s10044-026-01628-9","relation":{},"ISSN":["1433-7541","1433-755X"],"issn-type":[{"value":"1433-7541","type":"print"},{"value":"1433-755X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,9]]},"assertion":[{"value":"21 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"44"}}