{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T17:40:05Z","timestamp":1749318005225,"version":"3.41.0"},"reference-count":84,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T00:00:00Z","timestamp":1743033600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T00:00:00Z","timestamp":1743033600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02415-5","type":"journal-article","created":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T16:15:16Z","timestamp":1743351316000},"page":"4944-4961","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["$$\\hbox {I}^2$$MD: 3D Action Representation Learning with Inter- and Intra-Modal Mutual Distillation"],"prefix":"10.1007","volume":"133","author":[{"given":"Yunyao","family":"Mao","sequence":"first","affiliation":[]},{"given":"Jiajun","family":"Deng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Zhenbo","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[]},{"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,27]]},"reference":[{"key":"2415_CR1","first-page":"12980","volume":"33","author":"S Abbasi Koohpayegani","year":"2020","unstructured":"Abbasi Koohpayegani, S., Tejankar, A., & Pirsiavash, H. (2020). CompRess: Self-supervised learning by compressing representations. Proceedings of the Advances in Neural Information Processing Systems (NeurIPS), 33, 12980\u201312992.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)"},{"issue":"10","key":"2415_CR2","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TMM.2019.2960588","volume":"22","author":"D Avola","year":"2020","unstructured":"Avola, D., Cascio, M., Cinque, L., Foresti, G. L., Massaroni, C., & Rodol\u00e0, E. (2020). 2-D skeleton-based action recognition via two-branch stacked LSTM-RNNs. IEEE Transactions on Multimedia (TMM), 22(10), 2481\u20132496. https:\/\/doi.org\/10.1109\/TMM.2019.2960588","journal-title":"IEEE Transactions on Multimedia (TMM)"},{"key":"2415_CR3","first-page":"279","volume":"647","author":"DH Ballard","year":"1987","unstructured":"Ballard, D. H. (1987). Modular learning in neural networks. Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), 647, 279\u2013284.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)"},{"key":"2415_CR4","doi-asserted-by":"crossref","unstructured":"Caetano, C., Bremond, F., & Schwartz, W. (2019). Skeleton image representation for 3d action recognition based on tree structure and reference joints. In: SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI), pp 16\u201323.","DOI":"10.1109\/SIBGRAPI.2019.00011"},{"issue":"01","key":"2415_CR5","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","volume":"43","author":"Z Cao","year":"2021","unstructured":"Cao, Z., Hidalgo, G., Simon, T., Wei, S., & Sheikh, Y. (2021). OpenPose: Realtime multi-person 2D pose estimation using part affinity fields. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 43(01), 172\u2013186.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"2415_CR6","unstructured":"Chen, X., Fan, H., Girshick, R., & He, K. (2020b). Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297"},{"key":"2415_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020a). A simple framework for contrastive learning of visual representations. In: Proceedings of the International Conference on Machine Learning (ICML), pp 1597\u20131607."},{"key":"2415_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., & He, K. (2021a). An empirical study of training self-supervised vision transformers. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 9640\u20139649.","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"2415_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., Zhang, Z., Yuan, C., Li, B., Deng, Y., & Hu, W. (2021b). Channel-wise topology refinement graph convolution for skeleton-based action recognition. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 13359\u201313368.","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"2415_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., He, X., Chen, W., Cheng, J., & Lu, H. (2020). Skeleton-based action recognition with shift graph convolutional network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 183\u2013192.","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"2415_CR11","unstructured":"Chunhui, L., Yueyu, H., Yanghao, L., Sijie, S., & Jiaying, L. (2017). PKU-MMD: A large scale benchmark for continuous multi-modal human action understanding. arXiv preprint arXiv:1703.07475"},{"key":"2415_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 248\u2013255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2415_CR13","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Liu, D., Chen, T., Zhou, W., Zhang, Y., Li, H., & Ouyang, W. (2022). TransVG++: End-to-end visual grounding with language conditioned vision transformer. arXiv preprint arXiv:2206.06619","DOI":"10.1109\/TPAMI.2023.3296823"},{"key":"2415_CR14","unstructured":"Dinh, L., Krueger, D., & Bengio, Y. (2014). NICE: Non-linear independent components estimation. arXiv preprint arXiv:1410.8516"},{"key":"2415_CR15","unstructured":"Dinh, L., Sohl-Dickstein, J., & Bengio, S. (2016). Density estimation using real NVP. arXiv preprint arXiv:1605.08803"},{"key":"2415_CR16","doi-asserted-by":"crossref","unstructured":"Du, Y., Wang, W., & Wang, L. (2015). Hierarchical recurrent neural network for skeleton based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 1110\u20131118.","DOI":"10.1109\/CVPR.2015.7298714"},{"key":"2415_CR17","unstructured":"Fang, Z., Wang, J., Wang, L., Zhang, L., Yang, Y., & Liu, Z. (2021). SEED: Self-supervised distillation for visual representation. In: Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"2415_CR18","doi-asserted-by":"crossref","unstructured":"Fang, H.S., Xie, S., Tai, Y.W., & Lu, C. (2017). RMPE: Regional multi-person pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 2334\u20132343.","DOI":"10.1109\/ICCV.2017.256"},{"key":"2415_CR19","unstructured":"Franco, L., Mandica, P., Munjal, B., & Galasso, F. (2023). Hyperbolic self-paced learning for self-supervised skeleton-based action representations. arXiv preprint arXiv:2303.06242"},{"key":"2415_CR20","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1109\/TMM.2021.3127040","volume":"25","author":"X Gao","year":"2023","unstructured":"Gao, X., Yang, Y., Zhang, Y., Li, M., Yu, J. G., & Du, S. (2023). Efficient spatio-temporal contrastive learning for skeleton-based 3-d action recognition. IEEE Transactions on Multimedia (TMM), 25, 405\u2013417. https:\/\/doi.org\/10.1109\/TMM.2021.3127040","journal-title":"IEEE Transactions on Multimedia (TMM)"},{"issue":"7","key":"2415_CR21","doi-asserted-by":"publisher","first-page":"2097","DOI":"10.1007\/s11263-021-01470-y","volume":"129","author":"P Gupta","year":"2021","unstructured":"Gupta, P., Thatipelli, A., Aggarwal, A., Maheshwari, S., Trivedi, N., Das, S., & Sarvadevabhatla, R. K. (2021). Quo vadis, skeleton action recognition? International Journal of Computer Vision (IJCV), 129(7), 2097\u20132112.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"2415_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2415_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R. (2020). Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 9729\u20139738.","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2415_CR24","unstructured":"Hinton, G., Vinyals, O., Dean, J., et al. (2015). Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531"},{"key":"2415_CR25","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch Normalization: Accelerating deep network training by reducing internal covariate shift. In: Proceedings of the International Conference on Machine Learning (ICML), pp 448\u2013456."},{"key":"2415_CR26","doi-asserted-by":"crossref","unstructured":"Ke, Q., Bennamoun, M., An, S., Sohel, F., & Boussaid, F. (2017). A new representation of skeleton sequences for 3D action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3288\u20133297.","DOI":"10.1109\/CVPR.2017.486"},{"key":"2415_CR27","doi-asserted-by":"crossref","unstructured":"Kim, B., Chang, H.J., Kim, J., & Choi. J.Y. (2022). Global-local motion transformer for unsupervised skeleton-based action learning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 209\u2013225.","DOI":"10.1007\/978-3-031-19772-7_13"},{"key":"2415_CR28","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., & Serre, T. (2011). HMDB: A large video database for human motion recognition. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 2556\u20132563.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"2415_CR29","doi-asserted-by":"crossref","unstructured":"Li, M., Chen, S., Chen, X., Zhang, Y., Wang, Y., & Tian, Q. (2019). Actional-structural graph convolutional networks for skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3595\u20133603.","DOI":"10.1109\/CVPR.2019.00371"},{"key":"2415_CR30","doi-asserted-by":"crossref","unstructured":"Li, T., Ke, Q., Rahmani, H., Ho, R.E., Ding, H., & Liu, J. (2021c). Else-Net: Elastic semantic network for continual action recognition from skeleton data. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 13434\u201313443.","DOI":"10.1109\/ICCV48922.2021.01318"},{"key":"2415_CR31","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S., Xiong, C., & Hoi, S. (2021a). Align before fuse: Vision and language representation learning with momentum distillation. In: Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"2415_CR32","doi-asserted-by":"crossref","unstructured":"Li, L., Wang, M., Ni, B., Wang, H., Yang, J., & Zhang, W. (2021b). 3D human action representation learning via cross-view consistency pursuit. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 4741\u20134750.","DOI":"10.1109\/CVPR46437.2021.00471"},{"key":"2415_CR33","doi-asserted-by":"crossref","unstructured":"Liang, D., Fan, G., Lin, G., Chen, W., Pan, X., & Zhu, H. (2019). Three-stream convolutional neural network with multi-task and ensemble learning for 3D action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp 934\u2013940.","DOI":"10.1109\/CVPRW.2019.00123"},{"key":"2415_CR34","doi-asserted-by":"crossref","unstructured":"Lin, L., Song, S., Yang, W., & Liu, J. (2020). MS2L: Multi-task self-supervised learning for skeleton based action recognition. In: Proceedings of the 28th ACM International Conference on Multimedia (ACM MM), pp 2490\u20132498.","DOI":"10.1145\/3394171.3413548"},{"key":"2415_CR35","doi-asserted-by":"crossref","unstructured":"Lin, L., Zhang, J., & Liu, J. (2023). Actionlet-dependent contrastive learning for unsupervised skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 2363\u20132372","DOI":"10.1109\/CVPR52729.2023.00234"},{"key":"2415_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, H., Chen, Z., Wang, Z., & Ouyang, W. (2020b). Disentangling and unifying graph convolutions for skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 143\u2013152.","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"2415_CR37","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhang, F., Hou, Z., Mian, L., Wang, Z., Zhang, J., & Tang, J. (2021). Self-supervised learning: Generative or contrastive. IEEE Transactions on Knowledge and Data Engineering (TKDE).","DOI":"10.1109\/TKDE.2021.3090866"},{"issue":"10","key":"2415_CR38","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2020","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L. Y., & Kot, A. C. (2020). NTU RGB+D 120: A large-scale benchmark for 3d human activity understanding. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 42(10), 2684\u20132701.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"2415_CR39","doi-asserted-by":"crossref","unstructured":"Mao, Y., Zhou, W., Lu, Z., Deng, J., & Li, H. (2022). CMD: Self-supervised 3d action representation learning with cross-modal mutual distillation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 734\u2013752.","DOI":"10.1007\/978-3-031-20062-5_42"},{"key":"2415_CR40","doi-asserted-by":"crossref","unstructured":"Misra, I., Zitnick, C.L., & Hebert, M. (2016). Shuffle and learn: Unsupervised learning using temporal order verification. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 527\u2013544.","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"2415_CR41","doi-asserted-by":"crossref","unstructured":"Nie, Q., Liu, Z., & Liu, Y. (2020). Unsupervised 3D human pose representation with viewpoint and pose disentanglement. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 102\u2013118.","DOI":"10.1007\/978-3-030-58529-7_7"},{"issue":"1","key":"2415_CR42","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11263-020-01354-7","volume":"129","author":"Q Nie","year":"2021","unstructured":"Nie, Q., & Liu, Y. (2021). View transfer on human skeleton pose: Automatically disentangle the view-variant and view-invariant information for pose representation learning. International Journal of Computer Vision (IJCV), 129(1), 1\u201322.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"2415_CR43","doi-asserted-by":"crossref","unstructured":"Noroozi, M., & Favaro, P. (2016). Unsupervised learning of visual representations by solving jigsaw puzzles. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 69\u201384.","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"2415_CR44","unstructured":"Ouyang, J., Wu, H., Wang, M., Zhou, W., & Li, H. (2021). Contextual similarity aggregation with self-attention for visual re-ranking. In: Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"2415_CR45","doi-asserted-by":"crossref","unstructured":"Park, W., Kim, D., Lu, Y., & Cho, M. (2019). Relational knowledge distillation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3967\u20133976.","DOI":"10.1109\/CVPR.2019.00409"},{"key":"2415_CR46","doi-asserted-by":"crossref","unstructured":"Passalis, N., & Tefas, A. (2018). Learning deep representations with probabilistic knowledge transfer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 268\u2013284.","DOI":"10.1007\/978-3-030-01252-6_17"},{"key":"2415_CR47","doi-asserted-by":"crossref","unstructured":"Peng, B., Jin, X., Liu, J., Li, D., Wu, Y., Liu, Y., Zhou, S., & Zhang, Z. (2019). Correlation congruence for knowledge distillation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 5007\u20135016.","DOI":"10.1109\/ICCV.2019.00511"},{"key":"2415_CR48","doi-asserted-by":"crossref","unstructured":"Rao, H., Leung, C., & Miao, C. (2023). Hierarchical skeleton meta-prototype contrastive learning with hard skeleton mining for unsupervised person re-identification. International Journal of Computer Vision (IJCV) pp 1\u201323.","DOI":"10.1007\/s11263-023-01864-0"},{"key":"2415_CR49","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1016\/j.ins.2021.04.023","volume":"569","author":"H Rao","year":"2021","unstructured":"Rao, H., Xu, S., Hu, X., Cheng, J., & Hu, B. (2021). Augmented skeleton based contrastive action learning with momentum LSTM for unsupervised action recognition. Information Sciences, 569, 90\u2013109.","journal-title":"Information Sciences"},{"key":"2415_CR50","unstructured":"Romero, A., Ballas, N., Kahou, S.E., Chassang, A., Gatta, C., & Bengio, Y. (2015). FitNets: Hints for thin deep nets. In: Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"2415_CR51","doi-asserted-by":"crossref","unstructured":"Shah, A., Roy, A., Shah, K., Mishra, S., Jacobs, D., Cherian, A., & Chellappa, R. (2023). Halp: Hallucinating latent positives for skeleton-based self-supervised learning of actions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 18846\u201318856.","DOI":"10.1109\/CVPR52729.2023.01807"},{"key":"2415_CR52","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., & Wang, G. (2016). NTU RGB+D: A large scale dataset for 3d human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 1010\u20131019.","DOI":"10.1109\/CVPR.2016.115"},{"key":"2415_CR53","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., & Lu, H. (2019a). Skeleton-based action recognition with directed graph neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 7912\u20137921.","DOI":"10.1109\/CVPR.2019.00810"},{"key":"2415_CR54","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., & Lu, H. (2019b). Two-stream adaptive graph convolutional networks for skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 12026\u201312035.","DOI":"10.1109\/CVPR.2019.01230"},{"key":"2415_CR55","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., & Lu, H. (2021). AdaSGN: Adapting joint number and model size for efficient skeleton-based action recognition. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 13413\u201313422.","DOI":"10.1109\/ICCV48922.2021.01316"},{"key":"2415_CR56","doi-asserted-by":"crossref","unstructured":"Si, C., Chen, W., Wang, W., Wang, L., & Tan, T. (2019). An attention enhanced graph convolutional LSTM network for skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 1227\u20131236.","DOI":"10.1109\/CVPR.2019.00132"},{"key":"2415_CR57","doi-asserted-by":"crossref","unstructured":"Si, C., Nie, X., Wang, W., Wang, L., Tan, T., & Feng, J. (2020). Adversarial self-supervised learning for semi-supervised 3D action recognition. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 35\u201351.","DOI":"10.1007\/978-3-030-58571-6_3"},{"key":"2415_CR58","unstructured":"Soomro, K., Zamir, A.R., & Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"2415_CR59","doi-asserted-by":"crossref","unstructured":"Su, K., Liu, X., & Shlizerman, E. (2020). PREDICT & CLUSTER: Unsupervised skeleton based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 9631\u20139640.","DOI":"10.1109\/CVPR42600.2020.00965"},{"key":"2415_CR60","doi-asserted-by":"crossref","unstructured":"Tejankar, A., Koohpayegani, S.A., Pillai, V., Favaro, P., & Pirsiavash, H. (2021). ISD: Self-supervised learning by iterative similarity distillation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 9609\u20139618.","DOI":"10.1109\/ICCV48922.2021.00947"},{"key":"2415_CR61","doi-asserted-by":"crossref","unstructured":"Thoker, F.M., Doughty, H., & Snoek, C.G. (2021). Skeleton-contrastive 3D action representation learning. In: Proceedings of the 29th ACM International Conference on Multimedia (ACM MM), pp 1655\u20131663.","DOI":"10.1145\/3474085.3475307"},{"key":"2415_CR62","unstructured":"Tianyu, G., Hong, L., Zhan, C., Mengyuan, L., Tao, W., & Runwei, D. (2022). Contrastive learning from extremely augmented skeleton sequences for self-supervised action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)."},{"key":"2415_CR63","doi-asserted-by":"crossref","unstructured":"Tung, F., & Mori, G. (2019). Similarity-preserving knowledge distillation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 1365\u20131374.","DOI":"10.1109\/ICCV.2019.00145"},{"key":"2415_CR64","unstructured":"Van Den\u00a0Oord, A., Kalchbrenner, N., & Kavukcuoglu, K. (2016). Pixel recurrent neural networks. In: Proceedings of the International conference on machine learning, pp 1747\u20131756."},{"issue":"11","key":"2415_CR65","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. Journal of Machine Learning Research (JMLR), 9(11), 2579\u20132605.","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"2415_CR66","unstructured":"Van\u00a0den Oord, A., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv preprint arXiv:1809.03327"},{"key":"2415_CR67","unstructured":"van\u00a0den Oord, A., Vinyals, O., & kavukcuoglu, k. (2017). Neural discrete representation learning. In: Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"2415_CR68","unstructured":"Wang, M., Ni, B., & Yang, X. (2020). Learning multi-view interactional skeleton graph for action recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)."},{"key":"2415_CR69","doi-asserted-by":"crossref","unstructured":"Wang, N., Zhou, W., & Li, H. (2021). Contrastive transformation for self-supervised correspondence learning. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp 10174\u201310182.","DOI":"10.1609\/aaai.v35i11.17220"},{"key":"2415_CR70","doi-asserted-by":"crossref","unstructured":"Wu, H., Wang, M., Zhou, W., Li, H., & Tian, Q. (2022). Contextual similarity distillation for asymmetric image retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 9489\u20139498.","DOI":"10.1109\/CVPR52688.2022.00927"},{"key":"2415_CR71","doi-asserted-by":"crossref","unstructured":"Xu, J., Yu, Z., Ni, B., Yang, J., Yang, X., & Zhang, W. (2020). Deep kinematics analysis for monocular 3D human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 899\u2013908.","DOI":"10.1109\/CVPR42600.2020.00098"},{"key":"2415_CR72","doi-asserted-by":"publisher","first-page":"624","DOI":"10.1109\/TMM.2021.3129616","volume":"25","author":"S Xu","year":"2023","unstructured":"Xu, S., Rao, H., Hu, X., Cheng, J., & Hu, B. (2023). Prototypical contrast and reverse prediction: Unsupervised skeleton based action recognition. IEEE Transactions on Multimedia (TMM), 25, 624\u2013634. https:\/\/doi.org\/10.1109\/TMM.2021.3129616","journal-title":"IEEE Transactions on Multimedia (TMM)"},{"key":"2415_CR73","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., & Lin, D. (2018). Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp 7444\u20137452.","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"2415_CR74","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., & Le, Q.V. (2019). XLNet: Generalized autoregressive pretraining for language understanding. In: Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"2415_CR75","doi-asserted-by":"crossref","unstructured":"Yang, S., Liu, J., Lu, S., Er, M.H., & Kot, A.C. (2021b). Skeleton cloud colorization for unsupervised 3D action representation learning. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 13423\u201313433.","DOI":"10.1109\/ICCV48922.2021.01317"},{"key":"2415_CR76","doi-asserted-by":"crossref","unstructured":"Yang, S., Liu, J., Lu, S., Hwa, E.M., Hu, Y., & Kot, A.C. (2023). Self-supervised 3d action representation learning with skeleton cloud colorization. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2023.3325463"},{"key":"2415_CR77","doi-asserted-by":"publisher","first-page":"883","DOI":"10.1109\/TMM.2020.2990082","volume":"23","author":"J Yang","year":"2021","unstructured":"Yang, J., Liu, W., Yuan, J., & Mei, T. (2021). Hierarchical soft quantization for skeleton-based human action recognition. IEEE Transactions on Multimedia (TMM), 23, 883\u2013898. https:\/\/doi.org\/10.1109\/TMM.2020.2990082","journal-title":"IEEE Transactions on Multimedia (TMM)"},{"key":"2415_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, H., Hou, Y., Zhang, W., & Li, W. (2022). Contrastive positive mining for unsupervised 3D action representation learning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 36\u201351.","DOI":"10.1007\/978-3-031-19772-7_3"},{"key":"2415_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, P., Lan, C., Zeng, W., Xing, J., Xue, J., & Zheng, N. (2020a). Semantics-guided neural networks for efficient skeleton-based human action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 1112\u20131121.","DOI":"10.1109\/CVPR42600.2020.00119"},{"key":"2415_CR80","doi-asserted-by":"crossref","unstructured":"Zhang, J., Lin, L., & Liu, J. (2023a). Hierarchical consistent contrastive learning for skeleton-based action recognition with growing augmentations. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp 3427\u20133435.","DOI":"10.1609\/aaai.v37i3.25451"},{"key":"2415_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, X., Xu, C., & Tao, D. (2020b). Context aware graph convolution for skeleton-based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 14333\u201314342.","DOI":"10.1109\/CVPR42600.2020.01434"},{"issue":"8","key":"2415_CR82","doi-asserted-by":"publisher","first-page":"1963","DOI":"10.1109\/TPAMI.2019.2896631","volume":"41","author":"P Zhang","year":"2019","unstructured":"Zhang, P., Lan, C., Xing, J., Zeng, W., Xue, J., & Zheng, N. (2019). View adaptive neural networks for high performance skeleton-based human action recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 41(8), 1963\u20131978.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"2415_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, S., Wang, C., Nie, L., Yao, H., Huang, Q., & Tian, Q. (2023). Learning enriched hop-aware correlation for robust 3d human pose estimation. International Journal of Computer Vision (IJCV), 131(6), 1566\u20131583.","DOI":"10.1007\/s11263-023-01770-5"},{"key":"2415_CR84","doi-asserted-by":"crossref","unstructured":"Zheng, N., Wen, J., Liu, R., Long, L., Dai, J., & Gong, Z. (2018). Unsupervised representation learning with long-term dynamics for skeleton based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp 2644\u20132651.","DOI":"10.1609\/aaai.v32i1.11853"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02415-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02415-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02415-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T17:02:12Z","timestamp":1749315732000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02415-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,27]]},"references-count":84,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2415"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02415-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,3,27]]},"assertion":[{"value":"23 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 April 2025","order":4,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":5,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The repeated text in all tables has been removed.","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}