{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T17:45:20Z","timestamp":1772300720753,"version":"3.50.1"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,8,17]],"date-time":"2024-08-17T00:00:00Z","timestamp":1723852800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,17]],"date-time":"2024-08-17T00:00:00Z","timestamp":1723852800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["No. 51678075"],"award-info":[{"award-number":["No. 51678075"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Science and Technology Project of Hunan","award":["No. 2017GK2271"],"award-info":[{"award-number":["No. 2017GK2271"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s00138-024-01598-0","type":"journal-article","created":{"date-parts":[[2024,8,17]],"date-time":"2024-08-17T20:10:07Z","timestamp":1723925407000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Cmf-transformer: cross-modal fusion transformer for human action recognition"],"prefix":"10.1007","volume":"35","author":[{"given":"Jun","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Limin","family":"Xia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Wen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,17]]},"reference":[{"key":"1598_CR1","doi-asserted-by":"publisher","first-page":"690","DOI":"10.1007\/s10489-020-01823-z","volume":"51","author":"O Elharrouss","year":"2020","unstructured":"Elharrouss, O., Almaadeed, N., Al-Maadeed, S.A., Bouridane, A., Beghdadi, A.: A combined multiple action recognition and summarization for surveillance video sequences. Appl. Intell. 51, 690\u2013712 (2020)","journal-title":"Appl. Intell."},{"key":"1598_CR2","doi-asserted-by":"publisher","unstructured":"Baradel, F., Wolf, C., Mille, J., Taylor, G.W.: Glimpse clouds: Human activity recognition from unstructured feature points. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 469\u2013478 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00056","DOI":"10.1109\/CVPR.2018.00056"},{"key":"1598_CR3","doi-asserted-by":"publisher","unstructured":"Wang, Z., She, Q., Smolic, A.: Action-net: Multipath excitation for action recognition. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13209\u201313218 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01301","DOI":"10.1109\/CVPR46437.2021.01301"},{"key":"1598_CR4","doi-asserted-by":"publisher","unstructured":"Liu, X., Pintea, S.L., Nejadasl, F.K., Booij, O., van Gemert, J.C.: No frame left behind: Full video action recognition. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14887\u201314896 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01465","DOI":"10.1109\/CVPR46437.2021.01465"},{"key":"1598_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108487","volume":"124","author":"V Mazzia","year":"2022","unstructured":"Mazzia, V., Angarano, S., Salvetti, F., Angelini, F., Chiaberge, M.: Action transformer: a self-attention model for short-time pose-based human action recognition. Pattern Recogn. 124, 108487 (2022). https:\/\/doi.org\/10.1016\/j.patcog.2021.108487","journal-title":"Pattern Recogn."},{"key":"1598_CR6","doi-asserted-by":"publisher","unstructured":"Chen, Y., Zhang, Z., Yuan, C., Li, B., Deng, Y., Hu, W.: Channel-wise topology refinement graph convolution for skeleton-based action recognition. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13339\u201313348 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.01311","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"1598_CR7","doi-asserted-by":"publisher","unstructured":"Cheng, K., Zhang, Y., He, X., Chen, W., Cheng, J., Lu, H.: Skeleton-based action recognition with shift graph convolutional network. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 180\u2013189 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00026","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"1598_CR8","doi-asserted-by":"crossref","unstructured":"Duan, H., Zhao, Y., Chen, K., Lin, D., Dai, B.: Revisiting skeleton-based action recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18-24, 2022, pp. 2959\u20132968 (2022). doi:10.1109\/CVPR52688.2022.00298","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"1598_CR9","doi-asserted-by":"publisher","unstructured":"Feichtenhofer, C.: X3d: expanding architectures for efficient video recognition. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 200\u2013210 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00028","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"1598_CR10","doi-asserted-by":"publisher","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00675","DOI":"10.1109\/CVPR.2018.00675"},{"key":"1598_CR11","doi-asserted-by":"publisher","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lucic, M., Schmid, C.: Vivit: A video vision transformer. In: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021, pp. 6816\u20136826 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00676","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"1598_CR12","doi-asserted-by":"crossref","unstructured":"Willems, G., Tuytelaars, T., Van\u00a0Gool, L.: An efficient dense and scale-invariant spatio-temporal interest point detector. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) Computer Vision\u2013ECCV 2008, pp. 650\u2013663 (2008)","DOI":"10.1007\/978-3-540-88688-4_48"},{"key":"1598_CR13","doi-asserted-by":"publisher","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: 2013 IEEE International Conference on Computer Vision, pp. 3551\u20133558 (2013). https:\/\/doi.org\/10.1109\/ICCV.2013.441","DOI":"10.1109\/ICCV.2013.441"},{"key":"1598_CR14","unstructured":"Peng, X., Wang, L., Cai, Z., Qiao, Y., Peng, Q.: Hybrid super vector with improved dense trajectories for action recognition. (2013)"},{"key":"1598_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2019.115640","volume":"80","author":"Y Yi","year":"2020","unstructured":"Yi, Y., Li, A., Zhou, X.: Human action recognition based on action relevance weighted encoding. Signal Process.: Image Commun. 80, 115640 (2020). https:\/\/doi.org\/10.1016\/j.image.2019.115640","journal-title":"Signal Process.: Image Commun."},{"key":"1598_CR16","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. ArXiv arXiv:abs\/1406.2199 (2014)"},{"key":"1598_CR17","doi-asserted-by":"publisher","first-page":"57267","DOI":"10.1109\/ACCESS.2019.2910604","volume":"7","author":"E Chen","year":"2019","unstructured":"Chen, E., Bai, X., Gao, L., Tinega, H.C., Ding, Y.: A spatiotemporal heterogeneous two-stream network for action recognition. IEEE Access 7, 57267\u201357275 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2910604","journal-title":"IEEE Access"},{"key":"1598_CR18","doi-asserted-by":"publisher","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3d residual networks. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 5534\u20135542 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.590","DOI":"10.1109\/ICCV.2017.590"},{"issue":"1","key":"1598_CR19","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3d convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35(1), 221\u2013231 (2013). https:\/\/doi.org\/10.1109\/TPAMI.2012.59","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1598_CR20","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108797","volume":"130","author":"W Dong","year":"2022","unstructured":"Dong, W., Zhang, Z., Song, C., Tan, T.: Identifying the key frames: an attention-aware sampling method for action recognition. Pattern Recogn. 130, 108797 (2022). https:\/\/doi.org\/10.1016\/j.patcog.2022.108797","journal-title":"Pattern Recogn."},{"issue":"3","key":"1598_CR21","doi-asserted-by":"publisher","first-page":"1445","DOI":"10.2298\/CSIS210322042W","volume":"19","author":"Y Wang","year":"2022","unstructured":"Wang, Y., Feng, T., Zheng, Y.: Human action recognition using a depth sequence key-frames based on discriminative collaborative representation classifier for healthcare analytics. Comput. Sci. Inf. Syst. 19(3), 1445\u20131462 (2022). https:\/\/doi.org\/10.2298\/CSIS210322042W","journal-title":"Comput. Sci. Inf. Syst."},{"key":"1598_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.118484","volume":"210","author":"KS Tan","year":"2022","unstructured":"Tan, K.S., Lim, K.M., Lee, C.P., Kwek, L.C.: Bidirectional long short-term memory with temporal dense sampling for human action recognition. Expert Syst. Appl. 210, 118484 (2022). https:\/\/doi.org\/10.1016\/j.eswa.2022.118484","journal-title":"Expert Syst. Appl."},{"key":"1598_CR23","doi-asserted-by":"publisher","unstructured":"Korbar, B., Tran, D., Torresani, L.: Scsampler: Sampling salient clips from video for efficient action recognition. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6231\u20136241 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00633","DOI":"10.1109\/ICCV.2019.00633"},{"issue":"8","key":"1598_CR24","doi-asserted-by":"publisher","first-page":"3729","DOI":"10.1007\/S12652-022-03848-3","volume":"13","author":"SA Khowaja","year":"2022","unstructured":"Khowaja, S.A., Lee, S.: Skeleton-based human action recognition with sequential convolutional-lstm networks and fusion strategies. J. Ambient. Intell. Humaniz. Comput. 13(8), 3729\u20133746 (2022). https:\/\/doi.org\/10.1007\/S12652-022-03848-3","journal-title":"J. Ambient. Intell. Humaniz. Comput."},{"key":"1598_CR25","doi-asserted-by":"publisher","first-page":"16868","DOI":"10.1109\/ACCESS.2024.3359234","volume":"12","author":"H Hu","year":"2024","unstructured":"Hu, H., Fang, Y., Han, M., Qi, X.: Multi-scale adaptive graph convolution network for skeleton-based action recognition. IEEE Access 12, 16868\u201316880 (2024). https:\/\/doi.org\/10.1109\/ACCESS.2024.3359234","journal-title":"IEEE Access"},{"issue":"1","key":"1598_CR26","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/TCSVT.2023.3236430","volume":"34","author":"C Wu","year":"2024","unstructured":"Wu, C., Wu, X.-J., Xu, T., Shen, Z., Kittler, J.: Motion complement and temporal multifocusing for skeleton-based action recognition. IEEE Trans. Circuits Syst. Video Technol. 34(1), 34\u201345 (2024). https:\/\/doi.org\/10.1109\/TCSVT.2023.3236430","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1598_CR27","doi-asserted-by":"publisher","unstructured":"Huang, Y.-B., Lin, Y.-X., Aqil, A.F., Chen, Y.-Y., Hua, K.-L.: Graph involutional networks with dynamic feature fusion for skeleton-based action recognition. In: 2024 IEEE International Conference on Consumer Electronics (ICCE), pp. 1\u20136 (2024). https:\/\/doi.org\/10.1109\/ICCE59016.2024.10444342","DOI":"10.1109\/ICCE59016.2024.10444342"},{"key":"1598_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2023.107210","volume":"127","author":"Y Xia","year":"2024","unstructured":"Xia, Y., Gao, Q., Wu, W., Cao, Y.: Skeleton-based action recognition based on multidimensional adaptive dynamic temporal graph convolutional network. Eng. Appl. Artif. Intell. 127, 107210 (2024). https:\/\/doi.org\/10.1016\/j.engappai.2023.107210","journal-title":"Eng. Appl. Artif. Intell."},{"key":"1598_CR29","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110188","volume":"148","author":"H Qiu","year":"2024","unstructured":"Qiu, H., Hou, B.: Multi-grained clip focus for skeleton-based action recognition. Pattern Recogn. 148, 110188 (2024). https:\/\/doi.org\/10.1016\/j.patcog.2023.110188","journal-title":"Pattern Recogn."},{"key":"1598_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126903","volume":"563","author":"Z Zhao","year":"2024","unstructured":"Zhao, Z., Chen, Z., Li, J., Xie, X., Chen, K., Wang, X., Shi, G.: Stdm-transformer: space-time dual multi-scale transformer network for skeleton-based action recognition. Neurocomputing 563, 126903 (2024). https:\/\/doi.org\/10.1016\/j.neucom.2023.126903","journal-title":"Neurocomputing"},{"key":"1598_CR31","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021 (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"1598_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127027","volume":"569","author":"Y Ma","year":"2024","unstructured":"Ma, Y., Wang, R., Zong, M., Ji, W., Wang, Y., Ye, B.: Convolutional transformer network for fine-grained action recognition. Neurocomputing 569, 127027 (2024). https:\/\/doi.org\/10.1016\/j.neucom.2023.127027","journal-title":"Neurocomputing"},{"key":"1598_CR33","doi-asserted-by":"publisher","unstructured":"Yan, S., Xiong, X., Arnab, A., Lu, Z., Zhang, M., Sun, C., Schmid, C.: Multiview transformers for video recognition. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3323\u20133333 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00333","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"1598_CR34","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103219","volume":"208\u2013209","author":"C Plizzari","year":"2021","unstructured":"Plizzari, C., Cannici, M., Matteucci, M.: Skeleton-based action recognition via spatial and temporal transformer networks. Comput. Vis. Image Underst. 208\u2013209, 103219 (2021). https:\/\/doi.org\/10.1016\/j.cviu.2021.103219","journal-title":"Comput. Vis. Image Underst."},{"issue":"8","key":"1598_CR35","doi-asserted-by":"publisher","first-page":"4137","DOI":"10.1109\/TCSVT.2023.3240472","volume":"33","author":"H Liu","year":"2023","unstructured":"Liu, H., Liu, Y., Chen, Y., Yuan, C., Li, B., Hu, W.: Transkeleton: hierarchical spatial-temporal transformer for skeleton-based action recognition. IEEE Trans. Circuits Syst. Video Technol. 33(8), 4137\u20134148 (2023). https:\/\/doi.org\/10.1109\/TCSVT.2023.3240472","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"12","key":"1598_CR36","doi-asserted-by":"publisher","first-page":"15390","DOI":"10.1007\/S10489-022-04302-9","volume":"53","author":"S Chen","year":"2023","unstructured":"Chen, S., Xu, K., Zhu, B., Jiang, X., Sun, T.: Deformable graph convolutional transformer for skeleton-based action recognition. Appl. Intell. 53(12), 15390\u201315406 (2023). https:\/\/doi.org\/10.1007\/S10489-022-04302-9","journal-title":"Appl. Intell."},{"issue":"1","key":"1598_CR37","doi-asserted-by":"publisher","first-page":"116","DOI":"10.1007\/S44196-023-00292-9","volume":"16","author":"Y Sun","year":"2023","unstructured":"Sun, Y., Xu, W., Yu, X., Gao, J., Xia, T.: Integrating vision transformer-based bilinear pooling and attention network fusion of RGB and skeleton features for human action recognition. Int. J. Comput. Intell. Syst. 16(1), 116 (2023). https:\/\/doi.org\/10.1007\/S44196-023-00292-9","journal-title":"Int. J. Comput. Intell. Syst."},{"key":"1598_CR38","doi-asserted-by":"publisher","unstructured":"Shahroudy, A., Liu, J., Ng, T.-T., Wang, G.: Ntu rgb+d: a large scale dataset for 3d human activity analysis. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1010\u20131019 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.115","DOI":"10.1109\/CVPR.2016.115"},{"issue":"10","key":"1598_CR39","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2020","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.-Y., Kot, A.C.: Ntu rgb+d 120: a large-scale benchmark for 3d human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2684\u20132701 (2020). https:\/\/doi.org\/10.1109\/TPAMI.2019.2916873","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1598_CR40","unstructured":"Qin, Z., Liu, Y., Perera, M., Anwar, S., Gedeon, T., Ji, P., Kim, D.: Anubis: review and benchmark skeleton-based action recognition methods with a new dataset. (2022). https:\/\/api.semanticscholar.org\/CorpusID:248512580"},{"key":"1598_CR41","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., Suleyman, M., Zisserman, A.: The kinetics human action video dataset. CoRR arXiv:abs\/1705.06950 (2017)"},{"issue":"2","key":"1598_CR42","doi-asserted-by":"publisher","first-page":"1474","DOI":"10.1109\/TPAMI.2022.3157033","volume":"45","author":"Y-F Song","year":"2023","unstructured":"Song, Y.-F., Zhang, Z., Shan, C., Wang, L.: Constructing stronger and faster baselines for skeleton-based action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 45(2), 1474\u20131488 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3157033","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1598_CR43","doi-asserted-by":"publisher","unstructured":"Chi, H., Ha, M.H., Chi, S., Lee, S.W., Huang, Q., Ramani, K.: Infogcn: Representation learning for human skeleton-based action recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18-24, 2022, pp. 20154\u201320164 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01955","DOI":"10.1109\/CVPR52688.2022.01955"},{"key":"1598_CR44","doi-asserted-by":"publisher","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Decoupled spatial-temporal attention network for skeleton-based action-gesture recognition. In: Ishikawa, H., Liu, C., Pajdla, T., Shi, J. (eds.) Computer Vision - ACCV 2020 - 15th Asian Conference on Computer Vision, Kyoto, Japan, November 30 - December 4, 2020, Revised Selected Papers, Part V. Lecture Notes in Computer Science, vol. 12626, pp. 38\u201353 (2020). https:\/\/doi.org\/10.1007\/978-3-030-69541-5_3","DOI":"10.1007\/978-3-030-69541-5_3"},{"key":"1598_CR45","doi-asserted-by":"publisher","unstructured":"Vaezi\u00a0Joze, H.R., Shaban, A., Iuzzolino, M.L., Koishida, K.: Mmtm: Multimodal transfer module for cnn fusion. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13286\u201313296 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01330","DOI":"10.1109\/CVPR42600.2020.01330"},{"key":"1598_CR46","doi-asserted-by":"publisher","unstructured":"Ahn, D., Kim, S., Hong, H., Ko, B.: Star-transformer: A spatio-temporal cross attention transformer for human action recognition. In: IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV 2023, Waikoloa, HI, USA, January 2-7, 2023, pp. 3319\u20133328 (2023). https:\/\/doi.org\/10.1109\/WACV56688.2023.00333","DOI":"10.1109\/WACV56688.2023.00333"},{"key":"1598_CR47","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107356","volume":"104","author":"J Li","year":"2020","unstructured":"Li, J., Xie, X., Pan, Q., Cao, Y., Zhao, Z., Shi, G.: Sgm-net: skeleton-guided multimodal network for action recognition. Pattern Recogn. 104, 107356 (2020). https:\/\/doi.org\/10.1016\/j.patcog.2020.107356","journal-title":"Pattern Recogn."},{"issue":"3","key":"1598_CR48","doi-asserted-by":"publisher","first-page":"1250","DOI":"10.1109\/TCSVT.2021.3077512","volume":"32","author":"H Wu","year":"2022","unstructured":"Wu, H., Ma, X., Li, Y.: Spatiotemporal multimodal learning with 3d cnns for video action recognition. IEEE Trans. Circuits Syst. Video Technol. 32(3), 1250\u20131261 (2022). https:\/\/doi.org\/10.1109\/TCSVT.2021.3077512","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1598_CR49","doi-asserted-by":"publisher","unstructured":"Das, S., Sharma, S., Dai, R., Br\u00e9mond, F., Thonnat, M.: VPN: learning video-pose embedding for activities of daily living. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J. (eds.) Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IX. Lecture Notes in Computer Science, vol. 12354, pp. 72\u201390 (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_5","DOI":"10.1007\/978-3-030-58545-7_5"},{"key":"1598_CR50","doi-asserted-by":"publisher","unstructured":"Yu, B.X.B., Liu, Y., Chan, K.C.C.: Multimodal fusion via teacher-student network for indoor action recognition. In: Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, IAAI 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, EAAI 2021, Virtual Event, February 2-9, 2021, pp. 3199\u20133207 (2021). https:\/\/doi.org\/10.1609\/AAAI.V35I4.16430","DOI":"10.1609\/AAAI.V35I4.16430"},{"issue":"3","key":"1598_CR51","doi-asserted-by":"publisher","first-page":"3522","DOI":"10.1109\/TPAMI.2022.3177813","volume":"45","author":"BXB Yu","year":"2023","unstructured":"Yu, B.X.B., Liu, Y., Zhang, X., Zhong, S., Chan, K.C.C.: Mmnet: a model-based multimodal network for human action recognition in RGB-D videos. IEEE Trans. Pattern Anal. Mach. Intell. 45(3), 3522\u20133538 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3177813","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1598_CR52","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122314","volume":"239","author":"Z Hu","year":"2024","unstructured":"Hu, Z., Xiao, J., Li, L., Liu, C., Ji, G.: Human-centric multimodal fusion network for robust action recognition. Expert Syst. Appl. 239, 122314 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2023.122314","journal-title":"Expert Syst. Appl."},{"key":"1598_CR53","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., Cao, C., Shi, L., Cheng, J., Lu, H.: Decoupling gcn with dropgraph module for skeleton-based action recognition. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision\u2014ECCV 2020, Cham, pp. 536\u2013553 (2020)","DOI":"10.1007\/978-3-030-58586-0_32"},{"key":"1598_CR54","doi-asserted-by":"publisher","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Two-stream adaptive graph convolutional networks for skeleton-based action recognition. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12018\u201312027 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.01230","DOI":"10.1109\/CVPR.2019.01230"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01598-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-024-01598-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01598-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T04:05:59Z","timestamp":1726027559000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-024-01598-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,17]]},"references-count":54,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["1598"],"URL":"https:\/\/doi.org\/10.1007\/s00138-024-01598-0","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,17]]},"assertion":[{"value":"15 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 July 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 August 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 August 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}}],"article-number":"114"}}