{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:20:25Z","timestamp":1771950025936,"version":"3.50.1"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2020,3,2]],"date-time":"2020-03-02T00:00:00Z","timestamp":1583107200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,3,2]],"date-time":"2020-03-02T00:00:00Z","timestamp":1583107200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,5]]},"DOI":"10.1007\/s11042-019-08576-z","type":"journal-article","created":{"date-parts":[[2020,3,2]],"date-time":"2020-03-02T14:08:59Z","timestamp":1583158139000},"page":"16185-16203","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":40,"title":["Multi-modality learning for human action recognition"],"prefix":"10.1007","volume":"80","author":[{"given":"Ziliang","family":"Ren","sequence":"first","affiliation":[]},{"given":"Qieshi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xiangyang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Pengyi","family":"Hao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3131-3275","authenticated-orcid":false,"given":"Jun","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,3,2]]},"reference":[{"issue":"11","key":"8576_CR1","doi-asserted-by":"publisher","first-page":"14115","DOI":"10.1007\/s11042-017-5017-y","volume":"77","author":"M Asadi-Aghbolaghi","year":"2018","unstructured":"Asadi-Aghbolaghi M, Kasaei S (2018) Supervised spatio-temporal kernel descriptor for human action recognition from RGB-depth videos. Multimed Tools Appl 77(11):14115\u201314135","journal-title":"Multimed Tools Appl"},{"key":"8576_CR2","unstructured":"Baradel F, Wolf C, Mille J (2018) Human activity recognition with pose-driven attention to RGB. In: British machine vision conference (BMVC), pp 1\u201314"},{"key":"8576_CR3","doi-asserted-by":"crossref","unstructured":"Bilen H, Fernando B, Gavves E, Vedaldi A, Gould S (2016) Dynamic image networks for action recognition. In: IEEE conference on computer vision and pattern recognition (CVPR), pp 3034\u20133042","DOI":"10.1109\/CVPR.2016.331"},{"key":"8576_CR4","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L, Li K, Li F (2009) Imagenet: a large-scale hierarchical image database. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"issue":"4","key":"8576_CR5","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/TPAMI.2016.2599174","volume":"39","author":"J Donahue","year":"2017","unstructured":"Donahue J, Hendricks LA, Rohrbach M, Venugopalan S, Guadarrama S, Saenko K, Darrell T (2017) Long-term recurrent convolutional networks for visual recognition and description. IEEE Trans Pattern Anal Mach Intell (TPAMI) 39 (4):677\u2013691","journal-title":"IEEE Trans Pattern Anal Mach Intell (TPAMI)"},{"issue":"4","key":"8576_CR6","doi-asserted-by":"publisher","first-page":"773","DOI":"10.1109\/TPAMI.2016.2558148","volume":"39","author":"B Fernando","year":"2017","unstructured":"Fernando B, Gavves E, Oramas MJ, Ghodrati A, Tuytelaars T (2017) Rank pooling for action recognition. IEEE Trans Pattern Anal Mach Intell (TPAMI) 39(4):773\u2013787","journal-title":"IEEE Trans Pattern Anal Mach Intell (TPAMI)"},{"key":"8576_CR7","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"11","key":"8576_CR8","doi-asserted-by":"publisher","first-page":"2186","DOI":"10.1109\/TPAMI.2016.2640292","volume":"39","author":"J Hu","year":"2017","unstructured":"Hu J, Zheng W, Lai J, Zhang J (2017) Jointly learning heterogeneous features for RGB-D activity recognition. IEEE Trans Pattern Anal Mach Intellgence (TPAMI) 39(11):2186\u20132200","journal-title":"IEEE Trans Pattern Anal Mach Intellgence (TPAMI)"},{"key":"8576_CR9","doi-asserted-by":"publisher","first-page":"504","DOI":"10.1016\/j.patcog.2017.07.013","volume":"72","author":"EP Ijjina","year":"2017","unstructured":"Ijjina EP, Chalavadi KM (2017) Human action recognition in RGB-D videos using motion sequence information and deep learning. Pattern Recogn 72:504\u2013516","journal-title":"Pattern Recogn"},{"key":"8576_CR10","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate SHIFT. In: 32nd International conference on machine learning (ICML), vol 1, pp 448\u2013 456"},{"key":"8576_CR11","doi-asserted-by":"crossref","unstructured":"Ji Y, Ye G, Cheng H (2014) Interactive body part contrast mining for human interaction recognition. In: IEEE International conference on multimedia and expo workshops (ICMEW), pp 1\u20136","DOI":"10.1109\/ICMEW.2014.6890714"},{"key":"8576_CR12","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1016\/j.knosys.2017.01.035","volume":"122","author":"X Ji","year":"2017","unstructured":"Ji X, Cheng J, Tao D, Wu X, Feng W (2017) The spatial Laplacian and temporal energy pyramid representation for human action recognition using depth sequences. Knowl-Based Syst 122:64\u201374","journal-title":"Knowl-Based Syst"},{"key":"8576_CR13","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1016\/j.sigpro.2017.08.016","volume":"143","author":"X Ji","year":"2018","unstructured":"Ji X, Cheng J, Feng W, Tao D (2018) Skeleton embedded motion body partition for human action recognition using depth sequences. Signal Process 143:56\u201368","journal-title":"Signal Process"},{"issue":"11","key":"8576_CR14","doi-asserted-by":"publisher","first-page":"3781","DOI":"10.1109\/TIP.2015.2456412","volume":"24","author":"Y Jiang","year":"2015","unstructured":"Jiang Y, Dai Q, Liu W, Xue X, Ngo C (2015) Human action recognition in unconstrained videos by explicit motion modeling. IEEE Trans Image Process (TIP) 24(11):3781\u20133795","journal-title":"IEEE Trans Image Process (TIP)"},{"key":"8576_CR15","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"8576_CR16","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1016\/j.patrec.2018.04.035","volume":"115","author":"P Khaire","year":"2018","unstructured":"Khaire P, Kumar P, Imran J (2018) Combining CNN streams of RGB-D and skeletal data for human activity recognition. Pattern Recogn Lett 115:107\u2013116","journal-title":"Pattern Recogn Lett"},{"key":"8576_CR17","doi-asserted-by":"crossref","unstructured":"Kong Y, Fu Y (2015) Bilinear heterogeneous information machine for RGB-D action recognition. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 1054\u20131062","DOI":"10.1109\/CVPR.2015.7298708"},{"key":"8576_CR18","doi-asserted-by":"crossref","unstructured":"Li C, Zhong Q, Xie D, Pu S (2018) Co-occurrence feature learning from skeleton data for action recognition and detection with hierarchical aggregation. In: International joint conference on artificial intelligence (IJCAI), pp 786\u2013792","DOI":"10.24963\/ijcai.2018\/109"},{"key":"8576_CR19","doi-asserted-by":"crossref","unstructured":"Liu J, Shahroudy A, Xu D, Wang G (2016) Spatio-temporal LSTM with trust gates for 3D human action recognition. In: European conference on computer vision (ECCV), vol 9907, pp 816\u2013833","DOI":"10.1007\/978-3-319-46487-9_50"},{"key":"8576_CR20","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1016\/j.imavis.2016.04.004","volume":"55","author":"Z Liu","year":"2016","unstructured":"Liu Z, Zhang C, Tian Y (2016) 3D-based deep convolutional neural network for action recognition with depth sequences. Image Vis Comput 55:93\u2013100","journal-title":"Image Vis Comput"},{"key":"8576_CR21","doi-asserted-by":"crossref","unstructured":"Liu J, Wang G, Hu P, Duan L, Kot AC (2017) Global context-aware attention LSTM networks for 3D action recognition. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 3671\u20133680","DOI":"10.1109\/CVPR.2017.391"},{"issue":"2","key":"8576_CR22","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TASE.2013.2262940","volume":"11","author":"Z Moghaddam","year":"2014","unstructured":"Moghaddam Z, Piccardi M (2014) Training initialization of hidden Markov models in human action recognition. IEEE Trans Autom Sci Eng (TASE) 11(2):394\u2013408","journal-title":"IEEE Trans Autom Sci Eng (TASE)"},{"key":"8576_CR23","doi-asserted-by":"crossref","unstructured":"Rahmani H, Mian A (2016) 3D action recognition from novel viewpoints. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 1506\u20131515","DOI":"10.1109\/CVPR.2016.167"},{"issue":"12","key":"8576_CR24","doi-asserted-by":"publisher","first-page":"2430","DOI":"10.1109\/TPAMI.2016.2533389","volume":"38","author":"H Rahmani","year":"2016","unstructured":"Rahmani H, Mahmood A, Huynh D, Mian A (2016) Histogram of oriented principal components for cross-view action recognition. IEEE Trans Pattern Anal Mach Intell (TPAMI) 38(12):2430\u20132443","journal-title":"IEEE Trans Pattern Anal Mach Intell (TPAMI)"},{"key":"8576_CR25","doi-asserted-by":"crossref","unstructured":"Sempena S, Maulidevi N, Aryan P (2011) Human action recognition using dynamic time warping. In: International conference on electrical engineering and informatics (ICEEI), pp 1\u20135","DOI":"10.1109\/ICEEI.2011.6021605"},{"key":"8576_CR26","doi-asserted-by":"crossref","unstructured":"Shahroudy A, Liu J, Ng T, Wang G (2016) NTU RGB+D: a large scale dataset for 3D human activity analysis. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 1010\u20131019","DOI":"10.1109\/CVPR.2016.115"},{"issue":"5","key":"8576_CR27","doi-asserted-by":"publisher","first-page":"1045","DOI":"10.1109\/TPAMI.2017.2691321","volume":"40","author":"A Shahroudy","year":"2018","unstructured":"Shahroudy A, Ng T, Gong Y, Wang G (2018) Deep multimodal feature analysis for action recognition in RGB+D videos. IEEE Trans Pattern Anal Mach Intell (TPAMI) 40(5):1045\u20131058","journal-title":"IEEE Trans Pattern Anal Mach Intell (TPAMI)"},{"key":"8576_CR28","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems (NIPS), vol 1, pp 568\u2013576"},{"issue":"3","key":"8576_CR29","doi-asserted-by":"publisher","first-page":"199","DOI":"10.1023\/B:STCO.0000035301.49549.88","volume":"14","author":"AJ Smola","year":"2004","unstructured":"Smola AJ, Sch\u00f6lkopf B (2004) A tutorial on support vector regression. Stat Comput 14(3):199\u2013222","journal-title":"Stat Comput"},{"key":"8576_CR30","doi-asserted-by":"crossref","unstructured":"Sun L, Jia K, Yeung D, Shi BE (2015) Human action recognition using factorized spatio-temporal convolutional networks. In: IEEE International conference on computer vision (ICCV), pp 4597\u20134605","DOI":"10.1109\/ICCV.2015.522"},{"key":"8576_CR31","doi-asserted-by":"crossref","unstructured":"Szegedy C, Vanhoucke V, Ioffe S, Shlens J, Wojna Z (2016) Rethinking the inception architecture for computer vision. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 2818\u20132826","DOI":"10.1109\/CVPR.2016.308"},{"key":"8576_CR32","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3D convolutional networks. In: IEEE International conference on computer vision (ICCV), pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"8576_CR33","doi-asserted-by":"crossref","unstructured":"Veeriah V, Zhuang N, Qi G (2015) Differential recurrent neural networks for action recognition. In: IEEE International conference on computer vision (ICCV), pp 4041\u20134049","DOI":"10.1109\/ICCV.2015.460"},{"issue":"5","key":"8576_CR34","doi-asserted-by":"publisher","first-page":"914","DOI":"10.1109\/TPAMI.2013.198","volume":"36","author":"J Wang","year":"2014","unstructured":"Wang J, Liu Z, Wu Y, Yuan J (2014) Learning actionlet ensemble for 3D human action recognition. IEEE Trans Pattern Anal Mach Intell (TPAMI) 36 (5):914\u2013927","journal-title":"IEEE Trans Pattern Anal Mach Intell (TPAMI)"},{"key":"8576_CR35","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, van Gool L (2016) Temporal segment networks: towards good practices for deep action recognition. In: European Conference on computer vision (ECCV), vol 9912, pp 20\u201336","DOI":"10.1007\/978-3-319-46484-8_2"},{"issue":"4","key":"8576_CR36","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1109\/THMS.2015.2504550","volume":"46","author":"P Wang","year":"2016","unstructured":"Wang P, Li W, Gao Z, Zhang J, Tang C, Ogunbona P (2016) Action recognition from depth maps using deep convolutional neural networks. IEEE Trans Human-Mach Syst (THMS) 46(4):498\u2013509","journal-title":"IEEE Trans Human-Mach Syst (THMS)"},{"key":"8576_CR37","doi-asserted-by":"crossref","unstructured":"Wang P, Li W, Gao Z, Zhang Y, Tang C, Ogunbona P (2017) Scene flow to action map: a new representation for RGB-D based action recognition with convolutional neural networks. In: IEEE Conference on computer vision and pattern recognition (CVPR), pp 416\u2013425","DOI":"10.1109\/CVPR.2017.52"},{"key":"8576_CR38","doi-asserted-by":"crossref","unstructured":"Wang P, Li W, Wan J, Ogunbona P, Liu X (2018) Cooperative training of deep aggregation networks for RGB-D action recognition. In: 32nd AAAI Conference on artificial intelligence (AAAI), pp 7404\u20137411","DOI":"10.1609\/aaai.v32i1.12228"},{"issue":"5","key":"8576_CR39","doi-asserted-by":"publisher","first-page":"1051","DOI":"10.1109\/TMM.2018.2818329","volume":"20","author":"P Wang","year":"2018","unstructured":"Wang P, Li W, Gao Z, Tang C, Ogunbona P (2018) Depth pooling based large-scale 3-D action recognition with convolutional neural networks. IEEE Trans Multimed (TMM) 20(5):1051\u20131061","journal-title":"IEEE Trans Multimed (TMM)"},{"key":"8576_CR40","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1016\/j.ins.2018.12.050","volume":"480","author":"Y Xiao","year":"2019","unstructured":"Xiao Y, Chen J, Wang Y, Cao Z, Zhou JT, Bai X (2019) Action recognition for depth video using multi-view dynamic images. Inform Sci 480:287\u2013304","journal-title":"Inform Sci"},{"issue":"13","key":"8576_CR41","doi-asserted-by":"publisher","first-page":"16053","DOI":"10.1007\/s11042-017-5179-7","volume":"77","author":"K Zhang","year":"2018","unstructured":"Zhang K, Zhang L (2018) Extracting hierarchical spatial and temporal features for human action recognition. Multimed Tools Appl 77(13):16053\u201316068","journal-title":"Multimed Tools Appl"},{"key":"8576_CR42","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1016\/j.patcog.2016.05.019","volume":"60","author":"J Zhang","year":"2016","unstructured":"Zhang J, Li W, Ogunbona P, Wang P, Tang C (2016) RGB-D-based action recognition datasets: a survey. Pattern Recogn 60:86\u2013105","journal-title":"Pattern Recogn"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-019-08576-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-019-08576-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-019-08576-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,17]],"date-time":"2022-10-17T04:58:12Z","timestamp":1665982692000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-019-08576-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3,2]]},"references-count":42,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2021,5]]}},"alternative-id":["8576"],"URL":"https:\/\/doi.org\/10.1007\/s11042-019-08576-z","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3,2]]},"assertion":[{"value":"10 June 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 October 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 December 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 March 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}