{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T07:01:22Z","timestamp":1779174082611,"version":"3.51.4"},"reference-count":43,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"7","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2024,7,1]]},"DOI":"10.1587\/transinf.2023edp7223","type":"journal-article","created":{"date-parts":[[2024,6,30]],"date-time":"2024-06-30T22:22:42Z","timestamp":1719786162000},"page":"869-877","source":"Crossref","is-referenced-by-count":2,"title":["2D Human Skeleton Action Recognition Based on Depth Estimation"],"prefix":"10.1587","volume":"E107.D","author":[{"given":"Lei","family":"WANG","sequence":"first","affiliation":[{"name":"Sichuan University"},{"name":"Chengdu Aeronautic Polytechnic"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shanmin","family":"YANG","sequence":"additional","affiliation":[{"name":"Chengdu University of Information Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwei","family":"ZHANG","sequence":"additional","affiliation":[{"name":"Sichuan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"GU","sequence":"additional","affiliation":[{"name":"Chengdu Aeronautic Polytechnic"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] S. Gu, L. Wang, L. He, X. He, and J. Wang, \u201cGaze estimation via a differential eyes&apos; appearances network with a reference grid,\u201d Engineering, vol.7, no.6, pp.777-786, 2021. 10.1016\/j.eng.2020.08.027","DOI":"10.1016\/j.eng.2020.08.027"},{"key":"2","doi-asserted-by":"publisher","unstructured":"[2] G. Saleem, U.I. Bajwa, and R.H. Raza, \u201cToward human activity recognition: A survey,\u201d Neural Computing and Applications, vol.35, no.5, pp.4145-4182, 2023. 10.1007\/s00521-022-07937-4","DOI":"10.1007\/s00521-022-07937-4"},{"key":"3","doi-asserted-by":"publisher","unstructured":"[3] C.N. Phyo, T.T. Zin, and P. Tin, \u201cDeep learning for recognizing human activities using motions of skeletal joints,\u201d IEEE Trans. Consum. Electron., vol.65, no.2, pp.243-252, 2019. 10.1109\/tce.2019.2908986","DOI":"10.1109\/TCE.2019.2908986"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] P.F. Felzenszwalb and D.P. Huttenlocher, \u201cPictorial structures for object recognition,\u201d International Journal of Computer Vision, vol.61, no.1, pp.55-79, 2005. 10.1023\/b:visi.0000042934.15159.49","DOI":"10.1023\/B:VISI.0000042934.15159.49"},{"key":"5","unstructured":"[5] D. Eigen, C. Puhrsch, and R. Fergus, \u201cDepth map prediction from a single image using a multi-scale deep network,\u201d Advances in Neural Information Processing Systems, vol.27, 2014."},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] I. Laina, C. Rupprecht, V. Belagiannis, F. Tombari, and N. Navab, \u201cDeeper depth prediction with fully convolutional residual networks,\u201d 2016 Fourth International Conference on 3D Vision (3DV), pp.239-248, IEEE, 2016. 10.1109\/3dv.2016.32","DOI":"10.1109\/3DV.2016.32"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] C. Godard, O.M. Aodha, and G.J. Brostow, \u201cUnsupervised monocular depth estimation with left-right consistency,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.270-279, 2017. 10.1109\/CVPR.2017.699","DOI":"10.1109\/CVPR.2017.699"},{"key":"8","doi-asserted-by":"publisher","unstructured":"[8] K. Wang, L. Lin, C. Jiang, C. Qian, and P. Wei, \u201c3D human pose machines with self-supervised learning,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.42, no.5, pp.1069-1082, 2019. 10.1109\/tpami.2019.2892452","DOI":"10.1109\/TPAMI.2019.2892452"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] X. Sun, J. Shang, S. Liang, and Y. Wei, \u201cCompositional human pose regression,\u201d Proc. IEEE International Conference on Computer Vision, pp.2602-2611, 2017. 10.1109\/ICCV.2017.284","DOI":"10.1109\/ICCV.2017.284"},{"key":"10","unstructured":"[10] F. Yu and V. Koltun, \u201cMulti-scale context aggregation by dilated convolutions,\u201d arXiv preprint, arXiv:1511.07122, 2015. 10.48550\/arXiv.1511.07122"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] R. Ranftl, A. Bochkovskiy, and V. Koltun, \u201cVision transformers for dense prediction,\u201d Proc. IEEE\/CVF International Conference on Computer Vision, pp.12179-12188, 2021. 10.1109\/ICCV48922.2021.01196","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] L. Wang, J. Zhang, Y. Wang, H. Lu, and X. Ruan, \u201cCLIFFNet for monocular depth estimation with hierarchical embedding loss,\u201d European Conference on Computer Vision, pp.316-331, Springer, 2020. 10.1007\/978-3-030-58558-7_19","DOI":"10.1007\/978-3-030-58558-7_19"},{"key":"13","unstructured":"[13] F. Iandola, M. Moskewicz, S. Karayev, R. Girshick, T. Darrell, and K. Keutzer, \u201cDenseNet: Implementing efficient ConvNet descriptor pyramids,\u201d arXiv preprint, arXiv:1404.1869, 2014. 10.48550\/arXiv.1404.1869"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] H. Jung, Y. Kim, D. Min, C. Oh, and K. Sohn, \u201cDepth prediction from a single image with conditional adversarial networks,\u201d 2017 IEEE International Conference on Image Processing (ICIP), pp.1717-1721, IEEE, 2017. 10.1109\/icip.2017.8296575","DOI":"10.1109\/ICIP.2017.8296575"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] J. Jiao, Y. Cao, Y. Song, and R. Lau, \u201cLook deeper into depth: Monocular depth estimation with semantic booster and attention-driven loss,\u201d Proc. European Conference on Computer Vision (ECCV), pp.55-71, 2018. 10.1007\/978-3-030-01267-0_4","DOI":"10.1007\/978-3-030-01267-0_4"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] C.N. Phyo, T.T. Zin, and P. Tin, \u201cSkeleton motion history based human action recognition using deep learning,\u201d 2017 IEEE 6th Global Conference on Consumer Electronics (GCCE), pp.1-2, IEEE, 2017. 10.1109\/gcce.2017.8229448","DOI":"10.1109\/GCCE.2017.8229448"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] H. Wang and L. Wang, \u201cModeling temporal dynamics and spatial configurations of actions using two-stream recurrent neural networks,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.3633-3642, 2017. 10.1109\/CVPR.2017.387","DOI":"10.1109\/CVPR.2017.387"},{"key":"18","doi-asserted-by":"publisher","unstructured":"[18] C. Si, Y. Jing, W. Wang, L. Wang, and T. Tan, \u201cSkeleton-based action recognition with hierarchical spatial reasoning and temporal stack learning network,\u201d Pattern Recognition, vol.107, 107511, 2020. 10.1016\/j.patcog.2020.107511","DOI":"10.1016\/j.patcog.2020.107511"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] Y. Liu, H. Zhang, Y. Li, K. He, and D. Xu, \u201cSkeleton-based human action recognition via large-kernel attention graph convolutional network,\u201d IEEE Trans. Vis. Comput. Graph., vol.29, no.5, pp.2575-2585, 2023. 10.1109\/tvcg.2023.3247075","DOI":"10.1109\/TVCG.2023.3247075"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] J. Zhang, Z. Tu, J. Yang, Y. Chen, and J. Yuan, \u201cMixSTE: Seq2seq mixed spatio-temporal encoder for 3D human pose estimation in video,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.13232-13242, 2022. 10.1109\/CVPR52688.2022.01288","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"21","doi-asserted-by":"publisher","unstructured":"[21] J. Zhang, G. Ye, Z. Tu, Y. Qin, Q. Qin, J. Zhang, and J. Liu, \u201cA spatial attentive and temporal dilated (SATD) GCN for skeleton-based action recognition,\u201d CAAI Trans. Intelligence Technology, vol.7, no.1, pp.46-55, 2021. 10.1049\/cit2.12012","DOI":"10.1049\/cit2.12012"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] S. Cho, M.H. Maqbool, F. Liu, and H. Foroosh, \u201cSelf-attention network for skeleton-based human action recognition,\u201d Proc. IEEE\/CVF Winter Conference on Applications of Computer Vision, pp.635-644, 2020. 10.1109\/WACV45572.2020.9093639","DOI":"10.1109\/WACV45572.2020.9093639"},{"key":"23","doi-asserted-by":"publisher","unstructured":"[23] Z. Tu, J. Zhang, H. Li, Y. Chen, and J. Yuan, \u201cJoint-bone fusion graph convolutional network for semi-supervised skeleton action recognition,\u201d IEEE Trans. Multimed., vol.25, pp.1819-1831, 2022. 10.1109\/tmm.2022.3168137","DOI":"10.1109\/TMM.2022.3168137"},{"key":"24","doi-asserted-by":"publisher","unstructured":"[24] B. Xu, X. Shu, and Y. Song, \u201cX-invariant contrastive augmentation and representation learning for semi-supervised skeleton-based action recognition,\u201d IEEE Trans. Image Process., vol.31, pp.3852-3867, 2022. 10.1109\/tip.2022.3175605","DOI":"10.1109\/TIP.2022.3175605"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] K. Su, X. Liu, and E. Shlizerman, \u201cPredict &amp; cluster: Unsupervised skeleton based action recognition,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.9631-9640, 2020.","DOI":"10.1109\/CVPR42600.2020.00965"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] S. Chaudhary and S. Murala, \u201cDepth-based end-to-end deep network for human action recognition,\u201d IET Computer Vision, vol.13, no.1, pp.15-22, 2019. 10.1049\/iet-cvi.2018.5020","DOI":"10.1049\/iet-cvi.2018.5020"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] Z. Cao, T. Simon, S.-E. Wei, and Y. Sheikh, \u201cRealtime multi-person 2D pose estimation using part affinity fields,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.7291-7299, 2017. 10.1109\/CVPR.2017.143","DOI":"10.1109\/CVPR.2017.143"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] M. Li, S. Chen, X. Chen, Y. Zhang, Y. Wang, and Q. Tian, \u201cActional-structural graph convolutional networks for skeleton-based action recognition,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.3595-3603, 2019.","DOI":"10.1109\/CVPR.2019.00371"},{"key":"29","unstructured":"[29] T. Kipf, E. Fetaya, K.C. Wang, M. Welling, and R. Zemel, \u201cNeural relational inference for interacting systems,\u201d International Conference on Machine Learning, pp.2688-2697, PMLR, 2018."},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] G. Huang, Z. Liu, L. Van Der Maaten, and K.Q. Weinberger, \u201cDensely connected convolutional networks,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.2261-2269, 2017. 10.1109\/CVPR.2017.243","DOI":"10.1109\/CVPR.2017.243"},{"key":"31","doi-asserted-by":"publisher","unstructured":"[31] C. Szegedy, S. Ioffe, V. Vanhoucke, and A. Alemi, \u201cInception-v4, Inception-ResNet and the impact of residual connections on learning,\u201d Proc. AAAI Conference on Artificial Intelligence, vol.31, no.1, 2017. 10.1609\/aaai.v31i1.11231","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] J. Carreira and A. Zisserman, \u201cQuo vadis, action recognition? A new model and the kinetics dataset,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.6299-6308, 2017. 10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"33","doi-asserted-by":"publisher","unstructured":"[33] J. Liu, A. Shahroudy, M. Perez, G. Wang, L.-Y. Duan, and A.C. Kot, \u201cNTU RGB+D 120: A large-scale benchmark for 3D human activity understanding,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.42, no.10, pp.2684-2701, 2019. 10.1109\/tpami.2019.2916873","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"34","doi-asserted-by":"publisher","unstructured":"[34] W. Yang, J. Zhang, J. Cai, and Z. Xu, \u201cHybridNet: Integrating GCN and CNN for skeleton-based action recognition,\u201d Applied Intelligence, vol.53, no.1, pp.574-585, 2023. 10.1007\/s10489-022-03436-0","DOI":"10.1007\/s10489-022-03436-0"},{"key":"35","doi-asserted-by":"publisher","unstructured":"[35] K. Hu, J. Jin, C. Shen, M. Xia, and L. Weng, \u201cAttentional weighting strategy-based dynamic GCN for skeleton-based action recognition,\u201d Multimedia Systems, vol.29, pp.1941-1954, 2023. 10.1007\/s00530-023-01082-1","DOI":"10.1007\/s00530-023-01082-1"},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] C. Wang, Q. Zhang, C. Huang, W. Liu, and X. Wang, \u201cMancs: A multi-task attentional network with curriculum sampling for person re-identification,\u201d Proc. European Conference on Computer Vision (ECCV), pp.365-381, 2018.","DOI":"10.1007\/978-3-030-01225-0_23"},{"key":"37","doi-asserted-by":"publisher","unstructured":"[37] S. Yan, Y. Xiong, and D. Lin, \u201cSpatial temporal graph convolutional networks for skeleton-based action recognition,\u201d Proc. AAAI Conference on Artificial Intelligence, vol.32, no.1, 2018. 10.1609\/aaai.v32i1.12328","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] L. Shi, Y. Zhang, J. Cheng, and H. Lu, \u201cTwo-stream adaptive graph convolutional networks for skeleton-based action recognition,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.12026-12035, 2019.","DOI":"10.1109\/CVPR.2019.01230"},{"key":"39","doi-asserted-by":"crossref","unstructured":"[39] L. Shi, Y. Zhang, J. Cheng, and H. Lu, \u201cSkeleton-based action recognition with directed graph neural networks,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.7912-7921, 2019. 10.1109\/CVPR.2019.00810","DOI":"10.1109\/CVPR.2019.00810"},{"key":"40","doi-asserted-by":"publisher","unstructured":"[40] L. Shi, Y. Zhang, J. Cheng, and H. Lu, \u201cSkeleton-based action recognition with multi-stream adaptive graph convolutional networks,\u201d IEEE Trans. Image Process., vol.29, pp.9532-9545, 2020. 10.1109\/tip.2020.3028207","DOI":"10.1109\/TIP.2020.3028207"},{"key":"41","doi-asserted-by":"publisher","unstructured":"[41] N. Sun, L. Leng, J. Liu, and G. Han, \u201cMulti-stream slowfast graph convolutional networks for skeleton-based action recognition,\u201d Image and Vision Computing, vol.109, 104141, 2021. 10.1016\/j.imavis.2021.104141","DOI":"10.1016\/j.imavis.2021.104141"},{"key":"42","doi-asserted-by":"publisher","unstructured":"[42] J. Xie, Q. Miao, R. Liu, W. Xin, L. Tang, S. Zhong, and X. Gao, \u201cAttention adjacency matrix based graph convolutional networks for skeleton-based action recognition,\u201d Neurocomputing, vol.440, pp.230-239, 2021. 10.1016\/j.neucom.2021.02.001","DOI":"10.1016\/j.neucom.2021.02.001"},{"key":"43","doi-asserted-by":"publisher","unstructured":"[43] L. Wu, C. Zhang, and Y. Zou, \u201cSpatiotemporal focus for skeleton-based action recognition,\u201d Pattern Recognition, vol.136, 109231, 2023. 10.1016\/j.patcog.2022.109231","DOI":"10.1016\/j.patcog.2022.109231"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/7\/E107.D_2023EDP7223\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,6]],"date-time":"2024-07-06T04:15:14Z","timestamp":1720239314000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/7\/E107.D_2023EDP7223\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,1]]},"references-count":43,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2023edp7223","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7,1]]},"article-number":"2023EDP7223"}}