{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T14:16:47Z","timestamp":1778249807829,"version":"3.51.4"},"reference-count":47,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Displays"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.displa.2026.103429","type":"journal-article","created":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T22:56:12Z","timestamp":1773615372000},"page":"103429","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":4,"special_numbering":"C","title":["Dual-stream spatio-temporal GCN-transformer network for 3D human pose estimation"],"prefix":"10.1016","volume":"93","author":[{"given":"Jiawen","family":"Duan","sequence":"first","affiliation":[]},{"given":"Jian","family":"Xiang","sequence":"additional","affiliation":[]},{"given":"Zhiqiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Linlin","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Wan","family":"Xiang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.displa.2026.103429_b0005","doi-asserted-by":"crossref","unstructured":"N. Mehdi, V. Thomas, S. Ivaldi, F. Colas, 2022. Simultaneous pose and posture estimation with a two-stage particle filter for visuo-inertial fusion. 2022 International Conference on Advanced Robotics and Mechatronics (ICARM), Guilin, China, pp. 132\u2013137. 10.1109\/ICARM54641.2022.9959293.","DOI":"10.1109\/ICARM54641.2022.9959293"},{"issue":"4","key":"10.1016\/j.displa.2026.103429_b0010","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3072959.3073596","article-title":"VNect: real-time 3D human pose estimation with a single RGB camera","volume":"36","author":"Mehta","year":"2017","journal-title":"ACM Trans. Graphics (TOG)"},{"key":"10.1016\/j.displa.2026.103429_b0015","doi-asserted-by":"crossref","unstructured":"H.-Y. Lin, T.-W. Chen, Augmented reality with human body interaction based on monocular 3D pose estimation, International Conference on Advanced Concepts for Intelligent Vision Systems, pp. 321\u2013331, Springer, DOI: 10.1007\/978-3-642-17688-3_31, 2010.","DOI":"10.1007\/978-3-642-17688-3_31"},{"key":"10.1016\/j.displa.2026.103429_b0020","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1159","article-title":"Recognizing human actions as the evolution of pose estimation maps","author":"Liu","year":"2018"},{"key":"10.1016\/j.displa.2026.103429_b0025","doi-asserted-by":"crossref","first-page":"346","DOI":"10.1016\/j.patcog.2017.02.030","article-title":"Enhanced skeleton visualization for view invariant human action recognition","volume":"68","author":"Liu","year":"2017","journal-title":"Pattern Recogn."},{"key":"10.1016\/j.displa.2026.103429_b0030","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"11656","article-title":"3D human pose estimation with spatial and temporal transformers","author":"Zheng","year":"2021"},{"key":"10.1016\/j.displa.2026.103429_b0035","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"602","article-title":"Conditional directed graph convolution for 3D human pose estimation","author":"Hu","year":"2021"},{"key":"10.1016\/j.displa.2026.103429_b0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7779","article-title":"Epipolar transformers","author":"He","year":"2020"},{"key":"10.1016\/j.displa.2026.103429_b0045","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"7718","article-title":"Learnable triangulation of human pose","author":"Iskakov","year":"2019"},{"key":"10.1016\/j.displa.2026.103429_b0050","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15190","article-title":"TesseTrack: End-to-end learnable multi-person articulated 3D pose tracking","author":"Reddy","year":"2021"},{"key":"10.1016\/j.displa.2026.103429_b0055","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4790","article-title":"3D human pose estimation with spatio-temporal criss-cross attention","author":"Tang","year":"2023"},{"key":"10.1016\/j.displa.2026.103429_b0060","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"8877","article-title":"PoseFormerV2: Exploring frequency domain for efficient and robust 3D human pose estimation","author":"Zhao","year":"2023"},{"key":"10.1016\/j.displa.2026.103429_b0065","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"MotionBERT: a unified perspective on learning human motion representations","author":"Zhu","year":"2023"},{"key":"10.1016\/j.displa.2026.103429_b0070","article-title":"I2L-MeshNet: Image-to-lixel prediction network for accurate 3D human pose and mesh estimation from a single RGB image","author":"Moon","year":"2020","journal-title":"European Conference on Computer Vision (ECCV)"},{"key":"10.1016\/j.displa.2026.103429_b0075","article-title":"Ordinal depth supervision for 3D human pose estimation","author":"Pavlakos","year":"2018","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"10.1016\/j.displa.2026.103429_b0080","doi-asserted-by":"crossref","unstructured":"S. Zhang, X. Li, C. Hu, J. Xu, H. Liu, DSTFormer: 3D human pose estimation with a dual-scale spatial and temporal transformer network, 2024 International Conference on Advanced Robotics and Mechatronics (ICARM), Tokyo, Japan, 2024, pp. 484\u2013489. 10.1109\/ICARM62033.2024.10715863.","DOI":"10.1109\/ICARM62033.2024.10715863"},{"key":"10.1016\/j.displa.2026.103429_b0085","article-title":"Cascaded pyramid network for multi-person pose estimation","author":"Chen","year":"2018","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"10.1016\/j.displa.2026.103429_b0090","doi-asserted-by":"crossref","unstructured":"A. Newell, K. Yang, J. Deng, Stacked hourglass networks for human pose estimation, European Conference on Computer Vision, 2016, pp. 483\u2013499, Springer, DOI: 10.1007\/978-3-319-46484-829.","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"10.1016\/j.displa.2026.103429_b0095","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5693","article-title":"Deep high-resolution representation learning for human pose estimation","author":"Sun","year":"2019"},{"key":"10.1016\/j.displa.2026.103429_b0100","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, \u0141. Kaiser, I. Polosukhin, Attention is all you need, Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS'17), pp. 6000\u20136010, Curran Associates Inc., Red Hook, NY, USA, 2017."},{"key":"10.1016\/j.displa.2026.103429_b0105","unstructured":"T.N. Kipf, M. Welling, Semi-supervised classification with graph convolutional networks, arXiv, DOI: 10.48550\/arXiv.1609.02907, 2016."},{"key":"10.1016\/j.displa.2026.103429_b0110","doi-asserted-by":"crossref","unstructured":"J. Zhang, Z. Tu, J. Yang, Y. Chen, J. Yuan, MixSTE: Seq2seq Mixed spatio-temporal encoder for 3D human pose estimation in video, arXiv, DOI: 10.48550\/arXiv.2203.00859, 2022.","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"10.1016\/j.displa.2026.103429_b0115","unstructured":"Y. Liu, Z. Shao, N. Hoffmann, Global attention mechanism: Retain information to enhance channel-spatial interactions, arXiv, DOI: 10.48550\/arXiv.2112.05561, 2021."},{"key":"10.1016\/j.displa.2026.103429_b0120","doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, G. Sun, Squeeze-and-excitation networks, 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Salt Lake City, UT, USA, 2018, pp. 7132\u20137141. 10.1109\/CVPR.2018.00745.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.displa.2026.103429_b0125","doi-asserted-by":"crossref","unstructured":"Q. Dang, J. Yin, B. Wang, W. Zheng, Deep learning based 2D human pose estimation: A survey, Tsinghua Science and Technology, 24(6), 663\u2013676, 2019, 10.26599\/TST.2018.9010100.","DOI":"10.26599\/TST.2018.9010100"},{"key":"10.1016\/j.displa.2026.103429_b0130","doi-asserted-by":"crossref","unstructured":"C. Ionescu, F. Li, C. Sminchisescu, Latent structured models for human pose estimation, Proceedings of the 2011 International Conference on Computer Vision, Barcelona, Spain, 2011, pp. 2220\u20132227, 10.1109\/ICCV.2011.6126500.","DOI":"10.1109\/ICCV.2011.6126500"},{"issue":"1","key":"10.1016\/j.displa.2026.103429_b0135","doi-asserted-by":"crossref","first-page":"44","DOI":"10.1109\/TPAMI.2006.21","article-title":"Recovering 3D human pose from monocular images","volume":"28","author":"Agarwal","year":"2005","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.displa.2026.103429_b0140","doi-asserted-by":"crossref","first-page":"116","DOI":"10.1016\/j.robot.2015.03.001","article-title":"Action database for categorizing and inferring human poses from video sequences","volume":"70","author":"Takano","year":"2015","journal-title":"Rob. Auton. Syst."},{"issue":"4","key":"10.1016\/j.displa.2026.103429_b0145","doi-asserted-by":"crossref","first-page":"1646","DOI":"10.3390\/app14041646","article-title":"SCGFormer: semantic chebyshev graph convolution transformer for 3D human pose estimation","volume":"14","author":"Liang","year":"2024","journal-title":"Appl. Sci."},{"issue":"6","key":"10.1016\/j.displa.2026.103429_b0150","doi-asserted-by":"crossref","first-page":"3000","DOI":"10.1109\/TPAMI.2021.3051173","article-title":"HEMlets PoSh: learning part-centric heatmap triplets for 3D human pose and shape estimation","volume":"44","author":"Zhou","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.displa.2026.103429_b0155","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)","article-title":"3D human pose estimation using spatio-temporal networks with explicit occlusion training","author":"Cheng","year":"2020"},{"key":"10.1016\/j.displa.2026.103429_b0160","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"7753","article-title":"3D human pose estimation in video with temporal convolutions and semi-supervised training","author":"Pavllo","year":"2019"},{"key":"10.1016\/j.displa.2026.103429_b0165","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"13137","article-title":"MHFormer: Multi-hypothesis transformer for 3D human pose estimation","author":"Li","year":"2022"},{"key":"10.1016\/j.displa.2026.103429_b0170","doi-asserted-by":"crossref","unstructured":"W. Shan, Z. Liu, X. Zhang, S. Wang, S. Ma, W. Gao, 2022. P-STMO: Pre-trained spatial temporal many-to-one model for 3D human pose estimation. Computer Vision \u2013 ECCV 2022, Lecture Notes in Computer Science, vol. 13665, pp. 1\u201317. Springer, Cham. DOI: 10.1007\/978-3-031-20065-6_27.","DOI":"10.1007\/978-3-031-20065-6_27"},{"key":"10.1016\/j.displa.2026.103429_b0175","doi-asserted-by":"crossref","unstructured":"Z. Tang, Z. Qiu, Y. Hao, R. Hong, T. Yao, 3D human pose estimation with spatio-temporal criss-cross attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Vancouver, BC, Canada, 2023, pp. 4790\u20134799, 10.1109\/CVPR52729.2023.00464.","DOI":"10.1109\/CVPR52729.2023.00464"},{"key":"10.1016\/j.displa.2026.103429_b0180","doi-asserted-by":"crossref","unstructured":"L. Zhao, X. Peng, Y. Tian, M. Kapadia, D.N. Metaxas, Semantic graph convolutional networks for 3D human pose regression, Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, pp. 3420\u20133430, 10.1109\/CVPR.2019.00354, 2019.","DOI":"10.1109\/CVPR.2019.00354"},{"key":"10.1016\/j.displa.2026.103429_b0185","doi-asserted-by":"crossref","unstructured":"T. Xu, W. Takano, Graph stacked hourglass networks for 3D human pose estimation, Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Nashville, TN, USA, 2021, pp. 16100\u201316109. 10.1109\/CVPR46437.2021.01584.","DOI":"10.1109\/CVPR46437.2021.01584"},{"key":"10.1016\/j.displa.2026.103429_b0190","doi-asserted-by":"crossref","unstructured":"B.X.B. Yu, Z. Zhang, Y. Liu, S.-H. Zhong, Y. Liu, C. W. Chen, GLA-GCN: Global-local adaptive graph convolutional network for 3D human pose estimation from monocular video, Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), Paris, France, 2023, pp. 8784\u20138795, DOI: 10.1109\/ICCV51070.2023.00810.","DOI":"10.1109\/ICCV51070.2023.00810"},{"key":"10.1016\/j.displa.2026.103429_b0195","doi-asserted-by":"crossref","unstructured":"W. Zhao, W. Wang, Y. Tian, GraFormer: Graph-oriented transformer for 3D pose estimation, Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), New Orleans, LA, USA, 2022, pp. 20406\u201320415. 10.1109\/CVPR52688.2022.01979.","DOI":"10.1109\/CVPR52688.2022.01979"},{"key":"10.1016\/j.displa.2026.103429_b0200","first-page":"3844","article-title":"Convolutional neural networks on graphs with fast localized spectral filtering","volume":"29","author":"Defferrard","year":"2016","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.displa.2026.103429_b0205","doi-asserted-by":"crossref","unstructured":"J. Gong, L.G. Foo, Z. Fan, Q. Ke, H. Rahmani, J. Liu, DiffPose: Toward more reliable 3D pose estimation, Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Vancouver, BC, Canada, 2023, pp. 13041\u201313051. 10.1109\/CVPR52729.2023.01253.","DOI":"10.1109\/CVPR52729.2023.01253"},{"key":"10.1016\/j.displa.2026.103429_b0210","doi-asserted-by":"crossref","unstructured":"S. Mehraban, V. Adeli, B. Taati, MotionAGFormer: Enhancing 3D human pose estimation with a transformer-GCNFormer network, 2024 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), Waikoloa, HI, USA, 2024, pp. 6905\u20136915.","DOI":"10.1109\/WACV57701.2024.00677"},{"issue":"7","key":"10.1016\/j.displa.2026.103429_b0215","doi-asserted-by":"crossref","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","article-title":"Human 3.6M: Large scale datasets and predictive methods for 3D human sensing in natural environments","volume":"36","author":"Ionescu","year":"2013","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.displa.2026.103429_b0220","series-title":"2017 International Conference on 3D Vision (3DV)","first-page":"506","article-title":"Monocular 3D human pose estimation in the wild using improved CNN supervision","author":"Mehta","year":"2017"},{"key":"10.1016\/j.displa.2026.103429_b0225","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"International Conference on Learning Representations"},{"key":"10.1016\/j.displa.2026.103429_b0230","series-title":"Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence Main Track","first-page":"581","article-title":"HDFormer: High-order directed transformer for 3D human pose estimation","author":"Chen","year":"2023"},{"key":"10.1016\/j.displa.2026.103429_b0235","unstructured":"X. Qian, Y. Tang, N. Zhang, M. Han, J. Xiao, M.-C. Huang, R.-S. Lin, 2023. HSTFormer: Hierarchical spatial-temporal transformers for 3D human pose estimation. arXiv. DOI: 10.48550\/arXiv.2301.07322."}],"container-title":["Displays"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226000922?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226000922?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T13:54:27Z","timestamp":1778248467000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0141938226000922"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":47,"alternative-id":["S0141938226000922"],"URL":"https:\/\/doi.org\/10.1016\/j.displa.2026.103429","relation":{},"ISSN":["0141-9382"],"issn-type":[{"value":"0141-9382","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Dual-stream spatio-temporal GCN-transformer network for 3D human pose estimation","name":"articletitle","label":"Article Title"},{"value":"Displays","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.displa.2026.103429","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103429"}}