{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:08:03Z","timestamp":1780931283779,"version":"3.54.1"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.113960","type":"journal-article","created":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T06:01:14Z","timestamp":1779084074000},"page":"113960","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["ETCMesh: Exploring temporal consistency for human pose and mesh reconstruction with state space models"],"prefix":"10.1016","volume":"180","author":[{"given":"Hehao","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0300-6144","authenticated-orcid":false,"given":"Zhengping","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jirui","family":"Di","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qiming","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhe","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113960_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112805","article-title":"Structure and sensitivity in 3D human pose similarity quantification and estimation","volume":"173","author":"Lee","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113960_b2","doi-asserted-by":"crossref","first-page":"6962","DOI":"10.1109\/TMM.2025.3590928","article-title":"LiDAR-HMR: 3D human mesh recovery from LiDAR","volume":"27","author":"Fan","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113960_b3","doi-asserted-by":"crossref","first-page":"3285","DOI":"10.1109\/TIP.2024.3393716","article-title":"LEAPSE: Learning environment affordances for 3D human pose and shape estimation","volume":"33","author":"Tian","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.113960_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110848","article-title":"LiDARCapV2: 3D human pose estimation with human-object interaction from LiDAR point clouds","volume":"156","author":"Zhang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113960_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111377","article-title":"SG-CLR: Semantic representation-guided contrastive learning for self-supervised skeleton-based action recognition","volume":"162","author":"Liu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113960_b6","doi-asserted-by":"crossref","first-page":"2399","DOI":"10.1109\/TCSVT.2023.3310525","article-title":"Personalized graph generation for monocular 3D human pose and shape estimation","volume":"34","author":"Hu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113960_b7","doi-asserted-by":"crossref","unstructured":"S.K. Dwivedi, Y. Sun, P. Patel, M.J. Black, TokenHMR: Advancing human mesh recovery with a tokenized pose representation, in: IEEE Conference on Computer Vision and Pattern Recognition, 2024, pp. 1323\u20131333.","DOI":"10.1109\/CVPR52733.2024.00132"},{"key":"10.1016\/j.patcog.2026.113960_b8","doi-asserted-by":"crossref","unstructured":"Z. Huang, M. Shi, C. Liu, K. Xian, Z. Cao, Simhmr: A simple query-based framework for parameterized human mesh reconstruction, in: ACM International Conference on Multimedia, 2023, pp. 6918\u20136927.","DOI":"10.1145\/3581783.3611814"},{"key":"10.1016\/j.patcog.2026.113960_b9","doi-asserted-by":"crossref","unstructured":"K. Lin, C. Lin and, et al., MPT: Mesh Pre-Training with Transformers for Human Pose and Mesh Reconstruction, in: IEEE Winter Conference on Applications of Computer Vision, 2024, pp. 3403\u20133413.","DOI":"10.1109\/WACV57701.2024.00338"},{"key":"10.1016\/j.patcog.2026.113960_b10","doi-asserted-by":"crossref","unstructured":"C. Zheng, M. Mendieta, P. Wang, A. Lu, C. Chen, A Lightweight Graph Transformer Network for Human Mesh Reconstruction from 2D Human Pose, in: ACM International Conference on Multimedia, 2022, pp. 5496\u20135507.","DOI":"10.1145\/3503161.3547844"},{"key":"10.1016\/j.patcog.2026.113960_b11","doi-asserted-by":"crossref","first-page":"104","DOI":"10.1109\/TCSVT.2022.3199201","article-title":"A progressive quadric graph convolutional network for 3D human mesh recovery","volume":"33","author":"Wang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113960_b12","doi-asserted-by":"crossref","unstructured":"M. Lee, H. Lee, B. Kim, S. Kim, UNSPAT: Uncertainty-guided spatiotemporal transformer for 3D human pose and shape estimation on videos, in: IEEE Winter Conference on Applications of Computer Vision, 2024, pp. 3004\u20133013.","DOI":"10.1109\/WACV57701.2024.00298"},{"key":"10.1016\/j.patcog.2026.113960_b13","doi-asserted-by":"crossref","unstructured":"X. Shen, Z. Yang, X. Wang, J. Ma, C. Zhou, Y. Yang, Global-to-Local Modeling for Video-Based 3D Human Pose and Shape Estimation, in: IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 8887\u20138896.","DOI":"10.1109\/CVPR52729.2023.00858"},{"key":"10.1016\/j.patcog.2026.113960_b14","doi-asserted-by":"crossref","unstructured":"B. Zhang, K. Ma, S. Wu, Z. Yuan, Two-stage co-segmentation network based on discriminative representation for recovering human mesh from videos, in: IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 5662\u20135670.","DOI":"10.1109\/CVPR52729.2023.00548"},{"key":"10.1016\/j.patcog.2026.113960_b15","first-page":"10041","article-title":"Transformers are SSMs: generalized models and efficient algorithms through structured state space duality","volume":"vol. 235","author":"Dao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113960_b16","unstructured":"A. Gu, T. Dao, Linear-Time Sequence Modeling with Selective State Spaces, in: Conference on Language Modeling, 2024, pp. 1\u201332."},{"key":"10.1016\/j.patcog.2026.113960_b17","doi-asserted-by":"crossref","unstructured":"A. Stathopoulos, L. Han, D. Metaxas, Score-Guided Diffusion for 3D Human Recovery, in: IEEE Conference on Computer Vision and Pattern Recognition, 2024, pp. 906\u2013915.","DOI":"10.1109\/CVPR52733.2024.00092"},{"key":"10.1016\/j.patcog.2026.113960_b18","doi-asserted-by":"crossref","unstructured":"H. Choi, G. Moon, et al., Pose2Mesh: Graph Convolutional Network for 3D Human Pose and Mesh Recovery from a 2D Human Pose, in: European Conference on Computer Vision, 2020, pp. 769\u2013787.","DOI":"10.1007\/978-3-030-58571-6_45"},{"key":"10.1016\/j.patcog.2026.113960_b19","doi-asserted-by":"crossref","unstructured":"G. Pavlakos, V. Choutas and, et al., Expressive body capture: 3D hands, face, and body from a single image, in: IEEE Conference on Computer Vision and Pattern Recognition, 2019, pp. 10975\u201310985.","DOI":"10.1109\/CVPR.2019.01123"},{"key":"10.1016\/j.patcog.2026.113960_b20","doi-asserted-by":"crossref","unstructured":"H. Choi, G. Moon, et al., Beyond static features for temporally consistent 3D human pose and shape from a video, in: IEEE Conference on Computer Vision and Pattern Recognition, 2021, pp. 1964\u20131973.","DOI":"10.1109\/CVPR46437.2021.00200"},{"key":"10.1016\/j.patcog.2026.113960_b21","doi-asserted-by":"crossref","unstructured":"Z. Luo, S.A. Golestaneh, K.M. Kitani, 3D Human Motion Estimation via Motion Compression and Refinement, in: Asian Conference on Computer Vision, 2020, pp. 324\u2013340.","DOI":"10.1007\/978-3-030-69541-5_20"},{"key":"10.1016\/j.patcog.2026.113960_b22","doi-asserted-by":"crossref","first-page":"10564","DOI":"10.1109\/TCSVT.2024.3410400","article-title":"STAF: 3D human mesh recovery from video with spatio-temporal alignment fusion","volume":"34","author":"Yao","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113960_b23","unstructured":"L. Zhu, B. Liao, Q. Zhang, et al., Vision mamba: Efficient visual representation learning with bidirectional state space model, in: International Conference on Machine Learning, 2024, pp. 62429\u201362442."},{"key":"10.1016\/j.patcog.2026.113960_b24","article-title":"Vmamba: Visual state space model","volume":"vol. 37","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113960_b25","doi-asserted-by":"crossref","unstructured":"J. Park, H.S. Kim, K. Ko, M. Kim, C. Kim, VideoMamba: Spatio-Temporal Selective State Space Model, in: European Conference on Computer Vision, 2024, pp. 1\u201318.","DOI":"10.1007\/978-3-031-72698-9_1"},{"key":"10.1016\/j.patcog.2026.113960_b26","doi-asserted-by":"crossref","unstructured":"Y. Huang, J. Liu and, et al., PoseMamba: Monocular 3D Human Pose Estimation with Bidirectional Global-Local Spatio-Temporal State Space Model, in: AAAI Conference on Artificial Intelligence, 2025.","DOI":"10.1609\/aaai.v39i4.32401"},{"key":"10.1016\/j.patcog.2026.113960_b27","doi-asserted-by":"crossref","unstructured":"A. Mondal, S. Alletto, D. Tome, HumMUSS: Human Motion Understanding using State Space Models, in: IEEE Conference on Computer Vision and Pattern Recognition, 2024, pp. 2318\u20132330.","DOI":"10.1109\/CVPR52733.2024.00225"},{"key":"10.1016\/j.patcog.2026.113960_b28","doi-asserted-by":"crossref","unstructured":"N. Kolotouros, G. Pavlakos, et al., Learning to reconstruct 3D human pose and shape via model-fitting in the loop, in: IEEE International Conference on Computer Vision, 2019, pp. 2252\u20132261.","DOI":"10.1109\/ICCV.2019.00234"},{"key":"10.1016\/j.patcog.2026.113960_b29","doi-asserted-by":"crossref","unstructured":"T.V. Marcard, R. Henschel, et al., Recovering accurate 3d human pose in the wild using imus and a moving camera, in: European Conference on Computer Vision, 2018, pp. 601\u2013617.","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"10.1016\/j.patcog.2026.113960_b30","doi-asserted-by":"crossref","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","article-title":"Human3.6 m: Large scale datasets and predictive methods for 3d human sensing in natural environments","volume":"36","author":"Ionescu","year":"2013","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113960_b31","doi-asserted-by":"crossref","unstructured":"P. Wu, X. Lu, J. Shen, Y. Yin, Clip fusion with bi-level optimization for human mesh reconstruction from monocular videos, in: ACM International Conference on Multimedia, 2023, pp. 105\u2013115.","DOI":"10.1145\/3581783.3611978"},{"key":"10.1016\/j.patcog.2026.113960_b32","doi-asserted-by":"crossref","unstructured":"D. Mehta, H. Rhodin, D. Casas, et al., Monocular 3d human pose estimation in the wild using improved CNN supervision, in: International Conference on 3D Vision, 3DV, 2017, pp. 506\u2013516.","DOI":"10.1109\/3DV.2017.00064"},{"key":"10.1016\/j.patcog.2026.113960_b33","doi-asserted-by":"crossref","unstructured":"Z. Wan, Z. Li, M. Tian, et al., Encoder\u2013decoder with Multi-level Attention for 3D Human Shape and Pose Estimation, in: IEEE International Conference on Computer Vision, 2021, pp. 13013\u201313022.","DOI":"10.1109\/ICCV48922.2021.01279"},{"key":"10.1016\/j.patcog.2026.113960_b34","doi-asserted-by":"crossref","unstructured":"T. Lin, M. Maire, S. Belongie, et al., Microsoft COCO: Common objects in context, in: European Conference on Computer Vision, 2014, pp. 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10.1016\/j.patcog.2026.113960_b35","doi-asserted-by":"crossref","unstructured":"M. Andriluka, L. Pishchulin, et al., 2D human pose estimation: New benchmark and state of the art analysis, in: IEEE Conference on Computer Vision and Pattern Recognition, 2014, pp. 3686\u20133693.","DOI":"10.1109\/CVPR.2014.471"},{"key":"10.1016\/j.patcog.2026.113960_b36","doi-asserted-by":"crossref","unstructured":"W. Wei, J. Lin, T. Liu, H.M. Liao, Capturing humans in motion: Temporal-attentive 3D human pose and shape estimation from monocular video, in: IEEE Conference on Computer Vision and Pattern Recognition, 2022, pp. 13211\u201313220.","DOI":"10.1109\/CVPR52688.2022.01286"},{"key":"10.1016\/j.patcog.2026.113960_b37","first-page":"38571","article-title":"ViTPose: Simple vision transformer baselines for human pose estimation","volume":"35","author":"Xu","year":"2022","journal-title":"Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113960_b38","doi-asserted-by":"crossref","unstructured":"Y. Chen, Z. Wang, Y. Peng, et al., Cascaded pyramid network for multi-person pose estimation, in: IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 7103\u20137112.","DOI":"10.1109\/CVPR.2018.00742"},{"key":"10.1016\/j.patcog.2026.113960_b39","doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, G. Sun, Squeeze-and-Excitation Networks, in: IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.patcog.2026.113960_b40","doi-asserted-by":"crossref","unstructured":"G. Moon, K.M. Lee, I2L-MeshNet: Image-to-Lixel Prediction Network for Accurate 3D Human Pose and Mesh Estimation from a Single RGB Image, in: European Conference on Computer Vision, 2020, pp. 752\u2013768.","DOI":"10.1007\/978-3-030-58571-6_44"},{"key":"10.1016\/j.patcog.2026.113960_b41","doi-asserted-by":"crossref","unstructured":"A. Newell, K. Yang, J. Deng, Stacked hourglass networks for human pose estimation, in: European Conference on Computer Vision, 2016, pp. 483\u2013499.","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"10.1016\/j.patcog.2026.113960_b42","doi-asserted-by":"crossref","unstructured":"D. Pavllo, C. Feichtenhofer, D. Grangier, M. Auli, 3d human pose estimation in video with temporal convolutions and semi-supervised training, in: IEEE Conference on Computer Vision and Pattern Recognition, 2019, pp. 7753\u20137762.","DOI":"10.1109\/CVPR.2019.00794"},{"key":"10.1016\/j.patcog.2026.113960_b43","unstructured":"S. Yang, W. Heng, G. Liu, et al., Capturing the motion of every joint: 3D human pose and shape estimation with independent tokens, in: International Conference on Learning Representations, 2023."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009258?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009258?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:56:42Z","timestamp":1780930602000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009258"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":43,"alternative-id":["S0031320326009258"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113960","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"ETCMesh: Exploring temporal consistency for human pose and mesh reconstruction with state space models","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113960","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113960"}}