{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:36:59Z","timestamp":1775770619006,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T00:00:00Z","timestamp":1757548800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T00:00:00Z","timestamp":1757548800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"the Natural Science and Technology Special Projects","award":["2019-1496"],"award-info":[{"award-number":["2019-1496"]}]},{"name":"the Natural Science and Technology Special Projects","award":["2019-1496"],"award-info":[{"award-number":["2019-1496"]}]},{"name":"the Natural Science and Technology Special Projects","award":["2019-1496"],"award-info":[{"award-number":["2019-1496"]}]},{"name":"Science and Technology Plan of Shenzhen","award":["JSGG20220831110607013"],"award-info":[{"award-number":["JSGG20220831110607013"]}]},{"name":"Science and Technology Plan of Shenzhen","award":["JSGG20220831110607013"],"award-info":[{"award-number":["JSGG20220831110607013"]}]},{"name":"Science and Technology Plan of Shenzhen","award":["JSGG20220831110607013"],"award-info":[{"award-number":["JSGG20220831110607013"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11760-025-04741-0","type":"journal-article","created":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T14:23:50Z","timestamp":1757600630000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["MTFT: Multimodal Temporal Fusion Transformer for 3D Human Pose and Shape Estimation"],"prefix":"10.1007","volume":"19","author":[{"given":"Ke","family":"Song","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunhe","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huiyuan","family":"Xiong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingtang","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fanbin","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingyuan","family":"Fan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,11]]},"reference":[{"key":"4741_CR1","first-page":"851","volume":"2","author":"M Loper","year":"2023","unstructured":"Loper, M., Mahmood, N., Romero, J., et al.: Smpl: A skinned multi-person linear model. Seminal Graphics Papers: Pushing the Boundaries 2, 851\u2013866 (2023)","journal-title":"Seminal Graphics Papers: Pushing the Boundaries"},{"key":"4741_CR2","doi-asserted-by":"crossref","unstructured":"Zheng C, Mendieta M, Wang P, et\u00a0al (2022) A lightweight graph transformer network for human mesh reconstruction from 2d human pose. In: Proceedings of the 30th ACM international conference on multimedia, pp 5496\u20135507","DOI":"10.1145\/3503161.3547844"},{"key":"4741_CR3","doi-asserted-by":"crossref","unstructured":"Kolotouros N, Pavlakos G, Black MJ, et\u00a0al (2019) Learning to reconstruct 3d human pose and shape via model-fitting in the loop. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2252\u20132261","DOI":"10.1109\/ICCV.2019.00234"},{"key":"4741_CR4","doi-asserted-by":"crossref","unstructured":"You Y, Liu H, Wang T, et\u00a0al (2023) Co-evolution of pose and mesh for 3d human body estimation from video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 14963\u201314973","DOI":"10.1109\/ICCV51070.2023.01374"},{"key":"4741_CR5","doi-asserted-by":"crossref","unstructured":"Yuan Y, Iqbal U, Molchanov P, et\u00a0al (2022) Glamr: Global occlusion-aware human mesh recovery with dynamic cameras. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 11038\u201311049","DOI":"10.1109\/CVPR52688.2022.01076"},{"key":"4741_CR6","doi-asserted-by":"crossref","unstructured":"Von\u00a0Marcard T, Henschel R, Black MJ, et\u00a0al (2018) Recovering accurate 3d human pose in the wild using imus and a moving camera. In: Proceedings of the European conference on computer vision (ECCV), pp 601\u2013617","DOI":"10.1007\/978-3-030-01249-6_37"},{"issue":"7","key":"4741_CR7","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2013","unstructured":"Ionescu, C., Papava, D., Olaru, V., et al.: Human3. 6m:Large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4741_CR8","doi-asserted-by":"crossref","unstructured":"Kolotouros N, Pavlakos G, Daniilidis K (2019) Convolutional mesh regression for single-image human shape reconstruction. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4501\u20134510","DOI":"10.1109\/CVPR.2019.00463"},{"key":"4741_CR9","doi-asserted-by":"crossref","unstructured":"Cho J, Youwang K, Oh TH (2022) Cross-attention of disentangled modalities for 3d human mesh recovery with transformers. In: European Conference on Computer Vision, Springer, pp 342\u2013359","DOI":"10.1007\/978-3-031-19769-7_20"},{"issue":"1","key":"4741_CR10","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1109\/TCSVT.2022.3199201","volume":"33","author":"L Wang","year":"2022","unstructured":"Wang, L., Liu, X., Ma, X., et al.: A progressive quadric graph convolutional network for 3d human mesh recovery. IEEE Trans. Circuits Syst. Video Technol. 33(1), 104\u2013117 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"4","key":"4741_CR11","doi-asserted-by":"publisher","first-page":"2399","DOI":"10.1109\/TCSVT.2023.3310525","volume":"34","author":"J Hu","year":"2023","unstructured":"Hu, J., Zhang, H., Wang, Y., et al.: Personalized graph generation for monocular 3d human pose and shape estimation. IEEE Trans. Circuits Syst. Video Technol. 34(4), 2399\u20132413 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4741_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102651","volume":"114","author":"Y Zhang","year":"2025","unstructured":"Zhang, Y., Zhang, J., Xu, S., et al.: Multi-view human pose and shape estimation via mesh-aligned voxel interpolation. Information Fusion 114, 102651 (2025)","journal-title":"Information Fusion"},{"key":"4741_CR13","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"4741_CR14","doi-asserted-by":"crossref","unstructured":"Kanazawa A, Zhang JY, Felsen P, et\u00a0al (2019) Learning 3d human dynamics from video. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5614\u20135623","DOI":"10.1109\/CVPR.2019.00576"},{"key":"4741_CR15","doi-asserted-by":"crossref","unstructured":"Choi H, Moon G, Chang JY, et\u00a0al (2021) Beyond static features for temporally consistent 3d human pose and shape from a video. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1964\u20131973","DOI":"10.1109\/CVPR46437.2021.00200"},{"issue":"2","key":"4741_CR16","doi-asserted-by":"publisher","first-page":"911","DOI":"10.1109\/TCSVT.2023.3286402","volume":"34","author":"Z Tang","year":"2023","unstructured":"Tang, Z., Hao, Y., Li, J., et al.: Ftcm: Frequency-temporal collaborative module for efficient 3d human pose estimation in video. IEEE Trans. Circuits Syst. Video Technol. 34(2), 911\u2013923 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"4741_CR17","doi-asserted-by":"crossref","unstructured":"Yao W, Zhang H, Sun Y, et\u00a0al (2024) Staf: 3d human mesh recovery from video with spatio-temporal alignment fusion. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2024.3410400"},{"key":"4741_CR18","doi-asserted-by":"crossref","unstructured":"Lee GH, Lee SW (2021) Uncertainty-aware human mesh recovery from video by learning part-based 3d dynamics. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 12375\u201312384","DOI":"10.1109\/ICCV48922.2021.01215"},{"key":"4741_CR19","doi-asserted-by":"crossref","unstructured":"Kocabas M, Yuan Y, Molchanov P, et\u00a0al (2024) Pace: Human and camera motion estimation from in-the-wild videos. In: 2024 International Conference on 3D Vision (3DV), IEEE, pp 397\u2013408","DOI":"10.1109\/3DV62453.2024.00103"},{"key":"4741_CR20","doi-asserted-by":"crossref","unstructured":"Shen Z, Pi H, Xia Y, et\u00a0al (2024) World-grounded human motion recovery via gravity-view coordinates. In: SIGGRAPH Asia 2024 Conference Papers, pp 1\u201311","DOI":"10.1145\/3680528.3687565"},{"key":"4741_CR21","unstructured":"Vaswani A, Shazeer N, Parmar N, et\u00a0al (2017) Attention is all you need. Advances in neural information processing systems 30"},{"key":"4741_CR22","doi-asserted-by":"crossref","unstructured":"Sun Y, Ye Y, Liu W, et\u00a0al (2019) Human mesh recovery from monocular images via a skeleton-disentangled representation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 5349\u20135358","DOI":"10.1109\/ICCV.2019.00545"},{"key":"4741_CR23","doi-asserted-by":"crossref","unstructured":"Pavlakos G, Malik J, Kanazawa A (2022) Human mesh recovery from multiple shots. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1485\u20131495","DOI":"10.1109\/CVPR52688.2022.00154"},{"key":"4741_CR24","doi-asserted-by":"crossref","unstructured":"Lin K, Wang L, Liu Z (2021a) End-to-end human pose and mesh reconstruction with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1954\u20131963","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"4741_CR25","doi-asserted-by":"crossref","unstructured":"Lin K, Wang L, Liu Z (2021b) Mesh graphormer. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 12939\u201312948","DOI":"10.1109\/ICCV48922.2021.01270"},{"key":"4741_CR26","doi-asserted-by":"crossref","unstructured":"Shen X, Yang Z, Wang X, et\u00a0al (2023) Global-to-local modeling for video-based 3d human pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8887\u20138896","DOI":"10.1109\/CVPR52729.2023.00858"},{"key":"4741_CR27","doi-asserted-by":"crossref","unstructured":"Cho H, Ahn J, Cho Y, et\u00a0al (2023) Video inference for human mesh recovery with vision transformer. In: 2023 IEEE 17th International Conference on Automatic Face and Gesture Recognition (FG), IEEE, pp 1\u20136","DOI":"10.1109\/FG57933.2023.10042731"},{"key":"4741_CR28","doi-asserted-by":"crossref","unstructured":"Qiu Z, Yang Q, Wang J, et\u00a0al (2023) Psvt: End-to-end multi-person 3d pose and shape estimation with progressive video transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 21254\u201321263","DOI":"10.1109\/CVPR52729.2023.02036"},{"key":"4741_CR29","doi-asserted-by":"crossref","unstructured":"Kanazawa A, Zhang JY, Felsen P, et\u00a0al (2019) Learning 3d human dynamics from video. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5614\u20135623","DOI":"10.1109\/CVPR.2019.00576"},{"key":"4741_CR30","doi-asserted-by":"crossref","unstructured":"Kocabas M, Athanasiou N, Black MJ (2020) Vibe: Video inference for human body pose and shape estimation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5253\u20135263","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"4741_CR31","doi-asserted-by":"crossref","unstructured":"Luo Z, Golestaneh SA, Kitani KM (2020) 3d human motion estimation via motion compression and refinement. In: Proceedings of the Asian Conference on Computer Vision","DOI":"10.1007\/978-3-030-69541-5_20"},{"key":"4741_CR32","doi-asserted-by":"crossref","unstructured":"Wei WL, Lin JC, Liu TL, et\u00a0al (2022) Capturing humans in motion: Temporal-attentive 3d human pose and shape estimation from monocular video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13211\u201313220","DOI":"10.1109\/CVPR52688.2022.01286"},{"key":"4741_CR33","doi-asserted-by":"crossref","unstructured":"Sengupta A, Budvytis I, Cipolla R (2023) Humaniflow: Ancestor-conditioned normalising flows on so (3) manifolds for human pose and shape distribution estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4779\u20134789","DOI":"10.1109\/CVPR52729.2023.00463"},{"key":"4741_CR34","doi-asserted-by":"crossref","unstructured":"Black MJ, Patel P, Tesch J, et\u00a0al (2023) Bedlam: A synthetic dataset of bodies exhibiting detailed lifelike animated motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8726\u20138737","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"4741_CR35","doi-asserted-by":"crossref","unstructured":"Xu Y, Ma X, Su J, et\u00a0al (2024) Scorehypo: Probabilistic human mesh estimation with hypothesis scoring. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 979\u2013989","DOI":"10.1109\/CVPR52733.2024.00099"},{"key":"4741_CR36","doi-asserted-by":"crossref","unstructured":"Kolotouros N, Pavlakos G, Jayaraman D, et\u00a0al (2021) Probabilistic modeling for human mesh recovery. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 11605\u201311614","DOI":"10.1109\/ICCV48922.2021.01140"},{"key":"4741_CR37","doi-asserted-by":"crossref","unstructured":"Wehrbein T, Rudolph M, Rosenhahn B, et\u00a0al (2025) Utilizing uncertainty in 2d pose detectors for probabilistic 3d human mesh recovery. In: 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), IEEE, pp 5852\u20135862","DOI":"10.1109\/WACV61041.2025.00571"},{"key":"4741_CR38","doi-asserted-by":"crossref","unstructured":"Mehta D, Rhodin H, Casas D, et\u00a0al (2017) Monocular 3d human pose estimation in the wild using improved cnn supervision. In: 2017 international conference on 3D vision (3DV), IEEE, pp 506\u2013516","DOI":"10.1109\/3DV.2017.00064"},{"key":"4741_CR39","doi-asserted-by":"crossref","unstructured":"Kaufmann M, Song J, Guo C, et\u00a0al (2023) Emdb: The electromagnetic database of global 3d human pose and shape in the wild. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 14632\u201314643","DOI":"10.1109\/ICCV51070.2023.01345"},{"key":"4741_CR40","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, et\u00a0al (2014) Microsoft coco: Common objects in context. In: Computer vision\u2013ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4741_CR41","doi-asserted-by":"crossref","unstructured":"Andriluka M, Pishchulin L, Gehler P, et\u00a0al (2014) 2d human pose estimation: New benchmark and state of the art analysis. In: Proceedings of the IEEE Conference on computer Vision and Pattern Recognition, pp 3686\u20133693","DOI":"10.1109\/CVPR.2014.471"},{"key":"4741_CR42","doi-asserted-by":"crossref","unstructured":"Moon G, Choi H, Lee KM (2022) Neuralannot: Neural annotator for 3d human mesh training sets. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2299\u20132307","DOI":"10.1109\/CVPRW56347.2022.00256"},{"key":"4741_CR43","doi-asserted-by":"crossref","unstructured":"Chen Y, Wang Z, Peng Y, et\u00a0al (2018) Cascaded pyramid network for multi-person pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7103\u20137112","DOI":"10.1109\/CVPR.2018.00742"},{"key":"4741_CR44","first-page":"38571","volume":"35","author":"Y Xu","year":"2022","unstructured":"Xu, Y., Zhang, J., Zhang, Q., et al.: Vitpose: Simple vision transformer baselines for human pose estimation. Adv. Neural. Inf. Process. Syst. 35, 38571\u201338584 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4741_CR45","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et\u00a0al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"4741_CR46","doi-asserted-by":"crossref","unstructured":"Newell A, Yang K, Deng J (2016) Stacked hourglass networks for human pose estimation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part VIII 14, Springer, pp 483\u2013499","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"4741_CR47","doi-asserted-by":"crossref","unstructured":"Pavllo D, Feichtenhofer C, Grangier D, et\u00a0al (2019) 3d human pose estimation in video with temporal convolutions and semi-supervised training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7753\u20137762","DOI":"10.1109\/CVPR.2019.00794"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04741-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-025-04741-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-025-04741-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T03:28:15Z","timestamp":1759980495000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-025-04741-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,11]]},"references-count":47,"journal-issue":{"issue":"13","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["4741"],"URL":"https:\/\/doi.org\/10.1007\/s11760-025-04741-0","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,11]]},"assertion":[{"value":"9 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 August 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 September 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 September 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"1132"}}