{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T08:43:10Z","timestamp":1773823390477,"version":"3.50.1"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T00:00:00Z","timestamp":1743724800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T00:00:00Z","timestamp":1743724800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. King Saud Univ. Comput. Inf. Sci."],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s44443-025-00023-4","type":"journal-article","created":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T17:00:53Z","timestamp":1743872453000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["MSTFormer: multi-granularity spatial-temporal transformers for 3D human pose estimation"],"prefix":"10.1007","volume":"37","author":[{"given":"Hao","family":"Lin","sequence":"first","affiliation":[]},{"given":"Sheng","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Chengyue","family":"Su","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,4]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Bajpai V, Sharma A, Subudhi BN, et\u00a0al (2021) Underwater u-net: Deep learning with u-net for visual underwater moving object detection. In: OCEANS 2021: San Diego\u2013Porto, IEEE, pp 1\u20134","DOI":"10.23919\/OCEANS44145.2021.9705761"},{"key":"23_CR2","unstructured":"Bazarevsky V (2020) Blazepose: On-device real-time body pose tracking. arXiv preprint arXiv:2006.10204"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Cai Y, Ge L, Liu J, et\u00a0al (2019) Exploiting spatial-temporal relationships for 3d pose estimation via graph convolutional networks. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2272\u20132281","DOI":"10.1109\/ICCV.2019.00236"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Cao Z, Simon T, Wei SE, et\u00a0al (2017) Realtime multi-person 2d pose estimation using part affinity fields. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7291\u20137299","DOI":"10.1109\/CVPR.2017.143"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, et\u00a0al (2020) End-to-end object detection with transformers. In: European conference on computer vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"issue":"1","key":"23_CR6","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TCSVT.2021.3057267","volume":"32","author":"T Chen","year":"2021","unstructured":"Chen T, Fang C, Shen X et al (2021) Anatomy-aware 3d human pose estimation with bone-based pose decomposition. IEEE Trans Circ Syst Video Technol 32(1):198\u2013209","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Chen Y, Wang Z, Peng Y, et\u00a0al (2018) Cascaded pyramid network for multi-person pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7103\u20137112","DOI":"10.1109\/CVPR.2018.00742"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Chen Z, Wang Y, Yang W (2022) Video based fall detection using human poses. In: CCF Conference on Big Data, Springer, pp 283\u2013296","DOI":"10.1007\/978-981-16-9709-8_19"},{"issue":"3","key":"23_CR9","doi-asserted-by":"publisher","first-page":"1429","DOI":"10.1109\/TPAMI.2020.3019139","volume":"44","author":"H Ci","year":"2020","unstructured":"Ci H, Ma X, Wang C et al (2020) Locally connected network for monocular 3d human pose estimation. IEEE Trans Patt Anal Mach Intell 44(3):1429\u20131442","journal-title":"IEEE Trans Patt Anal Mach Intell"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Dai Z, Cai B, Lin Y, et\u00a0al (2021) Up-detr: Unsupervised pre-training for object detection with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1601\u20131610","DOI":"10.1109\/CVPR46437.2021.00165"},{"issue":"4","key":"23_CR11","doi-asserted-by":"publisher","first-page":"2555","DOI":"10.1007\/s00371-023-02936-5","volume":"40","author":"A Diaz-Arias","year":"2024","unstructured":"Diaz-Arias A, Shin D (2024) Convformer: parameter reduction in transformer models for 3d human pose estimation by leveraging dynamic multi-headed convolutional attention. Vis Comp 40(4):2555\u20132569","journal-title":"Vis Comp"},{"key":"23_CR12","unstructured":"Dosovitskiy A (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Du G, Cao X, Liang J, et\u00a0al (2020) Medical image segmentation based on u-net: A review. J Imag Sci Technol 64(2)","DOI":"10.2352\/J.ImagingSci.Technol.2020.64.2.020508"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Einfalt M, Ludwig K, Lienhart R (2023) Uplift and upsample: Efficient 3d human pose estimation with uplifting transformers. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 2903\u20132913","DOI":"10.1109\/WACV56688.2023.00292"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Fang HS, Xie S, Tai YW, et\u00a0al (2017) Rmpe: Regional multi-person pose estimation. In: Proceedings of the IEEE international conference on computer vision, pp 2334\u20132343","DOI":"10.1109\/ICCV.2017.256"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Hassan MT, Hamza AB (2023) Regular splitting graph network for 3d human pose estimation. IEEE Trans Image Process","DOI":"10.1109\/TIP.2023.3275914"},{"key":"23_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2022.3230846","volume":"60","author":"X He","year":"2022","unstructured":"He X, Zhou Y, Zhao J et al (2022) Swin transformer embedding unet for remote sensing image semantic segmentation. IEEE Trans Geosci Remote Sens 60:1\u201315","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Hu W, Zhang C, Zhan F, et\u00a0al (2021) Conditional directed graph convolution for 3d human pose estimation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp 602\u2013611","DOI":"10.1145\/3474085.3475219"},{"key":"23_CR19","unstructured":"Hua G, Li W, Zhang Q, et\u00a0al (2021) Weakly-supervised cross-view 3d human pose estimation. arXiv:2105.10882"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Huang G, Sun Y, Liu Z, et\u00a0al (2016) Deep networks with stochastic depth. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, Springer, pp 646\u2013661","DOI":"10.1007\/978-3-319-46493-0_39"},{"issue":"7","key":"23_CR21","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2013","unstructured":"Ionescu C, Papava D, Olaru V et al (2013) Human3. 6m: Large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans Patt Anal Mach Intell 36(7):1325\u20131339","journal-title":"IEEE Trans Patt Anal Mach Intell"},{"key":"23_CR22","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv:1412.6980"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Lee K, Lee I, Lee S (2018) Propagating lstm: 3d pose estimation based on joint interdependency. In: Proceedings of the European conference on computer vision (ECCV), pp 119\u2013135","DOI":"10.1007\/978-3-030-01234-2_8"},{"issue":"6","key":"23_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3531004","volume":"21","author":"S Li","year":"2022","unstructured":"Li S, Man C, Shen A et al (2022) A fall detection network by 2d\/3d spatio-temporal joint models with tensor compression on edge. ACM Trans Embedded Comp Syst 21(6):1\u201319","journal-title":"ACM Trans Embedded Comp Syst"},{"issue":"7","key":"23_CR25","doi-asserted-by":"publisher","first-page":"2573","DOI":"10.3390\/s22072573","volume":"22","author":"W Li","year":"2022","unstructured":"Li W, Du R, Chen S (2022) Skeleton-based spatio-temporal u-network for 3d human pose estimation in video. Sensors 22(7):2573","journal-title":"Sensors"},{"key":"23_CR26","doi-asserted-by":"publisher","first-page":"1282","DOI":"10.1109\/TMM.2022.3141231","volume":"25","author":"W Li","year":"2022","unstructured":"Li W, Liu H, Ding R et al (2022) Exploiting temporal contexts with strided transformer for 3d human pose estimation. IEEE Trans Multimed 25:1282\u20131293","journal-title":"IEEE Trans Multimed"},{"key":"23_CR27","unstructured":"Lin J, Lee GH (2019) Trajectory space factorization for deep video-based 3d human pose estimation. arXiv:1908.08289"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Lin M, Lin L, Liang X, et\u00a0al (2017) Recurrent 3d pose sequence machines. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 810\u2013819","DOI":"10.1109\/CVPR.2017.588"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Liu J, Rojas J, Li Y, et\u00a0al (2021a) A graph attention spatio-temporal convolutional network for 3d human pose estimation in video. In: 2021 IEEE international conference on robotics and automation (ICRA), IEEE, pp 3374\u20133380","DOI":"10.1109\/ICRA48506.2021.9561605"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Liu M, Yuan J (2018) Recognizing human actions as the evolution of pose estimation maps. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1159\u20131168","DOI":"10.1109\/CVPR.2018.00127"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Liu R, Shen J, Wang H, et\u00a0al (2020) Attention mechanism exploits temporal contexts: Real-time 3d human pose reconstruction. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5064\u20135073","DOI":"10.1109\/CVPR42600.2020.00511"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, et\u00a0al (2021b) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Martinez J, Hossain R, Romero J, et\u00a0al (2017) A simple yet effective baseline for 3d human pose estimation. In: Proceedings of the IEEE international conference on computer vision, pp 2640\u20132649","DOI":"10.1109\/ICCV.2017.288"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Moon G, Lee KM (2020) I2l-meshnet: Image-to-lixel prediction network for accurate 3d human pose and mesh estimation from a single rgb image. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part VII 16, Springer, pp 752\u2013768","DOI":"10.1007\/978-3-030-58571-6_44"},{"key":"23_CR35","doi-asserted-by":"publisher","first-page":"133330","DOI":"10.1109\/ACCESS.2020.3010248","volume":"8","author":"TL Munea","year":"2020","unstructured":"Munea TL, Jembre YZ, Weldegebriel HT et al (2020) The progress of human pose estimation: A survey and taxonomy of models applied in 2d human pose estimation. IEEE Access 8:133330\u2013133348","journal-title":"IEEE Access"},{"key":"23_CR36","unstructured":"Park S, Kwak N (2018) 3d human pose estimation with relational networks. arXiv:1805.08961"},{"key":"23_CR37","unstructured":"Paszke A, Gross S, Chintala S, et\u00a0al (2017) Automatic differentiation in pytorch. In: NIPS 2017 Workshop on Autodiff"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"Pavlakos G, Zhou X, Daniilidis K (2018) Ordinal depth supervision for 3d human pose estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7307\u20137316","DOI":"10.1109\/CVPR.2018.00763"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Pavllo D, Feichtenhofer C, Grangier D, et\u00a0al (2019) 3d human pose estimation in video with temporal convolutions and semi-supervised training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7753\u20137762","DOI":"10.1109\/CVPR.2019.00794"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Rayat Imtiaz\u00a0Hossain M, Little JJ (2017) Exploiting temporal information for 3d pose estimation. arXiv e-prints pp arXiv\u20131711","DOI":"10.1007\/978-3-030-01249-6_5"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Sigal L, Balan AO, Black MJ (2010) Humaneva: Synchronized video and motion capture dataset and baseline algorithm for evaluation of articulated human motion. Inter J Comp Vision 87(1):4\u201327","DOI":"10.1007\/s11263-009-0273-6"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Strudel R, Garcia R, Laptev I, et\u00a0al (2021) Segmenter: Transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7262\u20137272","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Tang W, Wu Y (2019) Does learning specific features for related parts help human pose estimation? In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1107\u20131116","DOI":"10.1109\/CVPR.2019.00120"},{"issue":"2","key":"23_CR44","doi-asserted-by":"publisher","first-page":"911","DOI":"10.1109\/TCSVT.2023.3286402","volume":"34","author":"Z Tang","year":"2023","unstructured":"Tang Z, Hao Y, Li J et al (2023) Ftcm: Frequency-temporal collaborative module for efficient 3d human pose estimation in video. IEEE Trans Circ Syst Video Technol 34(2):911\u2013923","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"23_CR45","unstructured":"Vaswani A (2017) Attention is all you need. Adv Neural Inf Process Syst"},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Wang J, Yan S, Xiong Y, et\u00a0al (2020) Motion guided 3d pose estimation from videos. In: European conference on computer vision, Springer, pp 764\u2013780","DOI":"10.1007\/978-3-030-58601-0_45"},{"key":"23_CR47","doi-asserted-by":"publisher","first-page":"364","DOI":"10.1109\/TIP.2022.3228497","volume":"32","author":"X Wu","year":"2022","unstructured":"Wu X, Hong D, Chanussot J (2022) Uiu-net: U-net in u-net for infrared small object detection. IEEE Trans Image Process 32:364\u2013376","journal-title":"IEEE Trans Image Process"},{"issue":"11","key":"23_CR48","doi-asserted-by":"publisher","first-page":"22301","DOI":"10.1109\/TITS.2021.3135251","volume":"23","author":"F Xu","year":"2021","unstructured":"Xu F, Xu F, Xie J et al (2021) Action recognition framework in traffic scene for autonomous driving system. IEEE Trans Intell Transp Syst 23(11):22301\u201322311","journal-title":"IEEE Trans Intell Transp Syst"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Xu T, Takano W (2021) Graph stacked hourglass networks for 3d human pose estimation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16105\u201316114","DOI":"10.1109\/CVPR46437.2021.01584"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Yu BX, Zhang Z, Liu Y, et\u00a0al (2023) Gla-gcn: Global-local adaptive graph convolutional network for 3d human pose estimation from monocular video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 8818\u20138829","DOI":"10.1109\/ICCV51070.2023.00810"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Zeng A, Sun X, Huang F, et\u00a0al (2020) Srnet: Improving generalization in 3d human pose estimation with a split-and-recombine approach. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16, Springer, pp 507\u2013523","DOI":"10.1007\/978-3-030-58568-6_30"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Zhan Y, Li F, Weng R, et\u00a0al (2022) Ray3d: ray-based 3d human pose estimation for monocular absolute 3d localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13116\u201313125","DOI":"10.1109\/CVPR52688.2022.01277"},{"key":"23_CR53","doi-asserted-by":"crossref","unstructured":"Zhao W, Wang W, Tian Y (2022) Graformer: Graph-oriented transformer for 3d pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 20438\u201320447","DOI":"10.1109\/CVPR52688.2022.01979"},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Zheng C, Zhu S, Mendieta M, et\u00a0al (2021) 3d human pose estimation with spatial and temporal transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 11656\u201311665","DOI":"10.1109\/ICCV48922.2021.01145"},{"issue":"1","key":"23_CR55","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3603618","volume":"56","author":"C Zheng","year":"2023","unstructured":"Zheng C, Wu W, Chen C et al (2023) Deep learning-based human pose estimation: A survey. ACM Comput Surv 56(1):1\u201337","journal-title":"ACM Comput Surv"},{"key":"23_CR56","doi-asserted-by":"publisher","first-page":"104699","DOI":"10.1016\/j.compbiomed.2021.104699","volume":"136","author":"H Zunair","year":"2021","unstructured":"Zunair H, Hamza AB (2021) Sharp u-net: Depthwise convolutional network for biomedical image segmentation. Comput Biol Med 136:104699","journal-title":"Comput Biol Med"}],"container-title":["Journal of King Saud University Computer and Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00023-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44443-025-00023-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00023-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T11:45:46Z","timestamp":1748605546000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44443-025-00023-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,4]]},"references-count":56,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["23"],"URL":"https:\/\/doi.org\/10.1007\/s44443-025-00023-4","relation":{},"ISSN":["1319-1578","2213-1248"],"issn-type":[{"value":"1319-1578","type":"print"},{"value":"2213-1248","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,4]]},"assertion":[{"value":"22 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests:"}}],"article-number":"15"}}