{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:19:13Z","timestamp":1777655953404,"version":"3.51.4"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2022,9,4]],"date-time":"2022-09-04T00:00:00Z","timestamp":1662249600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,9,4]],"date-time":"2022-09-04T00:00:00Z","timestamp":1662249600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.61907028"],"award-info":[{"award-number":["No.61907028"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.11872036"],"award-info":[{"award-number":["No.11872036"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Young science and technology stars in Shaanxi Province","award":["2021KJXX-91"],"award-info":[{"award-number":["2021KJXX-91"]}]},{"name":"Young Talent fund of University Association for Science and Technology in Shaanxi","award":["No.20200105"],"award-info":[{"award-number":["No.20200105"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2022,11]]},"DOI":"10.1007\/s00138-022-01334-6","type":"journal-article","created":{"date-parts":[[2022,9,4]],"date-time":"2022-09-04T17:02:21Z","timestamp":1662310941000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["U-shaped spatial\u2013temporal transformer network for 3D human pose estimation"],"prefix":"10.1007","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4124-5317","authenticated-orcid":false,"given":"Honghong","family":"Yang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Longfei","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yumei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaojun","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,9,4]]},"reference":[{"key":"1334_CR1","doi-asserted-by":"crossref","unstructured":"Zheng, C., Zhu, S., Mendieta, M., et al: 3D human pose estimation with spatial and temporal transformers. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 11656\u201311665 (2021)","DOI":"10.1109\/ICCV48922.2021.01145"},{"issue":"3","key":"1334_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00138-022-01291-0","volume":"33","author":"Z Malik","year":"2022","unstructured":"Malik, Z., Shapiai, M.: Human action interpretation using convolutional neural network: a survey. Mach. Vis. Appl. 33(3), 1\u201323 (2022)","journal-title":"Mach. Vis. Appl."},{"key":"1334_CR3","doi-asserted-by":"crossref","unstructured":"Moon, G., Lee, K.M.: I2l-meshnet: Image to-lixel prediction network for accurate 3d human pose and mesh estimation from a single rgb image. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 752\u2013768 (2020)","DOI":"10.1007\/978-3-030-58571-6_44"},{"key":"1334_CR4","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhou, X., Daniilidis, K.: Ordinal depth supervision for 3D human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7307\u20137316 (2018)","DOI":"10.1109\/CVPR.2018.00763"},{"issue":"1","key":"1334_CR5","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TCSVT.2021.3057267","volume":"32","author":"T Chen","year":"2022","unstructured":"Chen, T., Fang, C., Shen, X., Zhu, Y., Chen, Z., Luo, J.: Anatomy-aware 3D human pose estimation with bone-based pose decomposition. IEEE Trans. Circuits Syst. Video Technol. 32(1), 198\u2013209 (2022). https:\/\/doi.org\/10.1109\/TCSVT.2021.3057267","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1334_CR6","doi-asserted-by":"crossref","unstructured":"Wang, J., Yan, S., Xiong, Y., Lin, D.: Motion guided 3D pose estimation from videos. In: Proceedings of the European Conference on Computer Vision 2020 (ECCV), pp. 764\u2013780. Springer, (2020)","DOI":"10.1007\/978-3-030-58601-0_45"},{"key":"1334_CR7","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s00138-020-01104-2","volume":"31","author":"R Wang","year":"2020","unstructured":"Wang, R., Tong, J., Wang, X.: Enhancing feature fusion for human pose estimation. Mach. Vis. Appl. 31, 60 (2020). https:\/\/doi.org\/10.1007\/s00138-020-01104-2","journal-title":"Mach. Vis. Appl."},{"key":"1334_CR8","doi-asserted-by":"crossref","unstructured":"Cai, Y., Ge, L., Liu, J., et al.: exploiting spatial-temporal relationships for 3D pose estimation via graph convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 2272\u20132281 (2019)","DOI":"10.1109\/ICCV.2019.00236"},{"key":"1334_CR9","doi-asserted-by":"crossref","unstructured":"Hossain, M.R.I., Little, J.J.: Exploiting temporal information for 3D human pose estimation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 6869\u20138486, Springer, (2018)","DOI":"10.1007\/978-3-030-01249-6_5"},{"key":"1334_CR10","doi-asserted-by":"crossref","unstructured":"Pavllo, D., Feichtenhofer, C., Grangier, D., et al.: 3d human pose estimation in video with temporal convolutions and semi-supervised training. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7745\u20137754 (2019)","DOI":"10.1109\/CVPR.2019.00794"},{"key":"1334_CR11","doi-asserted-by":"publisher","unstructured":"Huang, Z., Shen, X., Tian, X., et al.: Spatio-temporal inception graph convolutional networks for skeleton-based action recognition. In: ACM Deep Learning of Multimedia, Seattle, WA, USA, pp. 2122\u20132130 (2020). https:\/\/doi.org\/10.1145\/3394171.3413666","DOI":"10.1145\/3394171.3413666"},{"key":"1334_CR12","doi-asserted-by":"crossref","unstructured":"Li, S., Chan, A.: 3D human pose estimation from monocular images with deep convolutional neural network. In: Asian Conference on Computer Vision, pp. 332\u2013347 (2014)","DOI":"10.1007\/978-3-319-16808-1_23"},{"key":"1334_CR13","doi-asserted-by":"crossref","unstructured":"Park, S., Hwang, J., Kwak, N.: 3D human pose estimation using convolutional neural networks with 2d pose information. In: European Conference on Computer Vision (ECCV), pp. 156\u2013169, Springer, (2016)","DOI":"10.1007\/978-3-319-49409-8_15"},{"key":"1334_CR14","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhou, X., Derpanis, K.G., Daniilidis, K.: Coarse-to-fine volumetric prediction for single-image 3D human pose. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7025\u20137034(2017)","DOI":"10.1109\/CVPR.2017.139"},{"key":"1334_CR15","doi-asserted-by":"crossref","unstructured":"Zeng, A., Sun, X., Huang, F., et al.: SRNet: improving generalization in 3D human pose estimation with a split-and-recombine approach. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 507\u2013523 (2020)","DOI":"10.1007\/978-3-030-58568-6_30"},{"key":"1334_CR16","doi-asserted-by":"publisher","unstructured":"Martinez, J., Hossain, R., Romero, J., Little, J.J: A simple yet effective baseline for 3d human pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 2659\u20132668 (2017) https:\/\/doi.org\/10.1109\/ICCV.2017.288.","DOI":"10.1109\/ICCV.2017.288"},{"key":"1334_CR17","doi-asserted-by":"crossref","unstructured":"Xu, T., Takano, W.: Graph stacked hourglass networks for 3d human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16105\u201316114 (2021)","DOI":"10.1109\/CVPR46437.2021.01584"},{"key":"1334_CR18","doi-asserted-by":"crossref","unstructured":"Liu, J., Guang, Y., Rojas, J.: A graph attention spatio-temporal convolutional network for 3D human pose estimation in video. In: Proceedings of the IEEE International Conference on Robotics and Automation (ICRA), pp. 3374\u20133380 (2021)","DOI":"10.1109\/ICRA48506.2021.9561605"},{"key":"1334_CR19","doi-asserted-by":"crossref","unstructured":"Li, W., Liu, H., Tang, H., et al.: MHFormer: multi-hypothesis transformer for 3D human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13147\u201313156 (2022)","DOI":"10.1109\/CVPR52688.2022.01280"},{"key":"1334_CR20","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3141231","author":"W Li","year":"2022","unstructured":"Li, W., Liu, H., Ding, R., et al.: Exploiting temporal contexts with strided transformer for 3D human pose estimation. IEEE Trans. Multimed. (2022). https:\/\/doi.org\/10.1109\/TMM.2022.3141231","journal-title":"IEEE Trans. Multimed."},{"key":"1334_CR21","doi-asserted-by":"publisher","unstructured":"Lin, K., Wang, L., Liu, Z.: End-to-end human pose and mesh reconstruction with transformers. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1954\u20131963, (2021) https:\/\/doi.org\/10.1109\/CVPR46437.2021.00199","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"1334_CR22","doi-asserted-by":"publisher","unstructured":"Lin, T., Dollar, P., Girshick, R., He, K., Hariharan, H., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 936\u2013944 (2017) https:\/\/doi.org\/10.1109\/CVPR.2017.106","DOI":"10.1109\/CVPR.2017.106"},{"key":"1334_CR23","doi-asserted-by":"crossref","unstructured":"Newell, A., Yang, K., Deng, J.: Stacked hourglass networks for human pose estimation. In: Proceedings of the European Conference on Computer Vision 2020 (ECCV), pp. 483\u2013499 (2020)","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"1334_CR24","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., et al.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5686\u20135696 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"1334_CR25","doi-asserted-by":"publisher","unstructured":"Zhao, Q., Sheng, T., Wang, Y., Tang, Z., Chen, Y., Cai, L., Ling, H.: M2det: a single-shot object detector based on multi-level feature pyramid network. In: The Thirty-Third AAAI Conference on Artificial Intellilgence (AAAI), pp. 9259\u20139266, (2019) https:\/\/doi.org\/10.1609\/aaai.v33i01.33019259","DOI":"10.1609\/aaai.v33i01.33019259"},{"key":"1334_CR26","doi-asserted-by":"publisher","unstructured":"Hua, G., Li, W., Zhang, Q., et al.: Weakly-supervised 3D human pose estimation with cross-view U-shaped graph convolutional network. In: IEEE Transactions on Multimedia, arXiv preprint http:\/\/arxiv.org\/abs\/2105.10882, (2022) https:\/\/doi.org\/10.48550\/arXiv.2105.10882","DOI":"10.48550\/arXiv.2105.10882"},{"key":"1334_CR27","doi-asserted-by":"publisher","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint http:\/\/arxiv.org\/abs\/2010.11929 (2021) https:\/\/doi.org\/10.48550\/arXiv.2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"issue":"8","key":"1334_CR28","doi-asserted-by":"publisher","first-page":"2011","DOI":"10.1109\/TPAMI.2019.2913372","volume":"42","author":"J Hu","year":"2020","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. IEEE Trans. Patt. Anal. Mach. Intell. 42(8), 2011\u20132023 (2020). https:\/\/doi.org\/10.1109\/TPAMI.2019.2913372","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"issue":"7","key":"1334_CR29","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2014","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3.6m: large scale datasets and predictive methods for 3D human sensing in natural environments. IEEE Trans. Patt. Anal. Mach. Intell. 36(7), 1325\u20131339 (2014)","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"issue":"1","key":"1334_CR30","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s11263-009-0273-6","volume":"87","author":"L Sigal","year":"2010","unstructured":"Sigal, L., Balan, A.O., Black, M.J.: Humaneva: synchronized video and motion capture dataset and baseline algorithm for evaluation of articulated human motion. Int. J. Comput. Vis. 87(1), 4\u201327 (2010)","journal-title":"Int. J. Comput. Vis."},{"key":"1334_CR31","doi-asserted-by":"publisher","unstructured":"Zheng, C., Wu, W., Yang, T., Zhu, S., Chen, C., Liu, R., Shen, J., Kehtarnavaz, N., Shah, M.: Deep learning-based human pose estimation: a http:\/\/arxiv.org\/abs\/2012.13392v4, https:\/\/doi.org\/10.48550\/arXiv.2012.13392","DOI":"10.48550\/arXiv.2012.13392"},{"key":"1334_CR32","doi-asserted-by":"publisher","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representations (ICLR), pp. 1\u201315 (2015), https:\/\/doi.org\/10.48550\/arXiv.1412.6980.","DOI":"10.48550\/arXiv.1412.6980"},{"issue":"56","key":"1334_CR33","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: 1Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(56), 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"1334_CR34","doi-asserted-by":"crossref","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., Weinberger, K.Q.: Deep networks with stochastic depth. In: Proceedings of the European conference on computer vision (ECCV), pp. 646\u2013661 (2016)","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"1334_CR35","doi-asserted-by":"crossref","unstructured":"Fang, H., Xu, Y., Wang, W., Liu, X., Zhu, S.: Learning pose grammar to encode human body configuration for 3D pose estimation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32, no. 1, pp. 6821\u20136828 (2018)","DOI":"10.1609\/aaai.v32i1.12270"},{"key":"1334_CR36","doi-asserted-by":"crossref","unstructured":"Zou, Z., Tang, W.: Modulated graph convolutional network for 3D human pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 11477\u201311487 (2021)","DOI":"10.1109\/ICCV48922.2021.01128"},{"key":"1334_CR37","doi-asserted-by":"crossref","unstructured":"Zhao, L., Peng, X., Tian, Y., Kapadia, M., Metaxas, D.N..: Semantic graph convolutional networks for 3D human pose regression. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp: 3425\u20133435 (2019)","DOI":"10.1109\/CVPR.2019.00354"},{"key":"1334_CR38","doi-asserted-by":"publisher","unstructured":"Yeh, R.A., Hu, Y., Schwing, A.G.: Chirality nets for human pose regression. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems (NIPS), pp. 8163\u20138173 (2019) https:\/\/doi.org\/10.48550\/arXiv.1911.00029","DOI":"10.48550\/arXiv.1911.00029"},{"key":"1334_CR39","doi-asserted-by":"publisher","unstructured":"Lin, J., Lee, G.H.: Trajectory space factorization for deep video-based 3d human pose estimation. In: Proceedings of the British Machine Vision Conference (BMVC), pp. 1\u201313(2019) https:\/\/doi.org\/10.48550\/arXiv.1908.08289","DOI":"10.48550\/arXiv.1908.08289"},{"key":"1334_CR40","doi-asserted-by":"publisher","unstructured":"Gong, K., Zhang, J., Poseaug, J.F.: A differentiable pose augmentation framework for 3D human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8575\u20138584(2021) https:\/\/doi.org\/10.48550\/arXiv.2105.02465","DOI":"10.48550\/arXiv.2105.02465"},{"key":"1334_CR41","doi-asserted-by":"publisher","unstructured":"Xu, J., Yu, Z., Ni, B., Yang, J., Yang, X., Zhang, W.: Deep kinematics analysis for monocular 3D human pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 896\u2013905, (2020) https:\/\/doi.org\/10.1109\/CVPR42600.2020.00098","DOI":"10.1109\/CVPR42600.2020.00098"},{"key":"1334_CR42","doi-asserted-by":"publisher","unstructured":"Liu, R., Shen, J., Wang, H., Chen, C., Cheung, S., Asari, V.: Attention mechanism exploits temporal contexts: real-time 3D human pose reconstruction. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5063\u20135072 (2020) https:\/\/doi.org\/10.1109\/CVPR42600.2020.00511.","DOI":"10.1109\/CVPR42600.2020.00511"},{"key":"1334_CR43","doi-asserted-by":"publisher","unstructured":"Lee, K., Lee, I., Lee, S.: Propagating lstm: 3D pose estimation based on joint interdependency. In Proceedings of the European Conference on Computer Vision (ECCV), pp. 123\u2013141 (2018) https:\/\/doi.org\/10.1007\/978-3-030-01234-2_8","DOI":"10.1007\/978-3-030-01234-2_8"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-022-01334-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-022-01334-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-022-01334-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,7]],"date-time":"2022-11-07T04:12:46Z","timestamp":1667794366000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-022-01334-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,4]]},"references-count":43,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2022,11]]}},"alternative-id":["1334"],"URL":"https:\/\/doi.org\/10.1007\/s00138-022-01334-6","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,9,4]]},"assertion":[{"value":"26 April 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 August 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"82"}}