{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,12]],"date-time":"2026-04-12T16:31:35Z","timestamp":1776011495351,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":45,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819787944","type":"print"},{"value":"9789819787951","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8795-1_26","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T23:02:46Z","timestamp":1730588566000},"page":"383-398","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["MPM: A Unified 2D-3D Human Pose Representation via Masked Pose Modeling"],"prefix":"10.1007","author":[{"given":"Zhenyu","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhao","family":"Chai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongyu","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tian","family":"Ye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingli","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jenq-Neng","family":"Hwang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gaoang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"26_CR1","doi-asserted-by":"crossref","unstructured":"Andriluka, M., Pishchulin, L., Gehler, P., Schiele, B.: 2d human pose estimation: new benchmark and state of the art analysis. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.471"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Cai, Y., Ge, L., Liu, J., Cai, J., Cham, T.J., Yuan, J., Thalmann, N.M.: Exploiting spatial-temporal relationships for 3d pose estimation via graph convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2272\u20132281 (2019)","DOI":"10.1109\/ICCV.2019.00236"},{"key":"26_CR3","doi-asserted-by":"crossref","unstructured":"Chai, W., Jiang, Z., Hwang, J.N., Wang, G.: Global adaptation meets local generalization: unsupervised domain adaptation for 3d human pose estimation. arXiv preprint arXiv:2303.16456 (2023)","DOI":"10.1109\/ICCV51070.2023.01347"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Chen, C.H., Ramanan, D.: 3d human pose estimation= 2d pose estimation+ matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7035\u20137043 (2017)","DOI":"10.1109\/CVPR.2017.610"},{"issue":"1","key":"26_CR5","doi-asserted-by":"publisher","first-page":"198","DOI":"10.1109\/TCSVT.2021.3057267","volume":"32","author":"T Chen","year":"2021","unstructured":"Chen, T., Fang, C., Shen, X., Zhu, Y., Chen, Z., Luo, J.: Anatomy-aware 3d human pose estimation with bone-based pose decomposition. IEEE Trans. Circuits Syst. Video Technol. 32(1), 198\u2013209 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: universal image-text representation learning. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX. pp. 104\u2013120. Springer (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, Z., Peng, Y., Zhang, Z., Yu, G., Sun, J.: Cascaded pyramid network for multi-person pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7103\u20137112 (2018)","DOI":"10.1109\/CVPR.2018.00742"},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Ci, H., Wu, M., Zhu, W., Ma, X., Dong, H., Zhong, F., Wang, Y.: Gfpose: Learning 3d human pose prior with gradient fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4800\u20134810 (2023)","DOI":"10.1109\/CVPR52729.2023.00465"},{"key":"26_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Drover, D., MV, R., Chen, C.H., Agrawal, A., Tyagi, A., Phuoc\u00a0Huynh, C.: Can 3d pose be learned from 2d projections alone? In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops, pp.\u00a00\u20130 (2018)","DOI":"10.1007\/978-3-030-11018-5_7"},{"key":"26_CR11","doi-asserted-by":"crossref","unstructured":"Fang, H.S., Xu, Y., Wang, W., Liu, X., Zhu, S.C.: Learning pose grammar to encode human body configuration for 3d pose estimation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.12270"},{"key":"26_CR12","first-page":"35946","volume":"35","author":"C Feichtenhofer","year":"2022","unstructured":"Feichtenhofer, C., Li, Y., He, K., et al.: Masked autoencoders as spatiotemporal learners. Adv. Neural. Inf. Process. Syst. 35, 35946\u201335958 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Gong, K., Zhang, J., Feng, J.: Poseaug: a differentiable pose augmentation framework for 3d human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8575\u20138584 (2021)","DOI":"10.1109\/CVPR46437.2021.00847"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Hao, S., Liu, P., Zhan, Y., Jin, K., Liu, Z., Song, M., Hwang, J.N., Wang, G.: Divotrack: a novel dataset and baseline method for cross-view multi-object tracking in diverse open scenes. arXiv preprint arXiv:2302.07676 (2023)","DOI":"10.1007\/s11263-023-01922-7"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"26_CR16","doi-asserted-by":"crossref","unstructured":"Hu, W., Zhang, C., Zhan, F., Zhang, L., Wong, T.T.: Conditional directed graph convolution for 3d human pose estimation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 602\u2013611 (2021)","DOI":"10.1145\/3474085.3475219"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3. 6m: large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2013)","DOI":"10.1109\/TPAMI.2013.248"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Li, C., Lee, G.H.: Generating multiple hypotheses for 3d human pose estimation with mixture density network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9887\u20139895 (2019)","DOI":"10.1109\/CVPR.2019.01012"},{"issue":"6","key":"26_CR19","doi-asserted-by":"publisher","first-page":"3316","DOI":"10.1109\/TPAMI.2021.3053765","volume":"44","author":"M Li","year":"2021","unstructured":"Li, M., Chen, S., Chen, X., Zhang, Y., Wang, Y., Tian, Q.: Symbiotic graph neural networks for 3d skeleton-based human action recognition and motion prediction. IEEE Trans. Pattern Anal. Mach. Intell. 44(6), 3316\u20133333 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Li, S., Ke, L., Pratama, K., Tai, Y.W., Tang, C.K., Cheng, K.T.: Cascaded deep monocular 3d human pose estimation with evolutionary training data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6173\u20136183 (2020)","DOI":"10.1109\/CVPR42600.2020.00621"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Li, W., Liu, H., Ding, R., Liu, M., Wang, P., Yang, W.: Exploiting temporal contexts with strided transformer for 3d human pose estimation. IEEE Trans. Multimed. (2022)","DOI":"10.1109\/TMM.2022.3141231"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Li, W., Liu, H., Tang, H., Wang, P., Van\u00a0Gool, L.: Mhformer: multi-hypothesis transformer for 3d human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13147\u201313156 (2022)","DOI":"10.1109\/CVPR52688.2022.01280"},{"key":"26_CR23","unstructured":"Lin, J., Lee, G.H.: Trajectory space factorization for deep video-based 3d human pose estimation. arXiv preprint arXiv:1908.08289 (2019)"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"26_CR25","doi-asserted-by":"crossref","unstructured":"Liu, R., Shen, J., Wang, H., Chen, C., Cheung, S.C., Asari, V.: Attention mechanism exploits temporal contexts: real-time 3d human pose reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5064\u20135073 (2020)","DOI":"10.1109\/CVPR42600.2020.00511"},{"key":"26_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"26_CR27","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"26_CR28","unstructured":"Luo, H., Ji, L., Shi, B., Huang, H., Duan, N., Li, T., Li, J., Bharti, T., Zhou, M.: Univl: a unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: Amass: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Martinez, J., Hossain, R., Romero, J., Little, J.J.: A simple yet effective baseline for 3d human pose estimation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2640\u20132649 (2017)","DOI":"10.1109\/ICCV.2017.288"},{"key":"26_CR31","doi-asserted-by":"crossref","unstructured":"Mehta, D., Rhodin, H., Casas, D., Fua, P., Sotnychenko, O., Xu, W., Theobalt, C.: Monocular 3d human pose estimation in the wild using improved cnn supervision. In: 2017 International Conference on 3D Vision (3DV), pp. 506\u2013516. IEEE (2017)","DOI":"10.1109\/3DV.2017.00064"},{"key":"26_CR32","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"26_CR33","doi-asserted-by":"crossref","unstructured":"Pavllo, D., Feichtenhofer, C., Grangier, D., Auli, M.: 3d human pose estimation in video with temporal convolutions and semi-supervised training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7753\u20137762 (2019)","DOI":"10.1109\/CVPR.2019.00794"},{"key":"26_CR34","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Shan, W., Liu, Z., Zhang, X., Wang, S., Ma, S., Gao, W.: P-stmo: pre-trained spatial temporal many-to-one model for 3d human pose estimation. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part V, pp. 461\u2013478. Springer (2022)","DOI":"10.1007\/978-3-031-20065-6_27"},{"key":"26_CR36","doi-asserted-by":"crossref","unstructured":"Shan, W., Lu, H., Wang, S., Zhang, X., Gao, W.: Improving robustness and accuracy via relative information encoding in 3d human pose estimation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3446\u20133454 (2021)","DOI":"10.1145\/3474085.3475504"},{"key":"26_CR37","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"26_CR38","doi-asserted-by":"crossref","unstructured":"Xu, J., Yu, Z., Ni, B., Yang, J., Yang, X., Zhang, W.: Deep kinematics analysis for monocular 3d human pose estimation. In: Proceedings of the IEEE\/CVF Conference on computer vision and Pattern recognition, pp. 899\u2013908 (2020)","DOI":"10.1109\/CVPR42600.2020.00098"},{"key":"26_CR39","doi-asserted-by":"crossref","unstructured":"Xu, P., Zhu, X., Clifton, D.A.: Multimodal learning with transformers: a survey. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"26_CR40","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"26_CR41","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1109\/TIP.2021.3129117","volume":"31","author":"H Yang","year":"2021","unstructured":"Yang, H., Yan, D., Zhang, L., Sun, Y., Li, D., Maybank, S.J.: Feedback graph convolutional network for skeleton-based action recognition. IEEE Trans. Image Process. 31, 164\u2013175 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"26_CR42","doi-asserted-by":"crossref","unstructured":"Zeng, A., Sun, X., Huang, F., Liu, M., Xu, Q., Lin, S.: Srnet: improving generalization in 3d human pose estimation with a split-and-recombine approach. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIV 16, pp. 507\u2013523. Springer (2020)","DOI":"10.1007\/978-3-030-58568-6_30"},{"key":"26_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, J., Tu, Z., Yang, J., Chen, Y., Yuan, J.: Mixste: Seq2seq mixed spatio-temporal encoder for 3d human pose estimation in video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13232\u201313242 (2022)","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"26_CR44","doi-asserted-by":"crossref","unstructured":"Zhao, Q., Zheng, C., Liu, M., Wang, P., Chen, C.: Poseformerv2: exploring frequency domain for efficient and robust 3d human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8877\u20138886 (2023)","DOI":"10.1109\/CVPR52729.2023.00857"},{"key":"26_CR45","doi-asserted-by":"crossref","unstructured":"Zheng, C., Zhu, S., Mendieta, M., Yang, T., Chen, C., Ding, Z.: 3d human pose estimation with spatial and temporal transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11656\u201311665 (2021)","DOI":"10.1109\/ICCV48922.2021.01145"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8795-1_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T23:06:07Z","timestamp":1730588767000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8795-1_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9789819787944","9789819787951"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8795-1_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}