{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T23:51:56Z","timestamp":1781826716593,"version":"3.54.5"},"reference-count":73,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T00:00:00Z","timestamp":1775520000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T00:00:00Z","timestamp":1775520000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s11263-026-02750-1","type":"journal-article","created":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T02:30:27Z","timestamp":1775529027000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["PointHPS: Cascaded 3D Human Pose and Shape Estimation from Point Clouds"],"prefix":"10.1007","volume":"134","author":[{"given":"Zhongang","family":"Cai","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liang","family":"Pan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wanqi","family":"Yin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fangzhou","family":"Hong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mingyuan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Atsushi","family":"Yamashita","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen Change","family":"Loy","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4220-5958","authenticated-orcid":false,"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,4,7]]},"reference":[{"key":"2750_CR1","doi-asserted-by":"crossref","unstructured":"Bashirov, R., Ianina, A., Iskakov, K., Kononenko, Y., Strizhkova, V., Lempitsky, V., & Vakhitov, A. (2021). Real-time rgbd-based extended body pose estimation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 2807\u20132816","DOI":"10.1109\/WACV48630.2021.00285"},{"key":"2750_CR2","doi-asserted-by":"crossref","unstructured":"Bhatnagar, B.L., Sminchisescu, C., Theobalt, C., & Pons-Moll, G. (2020). Combining implicit function learning and parametric models for 3d human reconstruction. In: Proceedings of the European Conference on Computer Vision, Springer, 311\u2013329","DOI":"10.1007\/978-3-030-58536-5_19"},{"key":"2750_CR3","doi-asserted-by":"crossref","unstructured":"Bogo, F., Kanazawa, A., Lassner, C., Gehler, P., Romero, J., & Black, M.J. (2016). Keep it smpl: Automatic estimation of 3d human pose and shape from a single image. In: Proceedings of the European Conference on Computer Vision, Springer, pp 561\u2013578","DOI":"10.1007\/978-3-319-46454-1_34"},{"key":"2750_CR4","unstructured":"Cai, Z., Zhang, M., Ren, J., Wei, C., Ren, D., Li, J., Lin, Z., Zhao, H., Yi, S., Yang, L., et al. (2021). Playing for 3d human recovery. arXiv preprint arXiv:2110.07588"},{"key":"2750_CR5","doi-asserted-by":"crossref","unstructured":"Cai, Z., Ren, D., Zeng, A., Lin, Z., Yu, T., Wang, W., Fan, X., Gao, Y., Yu, Y., Pan, L., et al. (2022) Humman: Multi-modal 4d human dataset for versatile sensing and modeling. In: Proceedings of the European Conference on Computer Vision, Springer, 557\u2013577","DOI":"10.1007\/978-3-031-20071-7_33"},{"key":"2750_CR6","doi-asserted-by":"publisher","first-page":"11454","DOI":"10.52202\/075280-0506","volume":"36","author":"Z Cai","year":"2023","unstructured":"Cai, Z., Yin, W., Zeng, A., Wei, C., Sun, Q., Yanjun, W., Pang, H. E., Mei, H., Zhang, M., Zhang, L., et al. (2023). Smpler-x: scaling up expressive human pose and shape estimation. Advances in Neural Information Processing Systems, 36, 11454\u201311468.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2750_CR7","doi-asserted-by":"crossref","unstructured":"Cao, Z., Gao, H., Mangalam, K., Cai, Q.Z., Vo, M., & Malik, J. (2020). Long-term human motion prediction with scene context. In: Proceedings of the European Conference on Computer Vision, Springer, 387\u2013404","DOI":"10.1007\/978-3-030-58452-8_23"},{"key":"2750_CR8","doi-asserted-by":"crossref","unstructured":"Chen, K., Pang, J., Wang, J., Xiong, Y., Li, X., Sun, S., Feng, W., Liu, Z., Shi, J., Ouyang, W., et\u00a0al. (2019). Hybrid task cascade for instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4974\u20134983","DOI":"10.1109\/CVPR.2019.00511"},{"key":"2750_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, Z., Peng, Y., Zhang, Z., Yu, G.g., & Sun, J. (2018). Cascaded pyramid network for multi-person pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7103\u20137112","DOI":"10.1109\/CVPR.2018.00742"},{"key":"2750_CR10","doi-asserted-by":"crossref","unstructured":"Choi, H., Moon, G., & Lee, K.M. (2021). Beyond static features for temporally consistent 3d human pose and shape from a video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR46437.2021.00200"},{"key":"2750_CR11","doi-asserted-by":"crossref","unstructured":"Dai, Y., Lin, Y., Wen, C., Shen, S., Xu, L., Yu, J., Ma, Y., & Wang, C. (2022). Hsc4d: Human-centered 4d scene capture in large-scale indoor-outdoor space using wearable imus and lidar. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6792\u20136802","DOI":"10.1109\/CVPR52688.2022.00667"},{"key":"2750_CR12","doi-asserted-by":"crossref","unstructured":"Dwivedi, S.K., Athanasiou, N., Kocabas, M., Black, M.J. (2021). Learning to regress bodies from images using differentiable semantic rendering. In: Proceedings of the IEEE International Conference on Computer Vision, pp 11250\u201311259","DOI":"10.1109\/ICCV48922.2021.01106"},{"key":"2750_CR13","doi-asserted-by":"crossref","unstructured":"Fabbri, M., Lanzi, F., Calderara, S., Palazzi, A., Vezzani, R., & Cucchiara, R. (2018). Learning to detect and track visible and occluded body joints in a virtual world. In: Proceedings of the European Conference on Computer Vision, pp 430\u2013446","DOI":"10.1007\/978-3-030-01225-0_27"},{"key":"2750_CR14","doi-asserted-by":"crossref","unstructured":"Garau, N., Bisagno, N., Br\u00f3dka, P., & Conci, N. (2021). Deca: Deep viewpoint-equivariant human pose estimation using capsule autoencoders. In: Proceedings of the IEEE International Conference on Computer Vision, pp 11677\u201311686","DOI":"10.1109\/ICCV48922.2021.01147"},{"key":"2750_CR15","doi-asserted-by":"crossref","unstructured":"Georgakis, G., Li, R., Karanam, S., Chen, T., Ko\u0161eck\u00e1, J., & Wu, Z. (2020). Hierarchical kinematic human mesh recovery. In: Proceedings of the European Conference on Computer Vision, Springer, pp 768\u2013784","DOI":"10.1007\/978-3-030-58520-4_45"},{"key":"2750_CR16","doi-asserted-by":"crossref","unstructured":"Groueix, T., Fisher, M., Kim, V.G., Russell, B.C., & Aubry, M. (2018). 3d-coded: 3d correspondences by deep deformation. In: Proceedings of the European Conference on Computer Vision, pp 230\u2013246","DOI":"10.1007\/978-3-030-01216-8_15"},{"key":"2750_CR17","doi-asserted-by":"crossref","unstructured":"Guler, R.A., & Kokkinos, I. (2019). Holopose: Holistic 3d human reconstruction in-the-wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10884\u201310894","DOI":"10.1109\/CVPR.2019.01114"},{"key":"2750_CR18","doi-asserted-by":"crossref","unstructured":"Haque, A., Peng, B., Luo, Z., Alahi, A., Yeung, S., & Fei-Fei, L. (2016). Towards viewpoint invariant 3d human pose estimation. In: Proceedings of the European Conference on Computer Vision, Springer, pp 160\u2013177","DOI":"10.1007\/978-3-319-46448-0_10"},{"key":"2750_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, B., Ren, X., Dou, M., Xue, X., Fu, Y., & Zhang, Y. (2022a) Lord: Local 4d implicit representation for high-fidelity dynamic human modeling. In: Proceedings of the European Conference on Computer Vision","DOI":"10.1007\/978-3-031-19809-0_18"},{"key":"2750_CR20","doi-asserted-by":"crossref","unstructured":"Jiang, B., Zhang, Y., Wei, X., Xue, X., & Fu, Y. (2022b). H4d: Human 4d modeling by learning neural compositional representation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52688.2022.01875"},{"key":"2750_CR21","doi-asserted-by":"crossref","unstructured":"Jiang, H., Cai, J., & Zheng, J. (2019). Skeleton-aware 3d human shape reconstruction from point clouds. In: Proceedings of the IEEE International Conference on Computer Vision, pp 5431\u20135441","DOI":"10.1109\/ICCV.2019.00553"},{"key":"2750_CR22","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M.J., Jacobs, D.W., & Malik, J. (2018). End-to-end recovery of human shape and pose. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7122\u20137131","DOI":"10.1109\/CVPR.2018.00744"},{"key":"2750_CR23","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Zhang, J.Y., Felsen, P., & Malik, J. (2019). Learning 3d human dynamics from video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5614\u20135623","DOI":"10.1109\/CVPR.2019.00576"},{"key":"2750_CR24","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Athanasiou, N., & Black, M.J. (2020). Vibe: Video inference for human body pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5253\u20135263","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"2750_CR25","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Huang, C.H.P., Hilliges, O., & Black, M.J. (2021). Pare: Part attention regressor for 3d human body estimation. arXiv preprint arXiv:2104.08527","DOI":"10.1109\/ICCV48922.2021.01094"},{"key":"2750_CR26","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Black, M.J., & Daniilidis, K. (2019a). Learning to reconstruct 3d human pose and shape via model-fitting in the loop. In: Proceedings of the IEEE International Conference on Computer Vision, pp 2252\u20132261","DOI":"10.1109\/ICCV.2019.00234"},{"key":"2750_CR27","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G.,& Daniilidis, K. (2019b). Convolutional mesh regression for single-image human shape reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4501\u20134510","DOI":"10.1109\/CVPR.2019.00463"},{"key":"2750_CR28","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Jayaraman, D., & Daniilidis, K. (2021). Probabilistic modeling for human mesh recovery. In: Proceedings of the IEEE International Conference on Computer Vision, pp 11605\u201311614","DOI":"10.1109\/ICCV48922.2021.01140"},{"key":"2750_CR29","doi-asserted-by":"crossref","unstructured":"Li, J., Xu, C., Chen, Z., Bian, S., Yang, L., & Lu, C. (2021). Hybrik: A hybrid analytical-neural inverse kinematics solution for 3d human pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Computer Vision Foundation \/ IEEE, pp 3383\u20133393","DOI":"10.1109\/CVPR46437.2021.00339"},{"key":"2750_CR30","doi-asserted-by":"crossref","unstructured":"Li, J., Zhang, J., Wang, Z., Shen, S., Wen, C., Ma, Y., Xu, L., Yu, J., & Wang, C. (2022a). Lidarcap: Long-range marker-less 3d human motion capture with lidar point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 20502\u201320512","DOI":"10.1109\/CVPR52688.2022.01985"},{"key":"2750_CR31","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, W., Hu, X., & Yang, J. (2019). Selective kernel networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 510\u2013519","DOI":"10.1109\/CVPR.2019.00060"},{"key":"2750_CR32","doi-asserted-by":"crossref","unstructured":"Li, Z., Liu, J., Zhang, Z., Xu, S., & Yan, Y. (2022b). Cliff: Carrying location information in full frames into human pose and shape estimation. In: Proceedings of the European Conference on Computer Vision, Springer, pp 590\u2013606","DOI":"10.1007\/978-3-031-20065-6_34"},{"key":"2750_CR33","doi-asserted-by":"crossref","unstructured":"Liu, G., Rong, Y., & Sheng, L. (2021). Votehmr: Occlusion-aware voting network for robust 3d human mesh recovery from partial point clouds. In: Proceedings of the 29th ACM International Conference on Multimedia, pp 955\u2013964","DOI":"10.1145\/3474085.3475309"},{"issue":"6","key":"2750_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818013","volume":"34","author":"M Loper","year":"2015","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., & Black, M. J. (2015). SMPL: A skinned multi-person linear model. ACM Transactions on Graphics (ToG), 34(6), 1\u201316.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"2750_CR35","doi-asserted-by":"crossref","unstructured":"Luo, Z., Golestaneh, S.A., Kitani, K.M.(2020). 3d human motion estimation via motion compression and refinement. In: Proceedings of the Asian Conference on Computer Vision","DOI":"10.1007\/978-3-030-69541-5_20"},{"issue":"4","key":"2750_CR36","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1145\/3386569.3392410","volume":"39","author":"D Mehta","year":"2020","unstructured":"Mehta, D., Sotnychenko, O., Mueller, F., Xu, W., Elgharib, M., Fua, P., Seidel, H. P., Rhodin, H., Pons-Moll, G., & Theobalt, C. (2020). Xnect: Real-time multi-person 3d motion capture with a single rgb camera. ACM Transactions on Graphics (ToG), 39(4), 82\u20131.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"2750_CR37","first-page":"752","volume":"12352","author":"G Moon","year":"2020","unstructured":"Moon, G., & Lee, K. M. (2020). I2l-meshnet: Image-to-lixel prediction network for accurate 3d human pose and mesh estimation from a single RGB image. Proceedings of the European Conference on Computer Vision, Springer, Lecture Notes in Computer Science, 12352, 752\u2013768.","journal-title":"Proceedings of the European Conference on Computer Vision, Springer, Lecture Notes in Computer Science"},{"key":"2750_CR38","doi-asserted-by":"crossref","unstructured":"Newell, A., Yang, K., & Deng, J. (2016). Stacked hourglass networks for human pose estimation. In: Proceedings of the European Conference on Computer Vision, Springer, pp 483\u2013499","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"2750_CR39","doi-asserted-by":"crossref","unstructured":"Ofli, F., Chaudhry, R., Kurillo, G., Vidal, R., & Bajcsy, R. (2013). Berkeley mhad: A comprehensive multimodal human action database. In: 2013 IEEE workshop on applications of computer vision (WACV), IEEE, pp 53\u201360","DOI":"10.1109\/WACV.2013.6474999"},{"key":"2750_CR40","doi-asserted-by":"crossref","unstructured":"Omran, M., Lassner, C., Pons-Moll, G., Gehler, P., & Schiele, B. (2018). Neural body fitting: Unifying deep learning and model based human pose and shape estimation. In: 2018 International Conference on 3D Vision, IEEE, pp 484\u2013494","DOI":"10.1109\/3DV.2018.00062"},{"key":"2750_CR41","first-page":"598","volume":"12351","author":"AAA Osman","year":"2020","unstructured":"Osman, A. A. A., Bolkart, T., & Black, M. J. (2020). STAR: sparse trained articulated human body regressor. Proceedings of the European Conference on Computer Vision, Springer, Lecture Notes in Computer Science, 12351, 598\u2013613.","journal-title":"Proceedings of the European Conference on Computer Vision, Springer, Lecture Notes in Computer Science"},{"key":"2750_CR42","doi-asserted-by":"crossref","unstructured":"Palafox, P., Sarafianos, N., Tung, T., & Dai, A. (2022). Spams: Structured implicit parametric models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12851\u201312860","DOI":"10.1109\/CVPR52688.2022.01251"},{"key":"2750_CR43","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhu, L., Zhou, X., & Daniilidis, K. (2018). Learning to estimate 3d human pose and shape from a single color image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 459\u2013468","DOI":"10.1109\/CVPR.2018.00055"},{"key":"2750_CR44","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Choutas, V., Ghorbani, N., Bolkart, T., Osman, A.A., Tzionas, D., Black, M.J. (2019). Expressive body capture: 3d hands, face, and body from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10975\u201310985","DOI":"10.1109\/CVPR.2019.01123"},{"key":"2750_CR45","unstructured":"Qi, C.R., Su, H., Mo, K., & Guibas, L.J. (2017a). Pointnet: Deep learning on point sets for 3d classification and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 652\u2013660"},{"key":"2750_CR46","unstructured":"Qi, C.R., Yi, L., Su, H., & Guibas, L.J. (2017b). Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Advances in Neural Information Processing Systems 30"},{"key":"2750_CR47","unstructured":"Qin, H., Cai, Z., Zhang, M., Ding, Y., Zhao, H., Yi, S., Liu, X., & Su, H. (2020). Bipointnet: Binary neural network for point clouds. arXiv preprint arXiv:2010.05501"},{"key":"2750_CR48","doi-asserted-by":"crossref","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., & Malik, J. (2022). Tracking people by predicting 3d appearance, location and pose. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2740\u20132749","DOI":"10.1109\/CVPR52688.2022.00276"},{"key":"2750_CR49","doi-asserted-by":"crossref","unstructured":"Ren, Y., Han, X., Zhao, C., Wang, J., Xu, L., Yu, J., & Ma, Y. (2024). Livehps: lidar-based scene-level human pose and shape estimation in free environment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1281\u20131291","DOI":"10.1109\/CVPR52733.2024.00128"},{"key":"2750_CR50","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., & Li, H. (2021). Efficient attention: Attention with linear complexities. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 3531\u20133539"},{"key":"2750_CR51","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., & Lu, H. (2019). Skeleton-based action recognition with multi-stream adaptive graph convolutional networks. arXiv preprint arXiv:1912.06971","DOI":"10.1109\/TIP.2020.3028207"},{"key":"2750_CR52","doi-asserted-by":"crossref","unstructured":"Shotton, J., Fitzgibbon, A., Cook, M., Sharp, T., Finocchio, M., Moore, R., Kipman, A., & Blake, A. (2011). Real-time human pose recognition in parts from single depth images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Ieee, pp 1297\u20131304","DOI":"10.1109\/CVPR.2011.5995316"},{"key":"2750_CR53","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., & Wang, J. (2019a). Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5693\u20135703","DOI":"10.1109\/CVPR.2019.00584"},{"key":"2750_CR54","doi-asserted-by":"crossref","unstructured":"Sun, Q., Wang, Y., Zeng, A., Yin, W., Wei, C., Wang, W., Mei, H., Leung, C.S., Liu, Z., Yang, L., et\u00a0al. (2024). Aios: All-in-one-stage expressive human pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1834\u20131843","DOI":"10.1109\/CVPR52733.2024.00180"},{"key":"2750_CR55","doi-asserted-by":"crossref","unstructured":"Sun, Y., Ye, Y., Liu, W., Gao, W., Fu, Y., & Mei, T. (2019b). Human mesh recovery from monocular images via a skeleton-disentangled representation. In: Proceedings of the IEEE International Conference on Computer Vision, pp 5349\u20135358","DOI":"10.1109\/ICCV.2019.00545"},{"key":"2750_CR56","doi-asserted-by":"crossref","unstructured":"Sun, Y., Bao, Q., Liu, W., Fu, Y., Michael,\u00a0J.B., & Mei, T. (2021). Monocular, one-stage, regression of multiple 3d people. In: Proceedings of the IEEE International Conference on Computer Vision","DOI":"10.1109\/ICCV48922.2021.01099"},{"key":"2750_CR57","doi-asserted-by":"crossref","unstructured":"Sun, Y., Liu, W., Bao, Q., Fu, Y., Mei, T., & Black, M.J. (2022). Putting people in their place: Monocular regression of 3d people in depth. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13243\u201313252","DOI":"10.1109\/CVPR52688.2022.01289"},{"key":"2750_CR58","doi-asserted-by":"crossref","unstructured":"Varol, G., Romero, J., Martin, X., Mahmood, N., Black, M.J., Laptev, I., & Schmid, C. (2017). Learning from synthetic humans. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition pp 4627\u20134635","DOI":"10.1109\/CVPR.2017.492"},{"key":"2750_CR59","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems 30"},{"key":"2750_CR60","doi-asserted-by":"crossref","unstructured":"Wang, K., Xie, J., Zhang, G., Liu, L., & Yang, J. (2020). Sequential 3d human pose and shape estimation from point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7275\u20137284","DOI":"10.1109\/CVPR42600.2020.00730"},{"key":"2750_CR61","doi-asserted-by":"crossref","unstructured":"Wang, K., Zheng, H., Zhang, G., & Yang, J. (2021a). Parametric model estimation for 3d clothed humans from point clouds. In: Proceedings of the International Symposium on Mixed and Augmented Reality","DOI":"10.1109\/ISMAR52148.2021.00030"},{"key":"2750_CR62","doi-asserted-by":"crossref","unstructured":"Wang, S., Geiger, A., & Tang, S. (2021b). Locally aware piecewise transformation fields for 3d human mesh registration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7639\u20137648","DOI":"10.1109\/CVPR46437.2021.00755"},{"key":"2750_CR63","first-page":"2810","volume":"34","author":"S Wang","year":"2021","unstructured":"Wang, S., Mihajlovic, M., Ma, Q., Geiger, A., & Tang, S. (2021). Metaavatar: Learning animatable clothed human models from few depth images. Advances in Neural Information Processing Systems, 34, 2810\u20132822.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2750_CR64","doi-asserted-by":"crossref","unstructured":"Wang, W., Ge, Y., Mei, H., Cai, Z., Sun, Q., Wang, Y., Shen, C., Yang, L., & Komura, T. (2023). Zolly: Zoom focal length correctly for perspective-distorted human mesh reconstruction. arXiv preprint arXiv:2303.13796","DOI":"10.1109\/ICCV51070.2023.00363"},{"issue":"5","key":"2750_CR65","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3326362","volume":"38","author":"Y Wang","year":"2019","unstructured":"Wang, Y., Sun, Y., Liu, Z., Sarma, S. E., Bronstein, M. M., & Solomon, J. M. (2019). Dynamic graph cnn for learning on point clouds. ACM Transactions on Graphics (ToG), 38(5), 1\u201312.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"2750_CR66","doi-asserted-by":"crossref","unstructured":"Xiong, F., Zhang, B., Xiao, Y., Cao, Z., Yu, T., Zhou, J.T., & Yuan, J. (2019). A2j: Anchor-to-joint regression network for 3d articulated pose estimation from a single depth image. In: Proceedings of the IEEE International Conference on Computer Vision, pp 793\u2013802","DOI":"10.1109\/ICCV.2019.00088"},{"key":"2750_CR67","doi-asserted-by":"crossref","unstructured":"Xu, H., Bazavan, E.G., Zanfir, A., Freeman, W.T., Sukthankar, R., & Sminchisescu, C. (2020). Ghum & ghuml: Generative 3d human shape and articulated pose models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6184\u20136193","DOI":"10.1109\/CVPR42600.2020.00622"},{"key":"2750_CR68","doi-asserted-by":"crossref","unstructured":"Yin, W., Cai, Z., Wang, R., Wang, F., Wei, C., Mei, H., Xiao, W., Yang, Z., Sun, Q., Yamashita, A., et\u00a0al. (2024). Whac: World-grounded humans and cameras. In: Proceedings of the European Conference on Computer Vision, Springer, pp 20\u201337","DOI":"10.1007\/978-3-031-72754-2_2"},{"key":"2750_CR69","doi-asserted-by":"crossref","unstructured":"Yin, W., Cai, Z., Wang, R., Zeng, A., Wei, C., Sun, Q., Mei, H., Wang, Y., Pang, H.E., Zhang, M., et\u00a0al. (2025). Smplest-x: Ultimate scaling for expressive human pose and shape estimation. arXiv preprint arXiv:2501.09782","DOI":"10.1109\/TPAMI.2025.3618174"},{"key":"2750_CR70","unstructured":"Zhao, C., Ren, Y., He, Y., Cong, P., Liang, H., Yu, J., Xu, L., & Ma, Y. (2022). Lidar-aid inertial poser: Large-scale human motion capture by sparse inertial and lidar sensors. arXiv preprint arXiv:2205.15410"},{"key":"2750_CR71","doi-asserted-by":"crossref","unstructured":"Zhou, B., Meng, D., Franco, J.S., & Boyer, E. (2023). Human body shape completion with implicit shape and flow learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12901\u201312911","DOI":"10.1109\/CVPR52729.2023.01240"},{"key":"2750_CR72","unstructured":"Zhou, Q.Y., Park, J., & Koltun, V. (2018). Open3D: A modern library for 3D data processing. arXiv:1801.09847"},{"key":"2750_CR73","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Dong, H., & El Saddik, A. (2020). Learning to estimate 3d human pose from point cloud. IEEE Sensors Journal,20(20), 12334\u201312342.","DOI":"10.1109\/JSEN.2020.2999849"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02750-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02750-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02750-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T23:07:30Z","timestamp":1781824050000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02750-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,7]]},"references-count":73,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["2750"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02750-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,7]]},"assertion":[{"value":"2 January 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"200"}}