{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:57:07Z","timestamp":1774022227734,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":160,"publisher":"ACM","funder":[{"name":"National Key R&D Program of China","award":["2024YFB2809105"],"award-info":[{"award-number":["2024YFB2809105"]}]},{"name":"NSFC","award":["U24B20154,62172364"],"award-info":[{"award-number":["U24B20154,62172364"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LR25F020003"],"award-info":[{"award-number":["LR25F020003"]}]},{"name":"Information Technology Center, Zhejiang University"},{"name":"State Key Lab of CAD&CG, Zhejiang University"},{"name":"Research Grants Council of the Hong Kong Special Administrative Region, China","award":["AoE\/E-601\/24-N"],"award-info":[{"award-number":["AoE\/E-601\/24-N"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730605","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HumanRAM: Feed-forward Human Reconstruction and Animation Model using Transformers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7220-7789","authenticated-orcid":false,"given":"Zhiyuan","family":"Yu","sequence":"first","affiliation":[{"name":"Department of Mathematics, Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4703-0875","authenticated-orcid":false,"given":"Zhe","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2662-0334","authenticated-orcid":false,"given":"Hujun","family":"Bao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CAD&amp;CG, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4407-3055","authenticated-orcid":false,"given":"Can","family":"Yang","sequence":"additional","affiliation":[{"name":"Department of Mathematics, Hong Kong University of Science and Technology, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1926-5597","authenticated-orcid":false,"given":"Xiaowei","family":"Zhou","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CAD&amp;CG, Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618153"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"crossref","unstructured":"Timur Bagautdinov Chenglei Wu Tomas Simon Fabian Prada Takaaki Shiratori Shih-En Wei Weipeng Xu Yaser Sheikh and Jason Saragih. 2021. Driving-signal aware full-body avatars. ACM Transactions on Graphics (TOG) 40 4 (2021) 1\u201317.","DOI":"10.1145\/3450626.3459850"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1399504.1360698"},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00097"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00451"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58526-6_36"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"e_1_3_3_2_9_1","volume-title":"arXiv","author":"Chan Eric\u00a0R.","year":"2021","unstructured":"Eric\u00a0R. Chan, Connor\u00a0Z. Lin, Matthew\u00a0A. Chan, Koki Nagano, Boxiao Pan, Shalini\u00a0De Mello, Orazio Gallo, Leonidas Guibas, Jonathan Tremblay, Sameh Khamis, Tero Karras, and Gordon Wetzstein. 2021. Efficient Geometry-aware 3D Generative Adversarial Networks. In arXiv."},{"key":"e_1_3_3_2_10_1","volume-title":"ECCV","author":"Chatziagapi Aggelina","year":"2024","unstructured":"Aggelina Chatziagapi, Grigorios\u00a0G. Chrysos, and Dimitris Samaras. 2024. MIGS: Multi-Identity Gaussian Splatting via Tensor Decomposition. In ECCV."},{"key":"e_1_3_3_2_11_1","volume-title":"European Conference on Computer Vision (ECCV)","author":"Chen Anpei","year":"2024","unstructured":"Anpei Chen, Haofei Xu, Stefano Esposito, Siyu Tang, and Andreas Geiger. 2024c. LaRa: Efficient Large-Baseline Radiance Fields. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_2_12_1","unstructured":"Jinnan Chen Chen Li and Gim\u00a0Hee Lee. 2024a. DiHuR: Diffusion-Guided Generalizable Human Reconstruction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.11903 (2024)."},{"key":"e_1_3_3_2_13_1","unstructured":"Jinnan Chen Chen Li Jianfeng Zhang Lingting Zhu Buzhen Huang Hanlin Chen and Gim\u00a0Hee Lee. 2024b. Generalizable Human Gaussians from Single-View Image. arXiv preprint (2024)."},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01978"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.168"},{"key":"e_1_3_3_2_16_1","unstructured":"Yushuo Chen Zerong Zheng Zhe Li Chao Xu and Yebin Liu. 2024d. MeshAvatar: Learning High-quality Triangular Human Avatars from Multi-view Videos. arxiv:https:\/\/arXiv.org\/abs\/2407.08414\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2407.08414"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"crossref","unstructured":"Alvaro Collet Ming Chuang Pat Sweeney Don Gillett Dennis Evseev David Calabrese Hugues Hoppe Adam Kirk and Steve Sullivan. 2015. High-quality streamable free-viewpoint video. ACM Transactions on Graphics (ToG) 34 4 (2015) 1\u201313.","DOI":"10.1145\/2766945"},{"key":"e_1_3_3_2_18_1","volume-title":"Blender - a 3D modelling and rendering package","author":"Community Blender\u00a0Online","year":"2018","unstructured":"Blender\u00a0Online Community. 2018. Blender - a 3D modelling and rendering package. Blender Foundation, Stichting Blender Foundation, Amsterdam. http:\/\/www.blender.org"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00195"},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"crossref","unstructured":"Zheng Dong Ke Xu Yaoan Gao Hujun Bao Weiwei Xu and Rynson\u00a0WH Lau. 2024. Gaussian Surfel Splatting for Live Human Performance Capture. ACM Transactions on Graphics (TOG) 43 6 (2024) 1\u201317.","DOI":"10.1145\/3687993"},{"key":"e_1_3_3_2_22_1","unstructured":"Qingzhe Gao Yiming Wang Libin Liu Lingjie Liu Christian Theobalt and Baoquan Chen. 2023. Neural novel actor: Learning a generalized animatable neural representation for human actors. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_3_2_23_1","first-page":"10084","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Gao Xiangjun","year":"2024","unstructured":"Xiangjun Gao, Xiaoyu Li, Chaopeng Zhang, Qi Zhang, Yanpei Cao, Ying Shan, and Long Quan. 2024. ConTex-Human: Free-View Rendering of Human from a Single Image with Texture-Consistent Synthesis. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10084\u201310094."},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"crossref","unstructured":"Xiangjun Gao Jiaolong Yang Jongyoo Kim Sida Peng Zicheng Liu and Xin Tong. 2022. MPS-NeRF: Generalizable 3D Human Rendering From Multiview Images. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022) 1\u201312. https:\/\/doi.org\/10.1109\/TPAMI.2022.3205910","DOI":"10.1109\/TPAMI.2022.3205910"},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"crossref","unstructured":"Kaiwen Guo Peter Lincoln Philip Davidson Jay Busch Xueming Yu Matt Whalen Geoff Harvey Sergio Orts-Escolano Rohit Pandey Jason Dourgarian et\u00a0al. 2019. The relightables: Volumetric performance capture of humans with realistic relighting. ACM Transactions on Graphics (TOG) 38 6 (2019) 1\u201319.","DOI":"10.1145\/3355089.3356571"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01086"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Xu He Xiaoyu Li Di Kang Jiangnan Ye Chaopeng Zhang Liyang Chen Xiangjun Gao Han Zhang Zhiyong Wu and Haolin Zhuang. 2024. MagicMan: Generative Novel View Synthesis of Humans with 3D-Aware Diffusion and Iterative Refinement. arxiv:https:\/\/arXiv.org\/abs\/2408.14211\u00a0[cs.CV]","DOI":"10.1609\/aaai.v39i3.32356"},{"key":"e_1_3_3_2_28_1","unstructured":"Yicong Hong Kai Zhang Jiuxiang Gu Sai Bi Yang Zhou Difan Liu Feng Liu Kalyan Sunkavalli Trung Bui and Hao Tan. 2023. Lrm: Large reconstruction model for single image to 3d. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.04400 (2023)."},{"key":"e_1_3_3_2_29_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Lixin\u00a0Xue Jie\u00a0Song Hsuan-I\u00a0Ho,","year":"2023","unstructured":"Jie\u00a0Song Hsuan-I\u00a0Ho, Lixin\u00a0Xue and Otmar Hilliges. 2023. Learning Locally Editable Virtual Humans. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_3_2_30_1","volume-title":"NeurIPS","author":"Hu Hezhen","year":"2024","unstructured":"Hezhen Hu, Zhiwen Fan, Tianhao Wu, Yihan Xi, Seoyoung Lee, Georgios Pavlakos, and Zhangyang Wang. 2024a. Expressive Gaussian Human Avatars from Monocular RGB Video. In NeurIPS."},{"key":"e_1_3_3_2_31_1","unstructured":"Li Hu Xin Gao Peng Zhang Ke Sun Bang Zhang and Liefeng Bo. 2023a. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17117 (2023)."},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00858"},{"key":"e_1_3_3_2_33_1","unstructured":"Yingdong Hu Zhening Liu Jiawei Shao Zehong Lin and Jun Zhang. 2024b. EVA-Gaussian: 3D Gaussian-based Real-time Human Novel View Synthesis under Diverse Camera Settings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.01425 (2024)."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00437"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00316"},{"key":"e_1_3_3_2_36_1","first-page":"538","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"I\u00a0Ho Hsuan","year":"2024","unstructured":"Hsuan I\u00a0Ho, Jie Song, and Otmar Hilliges. 2024. SiTH: Single-view Textured Human Reconstruction with Image-Conditioned Diffusion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 538\u2013549."},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"crossref","unstructured":"Mustafa I\u015f\u0131k Martin R\u00fcnz Markos Georgopoulos Taras Khakhulin Jonathan Starck Lourdes Agapito and Matthias Nie\u00dfner. 2023. HumanRF: High-Fidelity Neural Radiance Fields for Humans in Motion. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201312. https:\/\/doi.org\/10.1145\/3592415","DOI":"10.1145\/3592415"},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00552"},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"crossref","unstructured":"Hanwen Jiang Zexiang Xu Desai Xie Ziwen Chen Haian Jin Fujun Luan Zhixin Shu Kai Zhang Sai Bi Xin Sun Jiuxiang Gu Qixing Huang Georgios Pavlakos and Hao Tan. 2024. MegaSynth: Scaling Up 3D Scene Reconstruction with Synthesized Data. (2024).","DOI":"10.1109\/CVPR52734.2025.01533"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"crossref","unstructured":"Tianjian Jiang Xu Chen Jie Song and Otmar Hilliges. 2022a. InstantAvatar: Learning Avatars from Monocular Video in 60 Seconds. arXiv (2022).","DOI":"10.1109\/CVPR52729.2023.01623"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_24"},{"key":"e_1_3_3_2_42_1","unstructured":"Haian Jin Hanwen Jiang Hao Tan Kai Zhang Sai Bi Tianyuan Zhang Fujun Luan Noah Snavely and Zexiang Xu. 2024. Lvsm: A large view synthesis model with minimal 3d inductive bias. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.17242 (2024)."},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3d gaussian splatting for real-time radiance field rendering. ACM Transactions on Graphics 42 4 (2023) 139\u20131.","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00055"},{"key":"e_1_3_3_2_45_1","unstructured":"Nikos Kolotouros Thiemo Alldieck Andrei Zanfir Eduard Bazavan Mihai Fieraru and Cristian Sminchisescu. 2024. Dreamhuman: Animatable 3d avatars from text. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_46_1","unstructured":"Youngjoong Kwon Baole Fang Yixing Lu Haoye Dong Cheng Zhang Francisco\u00a0Vicente Carrasco Albert Mosella-Montoro Jianjin Xu Shingo Takagi Daeil Kim et\u00a0al. 2024. Generalizable Human Gaussians for Sparse View Synthesis. European Conference on Computer Vision (2024)."},{"key":"e_1_3_3_2_47_1","unstructured":"Youngjoong Kwon Dahun Kim Duygu Ceylan and Henry Fuchs. 2021. Neural human performer: Learning generalizable radiance fields for human performance rendering. Advances in Neural Information Processing Systems 34 (2021) 24741\u201324752."},{"key":"e_1_3_3_2_48_1","unstructured":"Youngjoong Kwon Dahun Kim Duygu Ceylan and Henry Fuchs. 2023. Neural image-based avatars: Generalizable radiance fields for human avatar modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.04897 (2023)."},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"crossref","unstructured":"Samuli Laine Janne Hellsten Tero Karras Yeongho Seol Jaakko Lehtinen and Timo Aila. 2020. Modular Primitives for High-Performance Differentiable Rendering. ACM Transactions on Graphics 39 6 (2020).","DOI":"10.1145\/3414685.3417861"},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01879"},{"key":"e_1_3_3_2_51_1","unstructured":"Ruilong Li Julian Tanke Minh Vo Michael Zollhofer Jurgen Gall Angjoo Kanazawa and Christoph Lassner. 2022. TAVA: Template-free animatable volumetric actors. European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00910"},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591490"},{"key":"e_1_3_3_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01864"},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"crossref","unstructured":"Hanwen Liang Junli Cao Vidit Goel Guocheng Qian Sergei Korolev Demetri Terzopoulos Konstantinos Plataniotis Sergey Tulyakov and Jian Ren. 2024a. Wonderland: Navigating 3D Scenes from a Single Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.12091 (2024).","DOI":"10.1109\/CVPR52734.2025.00083"},{"key":"e_1_3_3_2_56_1","unstructured":"Hanxue Liang Jiawei Ren Ashkan Mirzaei Antonio Torralba Ziwei Liu Igor Gilitschenski Sanja Fidler Cengiz Oztireli Huan Ling Zan Gojcic et\u00a0al. 2024b. Feed-Forward Bullet-Time Reconstruction of Dynamic Scenes from Monocular Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03526 (2024)."},{"key":"e_1_3_3_2_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555376"},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657501"},{"key":"e_1_3_3_2_59_1","unstructured":"Lingjie Liu Marc Habermann Viktor Rudnev Kripasindhu Sarkar Jiatao Gu and Christian Theobalt. 2021. Neural Actor: Neural Free-view Synthesis of Human Actors with Pose Control. ACM Trans. Graph.(ACM SIGGRAPH Asia) (2021)."},{"key":"e_1_3_3_2_60_1","unstructured":"Minghua Liu Chao Xu Haian Jin Linghao Chen Mukund Varma\u00a0T Zexiang Xu and Hao Su. 2024b. One-2-3-45: Any single image to 3d mesh in 45 seconds without per-shape optimization. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00600"},{"key":"e_1_3_3_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00635"},{"key":"e_1_3_3_2_64_1","doi-asserted-by":"crossref","unstructured":"Yebin Liu Qionghai Dai and Wenli Xu. 2009. A point-cloud-based multiview stereo algorithm for free-viewpoint video. IEEE transactions on visualization and computer graphics 16 3 (2009) 407\u2013418.","DOI":"10.1109\/TVCG.2009.88"},{"key":"e_1_3_3_2_65_1","unstructured":"Yuxiao Liu Zhe Li Yebin Liu and Haoqian Wang. 2024a. TexVocab: Texture Vocabulary-conditioned Human Avatars. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.00524 (2024)."},{"key":"e_1_3_3_2_66_1","doi-asserted-by":"crossref","unstructured":"Matthew Loper Naureen Mahmood Javier Romero Gerard Pons-Moll and Michael\u00a0J. Black. 2015. SMPL: A Skinned Multi-Person Linear Model. ACM Trans. Graphics (Proc. SIGGRAPH Asia) 34 6 (Oct. 2015) 248:1\u2013248:16.","DOI":"10.1145\/2816795.2818013"},{"key":"e_1_3_3_2_67_1","unstructured":"Yue Ma Yingqing He Xiaodong Cun Xintao Wang Ying Shan Xiu Li and Qifeng Chen. 2023. Follow Your Pose: Pose-Guided Text-to-Video Generation using Pose-Free Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.01186 (2023)."},{"key":"e_1_3_3_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"e_1_3_3_2_69_1","unstructured":"Yifang Men Yuan Yao Miaomiao Cui and Liefeng Bo. 2024. MIMO: Controllable Character Video Synthesis with Spatial Decomposed Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.16160."},{"key":"e_1_3_3_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"e_1_3_3_2_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_11"},{"key":"e_1_3_3_2_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_3_2_73_1","volume-title":"ECCV","author":"Moon Gyeongsik","year":"2024","unstructured":"Gyeongsik Moon, Takaaki Shiratori, and Shunsuke Saito. 2024. Expressive Whole-Body 3D Gaussian Avatar. In ECCV."},{"key":"e_1_3_3_2_74_1","unstructured":"Jiteng Mu Shen Sang Nuno Vasconcelos and Xiaolong Wang. 2023. ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs. (2023) 18391\u201318401."},{"key":"e_1_3_3_2_75_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Theo Moutakanni Huy\u00a0V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby Russell Howes Po-Yao Huang Hu Xu Vasu Sharma Shang-Wen Li Wojciech Galuba Mike Rabbat Mido Assran Nicolas Ballas Gabriel Synnaeve Ishan Misra Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2023. DINOv2: Learning Robust Visual Features without Supervision."},{"key":"e_1_3_3_2_76_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Pan Panwang","year":"2024","unstructured":"Panwang Pan, Zhuo Su, Chenguo Lin, Zhen Fan, Yongjie Zhang, Zeming Li, Tingting Shen, Yadong Mu, and Yebin Liu. 2024. HumanSplat: Generalizable Single-Image Human Gaussian Splatting with Structure Priors. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"e_1_3_3_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"e_1_3_3_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01405"},{"key":"e_1_3_3_2_80_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_31"},{"key":"e_1_3_3_2_81_1","unstructured":"Sida Peng Zhen Xu Junting Dong Qianqian Wang Shangzhan Zhang Qing Shuai Hujun Bao and Xiaowei Zhou. 2024. Animatable Implicit Neural Representations for Creating Realistic Avatars from Videos. TPAMI (2024)."},{"key":"e_1_3_3_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00894"},{"key":"e_1_3_3_2_83_1","doi-asserted-by":"crossref","unstructured":"Julius Pl\u00fccker. 1865. Xvii. on a new geometry of space. Philosophical Transactions of the Royal Society of London155 (1865) 725\u2013791.","DOI":"10.1098\/rstl.1865.0017"},{"key":"e_1_3_3_2_84_1","volume-title":"ICLR","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan\u00a0T Barron, and Ben Mildenhall. 2023. Dreamfusion: Text-to-3d using 2d diffusion. In ICLR."},{"key":"e_1_3_3_2_85_1","unstructured":"Lorenza Prospero Abdullah Hamdi Joao\u00a0F. Henriques and Christian Rupprecht. 2024. GST: Precise 3D Human Body from a Single Image with Gaussian Splatting Transformers. arxiv:https:\/\/arXiv.org\/abs\/2409.04196\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2409.04196"},{"key":"e_1_3_3_2_86_1","doi-asserted-by":"crossref","unstructured":"Zhiyin Qian Shaofei Wang Marko Mihajlovic Andreas Geiger and Siyu Tang. 2024. 3DGS-Avatar: Animatable Avatars via Deformable 3D Gaussian Splatting. (2024).","DOI":"10.1109\/CVPR52733.2024.00480"},{"key":"e_1_3_3_2_87_1","doi-asserted-by":"crossref","unstructured":"Amit Raj Michael Zollhoefer Tomas Simon Jason Saragih Shunsuke Saito James Hays and Stephen Lombardi. 2021. Pva: Pixel-aligned volumetric avatars. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2101.02697 (2021).","DOI":"10.1109\/CVPR46437.2021.01156"},{"key":"e_1_3_3_2_88_1","doi-asserted-by":"crossref","unstructured":"Ren\u00e9 Ranftl Alexey Bochkovskiy and Vladlen Koltun. 2021. Vision Transformers for Dense Prediction. ArXiv preprint (2021).","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_3_2_89_1","doi-asserted-by":"crossref","unstructured":"Yurui Ren Ge Li Shan Liu and Thomas\u00a0H Li. 2020. Deep spatial transformation for pose-guided person image generation and animation. IEEE Transactions on Image Processing 29 (2020) 8622\u20138635.","DOI":"10.1109\/TIP.2020.3018224"},{"key":"e_1_3_3_2_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00239"},{"key":"e_1_3_3_2_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00016"},{"key":"e_1_3_3_2_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00143"},{"key":"e_1_3_3_2_93_1","doi-asserted-by":"crossref","unstructured":"Ruizhi Shao Youxin Pang Zerong Zheng Jingxiang Sun and Yebin Liu. 2024a. Human4DiT: 360-degree Human Video Generation with 4D Diffusion Transformer. ACM Transactions on Graphics (TOG) 43 6 (2024).","DOI":"10.1145\/3687980"},{"key":"e_1_3_3_2_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01541"},{"key":"e_1_3_3_2_95_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00159"},{"key":"e_1_3_3_2_96_1","unstructured":"Qiuhong Shen Zike Wu Xuanyu Yi Pan Zhou Hanwang Zhang Shuicheng Yan and Xinchao Wang. 2024. Gamba: Marry gaussian splatting with mamba for single view 3d reconstruction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.18795 (2024)."},{"key":"e_1_3_3_2_97_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73337-6_3"},{"key":"e_1_3_3_2_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00248"},{"key":"e_1_3_3_2_99_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Siarohin Aliaksandr","year":"2019","unstructured":"Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. 2019b. First Order Motion Model for Image Animation. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_3_2_101_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_2_102_1","doi-asserted-by":"crossref","unstructured":"Jonathan Starck and Adrian Hilton. 2007. Surface capture for performance-based animation. IEEE computer graphics and applications 27 3 (2007) 21\u201331.","DOI":"10.1109\/MCG.2007.68"},{"key":"e_1_3_3_2_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01359"},{"key":"e_1_3_3_2_104_1","volume-title":"ECCV","author":"Sun Guoxing","year":"2024","unstructured":"Guoxing Sun, Rishabh Dabral, Pascal Fua, Christian Theobalt, and Marc Habermann. 2024a. MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view Human Performance Capture and Rendering. In ECCV."},{"key":"e_1_3_3_2_105_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00180"},{"key":"e_1_3_3_2_106_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_1"},{"key":"e_1_3_3_2_107_1","doi-asserted-by":"crossref","unstructured":"Sicong Tang Guangyuan Wang Qing Ran Lingzhi Li Li Shen and Ping Tan. 2023. High-resolution volumetric reconstruction for clothed humans. ACM Transactions on Graphics 42 5 (2023) 1\u201315.","DOI":"10.1145\/3606032"},{"key":"e_1_3_3_2_108_1","doi-asserted-by":"crossref","unstructured":"Felix Taubner Ruihang Zhang Mathieu Tuli and David\u00a0B. Lindell. 2024. CAP4D: Creating Animatable 4D Portrait Avatars with Morphable Multi-View Diffusion Models. (2024).","DOI":"10.1109\/CVPR52734.2025.00501"},{"key":"e_1_3_3_2_109_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14022"},{"key":"e_1_3_3_2_110_1","doi-asserted-by":"crossref","unstructured":"Justus Thies Michael Zollh\u00f6fer and Matthias Nie\u00dfner. 2019. Deferred neural rendering: Image synthesis using neural textures. Acm Transactions on Graphics (TOG) 38 4 (2019) 1\u201312.","DOI":"10.1145\/3306346.3323035"},{"key":"e_1_3_3_2_111_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657491"},{"key":"e_1_3_3_2_112_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_2"},{"key":"e_1_3_3_2_113_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_3_2_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/1661412.1618520"},{"key":"e_1_3_3_2_115_1","volume-title":"European Conference on Computer Vision (ECCV)","author":"Voleti Vikram","year":"2024","unstructured":"Vikram Voleti, Chun-Han Yao, Mark Boss, Adam Letts, David Pankratz, Dmitrii Tochilkin, Christian Laforte, Robin Rombach, and Varun Jampani. 2024. SV3D: Novel Multi-view Synthesis and 3D Generation from a Single Image using Latent Video Diffusion. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_2_116_1","unstructured":"Peng Wang Hao Tan Sai Bi Yinghao Xu Fujun Luan Kalyan Sunkavalli Wenping Wang Zexiang Xu and Kai Zhang. 2023b. Pf-lrm: Pose-free large reconstruction model for joint pose and shape prediction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.12024 (2023)."},{"key":"e_1_3_3_2_117_1","unstructured":"Ruicheng Wang Sicheng Xu Cassie Dai Jianfeng Xiang Yu Deng Xin Tong and Jiaolong Yang. 2024. MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision. arxiv:https:\/\/arXiv.org\/abs\/2410.19115\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2410.19115"},{"key":"e_1_3_3_2_118_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_1"},{"key":"e_1_3_3_2_119_1","unstructured":"Tan Wang Linjie Li Kevin Lin Yuanhao Zhai Chung-Ching Lin Zhengyuan Yang Hanwang Zhang Zicheng Liu and Lijuan Wang. 2023a. Disco: Disentangled control for realistic human dance generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.00040 (2023)."},{"key":"e_1_3_3_2_120_1","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_2_121_1","unstructured":"Xinyue Wei Kai Zhang Sai Bi Hao Tan Fujun Luan Valentin Deschaintre Kalyan Sunkavalli Hao Su and Zexiang Xu. 2024. Meshlrm: Large reconstruction model for high-quality mesh. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.12385 (2024)."},{"key":"e_1_3_3_2_122_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00201"},{"key":"e_1_3_3_2_123_1","unstructured":"Zhenzhen Weng Jingyuan Liu Hao Tan Zhan Xu Yang Zhou Serena Yeung-Levy and Jimei Yang. 2023. Template-Free Single-View 3D Human Digitalization with Diffusion-Guided LRM. Preprint (2023)."},{"key":"e_1_3_3_2_124_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126358"},{"key":"e_1_3_3_2_125_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01899"},{"key":"e_1_3_3_2_126_1","unstructured":"Desai Xie Sai Bi Zhixin Shu Kai Zhang Zexiang Xu Yi Zhou S\u00f6ren Pirk Arie Kaufman Xin Sun and Hao Tan. 2024. LRM-Zero: Training Large Reconstruction Models with Synthesized Data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.09371 (2024)."},{"key":"e_1_3_3_2_127_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00057"},{"key":"e_1_3_3_2_128_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01294"},{"key":"e_1_3_3_2_129_1","doi-asserted-by":"crossref","unstructured":"Yuliang Xiu Yufei Ye Zhen Liu Dimitris Tzionas and Michael\u00a0J. Black. 2024. PuzzleAvatar: Assembling 3D Avatars from Personal Albums. ACM Trans. Graph. 43 6 Article 283 (Nov. 2024) 15\u00a0pages. https:\/\/doi.org\/10.1145\/3687771","DOI":"10.1145\/3687771"},{"key":"e_1_3_3_2_130_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01542"},{"key":"e_1_3_3_2_131_1","unstructured":"Yinghao Xu Zifan Shi Wang Yifan Hansheng Chen Ceyuan Yang Sida Peng Yujun Shen and Gordon Wetzstein. 2024b. Grm: Large gaussian reconstruction model for efficient 3d reconstruction and generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.14621 (2024)."},{"key":"e_1_3_3_2_132_1","unstructured":"Yinghao Xu Hao Tan Fujun Luan Sai Bi Peng Wang Jiahao Li Zifan Shi Kalyan Sunkavalli Gordon Wetzstein Zexiang Xu et\u00a0al. 2023a. Dmv3d: Denoising multi-view diffusion using 3d large reconstruction model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.09217 (2023)."},{"key":"e_1_3_3_2_133_1","unstructured":"Yuanyou Xu Zongxin Yang and Yi Yang. 2023b. Seeavatar: Photorealistic text-to-3d avatar generation with constrained geometry and appearance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.08889 (2023)."},{"key":"e_1_3_3_2_134_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00100"},{"key":"e_1_3_3_2_135_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00147"},{"key":"e_1_3_3_2_136_1","unstructured":"Lihe Yang Bingyi Kang Zilong Huang Zhen Zhao Xiaogang Xu Jiashi Feng and Hengshuang Zhao. 2024b. Depth Anything V2. arXiv:https:\/\/arXiv.org\/abs\/2406.09414 (2024)."},{"key":"e_1_3_3_2_137_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00077"},{"key":"e_1_3_3_2_138_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01015"},{"key":"e_1_3_3_2_139_1","unstructured":"Xuanyu Yi Zike Wu Qiuhong Shen Qingshan Xu Pan Zhou Joo-Hwee Lim Shuicheng Yan Xinchao Wang and Hanwang Zhang. 2024. MVGamba: Unify 3D Content Generation as State Space Sequence Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06367 (2024)."},{"key":"e_1_3_3_2_140_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00340"},{"key":"e_1_3_3_2_141_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00569"},{"key":"e_1_3_3_2_142_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00690"},{"key":"e_1_3_3_2_143_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01625"},{"key":"e_1_3_3_2_144_1","doi-asserted-by":"crossref","unstructured":"Kai Zhang Sai Bi Hao Tan Yuanbo Xiangli Nanxuan Zhao Kalyan Sunkavalli and Zexiang Xu. 2024a. GS-LRM: Large Reconstruction Model for 3D Gaussian Splatting. European Conference on Computer Vision (2024).","DOI":"10.1007\/978-3-031-72670-5_1"},{"key":"e_1_3_3_2_145_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"e_1_3_3_2_146_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611707"},{"key":"e_1_3_3_2_147_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_2_148_1","unstructured":"Yuang Zhang Jiaxi Gu Li-Wen Wang Han Wang Junqi Cheng Yuefeng Zhu and Fangyuan Zou. 2024b. MimicMotion: High-Quality Human Motion Video Generation with Confidence-aware Pose Guidance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.19680 (2024)."},{"key":"e_1_3_3_2_149_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00551"},{"key":"e_1_3_3_2_150_1","unstructured":"Zechuan Zhang Li Sun Zongxin Yang Ling Chen and Yi Yang. 2024c. Global-correlated 3d-decoupling transformer for clothed avatar reconstruction. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_151_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00948"},{"key":"e_1_3_3_2_152_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_3_2_153_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01861"},{"key":"e_1_3_3_2_154_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00618"},{"key":"e_1_3_3_2_155_1","doi-asserted-by":"crossref","unstructured":"Zerong Zheng Tao Yu Yebin Liu and Qionghai Dai. 2021b. Pamir: Parametric model-conditioned implicit representation for image-based human reconstruction. IEEE transactions on pattern analysis and machine intelligence 44 6 (2021) 3170\u20133184.","DOI":"10.1109\/TPAMI.2021.3050505"},{"key":"e_1_3_3_2_156_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00783"},{"key":"e_1_3_3_2_157_1","doi-asserted-by":"crossref","unstructured":"Boyao Zhou Shunyuan Zheng Hanzhang Tu Ruizhi Shao Boning Liu Shengping Zhang Liqiang Nie and Yebin Liu. 2024b. GPS-Gaussian+: Generalizable Pixel-wise 3D Gaussian Splatting for Real-Time Human-Scene Rendering from Sparse Views. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.11363 (2024).","DOI":"10.1109\/TPAMI.2025.3561248"},{"key":"e_1_3_3_2_158_1","doi-asserted-by":"crossref","unstructured":"Tiansong Zhou Jing Huang Tao Yu Ruizhi Shao and Kun Li. 2024a. HDhuman: High-Quality Human Novel-View Rendering From Sparse Views. IEEE Transactions on Visualization and Computer Graphics 30 8 (2024) 5328\u20135338.","DOI":"10.1109\/TVCG.2023.3290543"},{"key":"e_1_3_3_2_159_1","unstructured":"Shenhao Zhu Junming\u00a0Leo Chen Zuozhuo Dai Yinghui Xu Xun Cao Yao Yao Hao Zhu and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. arxiv:https:\/\/arXiv.org\/abs\/2403.14781\u00a0[cs.CV]"},{"key":"e_1_3_3_2_160_1","doi-asserted-by":"crossref","unstructured":"Wojciech Zielonka Timur Bagautdinov Shunsuke Saito Michael Zollh\u00f6fer Justus Thies and Javier Romero. 2025. Drivable 3D Gaussian Avatars. (March 2025).","DOI":"10.1109\/3DV66043.2025.00095"},{"key":"e_1_3_3_2_161_1","unstructured":"Chen Ziwen Hao Tan Kai Zhang Sai Bi Fujun Luan Yicong Hong Li Fuxin and Zexiang Xu. 2024. Long-LRM: Long-sequence Large Reconstruction Model for Wide-coverage Gaussian Splats. arXiv preprint 2410.12781 (2024)."}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730605","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:57:10Z","timestamp":1774018630000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730605"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":160,"alternative-id":["10.1145\/3721238.3730605","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730605","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}