{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:07Z","timestamp":1776931147512,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":145,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3757377.3763935","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T16:27:29Z","timestamp":1765211249000},"page":"1-14","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MV-Performer: Taming Video Diffusion Model for Faithful and Synchronized Multi-view Performer Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1183-5459","authenticated-orcid":false,"given":"Yihao","family":"Zhi","sequence":"first","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0604-7421","authenticated-orcid":false,"given":"Chenghong","family":"Li","sequence":"additional","affiliation":[{"name":"FNii-Shenzhen, Shenzhen, China and SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1992-7089","authenticated-orcid":false,"given":"Hongjie","family":"Liao","sequence":"additional","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0066-9365","authenticated-orcid":false,"given":"Xihe","family":"Yang","sequence":"additional","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3884-137X","authenticated-orcid":false,"given":"Zhengwentai","family":"Sun","sequence":"additional","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6877-1649","authenticated-orcid":false,"given":"Jiahao","family":"Chang","sequence":"additional","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3607-2236","authenticated-orcid":false,"given":"Xiaodong","family":"Cun","sequence":"additional","affiliation":[{"name":"Great Bay University, Dongguan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7315-337X","authenticated-orcid":false,"given":"Wensen","family":"Feng","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0162-3296","authenticated-orcid":false,"given":"Xiaoguang","family":"Han","sequence":"additional","affiliation":[{"name":"SSE, CUHKSZ, Shenzhen, China; FNii-Shenzhen, Shenzhen, China and Guangdong Provincial Key Laboratory of Future Networks of Intelligence, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.1997.609457"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00764"},{"key":"e_1_3_3_2_4_1","unstructured":"Jianhong Bai Menghan Xia Xiao Fu Xintao Wang Lianrui Mu Jinwen Cao Zuozhu Liu Haoji Hu Xiang Bai Pengfei Wan and Di Zhang. 2025. ReCamMaster: Camera-Controlled Generative Rendering from A Single Video. arxiv:https:\/\/arXiv.org\/abs\/2503.11647\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2503.11647"},{"key":"e_1_3_3_2_5_1","unstructured":"Jianhong Bai Menghan Xia Xintao Wang Ziyang Yuan Xiao Fu Zuozhu Liu Haoji Hu Pengfei Wan and Di Zhang. 2024. SynCamMaster: Synchronizing Multi-Camera Video Generation from Diverse Viewpoints. arxiv:https:\/\/arXiv.org\/abs\/2412.07760\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2412.07760"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"crossref","unstructured":"Weikang Bian Zhaoyang Huang Xiaoyu Shi Yijin Li Fu-Yun Wang and Hongsheng Li. 2025. GS-DiT: Advancing Video Generation with Pseudo 4D Gaussian Fields through Efficient Dense 3D Point Tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.02690 (2025).","DOI":"10.1109\/CVPR52734.2025.02023"},{"key":"e_1_3_3_2_7_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023a. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2011.01981.x"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_20"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01386"},{"key":"e_1_3_3_2_13_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang Chao Weng and Ying Shan. 2023. VideoCrafter1: Open Diffusion Models for High-Quality Video Generation. arxiv:https:\/\/arXiv.org\/abs\/2310.19512\u00a0[cs.CV]"},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"crossref","unstructured":"Haoxin Chen Yong Zhang Xiaodong Cun Menghan Xia Xintao Wang Chao Weng and Ying Shan. 2024b. VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2401.09047\u00a0[cs.CV]","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01002"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20050-2_14"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596757"},{"key":"e_1_3_3_2_18_1","first-page":"370","volume-title":"European Conference on Computer Vision","author":"Chen Yuedong","year":"2024","unstructured":"Yuedong Chen, Haofei Xu, Chuanxia Zheng, Bohan Zhuang, Marc Pollefeys, Andreas Geiger, Tat-Jen Cham, and Jianfei Cai. 2024a. Mvsplat: Efficient 3d gaussian splatting from sparse multi-view images. In European Conference on Computer Vision. Springer, 370\u2013386."},{"key":"e_1_3_3_2_19_1","first-page":"250","volume-title":"European Conference on Computer Vision","author":"Chen Yushuo","year":"2024","unstructured":"Yushuo Chen, Zerong Zheng, Zhe Li, Chao Xu, and Yebin Liu. 2024c. Meshavatar: Learning high-quality triangular human avatars from multi-view videos. In European Conference on Computer Vision. Springer, 250\u2013269."},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"crossref","unstructured":"Wei Cheng Ruixiang Chen Wanqi Yin Siming Fan Keyu Chen Honglin He Huiwen Luo Zhongang Cai Jingbo Wang Yang Gao Zhengming Yu Zhengyu Lin Daxuan Ren Lei Yang Ziwei Liu Chen\u00a0Change Loy Chen Qian Wayne Wu Dahua Lin Bo Dai and Kwan-Yee Lin. 2023. DNA-Rendering: A Diverse Neural Actor Repository for High-Fidelity Human-centric Rendering. arXiv preprint arXiv:2307.10173 (2023).","DOI":"10.1109\/ICCV51070.2023.01829"},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657463"},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01201"},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"crossref","unstructured":"Yasutaka Furukawa Carlos Hern\u00e1ndez et\u00a0al. 2015. Multi-view stereo: A tutorial. Foundations and trends\u00ae in Computer Graphics and Vision 9 1-2 (2015) 1\u2013148.","DOI":"10.1561\/0600000052"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"crossref","unstructured":"Chris\u00a0A Glasbey and Kantilal\u00a0Vardichand Mardia. 1998. A review of image-warping methods. Journal of applied statistics 25 2 (1998) 155\u2013171.","DOI":"10.1080\/02664769823151"},{"key":"e_1_3_3_2_25_1","unstructured":"Zekai Gu Rui Yan Jiahao Lu Peng Li Zhiyang Dou Chenyang Si Zhen Dong Qifeng Liu Cheng Lin Ziwei Liu Wenping Wang and Yuan Liu. 2025. Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.03847 (2025)."},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01236"},{"key":"e_1_3_3_2_27_1","volume-title":"The Thirteenth International Conference on Learning Representations","author":"He Hao","unstructured":"Hao He, Yinghao Xu, Yuwei Guo, Gordon Wetzstein, Bo Dai, Hongsheng Li, and Ceyuan Yang. [n. d.]. CameraCtrl: Enabling Camera Control for Video Diffusion Models. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_2_28_1","unstructured":"Hao He Yinghao Xu Yuwei Guo Gordon Wetzstein Bo Dai Hongsheng Li and Ceyuan Yang. 2024. Cameractrl: Enabling camera control for text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02101 (2024)."},{"key":"e_1_3_3_2_29_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022. Latent Video Diffusion Models for High-Fidelity Long Video Generation. (2022). arxiv:https:\/\/arXiv.org\/abs\/2211.13221\u00a0[cs.CV]"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"crossref","unstructured":"Anna Hilsmann Philipp Fechteler Wieland Morgenstern Wolfgang Paier Ingo Feldmann Oliver Schreer and Peter Eisert. 2020. Going beyond free viewpoint: creating animatable volumetric video of human performances. IET Computer Vision 14 6 (2020) 350\u2013358.","DOI":"10.1049\/iet-cvi.2019.0786"},{"key":"e_1_3_3_2_31_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_2_32_1","unstructured":"Wenyi Hong Ming Ding Wendi Zheng Xinghan Liu and Jie Tang. 2022. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.15868 (2022)."},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00858"},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01930"},{"key":"e_1_3_3_2_35_1","unstructured":"Tao Hu Haoyang Peng Xiao Liu and Yuewen Ma. 2025. EX-4D: EXtreme Viewpoint 4D Video Synthesis via Depth Watertight Mesh. arxiv:https:\/\/arXiv.org\/abs\/2506.05554\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2506.05554"},{"key":"e_1_3_3_2_36_1","unstructured":"Yingdong Hu Zhening Liu Jiawei Shao Zehong Lin and Jun Zhang. 2024b. Eva-Gaussian: 3D Gaussian-based real-time human novel view synthesis under diverse camera settings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.01425 (2024)."},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657428"},{"key":"e_1_3_3_2_38_1","unstructured":"Hanzhuo Huang Yuan Liu Ge Zheng Jiepeng Wang Zhiyang Dou and Sibei Yang. 2025. Mvtokenflow: High-quality 4d content generation using multiview token flow. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.11697 (2025)."},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00404"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_21"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00552"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01623"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_24"},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00133"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"crossref","unstructured":"Yuheng Jiang Zhehao Shen Yu Hong Chengcheng Guo Yize Wu Yingliang Zhang Jingyi Yu and Lan Xu. 2024a. Robust dual gaussian splatting for immersive human-centric volumetric videos. ACM Transactions on Graphics (TOG) 43 6 (2024) 1\u201315.","DOI":"10.1145\/3687926"},{"key":"e_1_3_3_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01866"},{"key":"e_1_3_3_2_47_1","unstructured":"Yanqin Jiang Chaohui Yu Chenjie Cao Fan Wang Weiming Hu and Jin Gao. 2024c. Animate3d: Animating any 3d model with multi-view video diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.11398 (2024)."},{"key":"e_1_3_3_2_48_1","unstructured":"Yanqin Jiang Li Zhang Jin Gao Weimin Hu and Yao Yao. 2023b. Consistent4d: Consistent 360 {\\ deg} dynamic object generation from monocular video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.02848 (2023)."},{"key":"e_1_3_3_2_49_1","volume-title":"International Conference on Computer Vision (ICCV)","author":"Jin Yudong","year":"2025","unstructured":"Yudong Jin, Sida Peng, Xuan Wang, Tao Xie, Zhen Xu, Yifan Yang, Yujun Shen, Hujun Bao, and Xiaowei Zhou. 2025. Diffuman4D: 4D Consistent Human View Synthesis from Sparse-View Videos with Spatio-Temporal Diffusion Models. In International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"crossref","unstructured":"Yash Kant Ethan Weber Jin\u00a0Kyu Kim Rawal Khirodkar Su Zhaoen Julieta Martinez Igor Gilitschenski Shunsuke Saito and Timur Bagautdinov. 2025. Pippo: High-Resolution Multi-View Humans from a Single Image. (2025).","DOI":"10.1109\/CVPR52734.2025.01531"},{"key":"e_1_3_3_2_51_1","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3d gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42 4 (2023) 139\u20131.","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"crossref","unstructured":"Rawal Khirodkar Timur Bagautdinov Julieta Martinez Su Zhaoen Austin James Peter Selednik Stuart Anderson and Shunsuke Saito. 2024. Sapiens: Foundation for Human Vision Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.12569 (2024).","DOI":"10.1007\/978-3-031-73235-5_12"},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00055"},{"key":"e_1_3_3_2_54_1","first-page":"451","volume-title":"European Conference on Computer Vision","author":"Kwon Youngjoong","year":"2024","unstructured":"Youngjoong Kwon, Baole Fang, Yixing Lu, Haoye Dong, Cheng Zhang, Francisco\u00a0Vicente Carrasco, Albert Mosella-Montoro, Jianjin Xu, Shingo Takagi, Daeil Kim, et\u00a0al. 2024. Generalizable human gaussians for sparse view synthesis. In European Conference on Computer Vision. Springer, 451\u2013468."},{"key":"e_1_3_3_2_55_1","unstructured":"Youngjoong Kwon Dahun Kim Duygu Ceylan and Henry Fuchs. 2021. Neural human performer: Learning generalizable radiance fields for human performance rendering. Advances in Neural Information Processing Systems 34 (2021) 24741\u201324752."},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596759"},{"key":"e_1_3_3_2_57_1","unstructured":"Chenghong Li Hongjie Liao Yihao Zhi Xihe Yang Zhengwentai Sun Jiahao Chang Shuguang Cui and Xiaoguang Han. 2025. MVHumanNet++: A Large-scale Dataset of Multi-view Daily Dressing Human Captures with Richer Annotations for 3D Human Digitization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.01838 (2025)."},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_25"},{"key":"e_1_3_3_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00813"},{"key":"e_1_3_3_2_60_1","volume-title":"arxiv","author":"Li Zhengqi","year":"2024","unstructured":"Zhengqi Li, Richard Tucker, Forrester Cole, Qianqian Wang, Linyi Jin, Vickie Ye, Angjoo Kanazawa, Aleksander Holynski, and Noah Snavely. 2024b. MegaSaM: Accurate, Fast and Robust Structure and Motion from Casual Dynamic Videos. In arxiv."},{"key":"e_1_3_3_2_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591490"},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01864"},{"key":"e_1_3_3_2_63_1","unstructured":"Bin Lin Yunyang Ge Xinhua Cheng Zongjian Li Bin Zhu Shaodong Wang Xianyi He Yang Ye Shenghai Yuan Liuhan Chen et\u00a0al. 2024b. Open-sora plan: Open-source large video generation model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.00131 (2024)."},{"key":"e_1_3_3_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01997"},{"key":"e_1_3_3_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00819"},{"key":"e_1_3_3_2_66_1","doi-asserted-by":"crossref","unstructured":"Lingjie Liu Marc Habermann Viktor Rudnev Kripasindhu Sarkar Jiatao Gu and Christian Theobalt. 2021. Neural actor: Neural free-view synthesis of human actors with pose control. ACM transactions on graphics (TOG) 40 6 (2021) 1\u201316.","DOI":"10.1145\/3478513.3480528"},{"key":"e_1_3_3_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"e_1_3_3_2_68_1","unstructured":"Minghua Liu Chao Xu Haian Jin Linghao Chen Mukund Varma\u00a0T Zexiang Xu and Hao Su. 2023c. One-2-3-45: Any single image to 3d mesh in 45 seconds without per-shape optimization. Advances in Neural Information Processing Systems 36 (2023) 22226\u201322246."},{"key":"e_1_3_3_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_3_2_70_1","unstructured":"Tianqi Liu Zihao Huang Zhaoxi Chen Guangcong Wang Shoukang Hu liao Shen Huiqiang Sun Zhiguo Cao Wei Li and Ziwei Liu. 2025. Free4D: Tuning-free 4D Scene Generation with Spatial-Temporal Consistency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.20785 (2025)."},{"key":"e_1_3_3_2_71_1","unstructured":"Yuan Liu Cheng Lin Zijiao Zeng Xiaoxiao Long Lingjie Liu Taku Komura and Wenping Wang. 2023a. Syncdreamer: Generating multiview-consistent images from a single-view image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.03453 (2023)."},{"key":"e_1_3_3_2_72_1","doi-asserted-by":"crossref","unstructured":"Matthew Loper Naureen Mahmood Javier Romero Gerard Pons-Moll and Michael\u00a0J. Black. 2015. SMPL: A Skinned Multi-Person Linear Model. ACM Trans. Graphics (Proc. SIGGRAPH Asia) 34 6 (Oct. 2015) 248:1\u2013248:16.","DOI":"10.1145\/2816795.2818013"},{"key":"e_1_3_3_2_73_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_2_74_1","unstructured":"Yixing Lu Junting Dong Youngjoong Kwon Qin Zhao Bo Dai and Fernando De\u00a0la Torre. 2025. GAS: Generative Avatar Synthesis from a Single Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.06957 (2025)."},{"key":"e_1_3_3_2_75_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_11"},{"key":"e_1_3_3_2_76_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_3_2_77_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P Srinivasan Matthew Tancik Jonathan\u00a0T Barron Ravi Ramamoorthi and Ren Ng. 2021. Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. ACM 65 1 (2021) 99\u2013106.","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_2_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984517"},{"key":"e_1_3_3_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00117"},{"key":"e_1_3_3_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02452"},{"key":"e_1_3_3_2_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"e_1_3_3_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"e_1_3_3_2_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01405"},{"key":"e_1_3_3_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00894"},{"key":"e_1_3_3_2_85_1","doi-asserted-by":"crossref","unstructured":"Luigi Piccinelli Christos Sakaridis Yung-Hsu Yang Mattia Segu Siyuan Li Wim Abbeloos and Luc\u00a0Van Gool. 2025. UniDepthV2: Universal Monocular Metric Depth Estimation Made Simpler. arxiv:https:\/\/arXiv.org\/abs\/2502.20110\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.20110","DOI":"10.1109\/TPAMI.2025.3628473"},{"key":"e_1_3_3_2_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"e_1_3_3_2_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01919"},{"key":"e_1_3_3_2_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00480"},{"key":"e_1_3_3_2_89_1","unstructured":"Jiawei Ren Liang Pan Jiaxiang Tang Chi Zhang Ang Cao Gang Zeng and Ziwei Liu. 2023. DreamGaussian4D: Generative 4D Gaussian Splatting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.17142 (2023)."},{"key":"e_1_3_3_2_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00574"},{"key":"e_1_3_3_2_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.19"},{"key":"e_1_3_3_2_93_1","doi-asserted-by":"crossref","unstructured":"Ruizhi Shao Youxin Pang Zerong Zheng Jingxiang Sun and Yebin Liu. 2024. Human4DiT: 360-degree Human Video Generation with 4D Diffusion Transformer. ACM Transactions on Graphics (TOG) 43 6 (2024).","DOI":"10.1145\/3687980"},{"key":"e_1_3_3_2_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01596"},{"key":"e_1_3_3_2_95_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Shen Tianchang","year":"2021","unstructured":"Tianchang Shen, Jun Gao, Kangxue Yin, Ming-Yu Liu, and Sanja Fidler. 2021a. Deep Marching Tetrahedra: a Hybrid Representation for High-Resolution 3D Shape Synthesis. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_96_1","unstructured":"Tianchang Shen Jun Gao Kangxue Yin Ming-Yu Liu and Sanja Fidler. 2021b. Deep marching tetrahedra: a hybrid representation for high-resolution 3d shape synthesis. Advances in Neural Information Processing Systems 34 (2021) 6087\u20136101."},{"key":"e_1_3_3_2_97_1","unstructured":"Ruoxi Shi Hansheng Chen Zhuoyang Zhang Minghua Liu Chao Xu Xinyue Wei Linghao Chen Chong Zeng and Hao Su. 2023a. Zero123++: a single image to consistent multi-view diffusion base model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.15110 (2023)."},{"key":"e_1_3_3_2_98_1","unstructured":"Yichun Shi Peng Wang Jianglong Ye Mai Long Kejie Li and Xiao Yang. 2023b. Mvdream: Multi-view diffusion for 3d generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.16512 (2023)."},{"key":"e_1_3_3_2_99_1","first-page":"2256","volume-title":"International conference on machine learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International conference on machine learning. pmlr, 2256\u20132265."},{"key":"e_1_3_3_2_100_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_2_101_1","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_2_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00521"},{"key":"e_1_3_3_2_103_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14022"},{"key":"e_1_3_3_2_104_1","doi-asserted-by":"crossref","unstructured":"Justus Thies Michael Zollh\u00f6fer and Matthias Nie\u00dfner. 2019. Deferred neural rendering: Image synthesis using neural textures. Acm Transactions on Graphics (TOG) 38 4 (2019) 1\u201312.","DOI":"10.1145\/3306346.3323035"},{"key":"e_1_3_3_2_105_1","doi-asserted-by":"crossref","unstructured":"Basile Van\u00a0Hoorick Rundi Wu Ege Ozguroglu Kyle Sargent Ruoshi Liu Pavel Tokmakov Achal Dave Changxi Zheng and Carl Vondrick. 2024. Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis. European Conference on Computer Vision (ECCV) (2024).","DOI":"10.1007\/978-3-031-72691-0_18"},{"key":"e_1_3_3_2_106_1","first-page":"439","volume-title":"European Conference on Computer Vision","author":"Voleti Vikram","year":"2024","unstructured":"Vikram Voleti, Chun-Han Yao, Mark Boss, Adam Letts, David Pankratz, Dmitry Tochilkin, Christian Laforte, Robin Rombach, and Varun Jampani. 2024. Sv3d: Novel multi-view synthesis and 3d generation from a single image using latent video diffusion. In European Conference on Computer Vision. Springer, 439\u2013457."},{"key":"e_1_3_3_2_107_1","unstructured":"Ang Wang Baole Ai Bin Wen Chaojie Mao Chen-Wei Xie Di Chen Feiwu Yu Haiming Zhao Jianxiao Yang Jianyuan Zeng Jiayu Wang Jingfeng Zhang Jingren Zhou Jinkai Wang Jixuan Chen Kai Zhu Kang Zhao Keyu Yan Lianghua Huang Mengyang Feng Ningyi Zhang Pandeng Li Pingyu Wu Ruihang Chu Ruili Feng Shiwei Zhang Siyang Sun Tao Fang Tianxing Wang Tianyi Gui Tingyu Weng Tong Shen Wei Lin Wei Wang Wei Wang Wenmeng Zhou Wente Wang Wenting Shen Wenyuan Yu Xianzhong Shi Xiaoming Huang Xin Xu Yan Kou Yangyu Lv Yifei Li Yijing Liu Yiming Wang Yingya Zhang Yitong Huang Yong Li You Wu Yu Liu Yulin Pan Yun Zheng Yuntao Hong Yupeng Shi Yutong Feng Zeyinzi Jiang Zhen Han Zhi-Fan Wu and Ziyu Liu. 2025a. Wan: Open and Advanced Large-Scale Video Generative Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.20314 (2025)."},{"key":"e_1_3_3_2_108_1","doi-asserted-by":"crossref","unstructured":"Hanyang Wang Fangfu Liu Jiawei Chi and Yueqi Duan. 2025b. VideoScene: Distilling Video Diffusion Model to Generate 3D Scenes in One Step. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.01956 (2025).","DOI":"10.1109\/CVPR52734.2025.01536"},{"key":"e_1_3_3_2_109_1","unstructured":"Peng Wang Lingjie Liu Yuan Liu Christian Theobalt Taku Komura and Wenping Wang. 2021. NeuS: Learning Neural Implicit Surfaces by Volume Rendering for Multi-view Reconstruction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.10689 (2021)."},{"key":"e_1_3_3_2_110_1","unstructured":"Peng Wang and Yichun Shi. 2023. Imagedream: Image-prompt multi-view diffusion for 3d generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.02201 (2023)."},{"key":"e_1_3_3_2_111_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00184"},{"key":"e_1_3_3_2_112_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_1"},{"key":"e_1_3_3_2_113_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02026"},{"key":"e_1_3_3_2_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_2_115_1","unstructured":"Daniel Watson William Chan Ricardo Martin-Brualla Jonathan Ho Andrea Tagliasacchi and Mohammad Norouzi. 2022. Novel view synthesis with diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.04628 (2022)."},{"key":"e_1_3_3_2_116_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00201"},{"key":"e_1_3_3_2_117_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01573"},{"key":"e_1_3_3_2_118_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01920"},{"key":"e_1_3_3_2_119_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00175"},{"key":"e_1_3_3_2_120_1","unstructured":"Rundi Wu Ruiqi Gao Ben Poole Alex Trevithick Changxi Zheng Jonathan\u00a0T Barron and Aleksander Holynski. 2024a. Cat4d: Create anything in 4d with multi-view video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.18613 (2024)."},{"key":"e_1_3_3_2_121_1","first-page":"361","volume-title":"European Conference on Computer Vision","author":"Wu Zijie","year":"2024","unstructured":"Zijie Wu, Chaohui Yu, Yanqin Jiang, Chenjie Cao, Fan Wang, and Xiang Bai. 2024c. Sc4d: Sparse-controlled video-to-4d generation and motion transfer. In European Conference on Computer Vision. Springer, 361\u2013379."},{"key":"e_1_3_3_2_122_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00226"},{"key":"e_1_3_3_2_123_1","unstructured":"Yiming Xie Chun-Han Yao Vikram Voleti Huaizu Jiang and Varun Jampani. 2024. Sv4d: Dynamic 3d content generation with multi-frame and multi-view consistency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.17470 (2024)."},{"key":"e_1_3_3_2_124_1","unstructured":"Jinbo Xing Menghan Xia Yong Zhang Haoxin Chen Xintao Wang Tien-Tsin Wong and Ying Shan. 2023. DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors. (2023). arxiv:https:\/\/arXiv.org\/abs\/2310.12190\u00a0[cs.CV]"},{"key":"e_1_3_3_2_125_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01872"},{"key":"e_1_3_3_2_126_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00536"},{"key":"e_1_3_3_2_127_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01893"},{"key":"e_1_3_3_2_128_1","doi-asserted-by":"crossref","unstructured":"Zhen Xu Yinghao Xu Zhiyuan Yu Sida Peng Jiaming Sun Hujun Bao and Xiaowei Zhou. 2024b. Representing Long Volumetric Video with Temporal Gaussian Hierarchy. ACM Transactions on Graphics 43 6 (November 2024). https:\/\/zju3dv.github.io\/longvolcap","DOI":"10.1145\/3687919"},{"key":"e_1_3_3_2_129_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024a. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_2_130_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yang Zeyu","year":"2024","unstructured":"Zeyu Yang, Hongye Yang, Zijie Pan, and Li Zhang. 2024b. Real-time Photorealistic Dynamic Scene Representation and Rendering with 4D Gaussian Splatting. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_131_1","unstructured":"Lior Yariv Jiatao Gu Yoni Kasten and Yaron Lipman. 2021. Volume rendering of neural implicit surfaces. Advances in Neural Information Processing Systems 34 (2021) 4805\u20134815."},{"key":"e_1_3_3_2_132_1","unstructured":"Mark YU Wenbo Hu Jinbo Xing and Ying Shan. 2025. TrajectoryCrafter: Redirecting Camera Trajectory for Monocular Videos via Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.05638 (2025)."},{"key":"e_1_3_3_2_133_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00569"},{"key":"e_1_3_3_2_134_1","unstructured":"Wangbo Yu Jinbo Xing Li Yuan Wenbo Hu Xiaoyu Li Zhipeng Huang Xiangjun Gao Tien-Tsin Wong Ying Shan and Yonghong Tian. 2024. ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02048 (2024)."},{"key":"e_1_3_3_2_135_1","unstructured":"Zehao Yu Songyou Peng Michael Niemeyer Torsten Sattler and Andreas Geiger. 2022. Monosdf: Exploring monocular geometric cues for neural implicit surface reconstruction. Advances in neural information processing systems 35 (2022) 25018\u201325032."},{"key":"e_1_3_3_2_136_1","first-page":"163","volume-title":"European Conference on Computer Vision","author":"Zeng Yifei","year":"2024","unstructured":"Yifei Zeng, Yanqin Jiang, Siyu Zhu, Yuanxun Lu, Youtian Lin, Hao Zhu, Weiming Hu, Xun Cao, and Yao Yao. 2024. Stag4d: Spatial-temporal anchored generative 4d gaussians. In European Conference on Computer Vision. Springer, 163\u2013179."},{"key":"e_1_3_3_2_137_1","doi-asserted-by":"crossref","unstructured":"Fuqiang Zhao Yuheng Jiang Kaixin Yao Jiakai Zhang Liao Wang Haizhao Dai Yuhui Zhong Yingliang Zhang Minye Wu Lan Xu et\u00a0al. 2022a. Human performance modeling and rendering via neural animated mesh. ACM Transactions on Graphics (TOG) 41 6 (2022) 1\u201317.","DOI":"10.1145\/3550454.3555451"},{"key":"e_1_3_3_2_138_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00759"},{"key":"e_1_3_3_2_139_1","doi-asserted-by":"crossref","unstructured":"Yiqun Zhao Chenming Wu Binbin Huang Yihao Zhi Chen Zhao Jingdong Wang and Shenghua Gao. 2025. Surfel-based Gaussian Inverse Rendering for Fast and Relightable Dynamic Human Reconstruction from Monocular Videos. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025).","DOI":"10.1109\/TPAMI.2025.3599415"},{"key":"e_1_3_3_2_140_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01861"},{"key":"e_1_3_3_2_141_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01543"},{"key":"e_1_3_3_2_142_1","doi-asserted-by":"crossref","unstructured":"Zerong Zheng Xiaochen Zhao Hongwen Zhang Boning Liu and Yebin Liu. 2023. Avatarrex: Real-time expressive full-body avatars. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201319.","DOI":"10.1145\/3592101"},{"key":"e_1_3_3_2_143_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00048"},{"key":"e_1_3_3_2_144_1","doi-asserted-by":"crossref","unstructured":"Yihao Zhi Wanhu Sun Jiahao Chang Chongjie Ye Wensen Feng and Xiaoguang Han. 2025. StruGauAvatar: Learning Structured 3D Gaussians for Animatable Avatars from Monocular Videos. IEEE Transactions on Visualization and Computer Graphics (2025).","DOI":"10.1109\/TVCG.2025.3557457"},{"key":"e_1_3_3_2_145_1","unstructured":"Shenhao Zhu Junming\u00a0Leo Chen Zuozhuo Dai Yinghui Xu Xun Cao Yao Yao Hao Zhu and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. arxiv:https:\/\/arXiv.org\/abs\/2403.14781\u00a0[cs.CV]"},{"key":"e_1_3_3_2_146_1","doi-asserted-by":"crossref","unstructured":"Yiyu Zhuang Jiaxi Lv Hao Wen Qing Shuai Ailing Zeng Hao Zhu Shifeng Chen Yujiu Yang Xun Cao and Wei Liu. 2024. IDOL: Instant Photorealistic 3D Human Creation from a Single Image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14963 (2024).","DOI":"10.1109\/CVPR52734.2025.02450"}],"event":{"name":"SA Conference Papers '25: SIGGRAPH Asia 2025 Conference Papers","location":"Hong Kong Hong Kong","acronym":"SA Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the SIGGRAPH Asia 2025 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757377.3763935","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T03:19:34Z","timestamp":1765250374000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757377.3763935"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":145,"alternative-id":["10.1145\/3757377.3763935","10.1145\/3757377"],"URL":"https:\/\/doi.org\/10.1145\/3757377.3763935","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}