{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T16:00:59Z","timestamp":1774022459427,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730752","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:42:43Z","timestamp":1753260163000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Splat4D: Diffusion-Enhanced 4D Gaussian Splatting for Temporally and Spatially Consistent Content Creation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4252-2179","authenticated-orcid":false,"given":"Minghao","family":"Yin","sequence":"first","affiliation":[{"name":"University of Hong Kong, HongKong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0125-0015","authenticated-orcid":false,"given":"Yukang","family":"Cao","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6085-8059","authenticated-orcid":false,"given":"Songyou","family":"Peng","sequence":"additional","affiliation":[{"name":"Google DeepMind, San Francisco, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7995-9999","authenticated-orcid":false,"given":"Kai","family":"Han","sequence":"additional","affiliation":[{"name":"University of Hong Kong, HongKong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.150"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00764"},{"key":"e_1_3_3_2_4_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023a. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02022"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_2_10_1","unstructured":"Quankai Gao Qiangeng Xu Zhe Cao Ben Mildenhall Wenchao Ma Le Chen Danhang Tang and Ulrich Neumann. 2024. Gaussianflow: Splatting gaussian dynamics for 4d content creation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12365 (2024)."},{"key":"e_1_3_3_2_11_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2023. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_12_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022. Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.13221 (2022)."},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"crossref","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J Fleet. 2022. Video diffusion models. Advances in Neural Information Processing Systems 35 (2022) 8633\u20138646.","DOI":"10.52202\/068431-0628"},{"key":"e_1_3_3_2_14_1","unstructured":"Zehuan Huang Yuan-Chen Guo Haoran Wang Ran Yi Lizhuang Ma Yan-Pei Cao and Lu Sheng. 2024. MV-Adapter: Multi-view Consistent Image Generation Made Easy. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03632 (2024)."},{"key":"e_1_3_3_2_15_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Jiang Yanqin","year":"2023","unstructured":"Yanqin Jiang, Li Zhang, Jin Gao, Weiming Hu, and Yao Yao. 2023. Consistent4D: Consistent 360\u00b0 Dynamic Object Generation from Monocular Video. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3D Gaussian Splatting for Real-Time Radiance Field Rendering. ACM Trans. Graph. 42 4 (2023) 139\u20131.","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"e_1_3_3_2_19_1","volume-title":"Advances in Neural Information Processing Systems","author":"Kulhanek Jonas","year":"2024","unstructured":"Jonas Kulhanek, Songyou Peng, Zuzana Kukelova, Marc Pollefeys, and Torsten Sattler. 2024. WildGaussians: 3D Gaussian Splatting in the Wild. In Advances in Neural Information Processing Systems , Vol.\u00a037."},{"key":"e_1_3_3_2_20_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Li Weiyu","year":"2024","unstructured":"Weiyu Li, Rui Chen, Xuelin Chen, and Ping Tan. 2024. SweetDreamer: Aligning Geometric Priors in 2D diffusion for Consistent Text-to-3D. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3519"},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00623"},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_3_2_25_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Liu Yuan","year":"2024","unstructured":"Yuan Liu, Cheng Lin, Zijiao Zeng, Xiaoxiao Long, Lingjie Liu, Taku Komura, and Wenping Wang. 2024. SyncDreamer: Generating Multiview-consistent Images from a Single-view Image. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_26_1","first-page":"8744","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Lu Yuanxun","year":"2024","unstructured":"Yuanxun Lu, Jingyang Zhang, Shiwei Li, Tian Fang, David McKinnon, Yanghai Tsin, Long Quan, Xun Cao, and Yao Yao. 2024. Direct2. 5: Diverse text-to-3d generation via multi-view 2.5 d diffusion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8744\u20138753."},{"key":"e_1_3_3_2_27_1","first-page":"8446","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Melas-Kyriazi Luke","year":"2023","unstructured":"Luke Melas-Kyriazi, Iro Laina, Christian Rupprecht, and Andrea Vedaldi. 2023. Realfusion: 360deg reconstruction of any object from a single image. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8446\u20138455."},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P Srinivasan Matthew Tancik Jonathan\u00a0T Barron Ravi Ramamoorthi and Ren Ng. 2021. Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. ACM 65 1 (2021) 99\u2013106.","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"crossref","unstructured":"Thomas M\u00fcller Alex Evans Christoph Schied and Alexander Keller. 2022. Instant neural graphics primitives with a multiresolution hash encoding. ACM transactions on graphics (TOG) 41 4 (2022) 1\u201315.","DOI":"10.1145\/3528223.3530127"},{"key":"e_1_3_3_2_30_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et\u00a0al. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research Journal (2024) 1\u201331."},{"key":"e_1_3_3_2_31_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Pan Zijie","year":"2024","unstructured":"Zijie Pan, Jiachen Lu, Xiatian Zhu, and Li Zhang. 2024. Enhancing High-Resolution 3D Generation through Pixel-wise Gradient Clipping. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_32_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan\u00a0T. Barron, and Ben Mildenhall. 2023. DreamFusion: Text-to-3D using 2D Diffusion. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_2_33_1","unstructured":"Guocheng Qian Jinjie Mai Abdullah Hamdi Jian Ren Aliaksandr Siarohin Bing Li Hsin-Ying Lee Ivan Skorokhodov Peter Wonka Sergey Tulyakov et\u00a0al. 2023. Magic123: One image to high-quality 3d object generation using both 2d and 3d diffusion priors. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.17843 (2023)."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00223"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383317"},{"key":"e_1_3_3_2_36_1","unstructured":"Jiawei Ren Liang Pan Jiaxiang Tang Chi Zhang Ang Cao Gang Zeng and Ziwei Liu. 2023. Dreamgaussian4d: Generative 4d gaussian splatting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.17142 (2023)."},{"key":"e_1_3_3_2_37_1","first-page":"56828","volume-title":"Advances in Neural Information Processing Systems","author":"Ren Jiawei","year":"2024","unstructured":"Jiawei Ren, Kevin Xie, Ashkan Mirzaei, Xiaohui Zeng, Karsten Kreis, Ziwei Liu, Antonio Torralba, Sanja Fidler, Seung\u00a0Wook Kim, Huan Ling, et\u00a0al. 2024. L4GM: Large 4D Gaussian Reconstruction Model. In Advances in Neural Information Processing Systems , Vol.\u00a037. 56828\u201356858."},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00900"},{"key":"e_1_3_3_2_39_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Seitzer Maximilian","year":"2023","unstructured":"Maximilian Seitzer, Sjoerd van Steenkiste, Thomas Kipf, Klaus Greff, and Mehdi\u00a0SM Sajjadi. 2023. DyST: Towards Dynamic Neural Scene Representations on Real-World Videos. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_40_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Shi Yichun","year":"2023","unstructured":"Yichun Shi, Peng Wang, Jianglong Ye, Long Mai, Kejie Li, and Xiao Yang. 2023. MVDream: Multi-view Diffusion for 3D Generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_41_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Singer Uriel","year":"2023","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et\u00a0al. 2023a. Make-A-Video: Text-to-Video Generation without Text-Video Data. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619731"},{"key":"e_1_3_3_2_43_1","unstructured":"Jingxiang Sun Bo Zhang Ruizhi Shao Lizhen Wang Wen Liu Zhenda Xie and Yebin Liu. 2023. Dreamcraft3d: Hierarchical 3d generation with bootstrapped diffusion prior. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.16818 (2023)."},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00972"},{"key":"e_1_3_3_2_45_1","first-page":"1","volume-title":"European Conference on Computer Vision","author":"Tang Jiaxiang","year":"2024","unstructured":"Jiaxiang Tang, Zhaoxi Chen, Xiaokang Chen, Tengfei Wang, Gang Zeng, and Ziwei Liu. 2024a. Lgm: Large multi-view gaussian model for high-resolution 3d content creation. In European Conference on Computer Vision. Springer, 1\u201318."},{"key":"e_1_3_3_2_46_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang Jiaxiang","year":"2024","unstructured":"Jiaxiang Tang, Jiawei Ren, Hang Zhou, Ziwei Liu, and Gang Zeng. 2024b. DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content Creation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02086"},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"crossref","unstructured":"Vikram Voleti Alexia Jolicoeur-Martineau and Chris Pal. 2022. Mcvd-masked conditional video diffusion for prediction generation and interpolation. Advances in Neural Information Processing Systems 35 (2022) 23371\u201323385.","DOI":"10.52202\/068431-1698"},{"key":"e_1_3_3_2_49_1","first-page":"439","volume-title":"European Conference on Computer Vision","author":"Voleti Vikram","year":"2024","unstructured":"Vikram Voleti, Chun-Han Yao, Mark Boss, Adam Letts, David Pankratz, Dmitry Tochilkin, Christian Laforte, Robin Rombach, and Varun Jampani. 2024. Sv3d: Novel multi-view synthesis and 3d generation from a single image using latent video diffusion. In European Conference on Computer Vision. Springer, 439\u2013457."},{"key":"e_1_3_3_2_50_1","first-page":"321","volume-title":"European Conference on Computer Vision","author":"Wang Xinzhou","year":"2024","unstructured":"Xinzhou Wang, Yikai Wang, Junliang Ye, Fuchun Sun, Zhengyi Wang, Ling Wang, Pengkun Liu, Kai Sun, Xintong Wang, Wende Xie, et\u00a0al. 2024. Animatabledreamer: Text-guided non-rigid 3d model generation and reconstruction with canonical score distillation. In European Conference on Computer Vision. Springer, 321\u2013339."},{"key":"e_1_3_3_2_51_1","volume-title":"Proceedings of the European Conference on Computer Vision Workshops","author":"Wang Xintao","year":"2018","unstructured":"Xintao Wang, Ke Yu, Shixiang Wu, Jinjin Gu, Yihao Liu, Chao Dong, Yu Qiao, and Chen Change\u00a0Loy. 2018. Esrgan: Enhanced super-resolution generative adversarial networks. In Proceedings of the European Conference on Computer Vision Workshops."},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"crossref","unstructured":"Zhengyi Wang Cheng Lu Yikai Wang Fan Bao Chongxuan Li Hang Su and Jun Zhu. 2023. Prolificdreamer: High-fidelity and diverse text-to-3d generation with variational score distillation. Advances in Neural Information Processing Systems 36 (2023) 8406\u20138441.","DOI":"10.52202\/075280-0368"},{"key":"e_1_3_3_2_53_1","unstructured":"Haohan Weng Tianyu Yang Jianan Wang Yu Li Tong Zhang CL Chen and Lei Zhang. 2023. Consistent123: Improve consistency for one image to 3d object synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.08092 (2023)."},{"key":"e_1_3_3_2_54_1","unstructured":"Yiming Xie Chun-Han Yao Vikram Voleti Huaizu Jiang and Varun Jampani. 2024. Sv4d: Dynamic 3d content generation with multi-frame and multi-view consistency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.17470 (2024)."},{"key":"e_1_3_3_2_55_1","first-page":"399","volume-title":"European Conference on Computer Vision","author":"Xing Jinbo","year":"2024","unstructured":"Jinbo Xing, Menghan Xia, Yong Zhang, Haoxin Chen, Wangbo Yu, Hanyuan Liu, Gongye Liu, Xintao Wang, Ying Shan, and Tien-Tsin Wong. 2024. Dynamicrafter: Animating open-domain images with video diffusion priors. In European Conference on Computer Vision. Springer, 399\u2013417."},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00649"},{"key":"e_1_3_3_2_57_1","unstructured":"Yuyang Yin Dejia Xu Zhangyang Wang Yao Zhao and Yunchao Wei. 2023. 4dgen: Grounded 4d content generation with spatial-temporal consistency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.17225 (2023)."},{"key":"e_1_3_3_2_58_1","unstructured":"Wangbo Yu Jinbo Xing Li Yuan Wenbo Hu Xiaoyu Li Zhipeng Huang Xiangjun Gao Tien-Tsin Wong Ying Shan and Yonghong Tian. 2024. Viewcrafter: Taming video diffusion models for high-fidelity novel view synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02048 (2024)."},{"key":"e_1_3_3_2_59_1","first-page":"163","volume-title":"European Conference on Computer Vision","author":"Zeng Yifei","year":"2024","unstructured":"Yifei Zeng, Yanqin Jiang, Siyu Zhu, Yuanxun Lu, Youtian Lin, Hao Zhu, Weiming Hu, Xun Cao, and Yao Yao. 2024. Stag4d: Spatial-temporal anchored generative 4d gaussians. In European Conference on Computer Vision. Springer, 163\u2013179."},{"key":"e_1_3_3_2_60_1","doi-asserted-by":"crossref","unstructured":"Haiyu Zhang Xinyuan Chen Yaohui Wang Xihui Liu Yunhong Wang and Yu Qiao. 2024. 4diffusion: Multi-view video diffusion model for 4d generation. Advances in Neural Information Processing Systems 37 (2024) 15272\u201315295.","DOI":"10.52202\/079017-0488"},{"key":"e_1_3_3_2_61_1","unstructured":"Yuyang Zhao Zhiwen Yan Enze Xie Lanqing Hong Zhenguo Li and Gim\u00a0Hee Lee. 2023. Animate124: Animating one image to 4d dynamic scene. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.14603 (2023)."},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00697"},{"key":"e_1_3_3_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00441"},{"key":"e_1_3_3_2_64_1","first-page":"145","volume-title":"European Conference on Computer Vision","author":"Zhu Shenhao","year":"2024","unstructured":"Shenhao Zhu, Junming\u00a0Leo Chen, Zuozhuo Dai, Zilong Dong, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. In European Conference on Computer Vision. 145\u2013162."}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730752","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:03:29Z","timestamp":1774019009000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730752"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":63,"alternative-id":["10.1145\/3721238.3730752","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730752","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}