{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T20:25:45Z","timestamp":1777148745490,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":90,"publisher":"ACM","funder":[{"name":"Research Grant Council of Hong Kong","award":["17210222"],"award-info":[{"award-number":["17210222"]}]},{"name":"Innovation and Technology Commission of Hong Kong","award":["ITS\/319\/21FP, ITS\/335\/23FP"],"award-info":[{"award-number":["ITS\/319\/21FP, ITS\/335\/23FP"]}]},{"name":"InnoHK initiative","award":["TransGP"],"award-info":[{"award-number":["TransGP"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730607","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9739-1209","authenticated-orcid":false,"given":"Zekai","family":"Gu","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1793-4622","authenticated-orcid":false,"given":"Rui","family":"Yan","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3280-9447","authenticated-orcid":false,"given":"Jiahao","family":"Lu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1066-6642","authenticated-orcid":false,"given":"Peng","family":"Li","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0186-8269","authenticated-orcid":false,"given":"Zhiyang","family":"Dou","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3354-1968","authenticated-orcid":false,"given":"Chenyang","family":"Si","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0152-3300","authenticated-orcid":false,"given":"Zhen","family":"Dong","sequence":"additional","affiliation":[{"name":"Wuhan University, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6191-076X","authenticated-orcid":false,"given":"Qifeng","family":"Liu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3335-6623","authenticated-orcid":false,"given":"Cheng","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Hong Kong, Hongkong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4220-5958","authenticated-orcid":false,"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2284-3952","authenticated-orcid":false,"given":"Wenping","family":"Wang","sequence":"additional","affiliation":[{"name":"Texas A&amp;M University, College Station, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2933-5667","authenticated-orcid":false,"given":"Yuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hongkong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"crossref","unstructured":"Sherwin Bahmani Ivan Skorokhodov Guocheng Qian Aliaksandr Siarohin Willi Menapace Andrea Tagliasacchi David\u00a0B Lindell and Sergey Tulyakov. 2024. AC3D: Analyzing and Improving 3D Camera Control in Video Diffusion Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.18673 (2024).","DOI":"10.1109\/CVPR52734.2025.02130"},{"key":"e_1_3_3_2_3_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_2_4_1","unstructured":"Aleksei Bochkovskii Ama\u00ebl Delaunoy Hugo Germain Marcel Santos Yichao Zhou Stephan\u00a0R Richter and Vladlen Koltun. 2024. Depth pro: Sharp monocular metric depth in less than a second. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02073 (2024)."},{"key":"e_1_3_3_2_5_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman et\u00a0al. 2024. Video generation models as world simulators. https:\/\/openai. com\/research\/video-generation-models-as-world-simulators"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00011"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00727"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00385"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"crossref","unstructured":"Eric\u00a0R. Chan Koki Nagano Matthew\u00a0A. Chan Alexander\u00a0W. Bergman Jeong\u00a0Joon Park Axel Levy Miika Aittala Shalini\u00a0De Mello Tero Karras and Gordon Wetzstein. 2023. Generative Novel View Synthesis with 3D-Aware Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2304.02602\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2304.02602","DOI":"10.1109\/ICCV51070.2023.00389"},{"key":"e_1_3_3_2_10_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang et\u00a0al. 2023b. Videocrafter1: Open diffusion models for high-quality video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.19512 (2023)."},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_2_12_1","first-page":"74","volume-title":"European Conference on Computer Vision","author":"Chen Junsong","year":"2024","unstructured":"Junsong Chen, Chongjian Ge, Enze Xie, Yue Wu, Lewei Yao, Xiaozhe Ren, Zhongdao Wang, Ping Luo, Huchuan Lu, and Zhenguo Li. 2024a. PIXART-Sigma: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation. In European Conference on Computer Vision. Springer, 74\u201391."},{"key":"e_1_3_3_2_13_1","unstructured":"Tsai-Shien Chen Chieh\u00a0Hubert Lin Hung-Yu Tseng Tsung-Yi Lin and Ming-Hsuan Yang. 2023a. Motion-conditioned diffusion model for controllable video synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.14404 (2023)."},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_3_2_15_1","unstructured":"Ruoyu Feng Wenming Weng Yanhui Wang Yuhui Yuan Jianmin Bao Chong Luo Zhibo Chen and Baining Guo. 2024b. CCEdit: Creative and Controllable Video Editing via Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2309.16496\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2309.16496"},{"key":"e_1_3_3_2_16_1","unstructured":"Wanquan Feng Tianhao Qi Jiawei Liu Mingzhen Sun Pengqi Tu Tianxiang Ma Fei Dai Songtao Zhao Siyu Zhou and Qian He. 2024a. I2VControl: Disentangled and Unified Video Motion Synthesis Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.17765 (2024)."},{"key":"e_1_3_3_2_17_1","unstructured":"Daniel Geng Charles Herrmann Junhwa Hur Forrester Cole Serena Zhang Tobias Pfaff Tatiana Lopez-Guevara Carl Doersch Yusuf Aytar Michael Rubinstein et\u00a0al. 2024. Motion Prompting: Controlling Video Generation with Motion Trajectories. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.02700 (2024)."},{"key":"e_1_3_3_2_18_1","unstructured":"Michal Geyer Omer Bar-Tal Shai Bagon and Tali Dekel. 2023a. Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.10373 (2023)."},{"key":"e_1_3_3_2_19_1","unstructured":"Michal Geyer Omer Bar-Tal Shai Bagon and Tali Dekel. 2023b. TokenFlow: Consistent Diffusion Features for Consistent Video Editing. arxiv:https:\/\/arXiv.org\/abs\/2307.10373\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2307.10373"},{"key":"e_1_3_3_2_20_1","first-page":"330","volume-title":"ECCV","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024. Sparsectrl: Adding sparse controls to text-to-video diffusion models. In ECCV. 330\u2013348."},{"key":"e_1_3_3_2_21_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Zhengyang Liang Yaohui Wang Yu Qiao Maneesh Agrawala Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_2_22_1","unstructured":"Hao He Yinghao Xu Yuwei Guo Gordon Wetzstein Bo Dai Hongsheng Li and Ceyuan Yang. 2024b. Cameractrl: Enabling camera control for text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02101 (2024)."},{"key":"e_1_3_3_2_23_1","unstructured":"Xuanhua He Quande Liu Shengju Qian Xin Wang Tao Hu Ke Cao Keyu Yan and Jie Zhang. 2024a. Id-animator: Zero-shot identity-preserving human video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.15275 (2024)."},{"key":"e_1_3_3_2_24_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022. Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.13221 (2022)."},{"key":"e_1_3_3_2_25_1","unstructured":"Jack Hessel Ari Holtzman Maxwell Forbes Ronan\u00a0Le Bras and Yejin Choi. 2022. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2104.08718\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2104.08718"},{"key":"e_1_3_3_2_26_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. NeurIPS (2020)."},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J Fleet. 2022. Video diffusion models. Advances in Neural Information Processing Systems 35 (2022) 8633\u20138646.","DOI":"10.52202\/068431-0628"},{"key":"e_1_3_3_2_28_1","unstructured":"Hsin-Ping Huang Yu-Chuan Su Deqing Sun Lu Jiang Xuhui Jia Yukun Zhu and Ming-Hsuan Yang. 2023. Fine-grained controllable video generation via object appearance and context. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.02919 (2023)."},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00772"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"crossref","unstructured":"Hyeonho Jeong Chun-Hao\u00a0Paul Huang Jong\u00a0Chul Ye Niloy Mitra and Duygu Ceylan. 2024. Track4Gen: Teaching Video Diffusion Models to Track Points Improves Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.06016 (2024).","DOI":"10.1109\/CVPR52734.2025.00682"},{"key":"e_1_3_3_2_31_1","unstructured":"Xuan Ju Yiming Gao Zhaoyang Zhang Ziyang Yuan Xintao Wang Ailing Zeng Yu Xiong Qiang Xu and Ying Shan. 2024. MiraData: A Large-Scale Video Dataset with Long Durations and Structured Captions. arxiv:https:\/\/arXiv.org\/abs\/2407.06358\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2407.06358"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_2_33_1","unstructured":"Weijie Kong Qi Tian Zijian Zhang Rox Min Zuozhuo Dai et\u00a0al. 2024. HunyuanVideo: A Systematic Framework For Large Video Generative Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03603 (2024)."},{"key":"e_1_3_3_2_34_1","unstructured":"Mathis Koroglu Hugo Caselles-Dupr\u00e9 Guillaume\u00a0Jeanneret Sanmiguel and Matthieu Cord. 2024. OnlyFlow: Optical Flow based Motion Conditioning for Video Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10501 (2024)."},{"key":"e_1_3_3_2_35_1","unstructured":"Kuaishou. 2024. Keling. https:\/\/kling.kuaishou.com\/"},{"key":"e_1_3_3_2_36_1","unstructured":"Black\u00a0Forest Labs. 2024. FLUX. https:\/\/github.com\/black-forest-labs\/flux"},{"key":"e_1_3_3_2_37_1","unstructured":"Guojun Lei Chi Wang Hong Li Rong Zhang Yikai Wang and Weiwei Xu. 2024. AnimateAnything: Consistent and Controllable Animation for Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10836 (2024)."},{"key":"e_1_3_3_2_38_1","unstructured":"Yaowei Li Xintao Wang Zhaoyang Zhang Zhouxia Wang Ziyang Yuan Liangbin Xie Yuexian Zou and Ying Shan. 2024. Image conductor: Precision control for interactive video synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.15339 (2024)."},{"key":"e_1_3_3_2_39_1","unstructured":"Bin Lin Yunyang Ge Xinhua Cheng Zongjian Li Bin Zhu Shaodong Wang Xianyi He Yang Ye Shenghai Yuan Liuhan Chen et\u00a0al. 2024. Open-Sora Plan: Open-Source Large Video Generation Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.00131 (2024)."},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596800"},{"key":"e_1_3_3_2_41_1","unstructured":"Jiahao Lu Tianyu Huang Peng Li Zhiyang Dou Cheng Lin Zhiming Cui Zhen Dong Sai-Kit Yeung Wenping Wang and Yuan Liu. 2024. Align3R: Aligned Monocular Depth Estimation for Dynamic Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03079 (2024)."},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687652"},{"key":"e_1_3_3_2_43_1","unstructured":"Yue Ma Yingqing He Hongfa Wang Andong Wang Chenyang Qi Chengfei Cai Xiu Li Zhifeng Li Heung-Yeung Shum Wei Liu et\u00a0al. 2024a. Follow-your-click: Open-domain regional image animation via short prompts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08268 (2024)."},{"key":"e_1_3_3_2_44_1","unstructured":"Tuna Han\u00a0Salih Meral Hidir Yesiltepe Connor Dunlop and Pinar Yanardag. 2024. MotionFlow: Attention-Driven Motion Transfer in Video Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05275 (2024)."},{"key":"e_1_3_3_2_45_1","unstructured":"Chong Mou Mingdeng Cao Xintao Wang Zhaoyang Zhang Ying Shan and Jian Zhang. 2024. ReVideo: Remake a Video with Motion and Content Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.13865 (2024)."},{"key":"e_1_3_3_2_46_1","unstructured":"Koichi Namekata Sherwin Bahmani Ziyi Wu Yash Kant Igor Gilitschenski and David\u00a0B Lindell. 2024. Sg-i2v: Self-guided trajectory control in image-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.04989 (2024)."},{"key":"e_1_3_3_2_47_1","doi-asserted-by":"crossref","unstructured":"Pauline\u00a0C Ng and Steven Henikoff. 2003. SIFT: Predicting amino acid changes that affect protein function. Nucleic acids research 31 13 (2003) 3812\u20133814.","DOI":"10.1093\/nar\/gkg509"},{"key":"e_1_3_3_2_48_1","volume-title":"ECCV","author":"Niu Muyao","year":"2024","unstructured":"Muyao Niu, Xiaodong Cun, Xintao Wang, Yong Zhang, Ying Shan, and Yinqiang Zheng. 2024. Mofa-video: Controllable image animation via generative motion field adaptions in frozen image-to-video diffusion model. In ECCV."},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"crossref","unstructured":"Karran Pandey Paul Guerrero Metheus Gadelha Yannick Hold-Geoffroy Karan Singh and Niloy\u00a0J. Mitra. 2024. Diffusion Handles: Enabling 3D Edits for Diffusion Models by Lifting Activations to 3D. CVPR (2024).","DOI":"10.1109\/CVPR52733.2024.00735"},{"key":"e_1_3_3_2_50_1","unstructured":"Geon\u00a0Yeong Park Hyeonho Jeong Sang\u00a0Wan Lee and Jong\u00a0Chul Ye. 2024. Spectral motion alignment for video motion transfer using diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.15249 (2024)."},{"key":"e_1_3_3_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_52_1","unstructured":"William Peebles and Saining Xie. 2023b. Scalable Diffusion Models with Transformers. arxiv:https:\/\/arXiv.org\/abs\/2212.09748\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2212.09748"},{"key":"e_1_3_3_2_53_1","unstructured":"Adam Polyak Amit Zohar Andrew Brown Andros Tjandra Animesh Sinha Ann Lee Apoorv Vyas Bowen Shi Chih-Yao Ma Ching-Yao Chuang et\u00a0al. 2024. Movie gen: A cast of media foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.13720 (2024)."},{"key":"e_1_3_3_2_54_1","unstructured":"Alexander Pondaven Aliaksandr Siarohin Sergey Tulyakov Philip Torr and Fabio Pizzati. 2024. Video Motion Transfer with Diffusion Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.07776 (2024)."},{"key":"e_1_3_3_2_55_1","unstructured":"Jordi Pont-Tuset Federico Perazzi Sergi Caelles Pablo Arbel\u00e1ez Alexander Sorkine-Hornung and Luc Van Gool. 2017. The 2017 DAVIS Challenge on Video Object Segmentation. arXiv:https:\/\/arXiv.org\/abs\/1704.00675 (2017)."},{"key":"e_1_3_3_2_56_1","unstructured":"Haonan Qiu Zhaoxi Chen Zhouxia Wang Yingqing He Menghan Xia and Ziwei Liu. 2024. Freetraj: Tuning-free trajectory control in video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16863 (2024)."},{"key":"e_1_3_3_2_57_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"crossref","unstructured":"Ren\u00e9 Ranftl Katrin Lasinger David Hafner Konrad Schindler and Vladlen Koltun. 2020. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. IEEE transactions on pattern analysis and machine intelligence 44 3 (2020) 1623\u20131637.","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"e_1_3_3_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591503"},{"key":"e_1_3_3_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_61_1","unstructured":"Rahul Sajnani Jeroen Vanbaar Jie Min Kapil Katyal and Srinath Sridhar. 2025. GeoDiffuser: Geometry-Based Image Editing with Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2404.14403\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2404.14403"},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657497"},{"key":"e_1_3_3_2_63_1","unstructured":"Vincent Sitzmann Semon Rezchikov Bill Freeman Josh Tenenbaum and Fredo Durand. 2021. Light field networks: Neural scene representations with single-evaluation rendering. Advances in Neural Information Processing Systems 34 (2021) 19313\u201319325."},{"key":"e_1_3_3_2_64_1","unstructured":"Yao Teng Enze Xie Yue Wu Haoyu Han Zhenguo Li and Xihui Liu. 2023. Drag-a-video: Non-rigid video editing with point-based interaction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.02936 (2023)."},{"key":"e_1_3_3_2_65_1","unstructured":"Thomas Unterthiner Sjoerd van Steenkiste Karol Kurach Raphael Marinier Marcin Michalski and Sylvain Gelly. 2019. Towards Accurate Generative Models of Video: A New Metric & Challenges. arxiv:https:\/\/arXiv.org\/abs\/1812.01717\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1812.01717"},{"key":"e_1_3_3_2_66_1","unstructured":"Jiawei Wang Yuchen Zhang Jiaxin Zou Yan Zeng Guoqiang Wei Liping Yuan and Hang Li. 2024f. Boximator: Generating rich and controllable motions for video synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.01566 (2024)."},{"key":"e_1_3_3_2_67_1","unstructured":"Ruicheng Wang Sicheng Xu Cassie Dai Jianfeng Xiang Yu Deng Xin Tong and Jiaolong Yang. 2024b. MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision. arxiv:https:\/\/arXiv.org\/abs\/2410.19115\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2410.19115"},{"key":"e_1_3_3_2_68_1","unstructured":"Tianfu Wang Menelaos Kanakis Konrad Schindler Luc Van\u00a0Gool and Anton Obukhov. 2023. Breathing new life into 3d assets with generative repainting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.08523 (2023)."},{"key":"e_1_3_3_2_69_1","unstructured":"Xiang Wang Hangjie Yuan Shiwei Zhang Dayou Chen Jiuniu Wang Yingya Zhang Yujun Shen Deli Zhao and Jingren Zhou. 2024d. Videocomposer: Compositional video synthesis with motion controllability. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_70_1","unstructured":"Yuelei Wang Jian Zhang Pengtao Jiang Hao Zhang Jinwei Chen and Bo Li. 2024e. CPA: Camera-pose-awareness Diffusion Transformer for Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.01429 (2024)."},{"key":"e_1_3_3_2_71_1","doi-asserted-by":"publisher","unstructured":"Zhou Wang A.C. Bovik H.R. Sheikh and E.P. Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE Transactions on Image Processing 13 4 (2004) 600\u2013612. 10.1109\/TIP.2003.819861","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_2_72_1","unstructured":"Zhouxia Wang Yushi Lan Shangchen Zhou and Chen\u00a0Change Loy. 2024a. ObjCtrl-2.5 D: Training-free Object Control with Camera Poses. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.07721 (2024)."},{"key":"e_1_3_3_2_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01929"},{"key":"e_1_3_3_2_75_1","unstructured":"Zeqi Xiao Wenqi Ouyang Yifan Zhou Shuai Yang Lei Yang Jianlou Si and Xingang Pan. 2024a. Trajectory Attention for Fine-grained Video Motion Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.19324 (2024)."},{"key":"e_1_3_3_2_76_1","first-page":"399","volume-title":"European Conference on Computer Vision","author":"Xing Jinbo","year":"2024","unstructured":"Jinbo Xing, Menghan Xia, Yong Zhang, Haoxin Chen, Wangbo Yu, Hanyuan Liu, Gongye Liu, Xintao Wang, Ying Shan, and Tien-Tsin Wong. 2024. Dynamicrafter: Animating open-domain images with video diffusion priors. In European Conference on Computer Vision. Springer, 399\u2013417."},{"key":"e_1_3_3_2_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657481"},{"key":"e_1_3_3_2_78_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024b. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00809"},{"key":"e_1_3_3_2_80_1","unstructured":"Shengming Yin Chenfei Wu Jian Liang Jie Shi Houqiang Li Gong Ming and Nan Duan. 2023. Dragnuwa: Fine-grained control in video generation by integrating text image and trajectory. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.08089 (2023)."},{"key":"e_1_3_3_2_81_1","unstructured":"Wangbo Yu Jinbo Xing Li Yuan Wenbo Hu Xiaoyu Li Zhipeng Huang Xiangjun Gao Tien-Tsin Wong Ying Shan and Yonghong Tian. 2024. Viewcrafter: Taming video diffusion models for high-fidelity novel view synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02048 (2024)."},{"key":"e_1_3_3_2_82_1","doi-asserted-by":"crossref","unstructured":"Shenghai Yuan Jinfa Huang Xianyi He Yunyuan Ge Yujun Shi Liuhan Chen Jiebo Luo and Li Yuan. 2024. Identity-Preserving Text-to-Video Generation by Frequency Decomposition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.17440 (2024).","DOI":"10.32388\/TZIID6"},{"key":"e_1_3_3_2_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_2_84_1","doi-asserted-by":"crossref","unstructured":"Qihang Zhang Shuangfei Zhai Miguel\u00a0Angel Bautista Kevin Miao Alexander Toshev Joshua Susskind and Jiatao Gu. 2024. World-consistent Video Diffusion with Explicit 3D Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.01821 (2024).","DOI":"10.1109\/CVPR52734.2025.02020"},{"key":"e_1_3_3_2_85_1","unstructured":"Richard Zhang Phillip Isola Alexei\u00a0A. Efros Eli Shechtman and Oliver Wang. 2018. The Unreasonable Effectiveness of Deep Features as a Perceptual Metric. arxiv:https:\/\/arXiv.org\/abs\/1801.03924\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1801.03924"},{"key":"e_1_3_3_2_86_1","unstructured":"Tingyang Zhang Chen Wang Zhiyang Dou Jiahui\u00a0Lei Qingzhe\u00a0Gao Baoquan Chen and Lingjie Liu. 2025. ProTracker: Probabilistic Integration for Robust and Accurate Point Tracking. arXiv preprint arxiv:https:\/\/arXiv.org\/abs\/2501.03220 (2025)."},{"key":"e_1_3_3_2_87_1","unstructured":"Guangcong Zheng Teng Li Rui Jiang Yehao Lu Tao Wu and Xi Li. 2024a. CamI2V: Camera-Controlled Image-to-Video Diffusion Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.15957 (2024)."},{"key":"e_1_3_3_2_88_1","volume-title":"Open-Sora: Democratizing Efficient Video Production for All","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiangyu Peng, Tianji Yang, Chenhui Shen, Shenggui Li, Hongxin Liu, Yukun Zhou, Tianyi Li, and Yang You. 2024b. Open-Sora: Democratizing Efficient Video Production for All. https:\/\/github.com\/hpcaitech\/Open-Sora"},{"key":"e_1_3_3_2_89_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201323"},{"key":"e_1_3_3_2_90_1","unstructured":"Hanxin Zhu Tianyu He Anni Tang Junliang Guo Zhibo Chen and Jiang Bian. 2024b. Compositional 3D-aware Video Generation with LLM Director. arxiv:https:\/\/arXiv.org\/abs\/2409.00558\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2409.00558"},{"key":"e_1_3_3_2_91_1","unstructured":"Shenhao Zhu Junming\u00a0Leo Chen Zuozhuo Dai Yinghui Xu Xun Cao Yao Yao Hao Zhu and Siyu Zhu. 2024a. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. arxiv:https:\/\/arXiv.org\/abs\/2403.14781\u00a0[cs.CV]"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:57:32Z","timestamp":1774018652000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730607"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":90,"alternative-id":["10.1145\/3721238.3730607","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730607","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}