{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:47:34Z","timestamp":1774021654482,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":119,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730668","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Reenact Anything: Semantic Video Motion Transfer Using Motion-Textual Inversion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4238-7560","authenticated-orcid":false,"given":"Manuel","family":"Kansy","sequence":"first","affiliation":[{"name":"ETH Z\u00fcrich, Zurich, Switzerland and DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2315-5166","authenticated-orcid":false,"given":"Jacek","family":"Naruniec","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1473-1878","authenticated-orcid":false,"given":"Christopher","family":"Schroers","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9324-779X","authenticated-orcid":false,"given":"Markus","family":"Gross","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich, Zurich, Switzerland and DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1196-5425","authenticated-orcid":false,"given":"Romann M.","family":"Weber","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_3_2_1","first-page":"e70093","volume-title":"Computer Graphics Forum","author":"Andreou Nefeli","year":"2024","unstructured":"Nefeli Andreou, Xi Wang, Victoria\u00a0Fern\u00e1ndez Abrevaya, Marie-Paule Cani, Yiorgos Chrysanthou, and Vicky Kalogeiton. 2024. Lead: Latent realignment for human motion diffusion. In Computer Graphics Forum. Wiley Online Library, e70093."},{"key":"e_1_3_3_3_3_1","unstructured":"Sherwin Bahmani Ivan Skorokhodov Aliaksandr Siarohin Willi Menapace Guocheng Qian Michael Vasilkovsky Hsin-Ying Lee Chaoyang Wang Jiaxu Zou Andrea Tagliasacchi et\u00a0al. 2024. VD3D: Taming Large Video Diffusion Transformers for 3D Camera Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.12781 (2024)."},{"key":"e_1_3_3_3_4_1","unstructured":"Jianhong Bai Tianyu He Yuchi Wang Junliang Guo Haoji Hu Zuozhu Liu and Jiang Bian. 2024. UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13185 (2024)."},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32182"},{"key":"e_1_3_3_3_7_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023a. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_3_9_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman Clarence Ng Ricky Wang and Aditya Ramesh. 2024. Video generation models as world simulators. (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"crossref","unstructured":"Ryan Burgert Yuancheng Xu Wenqi Xian Oliver Pilarski Pascal Clausen Mingming He Li Ma Yitong Deng Lingxiao Li Mohsen Mousavi et\u00a0al. 2025. Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using Real-Time Warped Noise. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.08331 (2025).","DOI":"10.1109\/CVPR52734.2025.00011"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"e_1_3_3_3_13_1","unstructured":"Hila Chefer Uriel Singer Amit Zohar Yuval Kirstain Adam Polyak Yaniv Taigman Lior Wolf and Shelly Sheynin. 2025. VideoJAM: Joint Appearance-Motion Representations for Enhanced Motion Generation in Video Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.02492 (2025)."},{"key":"e_1_3_3_3_14_1","doi-asserted-by":"crossref","unstructured":"Changgu Chen Junwei Shu Lianggangxu Chen Gaoqi He Changbo Wang and Yang Li. 2024. Motion-Zero: Zero-Shot Moving Object Control Framework for Diffusion-Based Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.10150 (2024).","DOI":"10.1609\/aaai.v39i2.32198"},{"key":"e_1_3_3_3_15_1","unstructured":"Tsai-Shien Chen Chieh\u00a0Hubert Lin Hung-Yu Tseng Tsung-Yi Lin and Ming-Hsuan Yang. 2023a. Motion-conditioned diffusion model for controllable video synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.14404 (2023)."},{"key":"e_1_3_3_3_16_1","unstructured":"Weifeng Chen Jie Wu Pan Xie Hefeng Wu Jiashi Li Xin Xia Xuefeng Xiao and Liang Lin. 2023b. Control-a-video: Controllable text-to-video generation with diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13840 (2023)."},{"key":"e_1_3_3_3_17_1","unstructured":"Soon\u00a0Yau Cheong Duygu Ceylan Armin Mustafa Andrew Gilbert and Chun-Hao\u00a0Paul Huang. 2024. Boosting camera motion control for video diffusion transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.10802 (2024)."},{"key":"e_1_3_3_3_18_1","unstructured":"Zuozhuo Dai Zhenghao Zhang Yao Yao Bingxue Qiu Siyu Zhu Long Qin and Weizhi Wang. 2023. AnimateAnything: Fine-Grained Open Domain Image Animation with Motion Guidance. arXiv e-prints (2023) arXiv\u20132311."},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547838"},{"key":"e_1_3_3_3_20_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Gal Rinon","year":"2023","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit\u00a0Haim Bermano, Gal Chechik, and Daniel Cohen-or. 2023. An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_3_21_1","first-page":"395","volume-title":"European Conference on Computer Vision","author":"Garibi Daniel","year":"2024","unstructured":"Daniel Garibi, Or Patashnik, Andrey Voynov, Hadar Averbuch-Elor, and Daniel Cohen-Or. 2024. Renoise: Real image inversion through iterative noising. In European Conference on Computer Vision. Springer, 395\u2013413."},{"key":"e_1_3_3_3_22_1","unstructured":"Daniel Geng Charles Herrmann Junhwa Hur Forrester Cole Serena Zhang Tobias Pfaff Tatiana Lopez-Guevara Carl Doersch Yusuf Aytar Michael Rubinstein et\u00a0al. 2024. Motion Prompting: Controlling Video Generation with Motion Trajectories. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.02700 (2024)."},{"key":"e_1_3_3_3_23_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Geyer Michal","year":"2024","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2024. TokenFlow: Consistent Diffusion Features for Consistent Video Editing. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00728"},{"key":"e_1_3_3_3_26_1","unstructured":"Zekai Gu Rui Yan Jiahao Lu Peng Li Zhiyang Dou Chenyang Si Zhen Dong Qifeng Liu Cheng Lin Ziwei Liu et\u00a0al. 2025. Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.03847 (2025)."},{"key":"e_1_3_3_3_27_1","unstructured":"Jianzhu Guo Dingyun Zhang Xiaoqiang Liu Zhizhou Zhong Yuan Zhang Pengfei Wan and Di Zhang. 2024b. LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.03168 (2024)."},{"key":"e_1_3_3_3_28_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024a. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00751"},{"key":"e_1_3_3_3_30_1","unstructured":"Hao He Yinghao Xu Yuwei Guo Gordon Wetzstein Bo Dai Hongsheng Li and Ceyuan Yang. 2024. CameraCtrl: Enabling Camera Control for Text-to-Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02101 (2024)."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"crossref","unstructured":"Eric Hedlin Gopal Sharma Shweta Mahajan Hossam Isack Abhishek Kar Andrea Tagliasacchi and Kwang\u00a0Moo Yi. 2023. Unsupervised semantic correspondence using stable diffusion. Advances in Neural Information Processing Systems 36 (2023) 8266\u20138279.","DOI":"10.52202\/075280-0363"},{"key":"e_1_3_3_3_32_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Hertz Amir","year":"2023","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-or. 2023. Prompt-to-Prompt Image Editing with Cross-Attention Control. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_3_3_33_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_3_34_1","volume-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_3_3_35_1","unstructured":"Chen Hou Guoqiang Wei Yan Zeng and Zhibo Chen. 2024. Training-free Camera Control for Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.10126 (2024)."},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00072"},{"key":"e_1_3_3_3_37_1","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang Weizhu Chen et\u00a0al. 2022. Lora: Low-rank adaptation of large language models. ICLR 1 2 (2022) 3."},{"key":"e_1_3_3_3_38_1","first-page":"8153","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Hu Li","year":"2024","unstructured":"Li Hu. 2024. Animate anyone: Consistent and controllable image-to-video synthesis for character animation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8153\u20138163."},{"key":"e_1_3_3_3_39_1","unstructured":"Teng Hu Jiangning Zhang Ran Yi Yating Wang Hongrui Huang Jieyu Weng Yabiao Wang and Lizhuang Ma. 2024. MotionMaster: Training-free Camera Motion Transfer For Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.15789 (2024)."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00772"},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00880"},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"crossref","unstructured":"Tero Karras Miika Aittala Timo Aila and Samuli Laine. 2022. Elucidating the design space of diffusion-based generative models. Advances in Neural Information Processing Systems 35 (2022) 26565\u201326577.","DOI":"10.52202\/068431-1926"},{"key":"e_1_3_3_3_44_1","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings","author":"Kingma Diederik\u00a0P.","year":"2015","unstructured":"Diederik\u00a0P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_3_3_45_1","unstructured":"Weijie Kong Qi Tian Zijian Zhang Rox Min Zuozhuo Dai Jin Zhou Jiangfeng Xiong Xin Li Bo Wu Jianwei Zhang et\u00a0al. 2024. Hunyuanvideo: A systematic framework for large video generative models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03603 (2024)."},{"key":"e_1_3_3_3_46_1","first-page":"409","volume-title":"European Conference on Computer Vision","author":"Li Mingxiao","year":"2024","unstructured":"Mingxiao Li, Bo Wan, Marie-Francine Moens, and Tinne Tuytelaars. 2024b. Animate your motion: Turning still images into dynamic videos. In European Conference on Computer Vision. Springer, 409\u2013425."},{"key":"e_1_3_3_3_47_1","unstructured":"Ruining Li Chuanxia Zheng Christian Rupprecht and Andrea Vedaldi. 2024c. Puppet-Master: Scaling Interactive Video Generation as a Motion Prior for Part-Level Dynamics. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.04631 (2024)."},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01723"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680718"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32533"},{"key":"e_1_3_3_3_51_1","unstructured":"Pengyang Ling Jiazi Bu Pan Zhang Xiaoyi Dong Yuhang Zang Tong Wu Huaian Chen Jiaqi Wang and Yi Jin. 2024. MotionClone: Training-Free Motion Cloning for Controllable Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.05338 (2024)."},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"crossref","unstructured":"Grace Luo Lisa Dunlap Dong\u00a0Huk Park Aleksander Holynski and Trevor Darrell. 2023. Diffusion hyperfeatures: Searching through time and space for semantic correspondence. Advances in Neural Information Processing Systems 36 (2023) 47500\u201347510.","DOI":"10.52202\/075280-2057"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687652"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"e_1_3_3_3_56_1","first-page":"1634","volume-title":"Proceedings of the Asian Conference on Computer Vision","author":"Materzy\u0144ska Joanna","year":"2024","unstructured":"Joanna Materzy\u0144ska, Josef Sivic, Eli Shechtman, Antonio Torralba, Richard Zhang, and Bryan Russell. 2024. NewMove: Customizing text-to-video models with novel motions. In Proceedings of the Asian Conference on Computer Vision. 1634\u20131651."},{"key":"e_1_3_3_3_57_1","unstructured":"Tuna Han\u00a0Salih Meral Hidir Yesiltepe Connor Dunlop and Pinar Yanardag. 2024. MotionFlow: Attention-Driven Motion Transfer in Video Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05275 (2024)."},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14616"},{"key":"e_1_3_3_3_60_1","unstructured":"Eyal Molad Eliahu Horwitz Dani Valevski Alex\u00a0Rav Acha Yossi Matias Yael Pritch Yaniv Leviathan and Yedid Hoshen. 2023. Dreamix: Video diffusion models are general video editors. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.01329 (2023)."},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"crossref","unstructured":"Chong Mou Mingdeng Cao Xintao Wang Zhaoyang Zhang Ying Shan and Jian Zhang. 2024. Revideo: Remake a video with motion and content control. Advances in Neural Information Processing Systems 37 (2024) 18481\u201318505.","DOI":"10.52202\/079017-0586"},{"key":"e_1_3_3_3_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00049"},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00728"},{"key":"e_1_3_3_3_64_1","first-page":"111","volume-title":"European Conference on Computer Vision","author":"Niu Muyao","year":"2024","unstructured":"Muyao Niu, Xiaodong Cun, Xintao Wang, Yong Zhang, Ying Shan, and Yinqiang Zheng. 2024. Mofa-video: Controllable image animation via generative motion field adaptions in frozen image-to-video diffusion model. In European Conference on Computer Vision. Springer, 111\u2013128."},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591513"},{"key":"e_1_3_3_3_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_3_67_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Podell Dustin","year":"2024","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2024. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_3_68_1","unstructured":"Alexander Pondaven Aliaksandr Siarohin Sergey Tulyakov Philip Torr and Fabio Pizzati. 2024. Video Motion Transfer with Diffusion Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.07776 (2024)."},{"key":"e_1_3_3_3_69_1","unstructured":"Haonan Qiu Zhaoxi Chen Zhouxia Wang Yingqing He Menghan Xia and Ziwei Liu. 2024. FreeTraj: Tuning-Free Trajectory Control in Video Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16863 (2024)."},{"key":"e_1_3_3_3_70_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_71_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_3_72_1","first-page":"332","volume-title":"European Conference on Computer Vision","author":"Ren Yixuan","year":"2024","unstructured":"Yixuan Ren, Yang Zhou, Jimei Yang, Jing Shi, Difan Liu, Feng Liu, Mingi Kwon, and Abhinav Shrivastava. 2024. Customize-a-video: One-shot motion customization of text-to-video diffusion models. In European Conference on Computer Vision. Springer, 332\u2013349."},{"key":"e_1_3_3_3_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_3_75_1","doi-asserted-by":"crossref","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022) 36479\u201336494.","DOI":"10.52202\/068431-2643"},{"key":"e_1_3_3_3_76_1","unstructured":"Aliaksandr Siarohin St\u00e9phane Lathuili\u00e8re Sergey Tulyakov Elisa Ricci and Nicu Sebe. 2019. First order motion model for image animation. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_3_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_3_3_78_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_3_79_1","volume-title":"International Conference on Learning Representations","author":"Song Yang","year":"2021","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik\u00a0P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations."},{"key":"e_1_3_3_3_80_1","doi-asserted-by":"crossref","unstructured":"Luming Tang Menglin Jia Qianqian Wang Cheng\u00a0Perng Phoo and Bharath Hariharan. 2023. Emergent correspondence from image diffusion. Advances in Neural Information Processing Systems 36 (2023) 1363\u20131389.","DOI":"10.52202\/075280-0068"},{"key":"e_1_3_3_3_81_1","unstructured":"Maham Tanveer Yizhi Wang Ruiqi Wang Nanxuan Zhao Ali Mahdavi-Amiri and Hao Zhang. 2024. AnaMoDiff: 2D Analogical Motion Diffusion via Disentangled Denoising. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.03549 (2024)."},{"key":"e_1_3_3_3_82_1","doi-asserted-by":"crossref","unstructured":"Zhan Tong Yibing Song Jue Wang and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems 35 (2022) 10078\u201310093.","DOI":"10.52202\/068431-0732"},{"key":"e_1_3_3_3_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00753"},{"key":"e_1_3_3_3_84_1","unstructured":"Shuyuan Tu Qi Dai Zihao Zhang Sicheng Xie Zhi-Qi Cheng Chong Luo Xintong Han Zuxuan Wu and Yu-Gang Jiang. 2024b. MotionFollower: Editing Video Motion via Lightweight Score-Guided Diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.20325 (2024)."},{"key":"e_1_3_3_3_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_3_3_86_1","unstructured":"Patrick von Platen Suraj Patil Anton Lozhkov Pedro Cuenca Nathan Lambert Kashif Rasul Mishig Davaadorj Dhruv Nair Sayak Paul William Berman Yiyi Xu Steven Liu and Thomas Wolf. 2022. Diffusers: State-of-the-art diffusion models. https:\/\/github.com\/huggingface\/diffusers."},{"key":"e_1_3_3_3_87_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023b. Modelscope text-to-video technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06571 (2023)."},{"key":"e_1_3_3_3_88_1","first-page":"52274","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li. 2024e. Boximator: generating rich and controllable motions for video synthesis. In Proceedings of the 41st International Conference on Machine Learning. 52274\u201352289."},{"key":"e_1_3_3_3_89_1","unstructured":"Luozhou Wang Guibao Shen Yixun Liang Xin Tao Pengfei Wan Di Zhang Yijun Li and Yingcong Chen. 2024b. Motion Inversion for Video Customization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.20193 (2024)."},{"key":"e_1_3_3_3_90_1","unstructured":"Qilin Wang Zhengkai Jiang Chengming Xu Jiangning Zhang Yabiao Wang Xinyi Zhang Yun Cao Weijian Cao Chengjie Wang and Yanwei Fu. 2024a. VividPose: Advancing Stable Video Diffusion for Realistic Human Image Animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.18156 (2024)."},{"key":"e_1_3_3_3_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_3_3_92_1","unstructured":"Wen Wang Yan Jiang Kangyang Xie Zide Liu Hao Chen Yue Cao Xinlong Wang and Chunhua Shen. 2023a. Zero-shot video editing using off-the-shelf image diffusion models."},{"key":"e_1_3_3_3_93_1","unstructured":"Xiang Wang Hangjie Yuan Shiwei Zhang Dayou Chen Jiuniu Wang Yingya Zhang Yujun Shen Deli Zhao and Jingren Zhou. 2024d. Videocomposer: Compositional video synthesis with motion controllability. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_3_95_1","first-page":"6537","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wei Yujie","year":"2024","unstructured":"Yujie Wei, Shiwei Zhang, Zhiwu Qing, Hangjie Yuan, Zhiheng Liu, Yu Liu, Yingya Zhang, Jingren Zhou, and Hongming Shan. 2024. Dreamvideo: Composing your dream videos with customized subject and motion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6537\u20136549."},{"key":"e_1_3_3_3_96_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Wu Jianzong","year":"2024","unstructured":"Jianzong Wu, Xiangtai Li, Yanhong Zeng, Jiangning Zhang, Qianyu Zhou, Yining Li, Yunhai Tong, and Kai Chen. 2024c. MotionBooth: Motion-Aware Customized Text-to-Video Generation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_3_97_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_3_3_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"e_1_3_3_3_99_1","first-page":"331","volume-title":"European Conference on Computer Vision","author":"Wu Weijia","year":"2024","unstructured":"Weijia Wu, Zhuang Li, Yuchao Gu, Rui Zhao, Yefei He, David\u00a0Junhao Zhang, Mike\u00a0Zheng Shou, Yan Li, Tingting Gao, and Di Zhang. 2024b. Draganything: Motion control for anything using entity representation. In European Conference on Computer Vision. Springer, 331\u2013348."},{"key":"e_1_3_3_3_100_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Xiao Zeqi","year":"2024","unstructured":"Zeqi Xiao, Yifan Zhou, Shuai Yang, and Xingang Pan. 2024. Video Diffusion Models are Training-free Motion Interpreter and Controller. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_3_101_1","unstructured":"Dejia Xu Weili Nie Chao Liu Sifei Liu Jan Kautz Zhangyang Wang and Arash Vahdat. 2024. CamCo: Camera-Controllable 3D-Consistent Image-to-Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.02509 (2024)."},{"key":"e_1_3_3_3_102_1","unstructured":"Wilson Yan Andrew Brown Pieter Abbeel Rohit Girdhar and Samaneh Azadi. 2023. Motion-conditioned image animation for video editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.18827 (2023)."},{"key":"e_1_3_3_3_103_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657481"},{"key":"e_1_3_3_3_104_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618160"},{"key":"e_1_3_3_3_105_1","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Yang Zhuoyi","year":"2025","unstructured":"Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, Da Yin, Yuxuan.Zhang, Weihan Wang, Yean Cheng, Bin Xu, Xiaotao Gu, Yuxiao Dong, and Jie Tang. 2025. CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=LQzN6TRFg9"},{"key":"e_1_3_3_3_106_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00535"},{"key":"e_1_3_3_3_107_1","unstructured":"Danah Yatim Rafail Fridman Omer\u00a0Bar Tal Yoni Kasten and Tali Dekel. 2023. Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17009 (2023)."},{"key":"e_1_3_3_3_108_1","unstructured":"Shengming Yin Chenfei Wu Jian Liang Jie Shi Houqiang Li Gong Ming and Nan Duan. 2023. Dragnuwa: Fine-grained control in video generation by integrating text image and trajectory. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.08089 (2023)."},{"key":"e_1_3_3_3_109_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00297"},{"key":"e_1_3_3_3_110_1","doi-asserted-by":"crossref","unstructured":"Junyi Zhang Charles Herrmann Junhwa Hur Luisa Polania\u00a0Cabrera Varun Jampani Deqing Sun and Ming-Hsuan Yang. 2023a. A tale of two features: Stable diffusion complements dino for zero-shot semantic correspondence. Advances in Neural Information Processing Systems 36 (2023) 45533\u201345547.","DOI":"10.52202\/075280-1973"},{"key":"e_1_3_3_3_111_1","unstructured":"Shiwei Zhang Jiayu Wang Yingya Zhang Kang Zhao Hangjie Yuan Zhiwu Qin Xiang Wang Deli Zhao and Jingren Zhou. 2023c. I2vgen-xl: High-quality image-to-video synthesis via cascaded diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.04145 (2023)."},{"key":"e_1_3_3_3_112_1","unstructured":"Yuxin Zhang Fan Tang Nisha Huang Haibin Huang Chongyang Ma Weiming Dong and Changsheng Xu. 2023b. MotionCrafter: One-Shot Motion Customization of Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.05288 (2023)."},{"key":"e_1_3_3_3_113_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhang Yabo","year":"2024","unstructured":"Yabo Zhang, Yuxiang Wei, Dongsheng Jiang, XIAOPENG ZHANG, Wangmeng Zuo, and Qi Tian. 2024b. ControlVideo: Training-free Controllable Text-to-video Generation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_3_114_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_3_3_115_1","first-page":"273","volume-title":"European Conference on Computer Vision","author":"Zhao Rui","year":"2024","unstructured":"Rui Zhao, Yuchao Gu, Jay\u00a0Zhangjie Wu, David\u00a0Junhao Zhang, Jia-Wei Liu, Weijia Wu, Jussi Keppo, and Mike\u00a0Zheng Shou. 2024. Motiondirector: Motion customization of text-to-video diffusion models. In European Conference on Computer Vision. Springer, 273\u2013290."},{"key":"e_1_3_3_3_116_1","unstructured":"Yuyang Zhao Enze Xie Lanqing Hong Zhenguo Li and Gim\u00a0Hee Lee. 2023. Make-a-protagonist: Generic video editing with an ensemble of experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.08850 (2023)."},{"key":"e_1_3_3_3_117_1","unstructured":"Guangcong Zheng Teng Li Rui Jiang Yehao Lu Tao Wu and Xi Li. 2024. Cami2v: Camera-controlled image-to-video diffusion model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.15957 (2024)."},{"key":"e_1_3_3_3_118_1","unstructured":"Haitao Zhou Chuang Wang Rui Nie Jinxiao Lin Dongdong Yu Qian Yu and Changhu Wang. 2024. TrackGo: A Flexible and Efficient Method for Controllable Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.11475 (2024)."},{"key":"e_1_3_3_3_119_1","first-page":"145","volume-title":"European Conference on Computer Vision","author":"Zhu Shenhao","year":"2024","unstructured":"Shenhao Zhu, Junming\u00a0Leo Chen, Zuozhuo Dai, Zilong Dong, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, and Siyu Zhu. 2024. Champ: Controllable and consistent human image animation with 3d parametric guidance. In European Conference on Computer Vision. Springer, 145\u2013162."},{"key":"e_1_3_3_3_120_1","unstructured":"Yi Zuo Lingling Li Licheng Jiao Fang Liu Xu Liu Wenping Ma Shuyuan Yang and Yuwei Guo. 2024. Edit-Your-Motion: Space-Time Diffusion Decoupling Learning for Video Motion Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.04496 (2024)."}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730668","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:53:20Z","timestamp":1774018400000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730668"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":119,"alternative-id":["10.1145\/3721238.3730668","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730668","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}