{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:34Z","timestamp":1765357714912,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681394","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"3332-3341","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Zero-Shot Controllable Image-to-Video Animation via Motion Decomposition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1670-0054","authenticated-orcid":false,"given":"Shoubin","family":"Yu","sequence":"first","affiliation":[{"name":"University of North Carolina at Chapel Hill, Chapel Hill, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6740-6855","authenticated-orcid":false,"given":"Jacob Zhiyuan","family":"Fang","sequence":"additional","affiliation":[{"name":"Amazon, San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6455-0161","authenticated-orcid":false,"given":"Jian","family":"Zheng","sequence":"additional","affiliation":[{"name":"Amazon, San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8967-7322","authenticated-orcid":false,"given":"Gunnar","family":"Sigurdsson","sequence":"additional","affiliation":[{"name":"Amazon, San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0279-5275","authenticated-orcid":false,"given":"Vicente","family":"Ordonez","sequence":"additional","affiliation":[{"name":"Rice University, Houston, TX, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1767-8382","authenticated-orcid":false,"given":"Robinson","family":"Piramuthu","sequence":"additional","affiliation":[{"name":"Amazon, San Jose, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2965-5354","authenticated-orcid":false,"given":"Mohit","family":"Bansal","sequence":"additional","affiliation":[{"name":"University of North Carolina at Chapel Hill, Chapel Hill, NC, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Latent-shift: Latent diffusion with temporal shift for efficient text-to-video generation. arXiv preprint arXiv:2304.08477","author":"An Jie","year":"2023","unstructured":"Jie An, Songyang Zhang, Harry Yang, Sonal Gupta, Jia-Bin Huang, Jiebo Luo, and Xi Yin. 2023. Latent-shift: Latent diffusion with temporal shift for efficient text-to-video generation. arXiv preprint arXiv:2304.08477 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01448"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01444"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00513"},{"key":"e_1_3_2_2_6_1","volume-title":"Peek- aboo: Text to image diffusion models are zero-shot segmentors. arXiv preprint arXiv:2211.13224","author":"Burgert Ryan","year":"2022","unstructured":"Ryan Burgert, Kanchana Ranasinghe, Xiang Li, and Michael S Ryoo. 2022. Peek- aboo: Text to image diffusion models are zero-shot segmentors. arXiv preprint arXiv:2211.13224 (2022)."},{"key":"e_1_3_2_2_7_1","volume-title":"Motion-Zero: Zero-Shot Moving Object Control Framework for Diffusion-Based Video Generation. arXiv preprint arXiv:2401.10150","author":"Chen Changgu","year":"2024","unstructured":"Changgu Chen, Junwei Shu, Lianggangxu Chen, Gaoqi He, Changbo Wang, and Yang Li. 2024. Motion-Zero: Zero-Shot Moving Object Control Framework for Diffusion-Based Video Generation. arXiv preprint arXiv:2401.10150 (2024)."},{"key":"e_1_3_2_2_8_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang Chao Weng and Ying Shan. 2023. VideoCrafter1: Open Diffusion Models for High-Quality Video Generation. arXiv:2310.19512 [cs.CV]"},{"key":"e_1_3_2_2_9_1","volume-title":"Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840","author":"Chen Weifeng","year":"2023","unstructured":"Weifeng Chen, Jie Wu, Pan Xie, Hefeng Wu, Jiashi Li, Xin Xia, Xuefeng Xiao, and Liang Lin. 2023. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840 (2023)."},{"key":"e_1_3_2_2_10_1","volume-title":"LivePhoto: Real Image Animation with Text-guided Motion Control. arXiv preprint arXiv:2312.02928","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Zhiheng Liu, Mengting Chen, Yutong Feng, Yu Liu, Yujun Shen, and Hengshuang Zhao. 2023. LivePhoto: Real Image Animation with Text-guided Motion Control. arXiv preprint arXiv:2312.02928 (2023)."},{"key":"e_1_3_2_2_11_1","volume-title":"Layoutdiffuse: Adapting foundational diffusion models for layout-to-image generation. arXiv preprint arXiv:2302.08908","author":"Cheng Jiaxin","year":"2023","unstructured":"Jiaxin Cheng, Xiao Liang, Xingjian Shi, Tong He, Tianjun Xiao, and Mu Li. 2023. Layoutdiffuse: Adapting foundational diffusion models for layout-to-image generation. arXiv preprint arXiv:2302.08908 (2023)."},{"key":"e_1_3_2_2_12_1","volume-title":"AnimateAnything: Fine-Grained Open Domain Image Animation with Motion Guidance. arXiv e-prints","author":"Dai Zuozhuo","year":"2023","unstructured":"Zuozhuo Dai, Zhenghao Zhang, Yao Yao, Bingxue Qiu, Siyu Zhu, Long Qin, and Weizhi Wang. 2023. AnimateAnything: Fine-Grained Open Domain Image Animation with Motion Guidance. arXiv e-prints (2023), arXiv--2311."},{"key":"e_1_3_2_2_13_1","volume-title":"Dragvideo: Interactive drag-style video editing. arXiv preprint arXiv:2312.02216","author":"Deng Yufan","year":"2023","unstructured":"Yufan Deng, Ruida Wang, Yuhao Zhang, Yu-Wing Tai, and Chi-Keung Tang. 2023. Dragvideo: Interactive drag-style video editing. arXiv preprint arXiv:2312.02216 (2023)."},{"key":"e_1_3_2_2_14_1","volume-title":"CATER: A diagnostic dataset for Com- positional Actions and TEmporal Reasoning. arXiv preprint arXiv:1910.04744","author":"Girdhar Rohit","year":"2019","unstructured":"Rohit Girdhar and Deva Ramanan. 2019. CATER: A diagnostic dataset for Com- positional Actions and TEmporal Reasoning. arXiv preprint arXiv:1910.04744 (2019)."},{"key":"e_1_3_2_2_15_1","volume-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00819"},{"key":"e_1_3_2_2_17_1","volume-title":"et al","author":"He Yingqing","year":"2023","unstructured":"Yingqing He, Menghan Xia, Haoxin Chen, Xiaodong Cun, Yuan Gong, Jinbo Xing, Yong Zhang, Xintao Wang, Chao Weng, Ying Shan, et al . 2023. Animate-a- story: Storytelling with retrieval-augmented video generation. arXiv preprint arXiv:2307.06940 (2023)."},{"key":"e_1_3_2_2_18_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In Advances in Neural Information Processing Systems, Vol. 30."},{"key":"e_1_3_2_2_19_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"e_1_3_2_2_21_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01768"},{"key":"e_1_3_2_2_23_1","volume-title":"Videocontrolnet: A motion-guided video-to-video translation framework by using diffusion model with controlnet. arXiv preprint arXiv:2307.14073","author":"Hu Zhihao","year":"2023","unstructured":"Zhihao Hu and Dong Xu. 2023. Videocontrolnet: A motion-guided video-to-video translation framework by using diffusion model with controlnet. arXiv preprint arXiv:2307.14073 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Free-bloom: Zero-shot text-to-video generator with llm director and ldm animator. arXiv preprint arXiv:2309.14494","author":"Huang Hanzhuo","year":"2023","unstructured":"Hanzhuo Huang, Yufan Feng, Cheng Shi, Lan Xu, Jingyi Yu, and Sibei Yang. 2023. Free-bloom: Zero-shot text-to-video generator with llm director and ldm animator. arXiv preprint arXiv:2309.14494 (2023)."},{"key":"e_1_3_2_2_25_1","volume-title":"Free-bloom: Zero-shot text-to-video generator with llm director and ldm animator. Advances in Neural Information Processing Systems 36","author":"Huang Hanzhuo","year":"2024","unstructured":"Hanzhuo Huang, Yufan Feng, Cheng Shi, Lan Xu, Jingyi Yu, and Sibei Yang. 2024. Free-bloom: Zero-shot text-to-video generator with llm director and ldm animator. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_26_1","volume-title":"Fine-grained controllable video generation via object appearance and context. arXiv preprint arXiv:2312.02919","author":"Huang Hsin-Ping","year":"2023","unstructured":"Hsin-Ping Huang, Yu-Chuan Su, Deqing Sun, Lu Jiang, Xuhui Jia, Yukun Zhu, and Ming-Hsuan Yang. 2023. Fine-grained controllable video generation via object appearance and context. arXiv preprint arXiv:2312.02919 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"Dreampose: Fashion image-to-video synthesis via stable diffu- sion. arXiv preprint arXiv:2304.06025","author":"Karras Johanna","year":"2023","unstructured":"Johanna Karras, Aleksander Holynski, Ting-Chun Wang, and Ira Kemelmacher- Shlizerman. 2023. Dreampose: Fashion image-to-video synthesis via stable diffu- sion. arXiv preprint arXiv:2304.06025 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"Zhangyang Wang, Shant Navasardyan, and Humphrey Shi.","author":"Khachatryan Levon","year":"2023","unstructured":"Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Hen- schel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2video-zero: Text-to-image diffusion models are zero-shot video genera- tors. arXiv preprint arXiv:2303.13439 (2023)."},{"key":"e_1_3_2_2_29_1","volume-title":"Backpropagation applied to handwritten zip code recognition. Neural computation 1, 4","author":"LeCun Yann","year":"1989","unstructured":"Yann LeCun, Bernhard Boser, John S Denker, Donnie Henderson, Richard E Howard, Wayne Hubbard, and Lawrence D Jackel. 1989. Backpropagation applied to handwritten zip code recognition. Neural computation 1, 4 (1989), 541--551."},{"key":"e_1_3_2_2_30_1","volume-title":"Qibin Hou, Yaxing Wang, and Jian Yang.","author":"Li Senmao","year":"2023","unstructured":"Senmao Li, Joost van de Weijer, Taihang Hu, Fahad Shahbaz Khan, Qibin Hou, Yaxing Wang, and Jian Yang. 2023. StyleDiffusion: Prompt-Embedding Inversion for Text-Based Editing. arXiv preprint arXiv:2303.15649 (2023)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_2_32_1","volume-title":"Gen- erative Image Dynamics. arXiv preprint arXiv:2309.07906","author":"Li Zhengqi","year":"2023","unstructured":"Zhengqi Li, Richard Tucker, Noah Snavely, and Aleksander Holynski. 2023. Gen- erative Image Dynamics. arXiv preprint arXiv:2309.07906 (2023)."},{"key":"e_1_3_2_2_33_1","volume-title":"LLM- GROUNDED VIDEO DIFFUSION MODELS. arXiv preprint arXiv:2309.17444","author":"Lian Long","year":"2023","unstructured":"Long Lian, Baifeng Shi, Adam Yala, Trevor Darrell, and Boyi Li. 2023. LLM- GROUNDED VIDEO DIFFUSION MODELS. arXiv preprint arXiv:2309.17444 (2023)."},{"key":"e_1_3_2_2_34_1","volume-title":"Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse Controls to Any Diffusion Model. arXiv preprint arXiv:2404.09967","author":"Lin Han","year":"2024","unstructured":"Han Lin, Jaemin Cho, Abhay Zala, and Mohit Bansal. 2024. Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse Controls to Any Diffusion Model. arXiv preprint arXiv:2404.09967 (2024)."},{"key":"e_1_3_2_2_35_1","volume-title":"VideoDirectorGPT: Consistent Multi-scene Video Generation via LLM-Guided Planning. arXiv preprint arXiv:2309.15091","author":"Lin Han","year":"2023","unstructured":"Han Lin, Abhay Zala, Jaemin Cho, and Mohit Bansal. 2023. VideoDirectorGPT: Consistent Multi-scene Video Generation via LLM-Guided Planning. arXiv preprint arXiv:2309.15091 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"et al","author":"Liu Shilong","year":"2023","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, et al . 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_2_37_1","volume-title":"Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761","author":"Liu Shaoteng","year":"2023","unstructured":"Shaoteng Liu, Yuechen Zhang, Wenbo Li, Zhe Lin, and Jiaya Jia. 2023. Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)."},{"key":"e_1_3_2_2_38_1","volume-title":"MagicStick: Controllable Video Editing via Control Handle Transformations. arXiv preprint arXiv:2312.03047","author":"Ma Yue","year":"2023","unstructured":"Yue Ma, Xiaodong Cun, Yingqing He, Chenyang Qi, Xintao Wang, Ying Shan, Xiu Li, and Qifeng Chen. 2023. MagicStick: Controllable Video Editing via Control Handle Transformations. arXiv preprint arXiv:2312.03047 (2023)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_2_41_1","unstructured":"openai. 2024. https:\/\/openai.com\/sora."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_2_2_43_1","volume-title":"Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535","author":"Qi Chenyang","year":"2023","unstructured":"Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, and Qifeng Chen. 2023. Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)."},{"key":"e_1_3_2_2_44_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_45_1","volume-title":"ConsistI2V: Enhancing Visual Consistency for Image-to- Video Generation. arXiv preprint arXiv:2402.04324","author":"Ren Weiming","year":"2024","unstructured":"Weiming Ren, Harry Yang, Ge Zhang, Cong Wei, Xinrun Du, Stephen Huang, and Wenhu Chen. 2024. ConsistI2V: Enhancing Visual Consistency for Image-to- Video Generation. arXiv preprint arXiv:2402.04324 (2024)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.308"},{"key":"e_1_3_2_2_48_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_2_49_1","volume-title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis via Bridging Image and Video Diffusion Models. arXiv preprint arXiv:2312.02813","author":"Shi Fengyuan","year":"2023","unstructured":"Fengyuan Shi, Jiaxi Gu, Hang Xu, Songcen Xu, Wei Zhang, and Limin Wang. 2023. BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis via Bridging Image and Video Diffusion Models. arXiv preprint arXiv:2312.02813 (2023)."},{"key":"e_1_3_2_2_50_1","volume-title":"ACM SIGGRAPH 2024 Conference Papers. 1--11","author":"Qin Hongwei","year":"2024","unstructured":"Xiaoyu Shi, Zhaoyang Huang, Fu-Yun Wang, Weikang Bian, Dasong Li, Yi Zhang, Manyuan Zhang, Ka Chun Cheung, Simon See, Hongwei Qin, et al . 2024. Motion- i2v: Consistent and controllable image-to-video generation with explicit motion modeling. In ACM SIGGRAPH 2024 Conference Papers. 1--11."},{"key":"e_1_3_2_2_51_1","volume-title":"et al","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al . 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_2_52_1","volume-title":"Mo- tionZero: Exploiting Motion Priors for Zero-shot Text-to-Video Generation. arXiv preprint arXiv:2311.16635","author":"Su Sitong","year":"2023","unstructured":"Sitong Su, Litao Guo, Lianli Gao, Hengtao Shen, and Jingkuan Song. 2023. Mo- tionZero: Exploiting Motion Priors for Zero-shot Text-to-Video Generation. arXiv preprint arXiv:2311.16635 (2023)."},{"key":"e_1_3_2_2_53_1","volume-title":"Drag- A-Video: Non-rigid Video Editing with Point-based Interaction. arXiv preprint arXiv:2312.02936","author":"Teng Yao","year":"2023","unstructured":"Yao Teng, Enze Xie, Yue Wu, Haoyu Han, Zhenguo Li, and Xihui Liu. 2023. Drag- A-Video: Non-rigid Video Editing with Point-based Interaction. arXiv preprint arXiv:2312.02936 (2023)."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"e_1_3_2_2_55_1","volume-title":"ICLR 2019 Workshop.","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new Metric for Video Genera- tion. In Deep Generative Models for Highly Structured Data, ICLR 2019 Workshop."},{"key":"e_1_3_2_2_56_1","volume-title":"Generating videos with scene dynamics. Advances in neural information processing systems 29","author":"Vondrick Carl","year":"2016","unstructured":"Carl Vondrick, Hamed Pirsiavash, and Antonio Torralba. 2016. Generating videos with scene dynamics. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_2_58_1","volume-title":"Boximator: Generating Rich and Controllable Motions for Video Synthesis. arXiv preprint arXiv:2402.01566","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li. 2024. Boximator: Generating Rich and Controllable Motions for Video Synthesis. arXiv preprint arXiv:2402.01566 (2024)."},{"key":"e_1_3_2_2_59_1","volume-title":"VideoComposer: Compositional Video Synthesis with Motion Controllability. arXiv preprint arXiv:2306.02018","author":"Wang Xiang","year":"2023","unstructured":"Xiang Wang, Hangjie Yuan, Shiwei Zhang, Dayou Chen, Jiuniu Wang, Yingya Zhang, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. VideoComposer: Compositional Video Synthesis with Motion Controllability. arXiv preprint arXiv:2306.02018 (2023)."},{"key":"e_1_3_2_2_60_1","volume-title":"Lavie: High- quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103","author":"Wang Yaohui","year":"2023","unstructured":"Yaohui Wang, Xinyuan Chen, Xin Ma, Shangchen Zhou, Ziqi Huang, Yi Wang, Ceyuan Yang, Yinan He, Jiashuo Yu, Peiqing Yang, et al. 2023. Lavie: High- quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)."},{"key":"e_1_3_2_2_61_1","volume-title":"Motionctrl: A unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641","author":"Wang Zhouxia","year":"2023","unstructured":"Zhouxia Wang, Ziyang Yuan, Xintao Wang, Tianshui Chen, Menghan Xia, Ping Luo, and Ying Shan. 2023. Motionctrl: A unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641 (2023)."},{"key":"e_1_3_2_2_62_1","volume-title":"Dreamvideo: Composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433","author":"Wei Yujie","year":"2023","unstructured":"Yujie Wei, Shiwei Zhang, Zhiwu Qing, Hangjie Yuan, Zhiheng Liu, Yu Liu, Yingya Zhang, Jingren Zhou, and Hongming Shan. 2023. Dreamvideo: Composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433 (2023)."},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_2_2_64_1","volume-title":"LAMP: Learn A Motion Pattern for Few-Shot-Based Video Genera- tion. arXiv preprint arXiv:2310.10769","author":"Wu Ruiqi","year":"2023","unstructured":"Ruiqi Wu, Liangyu Chen, Tong Yang, Chunle Guo, Chongyi Li, and Xiangyu Zhang. 2023. LAMP: Learn A Motion Pattern for Few-Shot-Based Video Genera- tion. arXiv preprint arXiv:2310.10769 (2023)."},{"key":"e_1_3_2_2_65_1","volume-title":"Multi-object Video Generation from Single Frame Layouts. arXiv preprint arXiv:2305.03983","author":"Wu Yang","year":"2023","unstructured":"Yang Wu, Zhibin Liu, Hefeng Wu, and Liang Lin. 2023. Multi-object Video Generation from Single Frame Layouts. arXiv preprint arXiv:2305.03983 (2023)."},{"key":"e_1_3_2_2_66_1","volume-title":"Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models: A Survey. arXiv preprint arXiv:2402.02242","author":"Xin Yi","year":"2024","unstructured":"Yi Xin, Siqi Luo, Haodi Zhou, Junlong Du, Xiaohong Liu, Yue Fan, Qing Li, and Yuntao Du. 2024. Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models: A Survey. arXiv preprint arXiv:2402.02242 (2024)."},{"key":"e_1_3_2_2_67_1","volume-title":"DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors. arXiv preprint arXiv:2310.12190","author":"Xing Jinbo","year":"2023","unstructured":"Jinbo Xing, Menghan Xia, Yong Zhang, Haoxin Chen, Xintao Wang, Tien-Tsin Wong, and Ying Shan. 2023. DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors. arXiv preprint arXiv:2310.12190 (2023)."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"e_1_3_2_2_69_1","volume-title":"Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion. arXiv preprint arXiv:2402.03162","author":"Yang Shiyuan","year":"2024","unstructured":"Shiyuan Yang, Liang Hou, Haibin Huang, Chongyang Ma, Pengfei Wan, Di Zhang, Xiaodong Chen, and Jing Liao. 2024. Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion. arXiv preprint arXiv:2402.03162 (2024)."},{"key":"e_1_3_2_2_70_1","volume-title":"Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. arXiv preprint arXiv:2306.07954","author":"Yang Shuai","year":"2023","unstructured":"Shuai Yang, Yifan Zhou, Ziwei Liu, and Chen Change Loy. 2023. Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. arXiv preprint arXiv:2306.07954 (2023)."},{"key":"e_1_3_2_2_71_1","volume-title":"Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089","author":"Yin Shengming","year":"2023","unstructured":"Shengming Yin, Chenfei Wu, Jian Liang, Jie Shi, Houqiang Li, Gong Ming, and Nan Duan. 2023. Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089 (2023)."},{"key":"e_1_3_2_2_72_1","volume-title":"Remove, Add, and Change Video Content with Auto-Generated Narratives. arXiv preprint arXiv:2405.18406","author":"Yoon Jaehong","year":"2024","unstructured":"Jaehong Yoon, Shoubin Yu, and Mohit Bansal. 2024. RACCooN: Remove, Add, and Change Video Content with Auto-Generated Narratives. arXiv preprint arXiv:2405.18406 (2024)."},{"key":"e_1_3_2_2_73_1","volume-title":"AnimateZero: Video Diffusion Models are Zero-Shot Image Animators. arXiv preprint arXiv:2312.03793","author":"Yu Jiwen","year":"2023","unstructured":"Jiwen Yu, Xiaodong Cun, Chenyang Qi, Yong Zhang, Xintao Wang, Ying Shan, and Jian Zhang. 2023. AnimateZero: Video Diffusion Models are Zero-Shot Image Animators. arXiv preprint arXiv:2312.03793 (2023)."},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00190"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_2_76_1","volume-title":"I2VGen-XL: High-Quality Image- to-Video Synthesis via Cascaded Diffusion Models. arXiv preprint arXiv:2311.04145","author":"Zhang Shiwei","year":"2023","unstructured":"Shiwei Zhang, Jiayu Wang, Yingya Zhang, Kang Zhao, Hangjie Yuan, Zhiwu Qin, Xiang Wang, Deli Zhao, and Jingren Zhou. 2023. I2VGen-XL: High-Quality Image- to-Video Synthesis via Cascaded Diffusion Models. arXiv preprint arXiv:2311.04145 (2023)."},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR59079.2023.00027"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01068"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00878"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_24"},{"key":"e_1_3_2_2_81_1","volume-title":"David Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, and Mike Zheng Shou.","author":"Zhao Rui","year":"2023","unstructured":"Rui Zhao, Yuchao Gu, Jay Zhangjie Wu, David Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, and Mike Zheng Shou. 2023. MotionDirector: Motion Cus- tomization of Text-to-Video Diffusion Models. arXiv preprint arXiv:2310.08465 (2023)."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02154"},{"key":"e_1_3_2_2_83_1","volume-title":"Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018","author":"Zhou Daquan","year":"2022","unstructured":"Daquan Zhou, Weimin Wang, Hanshu Yan, Weiwei Lv, Yizhe Zhu, and Jiashi Feng. 2022. Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)."},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00841"},{"volume-title":"Pro- ceedings of the 31st ACM International Conference on Multimedia. 8294--8303","author":"Zhuo Junbao","key":"e_1_3_2_2_85_1","unstructured":"Junbao Zhuo, Xingyu Zhao, Shuhui Wang, Huimin Ma, and Qingming Huang. 2023. Synthesizing Videos from Images for Image-to-Video Adaptation. In Pro- ceedings of the 31st ACM International Conference on Multimedia. 8294--8303"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681394","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681394","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:44Z","timestamp":1750295864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681394"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":84,"alternative-id":["10.1145\/3664647.3681394","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681394","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}