{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:28:09Z","timestamp":1780392489001,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,13]],"date-time":"2024-07-13T00:00:00Z","timestamp":1720828800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"RGC General Research Fund","award":["CityU 11216122"],"award-info":[{"award-number":["CityU 11216122"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,13]]},"DOI":"10.1145\/3641519.3657481","type":"proceedings-article","created":{"date-parts":[[2024,7,12]],"date-time":"2024-07-12T10:39:28Z","timestamp":1720780768000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":61,"title":["Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8213-5803","authenticated-orcid":false,"given":"Shiyuan","family":"Yang","sequence":"first","affiliation":[{"name":"Tianjin University, China and City University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4904-5694","authenticated-orcid":false,"given":"Liang","family":"Hou","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7787-6428","authenticated-orcid":false,"given":"Haibin","family":"Huang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8243-9513","authenticated-orcid":false,"given":"Chongyang","family":"Ma","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-565X","authenticated-orcid":false,"given":"Pengfei","family":"Wan","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5475-2728","authenticated-orcid":false,"given":"Di","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1624-2680","authenticated-orcid":false,"given":"Xiaodong","family":"Chen","sequence":"additional","affiliation":[{"name":"Tianjin University, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7014-5377","authenticated-orcid":false,"given":"Jing","family":"Liao","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,7,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"ediffi: Text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324","author":"Balaji Yogesh","year":"2022","unstructured":"Yogesh Balaji, Seungjun Nah, Xun Huang, Arash Vahdat, Jiaming Song, Karsten Kreis, Miika Aittala, Timo Aila, Samuli Laine, Bryan Catanzaro, 2022. ediffi: Text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)."},{"key":"e_1_3_2_2_2_1","volume-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127","author":"Blattmann Andreas","year":"2023","unstructured":"Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, 2023a. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"e_1_3_2_2_6_1","volume-title":"MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer. arXiv preprint arXiv:2311.12052","author":"Chang Di","year":"2023","unstructured":"Di Chang, Yichun Shi, Quankai Gao, Jessica Fu, Hongyi Xu, Guoxian Song, Qing Yan, Xiao Yang, and Mohammad Soleymani. 2023. MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer. arXiv preprint arXiv:2311.12052 (2023)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"e_1_3_2_2_9_1","volume-title":"Motion-Conditioned Diffusion Model for Controllable Video Synthesis. arXiv preprint arXiv:2304.14404","author":"Chen Tsai-Shien","year":"2023","unstructured":"Tsai-Shien Chen, Chieh\u00a0Hubert Lin, Hung-Yu Tseng, Tsung-Yi Lin, and Ming-Hsuan Yang. 2023a. Motion-Conditioned Diffusion Model for Controllable Video Synthesis. arXiv preprint arXiv:2304.14404 (2023)."},{"key":"e_1_3_2_2_10_1","volume-title":"Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840","author":"Chen Weifeng","year":"2023","unstructured":"Weifeng Chen, Jie Wu, Pan Xie, Hefeng Wu, Jiashi Li, Xin Xia, Xuefeng Xiao, and Liang Lin. 2023b. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840 (2023)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.6084\/m9.figshare.24078045.v2"},{"key":"e_1_3_2_2_12_1","volume-title":"DragVideo: Interactive Drag-style Video Editing. arXiv preprint arXiv:2312.02216","author":"Deng Yufan","year":"2023","unstructured":"Yufan Deng, Ruida Wang, Yuhao Zhang, Yu-Wing Tai, and Chi-Keung Tang. 2023. DragVideo: Interactive Drag-style Video Editing. arXiv preprint arXiv:2312.02216 (2023)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_2_2_14_1","volume-title":"DreaMoving: A Human Video Generation Framework based on Diffusion Models. arXiv e-prints","author":"Feng Mengyang","year":"2023","unstructured":"Mengyang Feng, Jinlin Liu, Kai Yu, Yuan Yao, Zheng Hui, Xiefan Guo, Xianhui Lin, Haolan Xue, Chen Shi, Xiaowen Li, 2023. DreaMoving: A Human Video Generation Framework based on Diffusion Models. arXiv e-prints (2023), arXiv\u20132312."},{"key":"e_1_3_2_2_15_1","volume-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit\u00a0H Bermano, Gal Chechik, and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)."},{"key":"e_1_3_2_2_16_1","volume-title":"Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373","author":"Geyer Michal","year":"2023","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2023. Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373 (2023)."},{"key":"e_1_3_2_2_17_1","volume-title":"VideoSwap: Customized Video Subject Swapping with Interactive Semantic Point Correspondence. arXiv preprint arXiv:2312.02087","author":"Gu Yuchao","year":"2023","unstructured":"Yuchao Gu, Yipin Zhou, Bichen Wu, Licheng Yu, Jia-Wei Liu, Rui Zhao, Jay\u00a0Zhangjie Wu, David\u00a0Junhao Zhang, Mike\u00a0Zheng Shou, and Kevin Tang. 2023. VideoSwap: Customized Video Subject Swapping with Interactive Semantic Point Correspondence. arXiv preprint arXiv:2312.02087 (2023)."},{"key":"e_1_3_2_2_18_1","volume-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_2_19_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_2_20_1","volume-title":"Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan\u00a0Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_2_21_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_22_1","volume-title":"Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey Gritsenko, Diederik\u00a0P Kingma, Ben Poole, Mohammad Norouzi, David\u00a0J Fleet, 2022a. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_2_23_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems 33 (2020), 6840\u20136851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_24_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_2_25_1","volume-title":"Video diffusion models. arXiv:2204.03458","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Tim Salimans, Alexey Gritsenko, William Chan, Mohammad Norouzi, and David\u00a0J Fleet. 2022b. Video diffusion models. arXiv:2204.03458 (2022)."},{"key":"e_1_3_2_2_26_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu J","year":"2021","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_2_27_1","volume-title":"Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117","author":"Hu Li","year":"2023","unstructured":"Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"PEEKABOO: Interactive Video Generation via Masked-Diffusion. arXiv preprint arXiv:2312.07509","author":"Jain Yash","year":"2023","unstructured":"Yash Jain, Anshul Nasery, Vibhav Vineet, and Harkirat Behl. 2023. PEEKABOO: Interactive Video Generation via Masked-Diffusion. arXiv preprint arXiv:2312.07509 (2023)."},{"key":"e_1_3_2_2_29_1","volume-title":"VMC: Video Motion Customization using Temporal Attention Adaption for Text-to-Video Diffusion Models. arXiv preprint arXiv:2312.00845","author":"Jeong Hyeonho","year":"2023","unstructured":"Hyeonho Jeong, Geon\u00a0Yeong Park, and Jong\u00a0Chul Ye. 2023. VMC: Video Motion Customization using Temporal Attention Adaption for Text-to-Video Diffusion Models. arXiv preprint arXiv:2312.00845 (2023)."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480546"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_2_34_1","volume-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499","author":"Liu Shilong","year":"2023","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, 2023a. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_2_35_1","volume-title":"Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761","author":"Liu Shaoteng","year":"2023","unstructured":"Shaoteng Liu, Yuechen Zhang, Wenbo Li, Zhe Lin, and Jiaya Jia. 2023b. Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"Directed diffusion: Direct control of object placement through attention guidance. arXiv preprint arXiv:2302.13153","author":"Ma Duo\u00a0Kurt","year":"2023","unstructured":"Wan-Duo\u00a0Kurt Ma, JP Lewis, W\u00a0Bastiaan Kleijn, and Thomas Leung. 2023. Directed diffusion: Direct control of object placement through attention guidance. arXiv preprint arXiv:2302.13153 (2023)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_2_39_1","volume-title":"T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"e_1_3_2_2_41_1","volume-title":"Codef: Content deformation fields for temporally consistent video processing. arXiv preprint arXiv:2308.07926","author":"Ouyang Hao","year":"2023","unstructured":"Hao Ouyang, Qiuyu Wang, Yuxi Xiao, Qingyan Bai, Juntao Zhang, Kecheng Zheng, Xiaowei Zhou, Qifeng Chen, and Yujun Shen. 2023. Codef: Content deformation fields for temporally consistent video processing. arXiv preprint arXiv:2308.07926 (2023)."},{"key":"e_1_3_2_2_42_1","volume-title":"Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535","author":"Qi Chenyang","year":"2023","unstructured":"Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, and Qifeng Chen. 2023. Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)."},{"key":"e_1_3_2_2_43_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings, Part XI 16","author":"Rao Anyi","year":"2020","unstructured":"Anyi Rao, Jiaze Wang, Linning Xu, Xuekun Jiang, Qingqiu Huang, Bolei Zhou, and Dahua Lin. 2020. A unified framework for shot type classification based on subject centric lens. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI 16. Springer, 17\u201334."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_2_47_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily\u00a0L Denton, Kamyar Ghasemipour, Raphael Gontijo\u00a0Lopes, Burcu Karagol\u00a0Ayan, Tim Salimans, 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems 35 (2022), 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00416"},{"key":"e_1_3_2_2_49_1","volume-title":"Videoflow: Exploiting temporal cues for multi-frame optical flow estimation. arXiv preprint arXiv:2303.08340","author":"Qin Hongwei","year":"2023","unstructured":"Xiaoyu Shi, Zhaoyang Huang, Weikang Bian, Dasong Li, Manyuan Zhang, Ka\u00a0Chun Cheung, Simon See, Hongwei Qin, Jifeng Dai, and Hongsheng Li. 2023. Videoflow: Exploiting temporal cues for multi-frame optical flow estimation. arXiv preprint arXiv:2303.08340 (2023)."},{"key":"e_1_3_2_2_50_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_2_51_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_2_52_1","volume-title":"Emergent Correspondence from Image Diffusion. arXiv preprint arXiv:2306.03881","author":"Tang Luming","year":"2023","unstructured":"Luming Tang, Menglin Jia, Qianqian Wang, Cheng\u00a0Perng Phoo, and Bharath Hariharan. 2023. Emergent Correspondence from Image Diffusion. arXiv preprint arXiv:2306.03881 (2023)."},{"key":"e_1_3_2_2_53_1","volume-title":"Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd Van\u00a0Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)."},{"key":"e_1_3_2_2_54_1","volume-title":"Autoencoder-based conditional optimal transport generative adversarial network for medical image generation. Visual Informatics","author":"Wang Jun","year":"2023","unstructured":"Jun Wang, Bohan Lei, Liya Ding, Xiaoyin Xu, Xianfeng Gu, and Min Zhang. 2023a. Autoencoder-based conditional optimal transport generative adversarial network for medical image generation. Visual Informatics (2023)."},{"key":"e_1_3_2_2_55_1","volume-title":"Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571","author":"Wang Jiuniu","year":"2023","unstructured":"Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, and Shiwei Zhang. 2023d. Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)."},{"key":"e_1_3_2_2_56_1","volume-title":"DisCo: Disentangled Control for Realistic Human Dance Generation. arXiv preprint arXiv:2307.00040","author":"Wang Tan","year":"2023","unstructured":"Tan Wang, Linjie Li, Kevin Lin, Yuanhao Zhai, Chung-Ching Lin, Zhengyuan Yang, Hanwang Zhang, Zicheng Liu, and Lijuan Wang. 2023b. DisCo: Disentangled Control for Realistic Human Dance Generation. arXiv preprint arXiv:2307.00040 (2023)."},{"key":"e_1_3_2_2_57_1","volume-title":"Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599","author":"Wang Wen","year":"2023","unstructured":"Wen Wang, Kangyang Xie, Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen. 2023c. Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599 (2023)."},{"key":"e_1_3_2_2_58_1","unstructured":"Xiang Wang Hangjie Yuan Shiwei Zhang Dayou Chen Jiuniu Wang Yingya Zhang Yujun Shen Deli Zhao and Jingren Zhou. 2023f. VideoComposer: Compositional Video Synthesis with Motion Controllability. In Advances in Neural Information Processing Systems. 7594\u20137611."},{"key":"e_1_3_2_2_59_1","volume-title":"MotionCtrl: A Unified and Flexible Motion Controller for Video Generation. arXiv preprint arXiv:2312.03641","author":"Wang Zhouxia","year":"2023","unstructured":"Zhouxia Wang, Ziyang Yuan, Xintao Wang, Tianshui Chen, Menghan Xia, Ping Luo, and Ying Shan. 2023e. MotionCtrl: A Unified and Flexible Motion Controller for Video Generation. arXiv preprint arXiv:2312.03641 (2023)."},{"key":"e_1_3_2_2_60_1","volume-title":"Dreamvideo: Composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433","author":"Wei Yujie","year":"2023","unstructured":"Yujie Wei, Shiwei Zhang, Zhiwu Qing, Hangjie Yuan, Zhiheng Liu, Yu Liu, Yingya Zhang, Jingren Zhou, and Hongming Shan. 2023. Dreamvideo: Composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433 (2023)."},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_2_2_62_1","volume-title":"Lamp: Learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769","author":"Wu Ruiqi","year":"2023","unstructured":"Ruiqi Wu, Liangyu Chen, Tong Yang, Chunle Guo, Chongyi Li, and Xiangyu Zhang. 2023a. Lamp: Learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769 (2023)."},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_64_1","volume-title":"MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model. arXiv preprint arXiv:2311.16498","author":"Xu Zhongcong","year":"2023","unstructured":"Zhongcong Xu, Jianfeng Zhang, Jun\u00a0Hao Liew, Hanshu Yan, Jia-Wei Liu, Chenxu Zhang, Jiashi Feng, and Mike\u00a0Zheng Shou. 2023. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model. arXiv preprint arXiv:2311.16498 (2023)."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00787"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612200"},{"key":"e_1_3_2_2_67_1","volume-title":"Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. arXiv preprint arXiv:2306.07954","author":"Yang Shuai","year":"2023","unstructured":"Shuai Yang, Yifan Zhou, Ziwei Liu, and Chen\u00a0Change Loy. 2023b. Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. arXiv preprint arXiv:2306.07954 (2023)."},{"key":"e_1_3_2_2_68_1","volume-title":"Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089","author":"Yin Shengming","year":"2023","unstructured":"Shengming Yin, Chenfei Wu, Jian Liang, Jie Shi, Houqiang Li, Gong Ming, and Nan Duan. 2023. Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089 (2023)."},{"key":"e_1_3_2_2_69_1","volume-title":"DiffMat: Latent diffusion models for image-guided material generation. Visual Informatics","author":"Yuan Liang","year":"2024","unstructured":"Liang Yuan, Dingkun Yan, Suguru Saito, and Issei Fujishiro. 2024. DiffMat: Latent diffusion models for image-guided material generation. Visual Informatics (2024)."},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_71_1","volume-title":"ControlVideo: Adding Conditional Control for One Shot Text-to-Video Editing. arXiv preprint arXiv:2305.17098","author":"Zhao Min","year":"2023","unstructured":"Min Zhao, Rongzhen Wang, Fan Bao, Chongxuan Li, and Jun Zhu. 2023b. ControlVideo: Adding Conditional Control for One Shot Text-to-Video Editing. arXiv preprint arXiv:2305.17098 (2023)."},{"key":"e_1_3_2_2_72_1","volume-title":"Motiondirector: Motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465","author":"Zhao Rui","year":"2023","unstructured":"Rui Zhao, Yuchao Gu, Jay\u00a0Zhangjie Wu, David\u00a0Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, and Mike\u00a0Zheng Shou. 2023a. Motiondirector: Motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465 (2023)."},{"key":"e_1_3_2_2_73_1","volume-title":"Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018","author":"Zhou Daquan","year":"2022","unstructured":"Daquan Zhou, Weimin Wang, Hanshu Yan, Weiwei Lv, Yizhe Zhu, and Jiashi Feng. 2022. Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)."}],"event":{"name":"SIGGRAPH '24: Special Interest Group on Computer Graphics and Interactive Techniques Conference","location":"Denver CO USA","acronym":"SIGGRAPH '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641519.3657481","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3641519.3657481","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:50Z","timestamp":1750291550000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641519.3657481"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,13]]},"references-count":73,"alternative-id":["10.1145\/3641519.3657481","10.1145\/3641519"],"URL":"https:\/\/doi.org\/10.1145\/3641519.3657481","relation":{},"subject":[],"published":{"date-parts":[[2024,7,13]]},"assertion":[{"value":"2024-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}