{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:03:21Z","timestamp":1750309401511,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSFC","award":["#62172279 # 61932020"],"award-info":[{"award-number":["#62172279 # 61932020"]}]},{"name":"Program of Shanghai Academic Research Leader"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681147","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"3761-3770","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HeroMaker: Human-centric Video Editing with Motion Priors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6483-0454","authenticated-orcid":false,"given":"Shiyu","family":"Liu","sequence":"first","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4831-8418","authenticated-orcid":false,"given":"Zibo","family":"Zhao","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1183-5459","authenticated-orcid":false,"given":"Yihao","family":"Zhi","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6405-1712","authenticated-orcid":false,"given":"Yiqun","family":"Zhao","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7495-2406","authenticated-orcid":false,"given":"Binbin","family":"Huang","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1699-1887","authenticated-orcid":false,"given":"Shuo","family":"Wang","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8835-7568","authenticated-orcid":false,"given":"Ruoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3404-7981","authenticated-orcid":false,"given":"Michael","family":"Xuan","sequence":"additional","affiliation":[{"name":"UniDT Co. Ltd, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8410-2976","authenticated-orcid":false,"given":"Zhengxin","family":"Li","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1626-2040","authenticated-orcid":false,"given":"Shenghua","family":"Gao","sequence":"additional","affiliation":[{"name":"The University of Hong Kong &amp; HKU Shanghai Advanced Computing and Intelligent Technology Research Institute, HKSAR, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00875"},{"key":"e_1_3_2_1_2_1","volume-title":"Blended Latent Diffusion. arXiv preprint arXiv:2206.02779","author":"Avrahami Omri","year":"2022","unstructured":"Omri Avrahami, Ohad Fried, and Dani Lischinski. 2022. Blended Latent Diffusion. arXiv preprint arXiv:2206.02779 (2022). arXiv:2206.02779."},{"key":"e_1_3_2_1_3_1","volume-title":"SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field. In The IEEE\/CVF Computer Vision and Pattern Recognition Conference (CVPR).","author":"Bao Chong","year":"2023","unstructured":"Chong Bao, Yinda Zhang, Bangbang Yang, Tianxing Fan, Zesong Yang, Hujun Bao, Guofeng Zhang, and Zhaopeng Cui. 2023. SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field. In The IEEE\/CVF Computer Vision and Pattern Recognition Conference (CVPR)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_2_1_5_1","volume-title":"Sanja Fidler, and Karsten Kreis.","author":"Blattmann Andreas","year":"2023","unstructured":"Andreas Blattmann, Robin Rombach, Huan Ling, Tim Dockhorn, Seung Wook Kim, Sanja Fidler, and Karsten Kreis. 2023. Align your Latents: High-Resolution Video Synthesis with Latent Diffusion Models. arxiv: 2304.08818 [cs.CV]"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"e_1_3_2_1_7_1","volume-title":"StableVideo: Text-driven Consistency-aware Diffusion Video Editing. arXiv preprint arXiv:2308.09592","author":"Chai Wenhao","year":"2023","unstructured":"Wenhao Chai, Xun Guo, Gaoang Wang, and Yan Lu. 2023. StableVideo: Text-driven Consistency-aware Diffusion Video Editing. arXiv preprint arXiv:2308.09592 (2023)."},{"key":"e_1_3_2_1_8_1","unstructured":"Weifeng Chen Jie Wu Pan Xie Hefeng Wu Jiashi Li Xin Xia Xuefeng Xiao and Liang Lin. 2023. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arxiv: 2305.13840 [cs.CV]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01139"},{"key":"e_1_3_2_1_10_1","volume-title":"Segment and Track Anything. arXiv preprint arXiv:2305.06558","author":"Cheng Yangming","year":"2023","unstructured":"Yangming Cheng, Liulei Li, Yuanyou Xu, Xiaodi Li, Zongxin Yang, Wenguan Wang, and Yi Yang. 2023. Segment and Track Anything. arXiv preprint arXiv:2305.06558 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Conference on Computer Vision (ICCV).","author":"Cong Yuren","year":"2023","unstructured":"Yuren Cong, Mengmeng Xu, Christian Simon, Shoufa Chen, Jiawei Ren, Yanping Xie, Juan-Manuel Perez-Rua, Bodo Rosenhahn, Tao Xiang, and Sen He. 2023. FLATTEN: Optical Flow-guided ATTENtion for Consistent Text-to-Video Editing. In Proceedings of the International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_12_1","unstructured":"MMPose Contributors. 2020. OpenMMLab Pose Estimation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmpose."},{"key":"e_1_3_2_1_13_1","unstructured":"Paul Couairon Cl\u00e9ment Rambour Jean-Emmanuel Haugeard and Nicolas Thome. 2023. VidEdit: Zero-Shot and Spatially Aware Text-Driven Video Editing. arxiv: 2306.08707 [cs.CV]"},{"key":"e_1_3_2_1_14_1","volume-title":"Barr","author":"Desbrun Mathieu","year":"1999","unstructured":"Mathieu Desbrun, Mark Meyer, Peter Schr\u00f6der, and Alan H. Barr. 1999. Implicit Fairing of Irregular Meshes using Diffusion and Curvature Flow. In SIGGRAPH."},{"key":"e_1_3_2_1_15_1","unstructured":"Zhongjie Duan Lizhou You Chengyu Wang Cen Chen Ziheng Wu Weining Qian and Jun Huang. 2023. DiffSynth: Latent In-Iteration Deflickering for Realistic Video Synthesis. arxiv: 2308.03463 [cs.CV]"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Patrick Esser Johnathan Chiu Parmida Atighehchian Jonathan Granskog and Anastasis Germanidis. 2023. Structure and Content-Guided Video Synthesis with Diffusion Models. arxiv: 2302.03011 [cs.CV]","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_2_1_17_1","unstructured":"Ruoyu Feng Wenming Weng Yanhui Wang Yuhui Yuan Jianmin Bao Chong Luo Zhibo Chen and Baining Guo. 2023. CCEdit: Creative and Controllable Video Editing via Diffusion Models. arxiv: 2309.16496 [cs.CV]"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555423"},{"key":"e_1_3_2_1_19_1","volume-title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing. arXiv preprint arxiv:2307.10373","author":"Geyer Michal","year":"2023","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2023. TokenFlow: Consistent Diffusion Features for Consistent Video Editing. arXiv preprint arxiv:2307.10373 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01236"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Learning Representations","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. International Conference on Learning Representations (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Yingqing He Menghan Xia Haoxin Chen Xiaodong Cun Yuan Gong Jinbo Xing Yong Zhang Xintao Wang Chao Weng Ying Shan and Qifeng Chen. 2023. Animate-A-Story: Storytelling with Retrieval-Augmented Video Generation. arxiv: 2307.06940 [cs.CV]"},{"key":"e_1_3_2_1_23_1","volume-title":"Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Imagen Video: High Definition Video Generation with Diffusion Models. arxiv: 2210.02303 [cs.CV]","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey Gritsenko, Diederik P. Kingma, Ben Poole, Mohammad Norouzi, David J. Fleet, and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. arxiv: 2210.02303 [cs.CV]"},{"key":"e_1_3_2_1_25_1","volume-title":"Video diffusion models. arXiv:2204.03458","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Tim Salimans, Alexey Gritsenko, William Chan, Mohammad Norouzi, and David J Fleet. 2022. Video diffusion models. arXiv:2204.03458 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117","author":"Hu Li","year":"2023","unstructured":"Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:2311.17117 (2023)."},{"key":"e_1_3_2_1_27_1","unstructured":"Zhihao Hu and Dong Xu. 2023. VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by Using Diffusion Model with ControlNet. arxiv: 2307.14073 [cs.CV]"},{"key":"e_1_3_2_1_28_1","volume-title":"Oliver Wang, and Joon-Young Lee.","author":"Huang Jiahui","year":"2023","unstructured":"Jiahui Huang, Leonid Sigal, Kwang Moo Yi, Oliver Wang, and Joon-Young Lee. 2023. INVE: Interactive Neural Video Editing. arxiv: 2307.07663 [cs.CV]"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00552"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480546"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00411"},{"key":"e_1_3_2_1_32_1","volume-title":"Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators. arXiv preprint arXiv:2303.13439","author":"Khachatryan Levon","year":"2023","unstructured":"Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators. arXiv preprint arXiv:2303.13439 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Yi-Ting Chen, Elizabeth Qiu, and Jia-Bin Huang.","author":"Lee Yao-Chih","year":"2023","unstructured":"Yao-Chih Lee, Ji-Ze Genevieve Jang, Yi-Ting Chen, Elizabeth Qiu, and Jia-Bin Huang. 2023. Shape-aware Text-driven Layered Video Editing. arxiv: 2301.13173 [cs.CV]"},{"volume-title":"International Conference on 3D Vision (3DV).","author":"Liao Tingting","key":"e_1_3_2_1_34_1","unstructured":"Tingting Liao, Hongwei Yi, Yuliang Xiu, Jiaxiang Tang, Yangyi Huang, Justus Thies, and Michael J. Black. 2024. TADA! Text to Animatable Digital Avatars. In International Conference on 3D Vision (3DV)."},{"key":"e_1_3_2_1_35_1","unstructured":"Zhenyi Liao and Zhijie Deng. 2023. LOVECon: Text-driven Training-Free Long Video Editing with ControlNet. arxiv: 2310.09711 [cs.CV]"},{"key":"e_1_3_2_1_36_1","unstructured":"Jun Hao Liew Hanshu Yan Jianfeng Zhang Zhongcong Xu and Jiashi Feng. 2023. MagicEdit: High-Fidelity and Temporally Coherent Video Editing. In arXiv."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02027"},{"key":"e_1_3_2_1_38_1","volume-title":"Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, and Mike Zheng Shou.","author":"Liu Jia-Wei","year":"2023","unstructured":"Jia-Wei Liu, Yan-Pei Cao, Jay Zhangjie Wu, Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, and Mike Zheng Shou. 2023. DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion-and View-Change Human-Centric Video Editing. arXiv preprint arXiv:2310.10624 (2023)."},{"key":"e_1_3_2_1_39_1","unstructured":"Shaoteng Liu Yuechen Zhang Wenbo Li Zhe Lin and Jiaya Jia. 2023. Video-P2P: Video Editing with Cross-attention Control."},{"key":"e_1_3_2_1_40_1","volume-title":"Appearance Transfer and Novel View Synthesis. In The IEEE International Conference on Computer Vision (ICCV).","author":"Liu Wen","year":"2019","unstructured":"Wen Liu, Zhixin Piao, Min Jie, Wenhan Luo, Lin Ma, and Shenghua Gao. 2019. Liquid Warping GAN: A Unified Framework for Human Motion Imitation, Appearance Transfer and Novel View Synthesis. In The IEEE International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_41_1","volume-title":"Liquid warping GAN with attention: A unified framework for human image synthesis","author":"Liu Wen","year":"2021","unstructured":"Wen Liu, Zhixin Piao, Zhi Tu, Wenhan Luo, Lin Ma, and Shenghua Gao. 2021. Liquid warping GAN with attention: A unified framework for human image synthesis. IEEE Transactions on Pattern Analysis and Machine Intelligence (2021)."},{"key":"e_1_3_2_1_42_1","unstructured":"Zhengxiong Luo Dayou Chen Yingya Zhang Yan Huang Liang Wang Yujun Shen Deli Zhao Jingren Zhou and Tieniu Tan. 2023. VideoFusion: Decomposed Diffusion Models for High-Quality Video Generation. arxiv: 2303.08320 [cs.CV]"},{"key":"e_1_3_2_1_43_1","volume-title":"SKED: Sketch-guided Text-based 3D Editing. ICCV","author":"Mikaeili Aryan","year":"2023","unstructured":"Aryan Mikaeili, Or Perel, Mehdi Safaee, Daniel Cohen-Or, and Ali Mahdavi-Amiri. 2023. SKED: Sketch-guided Text-based 3D Editing. ICCV (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen.","author":"Molad Eyal","year":"2023","unstructured":"Eyal Molad, Eliahu Horwitz, Dani Valevski, Alex Rav Acha, Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen. 2023. Dreamix: Video Diffusion Models are General Video Editors. arxiv: 2302.01329 [cs.CV]"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Andrew Nealen Takeo Igarashi Olga Sorkine and Marc Alexa. 2006. Laplacian Mesh Optimization. In GRAPHITE.","DOI":"10.1145\/1174429.1174494"},{"key":"e_1_3_2_1_46_1","volume-title":"CoDeF: Content Deformation Fields for Temporally Consistent Video Processing. arXiv preprint arXiv:2308.07926","author":"Ouyang Hao","year":"2023","unstructured":"Hao Ouyang, Qiuyu Wang, Yuxi Xiao, Qingyan Bai, Juntao Zhang, Kecheng Zheng, Xiaowei Zhou, Qifeng Chen, and Yujun Shen. 2023. CoDeF: Content Deformation Fields for Temporally Consistent Video Processing. arXiv preprint arXiv:2308.07926 (2023)."},{"volume-title":"Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR).","author":"Pavlakos Georgios","key":"e_1_3_2_1_47_1","unstructured":"Georgios Pavlakos, Vasileios Choutas, Nima Ghorbani, Timo Bolkart, Ahmed A. A. Osman, Dimitrios Tzionas, and Michael J. Black. 2019. Expressive Body Capture: 3D Hands, Face, and Body from a Single Image. In Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_48_1","volume-title":"FateZero: Fusing Attentions for Zero-shot Text-based Video Editing. arXiv:2303.09535","author":"Qi Chenyang","year":"2023","unstructured":"Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, and Qifeng Chen. 2023. FateZero: Fusing Attentions for Zero-shot Text-based Video Editing. arXiv:2303.09535 (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Bosheng Qin Juncheng Li Siliang Tang Tat-Seng Chua and Yueting Zhuang. 2023. InstructVid2Vid: Controllable Video Editing with Natural Language Instructions. arxiv: 2305.12328 [cs.CV]"},{"key":"e_1_3_2_1_50_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv preprint arXiv:2103.00020 (2021)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Ruizhi Shao Jingxiang Sun Cheng Peng Zerong Zheng Boyao Zhou Hongwen Zhang and Yebin Liu. 2024. Control4D: Efficient 4D Portrait Editing with Text. (2024).","DOI":"10.1109\/CVPR52733.2024.00436"},{"key":"e_1_3_2_1_52_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni Devi Parikh Sonal Gupta and Yaniv Taigman. 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. arxiv: 2209.14792 [cs.CV]"},{"key":"e_1_3_2_1_53_1","volume-title":"Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen.","author":"Wang Wen","year":"2023","unstructured":"Wen Wang, kangyang Xie, Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen. 2023. Zero-Shot Video Editing Using Off-The-Shelf Image Diffusion Models. arXiv preprint arXiv:2303.17599 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01573"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_2_1_56_1","volume-title":"SimDA: Simple Diffusion Adapter for Efficient Video Generation. arXiv preprint arXiv:2308.09710","author":"Xing Zhen","year":"2023","unstructured":"Zhen Xing, Qi Dai, Han Hu, Zuxuan Wu, and Yu-Gang Jiang. 2023. SimDA: Simple Diffusion Adapter for Efficient Video Generation. arXiv preprint arXiv:2308.09710 (2023)."},{"key":"e_1_3_2_1_57_1","unstructured":"Zhongcong Xu Jianfeng Zhang Jun Hao Liew Hanshu Yan Jia-Wei Liu Chenxu Zhang Jiashi Feng and Mike Zheng Shou. [n. d.]. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model."},{"key":"e_1_3_2_1_58_1","volume-title":"Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. In ACM SIGGRAPH Asia Conference Proceedings.","author":"Yang Shuai","year":"2023","unstructured":"Shuai Yang, Yifan Zhou, Ziwei Liu,, and Chen Change Loy. 2023. Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation. In ACM SIGGRAPH Asia Conference Proceedings."},{"key":"e_1_3_2_1_59_1","unstructured":"Shengming Yin Chenfei Wu Huan Yang Jianfeng Wang Xiaodong Wang Minheng Ni Zhengyuan Yang Linjie Li Shuguang Liu Fan Yang Jianlong Fu Gong Ming Lijuan Wang Zicheng Liu Houqiang Li and Nan Duan. 2023. NUWA-XL: Diffusion over Diffusion for eXtremely Long Video Generation. arxiv: 2303.12346 [cs.CV]"},{"key":"e_1_3_2_1_60_1","volume-title":"Jia-Wei Liu, Rui Zhao, Lingmin Ran, Yuchao Gu, Difei Gao, and Mike Zheng Shou.","author":"Zhang David Junhao","year":"2023","unstructured":"David Junhao Zhang, Jay Zhangjie Wu, Jia-Wei Liu, Rui Zhao, Lingmin Ran, Yuchao Gu, Difei Gao, and Mike Zheng Shou. 2023. Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video Generation. arXiv preprint arXiv:2309.15818 (2023)."},{"key":"e_1_3_2_1_61_1","unstructured":"Jianfeng Zhang Hanshu Yan Zhongcong Xu Jiashi Feng and Jun Hao Liew. 2023 d. MagicAvatar: Multi-modal Avatar Generation and Animation. In arXiv."},{"key":"e_1_3_2_1_62_1","unstructured":"Lvmin Zhang Anyi Rao and Maneesh Agrawala. [n. d.]. Adding Conditional Control to Text-to-Image Diffusion Models."},{"key":"e_1_3_2_1_63_1","unstructured":"Shangzhan Zhang Sida Peng Yinji ShenTu Qing Shuai Tianrun Chen Kaicheng Yu Hujun Bao and Xiaowei Zhou. 2023. Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields. arxiv: 2307.12909 [cs.CV]"},{"key":"e_1_3_2_1_64_1","volume-title":"ControlVideo: Training-free Controllable Text-to-Video Generation. arXiv preprint arXiv:2305.13077","author":"Zhang Yabo","year":"2023","unstructured":"Yabo Zhang, Yuxiang Wei, Dongsheng Jiang, Xiaopeng Zhang, Wangmeng Zuo, and Qi Tian. 2023. ControlVideo: Training-free Controllable Text-to-Video Generation. arXiv preprint arXiv:2305.13077 (2023)."},{"key":"e_1_3_2_1_65_1","unstructured":"Daquan Zhou Weimin Wang Hanshu Yan Weiwei Lv Yizhe Zhu and Jiashi Feng. 2023. MagicVideo: Efficient Video Generation With Latent Diffusion Models. arxiv: 2211.11018 [cs.CV]"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3001644"},{"key":"e_1_3_2_1_67_1","volume-title":"European Conference on Computer Vision (ECCV).","author":"Zhu Shenhao","year":"2024","unstructured":"Shenhao Zhu, Junming Leo Chen, Zuozhuo Dai, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, and Siyu Zhu. 2024. Champ: Controllable and Consistent Human Image Animation with 3D Parametric Guidance. In European Conference on Computer Vision (ECCV)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681147","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681147","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:53Z","timestamp":1750294673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681147"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":67,"alternative-id":["10.1145\/3664647.3681147","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681147","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}