{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:23:53Z","timestamp":1780392233277,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,13]],"date-time":"2024-07-13T00:00:00Z","timestamp":1720828800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,13]]},"DOI":"10.1145\/3641519.3657497","type":"proceedings-article","created":{"date-parts":[[2024,7,12]],"date-time":"2024-07-12T10:39:28Z","timestamp":1720780768000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":82,"title":["Motion-I2V: Consistent and Controllable Image-to-Video Generation with Explicit Motion Modeling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3696-4442","authenticated-orcid":false,"given":"Xiaoyu","family":"Shi","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7688-1471","authenticated-orcid":false,"given":"Zhaoyang","family":"Huang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1323-4933","authenticated-orcid":false,"given":"Fu-Yun","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9986-3348","authenticated-orcid":false,"given":"Weikang","family":"Bian","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6766-7529","authenticated-orcid":false,"given":"Dasong","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1704-4144","authenticated-orcid":false,"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[{"name":"SenseTime, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2148-1085","authenticated-orcid":false,"given":"Manyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5610-1298","authenticated-orcid":false,"given":"Ka Chun","family":"Cheung","sequence":"additional","affiliation":[{"name":"NVIDIA, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8310-7820","authenticated-orcid":false,"given":"Simon","family":"See","sequence":"additional","affiliation":[{"name":"NVIDIA, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7498-6755","authenticated-orcid":false,"given":"Hongwei","family":"Qin","sequence":"additional","affiliation":[{"name":"SenseTime, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5037-3695","authenticated-orcid":false,"given":"Jifeng","family":"Dai","sequence":"additional","affiliation":[{"name":"Tsinghua University, China, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2664-7975","authenticated-orcid":false,"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,7,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance Editing. arXiv preprint arXiv:2402.13185","author":"Bai Jianhong","year":"2024","unstructured":"Jianhong Bai, Tianyu He, Yuchi Wang, Junliang Guo, Haoji Hu, Zuozhu Liu, and Jiang Bian. 2024. UniEdit: A Unified Tuning-Free Framework for Video Motion and Appearance Editing. arXiv preprint arXiv:2402.13185 (2024)."},{"key":"e_1_3_2_2_2_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE International Conference on Computer Vision.","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_2_3_1","volume-title":"A framework for the robust estimation of optical flow. In 1993 (4th) International Conference on Computer Vision","author":"Black J","unstructured":"Michael\u00a0J Black and Padmanabhan Anandan. 1993. A framework for the robust estimation of optical flow. In 1993 (4th) International Conference on Computer Vision. IEEE, 231\u2013236."},{"key":"e_1_3_2_2_4_1","volume-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127","author":"Blattmann Andreas","year":"2023","unstructured":"Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, 2023a. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00513"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_2_7_1","volume-title":"International journal of computer vision 61, 3","author":"Bruhn Andr\u00e9s","year":"2005","unstructured":"Andr\u00e9s Bruhn, Joachim Weickert, and Christoph Schn\u00f6rr. 2005. Lucas\/Kanade meets Horn\/Schunck: Combining local and global optic flow methods. International journal of computer vision 61, 3 (2005), 211\u2013231."},{"key":"e_1_3_2_2_8_1","volume-title":"Motion-Zero: Zero-Shot Moving Object Control Framework for Diffusion-Based Video Generation. arXiv preprint arXiv:2401.10150","author":"Chen Changgu","year":"2024","unstructured":"Changgu Chen, Junwei Shu, Lianggangxu Chen, Gaoqi He, Changbo Wang, and Yang Li. 2024. Motion-Zero: Zero-Shot Moving Object Control Framework for Diffusion-Based Video Generation. arXiv preprint arXiv:2401.10150 (2024)."},{"key":"e_1_3_2_2_9_1","unstructured":"Zuozhuo Dai Zhenghao Zhang Yao Yao Bingxue Qiu Siyu Zhu Long Qin and Weizhi Wang. 2023. Fine-Grained Open Domain Image Animation with Motion Guidance. arxiv:2311.12886\u00a0[cs.CV]"},{"key":"e_1_3_2_2_10_1","volume-title":"NICE: Non-linear Independent Components Estimation. In 3rd International Conference on Learning Representations, ICLR","author":"Dinh Laurent","year":"2015","unstructured":"Laurent Dinh, David Krueger, and Yoshua Bengio. 2015. NICE: Non-linear Independent Components Estimation. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Workshop Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1410.8516"},{"key":"e_1_3_2_2_11_1","volume-title":"TAP-Vid: A Benchmark for Tracking Any Point in a Video. arXiv preprint arXiv:2211.03726","author":"Doersch Carl","year":"2022","unstructured":"Carl Doersch, Ankush Gupta, Larisa Markeeva, Adri\u00e0 Recasens, Lucas Smaira, Yusuf Aytar, Jo\u00e3o Carreira, Andrew Zisserman, and Yi Yang. 2022. TAP-Vid: A Benchmark for Tracking Any Point in a Video. arXiv preprint arXiv:2211.03726 (2022)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.316"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272127.3275043"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272127.3275043"},{"key":"e_1_3_2_2_15_1","volume-title":"Advances in Neural Information Processing Systems, Z.\u00a0Ghahramani, M.\u00a0Welling, C.\u00a0Cortes, N.\u00a0Lawrence, and K","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative Adversarial Nets. In Advances in Neural Information Processing Systems, Z.\u00a0Ghahramani, M.\u00a0Welling, C.\u00a0Cortes, N.\u00a0Lawrence, and K.Q. Weinberger (Eds.). Vol.\u00a027. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2014\/file\/5ca3e9b122f61f8f06494c97b1afccf3-Paper.pdf"},{"key":"e_1_3_2_2_16_1","volume-title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_4"},{"key":"e_1_3_2_2_18_1","volume-title":"Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey\u00a0A. Gritsenko, Diederik\u00a0P. Kingma, Ben Poole, David\u00a0J. Fleet, and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252715883"},{"key":"e_1_3_2_2_19_1","volume-title":"Denoising Diffusion Probabilistic Models. arXiv preprint arxiv:2006.11239","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv preprint arxiv:2006.11239 (2020)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00575"},{"key":"e_1_3_2_2_21_1","volume-title":"Determining optical flow. Artificial intelligence 17, 1-3","author":"Horn KP","year":"1981","unstructured":"Berthold\u00a0KP Horn and Brian\u00a0G Schunck. 1981. Determining optical flow. Artificial intelligence 17, 1-3 (1981), 185\u2013203."},{"key":"e_1_3_2_2_22_1","volume-title":"Region-aware diffusion for zero-shot text-driven image editing. arXiv preprint arXiv:2302.11797","author":"Huang Nisha","year":"2023","unstructured":"Nisha Huang, Fan Tang, Weiming Dong, Tong-Yee Lee, and Changsheng Xu. 2023. Region-aware diffusion for zero-shot text-driven image editing. arXiv preprint arXiv:2302.11797 (2023)."},{"key":"e_1_3_2_2_23_1","volume-title":"FlowFormer: A Transformer Architecture for Optical Flow. arXiv preprint arXiv:2203.16194","author":"Huang Zhaoyang","year":"2022","unstructured":"Zhaoyang Huang, Xiaoyu Shi, Chao Zhang, Qiang Wang, Ka\u00a0Chun Cheung, Hongwei Qin, Jifeng Dai, and Hongsheng Li. 2022. FlowFormer: A Transformer Architecture for Optical Flow. arXiv preprint arXiv:2203.16194 (2022)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00936"},{"key":"e_1_3_2_2_25_1","volume-title":"A lightweight optical flow CNN\u2014Revisiting data fidelity and regularization","author":"Hui Tak-Wai","year":"2020","unstructured":"Tak-Wai Hui, Xiaoou Tang, and Chen\u00a0Change Loy. 2020. A lightweight optical flow CNN\u2014Revisiting data fidelity and regularization. IEEE transactions on pattern analysis and machine intelligence 43, 8 (2020), 2555\u20132569."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2500031"},{"key":"e_1_3_2_2_28_1","volume-title":"Cotracker: It is better to track together. arXiv preprint arXiv:2307.07635","author":"Karaev Nikita","year":"2023","unstructured":"Nikita Karaev, Ignacio Rocco, Benjamin Graham, Natalia Neverova, Andrea Vedaldi, and Christian Rupprecht. 2023. Cotracker: It is better to track together. arXiv preprint arXiv:2307.07635 (2023)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Johanna Karras Aleksander Holynski Ting-Chun Wang and Ira Kemelmacher-Shlizerman. 2023. DreamPose: Fashion Image-to-Video Synthesis via Stable Diffusion. arxiv:2304.06025\u00a0[cs.CV]","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"e_1_3_2_2_30_1","volume-title":"Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators. arXiv preprint arXiv:2303.13439","author":"Khachatryan Levon","year":"2023","unstructured":"Levon Khachatryan, Andranik Movsisyan, Vahram Tadevosyan, Roberto Henschel, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi. 2023. Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators. arXiv preprint arXiv:2303.13439 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Kingma and Max Welling","author":"P.","year":"2014","unstructured":"Diederik\u00a0P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes. In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1312.6114"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00446"},{"key":"e_1_3_2_2_33_1","unstructured":"Zhengqi Li Richard Tucker Noah Snavely and Aleksander Holynski. 2023b. Generative Image Dynamics. arxiv:2309.07906\u00a0[cs.CV]"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00532"},{"key":"e_1_3_2_2_35_1","unstructured":"Shaoteng Liu Yuechen Zhang Wenbo Li Zhe Lin and Jiaya Jia. 2023. Video-P2P: Video Editing with Cross-attention Control."},{"key":"e_1_3_2_2_36_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00365"},{"key":"e_1_3_2_2_38_1","volume-title":"Text-Guided Synthesis of Eulerian Cinemagraphs. arXiv preprint arXiv:2307.03190","author":"Mahapatra Aniruddha","year":"2023","unstructured":"Aniruddha Mahapatra, Aliaksandr Siarohin, Hsin-Ying Lee, Sergey Tulyakov, and Jun-Yan Zhu. 2023. Text-Guided Synthesis of Eulerian Cinemagraphs. arXiv preprint arXiv:2307.03190 (2023)."},{"key":"e_1_3_2_2_39_1","volume-title":"Dense Optical Tracking: Connecting the Dots. arXiv preprint arXiv:2312.00786","author":"Moing Guillaume\u00a0Le","year":"2023","unstructured":"Guillaume\u00a0Le Moing, Jean Ponce, and Cordelia Schmid. 2023. Dense Optical Tracking: Connecting the Dots. arXiv preprint arXiv:2312.00786 (2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"Dragondiffusion: Enabling drag-style manipulation on diffusion models. arXiv preprint arXiv:2307.02421","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Jiechong Song, Ying Shan, and Jian Zhang. 2023. Dragondiffusion: Enabling drag-style manipulation on diffusion models. arXiv preprint arXiv:2307.02421 (2023)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"e_1_3_2_2_42_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. CoRR abs\/2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. CoRR abs\/2112.10741 (2021). arXiv:2112.10741https:\/\/arxiv.org\/abs\/2112.10741"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00548"},{"key":"e_1_3_2_2_44_1","volume-title":"Takeo Igarashi, and Hans-Peter Seidel.","author":"Okabe Makoto","year":"2009","unstructured":"Makoto Okabe, Ken ichi Anjyo, Takeo Igarashi, and Hans-Peter Seidel. 2009. Animating Pictures of Fluid using Video Examples. Computer Graphics Forum 28 (2009). https:\/\/api.semanticscholar.org\/CorpusID:31177574"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591500"},{"key":"e_1_3_2_2_46_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445"},{"key":"e_1_3_2_2_47_1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter\u00a0J. Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21, 1, Article 140 (jan 2020), 67\u00a0pages.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_2_48_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv abs\/2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv abs\/2204.06125 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248097655"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.291"},{"key":"e_1_3_2_2_50_1","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:2112.10752\u00a0[cs.CV]"},{"key":"e_1_3_2_2_51_1","volume-title":"IEEE International Conference on.","author":"Rott\u00a0Shaham Tamar","year":"2019","unstructured":"Tamar Rott\u00a0Shaham, Tali Dekel, and Tomer Michaeli. 2019. SinGAN: Learning a Generative Model from a Single Natural Image. In Computer Vision (ICCV), IEEE International Conference on."},{"key":"e_1_3_2_2_52_1","volume-title":"Raphael Gontijo-Lopes, Burcu\u00a0Karagol Ayan, Tim Salimans, Jonathan Ho, David\u00a0J. Fleet, and Mohammad Norouzi.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar\u00a0Seyed Ghasemipour, Raphael Gontijo-Lopes, Burcu\u00a0Karagol Ayan, Tim Salimans, Jonathan Ho, David\u00a0J. Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Advances in Neural Information Processing Systems, Alice\u00a0H. Oh, Alekh Agarwal, Danielle Belgrave, and Kyunghyun Cho (Eds.). https:\/\/openreview.net\/forum?id=08Yk-n5l2Al"},{"key":"e_1_3_2_2_53_1","volume-title":"VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation. arXiv preprint arXiv:2303.08340","author":"Qin Hongwei","year":"2023","unstructured":"Xiaoyu Shi, Zhaoyang Huang, Weikang Bian, Dasong Li, Manyuan Zhang, Ka\u00a0Chun Cheung, Simon See, Hongwei Qin, Jifeng Dai, and Hongsheng Li. 2023a. VideoFlow: Exploiting Temporal Cues for Multi-frame Optical Flow Estimation. arXiv preprint arXiv:2303.08340 (2023)."},{"key":"e_1_3_2_2_54_1","volume-title":"Masked Cost","author":"Qin Hongwei","year":"2023","unstructured":"Xiaoyu Shi, Zhaoyang Huang, Dasong Li, Manyuan Zhang, Ka\u00a0Chun Cheung, Simon See, Hongwei Qin, Jifeng Dai, and Hongsheng Li. 2023b. FlowFormer++: Masked Cost Volume Autoencoding for Pretraining Optical Flow Estimation. arXiv preprint arXiv:2303.01237 (2023)."},{"key":"e_1_3_2_2_55_1","volume-title":"DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing. arXiv preprint arXiv:2306.14435","author":"Shi Yujun","year":"2023","unstructured":"Yujun Shi, Chuhui Xue, Jiachun Pan, Wenqing Zhang, Vincent\u00a0YF Tan, and Song Bai. 2023c. DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing. arXiv preprint arXiv:2306.14435 (2023)."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"crossref","unstructured":"Aliaksandr Siarohin Oliver Woodford Jian Ren Menglei Chai and Sergey Tulyakov. 2021. Motion Representations for Articulated Animation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_2_2_57_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni Devi Parikh Sonal Gupta and Yaniv Taigman. 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. arxiv:2209.14792\u00a0[cs.CV]"},{"key":"e_1_3_2_2_58_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=St1giarCHLP","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=St1giarCHLP"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0644-x"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00931"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00881"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"e_1_3_2_2_63_1","volume-title":"Boximator: Generating Rich and Controllable Motions for Video Synthesis. arXiv preprint arXiv:2402.01566","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li. 2024. Boximator: Generating Rich and Controllable Motions for Video Synthesis. arXiv preprint arXiv:2402.01566 (2024)."},{"key":"e_1_3_2_2_64_1","volume-title":"Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen.","author":"Wang Wen","year":"2023","unstructured":"Wen Wang, kangyang Xie, Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen. 2023a. Zero-Shot Video Editing Using Off-The-Shelf Image Diffusion Models. arXiv preprint arXiv:2303.17599 (2023)."},{"key":"e_1_3_2_2_65_1","unstructured":"Xiang* Wang Hangjie* Yuan Shiwei* Zhang Dayou* Chen Jiuniu Wang Yingya Zhang Yujun Shen Deli Zhao and Jingren Zhou. 2023c. VideoComposer: Compositional Video Synthesis with Motion Controllability. (2023)."},{"key":"e_1_3_2_2_66_1","volume-title":"ImaGINator: Conditional Spatio-Temporal GAN for Video Generation. In The IEEE Winter Conference on Applications of Computer Vision (WACV).","author":"Piotr Bilinski Yaohui WANG","year":"2020","unstructured":"Yaohui WANG, Piotr Bilinski, Francois Bremond, and Antitza Dantcheva. 2020. ImaGINator: Conditional Spatio-Temporal GAN for Video Generation. In The IEEE Winter Conference on Applications of Computer Vision (WACV)."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093492"},{"key":"e_1_3_2_2_68_1","volume-title":"International Conference on Learning Representations.","author":"Wang Yaohui","year":"2022","unstructured":"Yaohui Wang, Di Yang, Francois Bremond, and Antitza Dantcheva. 2022. Latent Image Animator: Learning to Animate Images via Latent Space Navigation. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_69_1","volume-title":"Motionctrl: A unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641","author":"Wang Zhouxia","year":"2023","unstructured":"Zhouxia Wang, Ziyang Yuan, Xintao Wang, Tianshui Chen, Menghan Xia, Ping Luo, and Ying Shan. 2023b. Motionctrl: A unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641 (2023)."},{"key":"e_1_3_2_2_70_1","volume-title":"Context-PIPs: Persistent Independent Particles Demands Context Features. In Thirty-seventh Conference on Neural Information Processing Systems.","author":"Weikang BIAN","year":"2023","unstructured":"BIAN Weikang, Zhaoyang Huang, Xiaoyu Shi, Yitong Dong, Yijin Li, and Hongsheng Li. 2023. Context-PIPs: Persistent Independent Particles Demands Context Features. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_71_1","volume-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Weng Chung-Yi","year":"2018","unstructured":"Chung-Yi Weng, Brian Curless, and Ira Kemelmacher-Shlizerman. 2018. Photo Wake-Up: 3D Character Animation From a Single Photo. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018), 5901\u20135910. https:\/\/api.semanticscholar.org\/CorpusID:54446715"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109458"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02099"},{"key":"e_1_3_2_2_74_1","volume-title":"DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors. arXiv preprint arXiv:2310.12190","author":"Xing Jinbo","year":"2023","unstructured":"Jinbo Xing, Menghan Xia, Yong Zhang, Haoxin Chen, Wangbo Yu, Hanyuan Liu, Xintao Wang, Tien-Tsin Wong, and Ying Shan. 2023. DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors. arXiv preprint arXiv:2310.12190 (2023)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00251"},{"key":"e_1_3_2_2_76_1","volume-title":"Volumetric correspondence networks for optical flow. Advances in neural information processing systems 32","author":"Yang Gengshan","year":"2019","unstructured":"Gengshan Yang and Deva Ramanan. 2019. Volumetric correspondence networks for optical flow. Advances in neural information processing systems 32 (2019), 794\u2013805."},{"key":"e_1_3_2_2_77_1","volume-title":"Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion. arXiv preprint arXiv:2402.03162","author":"Yang Shiyuan","year":"2024","unstructured":"Shiyuan Yang, Liang Hou, Haibin Huang, Chongyang Ma, Pengfei Wan, Di Zhang, Xiaodong Chen, and Jing Liao. 2024. Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion. arXiv preprint arXiv:2402.03162 (2024)."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang Anyi Rao and Maneesh Agrawala. 2023a. Adding Conditional Control to Text-to-Image Diffusion Models.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_79_1","unstructured":"Shiwei Zhang Jiayu Wang Yingya Zhang Kang Zhao Hangjie Yuan Zhiwu Qing Xiang Wang Deli Zhao and Jingren Zhou. 2023b. I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion Models. (2023)."},{"key":"e_1_3_2_2_80_1","volume-title":"PIA: Your Personalized Image Animator via Plug-and-Play Modules in Text-to-Image Models. arxiv:2312.13964\u00a0[cs.CV]","author":"Zhang Yiming","year":"2023","unstructured":"Yiming Zhang, Zhening Xing, Yanhong Zeng, Youqing Fang, and Kai Chen. 2023c. PIA: Your Personalized Image Animator via Plug-and-Play Modules in Text-to-Image Models. arxiv:2312.13964\u00a0[cs.CV]"},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01818"},{"key":"e_1_3_2_2_82_1","unstructured":"Daquan Zhou Weimin Wang Hanshu Yan Weiwei Lv Yizhe Zhu and Jiashi Feng. 2023. MagicVideo: Efficient Video Generation With Latent Diffusion Models. arxiv:2211.11018\u00a0[cs.CV]"}],"event":{"name":"SIGGRAPH '24: Special Interest Group on Computer Graphics and Interactive Techniques Conference","location":"Denver CO USA","acronym":"SIGGRAPH '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641519.3657497","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3641519.3657497","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:50Z","timestamp":1750291550000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641519.3657497"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,13]]},"references-count":82,"alternative-id":["10.1145\/3641519.3657497","10.1145\/3641519"],"URL":"https:\/\/doi.org\/10.1145\/3641519.3657497","relation":{},"subject":[],"published":{"date-parts":[[2024,7,13]]},"assertion":[{"value":"2024-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}