{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:48:53Z","timestamp":1777657733227,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","funder":[{"name":"Kuaishou","award":["DJHL-20230925-297"],"award-info":[{"award-number":["DJHL-20230925-297"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730735","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Motion Inversion for Video Customization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-9761-1580","authenticated-orcid":false,"given":"Luozhou","family":"Wang","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2402-8928","authenticated-orcid":false,"given":"Ziyang","family":"Mai","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3252-9326","authenticated-orcid":false,"given":"Guibao","family":"Shen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4750-8875","authenticated-orcid":false,"given":"Yixun","family":"Liang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9126-4746","authenticated-orcid":false,"given":"Xin","family":"Tao","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-565X","authenticated-orcid":false,"given":"Pengfei","family":"Wan","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5475-2728","authenticated-orcid":false,"given":"Di","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kuaishou Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7295-8750","authenticated-orcid":false,"given":"Yijun","family":"Li","sequence":"additional","affiliation":[{"name":"Adobe Research, Seatle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9565-8205","authenticated-orcid":false,"given":"Ying-Cong","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_3_2_5_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et\u00a0al. 2023. Improving image generation with better captions. Computer Science. https:\/\/cdn. openai. com\/papers\/dall-e-3. pdf 2 3 (2023) 8."},{"key":"e_1_3_3_2_6_1","unstructured":"Mingdeng Cao Xintao Wang Zhongang Qi Ying Shan Xiaohu Qie and Yinqiang Zheng. 2023. MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.08465 (2023)."},{"key":"e_1_3_3_2_7_1","unstructured":"cerspense. 2023. zeroscope_v2. https:\/\/huggingface.co\/cerspense\/zeroscope_v2_576w. Accessed: 2023-02-03."},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201310.","DOI":"10.1145\/3592116"},{"key":"e_1_3_3_2_9_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang et\u00a0al. 2023b. Videocrafter1: Open diffusion models for high-quality video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.19512 (2023)."},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"crossref","unstructured":"Haoxin Chen Yong Zhang Xiaodong Cun Menghan Xia Xintao Wang Chao Weng and Ying Shan. 2024. Videocrafter2: Overcoming data limitations for high-quality video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.09047 (2024).","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_2_11_1","unstructured":"Weifeng Chen Jie Wu Pan Xie Hefeng Wu Jiashi Li Xin Xia Xuefeng Xiao and Liang Lin. 2023a. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13840 (2023)."},{"key":"e_1_3_3_2_12_1","unstructured":"Nathaniel Cohen Vladimir Kulikov Matan Kleiner Inbar Huberman-Spiegelglas and Tomer Michaeli. 2024. Slicedit: Zero-shot video editing with text-to-image diffusion models using spatio-temporal slices. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.12211 (2024)."},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_3_2_14_1","unstructured":"Rinon Gal Yuval Alaluf Yuval Atzmon Or Patashnik Amit\u00a0H Bermano Gal Chechik and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01618 (2022)."},{"key":"e_1_3_3_2_15_1","unstructured":"Michal Geyer Omer Bar-Tal Shai Bagon and Tali Dekel. 2023. Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.10373 (2023)."},{"key":"e_1_3_3_2_16_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Yaohui Wang Yu Qiao Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_2_17_1","unstructured":"Hao He Yinghao Xu Yuwei Guo Gordon Wetzstein Bo Dai Hongsheng Li and Ceyuan Yang. 2024. Cameractrl: Enabling camera control for text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02101 (2024)."},{"key":"e_1_3_3_2_18_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022. Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.13221 (2022)."},{"key":"e_1_3_3_2_19_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_2_20_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_21_1","unstructured":"Wenyi Hong Ming Ding Wendi Zheng Xinghan Liu and Jie Tang. 2022. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.15868 (2022)."},{"key":"e_1_3_3_2_22_1","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021)."},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"crossref","unstructured":"Hyeonho Jeong Jinho Chang Geon\u00a0Yeong Park and Jong\u00a0Chul Ye. 2024. DreamMotion: Space-Time Self-Similarity Score Distillation for Zero-Shot Video Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12002 (2024).","DOI":"10.1007\/978-3-031-73404-5_21"},{"key":"e_1_3_3_2_24_1","unstructured":"Hyeonho Jeong Geon\u00a0Yeong Park and Jong\u00a0Chul Ye. 2023. VMC: Video Motion Customization using Temporal Attention Adaption for Text-to-Video Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00845 (2023)."},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"crossref","unstructured":"Kumara Kahatapitiya Adil Karjauv Davide Abati Fatih Porikli Yuki\u00a0M Asano and Amirhossein Habibian. 2024. Object-Centric Diffusion for Efficient Video Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.05735 (2024).","DOI":"10.1007\/978-3-031-72998-0_6"},{"key":"e_1_3_3_2_26_1","unstructured":"Nikita Karaev Ignacio Rocco Benjamin Graham Natalia Neverova Andrea Vedaldi and Christian Rupprecht. 2023. Cotracker: It is better to track together. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.07635 (2023)."},{"key":"e_1_3_3_2_27_1","unstructured":"Guillaume Le\u00a0Moing Jean Ponce and Cordelia Schmid. 2021. CCVS: context-aware controllable video synthesis. Advances in Neural Information Processing Systems 34 (2021) 14042\u201314055."},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"e_1_3_3_2_29_1","unstructured":"Jun\u00a0Hao Liew Hanshu Yan Jianfeng Zhang Zhongcong Xu and Jiashi Feng. 2023. Magicedit: High-fidelity and temporally coherent video editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.14749 (2023)."},{"key":"e_1_3_3_2_30_1","unstructured":"Pengyang Ling Jiazi Bu Pan Zhang Xiaoyi Dong Yuhang Zang Tong Wu Huaian Chen Jiaqi Wang and Yi Jin. 2024. MotionClone: Training-Free Motion Cloning for Controllable Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.05338 (2024)."},{"key":"e_1_3_3_2_31_1","unstructured":"Shaoteng Liu Yuechen Zhang Wenbo Li Zhe Lin and Jiaya Jia. 2023. Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.04761 (2023)."},{"key":"e_1_3_3_2_32_1","unstructured":"Wan-Duo\u00a0Kurt Ma JP Lewis W\u00a0Bastiaan Kleijn and Thomas Leung. 2023b. Directed diffusion: Direct control of object placement through attention guidance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13153 (2023)."},{"key":"e_1_3_3_2_33_1","unstructured":"Wan-Duo\u00a0Kurt Ma John\u00a0P Lewis and W\u00a0Bastiaan Kleijn. 2023a. TrailBlazer: Trajectory Control for Diffusion-Based Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.00896 (2023)."},{"key":"e_1_3_3_2_34_1","unstructured":"Chenlin Meng Yutong He Yang Song Jiaming Song Jiajun Wu Jun-Yan Zhu and Stefano Ermon. 2021. Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.01073 (2021)."},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"crossref","unstructured":"Siwei Meng Yawei Luo and Ping Liu. 2025. Grounding Creativity in Physics: A Brief Survey of Physical Priors in AIGC. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.07007 (2025).","DOI":"10.24963\/ijcai.2025\/1176"},{"key":"e_1_3_3_2_36_1","unstructured":"Chong Mou Xintao Wang Liangbin Xie Jian Zhang Zhongang Qi Ying Shan and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.08453 (2023)."},{"key":"e_1_3_3_2_37_1","unstructured":"Alex Nichol Prafulla Dhariwal Aditya Ramesh Pranav Shyam Pamela Mishkin Bob McGrew Ilya Sutskever and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.10741 (2021)."},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3127905"},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"crossref","unstructured":"Or Patashnik Daniel Garibi Idan Azuri Hadar Averbuch-Elor and Daniel Cohen-Or. 2023. Localizing Object-level Shape Variations with Text-to-Image Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.11306 (2023).","DOI":"10.1109\/ICCV51070.2023.02107"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_3_2_41_1","unstructured":"Chenyang Qi Xiaodong Cun Yong Zhang Chenyang Lei Xintao Wang Ying Shan and Qifeng Chen. 2023. Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.09535 (2023)."},{"key":"e_1_3_3_2_42_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_43_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 (2022)."},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_2_46_1","doi-asserted-by":"crossref","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems 35 (2022) 36479\u201336494.","DOI":"10.52202\/068431-2643"},{"key":"e_1_3_3_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.308"},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00248"},{"key":"e_1_3_3_2_49_1","unstructured":"Aliaksandr Siarohin St\u00e9phane Lathuili\u00e8re Sergey Tulyakov Elisa Ricci and Nicu Sebe. 2019b. First order motion model for image animation. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_2_50_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni et\u00a0al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14792 (2022)."},{"key":"e_1_3_3_2_51_1","unstructured":"Kihyuk Sohn Nataniel Ruiz Kimin Lee Daniel\u00a0Castro Chin Irina Blok Huiwen Chang Jarred Barber Lu Jiang Glenn Entis Yuanzhen Li et\u00a0al. 2023. StyleDrop: Text-to-Image Generation in Any Style. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.00983 (2023)."},{"key":"e_1_3_3_2_52_1","unstructured":"Yu Tian Jian Ren Menglei Chai Kyle Olszewski Xi Peng Dimitris\u00a0N Metaxas and Sergey Tulyakov. 2021. A good image generator is what you need for high-resolution video synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.15069 (2021)."},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_3_2_54_1","unstructured":"Carl Vondrick Hamed Pirsiavash and Antonio Torralba. 2016. Generating videos with scene dynamics. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_2_55_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023. Modelscope text-to-video technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06571 (2023)."},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_2_57_1","first-page":"720","volume-title":"European conference on computer vision","author":"Wu Chenfei","year":"2022","unstructured":"Chenfei Wu, Jian Liang, Lei Ji, Fan Yang, Yuejian Fang, Daxin Jiang, and Nan Duan. 2022. N\u00fcwa: Visual synthesis pre-training for neural visual world creation. In European conference on computer vision. Springer, 720\u2013736."},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_3_2_59_1","unstructured":"Wilson Yan Yunzhi Zhang Pieter Abbeel and Aravind Srinivas. 2021. Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.10157 (2021)."},{"key":"e_1_3_3_2_60_1","unstructured":"Danah Yatim Rafail Fridman Omer\u00a0Bar Tal Yoni Kasten and Tali Dekel. 2023. Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17009 (2023)."},{"key":"e_1_3_3_2_61_1","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06721 (2023)."},{"key":"e_1_3_3_2_62_1","unstructured":"Weihao Ye Qiong Wu Wenhao Lin and Yiyi Zhou. 2024. Fit and Prune: Fast and Training-free Visual Token Pruning for Multi-modal Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.10197 (2024)."},{"key":"e_1_3_3_2_63_1","unstructured":"Shengming Yin Chenfei Wu Jian Liang Jie Shi Houqiang Li Gong Ming and Nan Duan. 2023. Dragnuwa: Fine-grained control in video generation by integrating text image and trajectory. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.08089 (2023)."},{"key":"e_1_3_3_2_64_1","unstructured":"David\u00a0Junhao Zhang Dongxu Li Hung Le Mike\u00a0Zheng Shou Caiming Xiong and Doyen Sahoo. 2024. Moonshot: Towards controllable video generation and editing with multimodal conditions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.01827 (2024)."},{"key":"e_1_3_3_2_65_1","unstructured":"David\u00a0Junhao Zhang Jay\u00a0Zhangjie Wu Jia-Wei Liu Rui Zhao Lingmin Ran Yuchao Gu Difei Gao and Mike\u00a0Zheng Shou. 2023. Show-1: Marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.15818 (2023)."},{"key":"e_1_3_3_2_66_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.05543 (2023).","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_2_67_1","unstructured":"Yuyao Zhang Jinghao Li and Yu-Wing Tai. 2025. LayerCraft: Enhancing Text-to-Image Generation with CoT Reasoning and Layered Object Integration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.00010 (2025)."},{"key":"e_1_3_3_2_68_1","unstructured":"Rui Zhao Yuchao Gu Jay\u00a0Zhangjie Wu David\u00a0Junhao Zhang Jiawei Liu Weijia Wu Jussi Keppo and Mike\u00a0Zheng Shou. 2023. Motiondirector: Motion customization of text-to-video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.08465 (2023)."}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730735","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:58:27Z","timestamp":1774018707000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730735"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":67,"alternative-id":["10.1145\/3721238.3730735","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730735","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}