{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:57:05Z","timestamp":1774022225378,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730604","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MotionCanvas: Cinematic Shot Design with Controllable Image-to-Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2181-1879","authenticated-orcid":false,"given":"Jinbo","family":"Xing","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong and Adobe Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1403-9884","authenticated-orcid":false,"given":"Long","family":"Mai","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2686-052X","authenticated-orcid":false,"given":"Cusuh","family":"Ham","sequence":"additional","affiliation":[{"name":"Adobe Research, Cambridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0389-1721","authenticated-orcid":false,"given":"Jiahui","family":"Huang","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1480-7302","authenticated-orcid":false,"given":"Aniruddha","family":"Mahapatra","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5238-593X","authenticated-orcid":false,"given":"Chi-Wing","family":"Fu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7792-9307","authenticated-orcid":false,"given":"Tien-Tsin","family":"Wong","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5399-6214","authenticated-orcid":false,"given":"Feng","family":"Liu","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"crossref","unstructured":"Sherwin Bahmani Ivan Skorokhodov Guocheng Qian Aliaksandr Siarohin Willi Menapace Andrea Tagliasacchi David\u00a0B. Lindell and Sergey Tulyakov. 2024a. AC3D: Analyzing and Improving 3D Camera Control in Video Diffusion Transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.18673 (2024).","DOI":"10.1109\/CVPR52734.2025.02130"},{"key":"e_1_3_3_2_3_1","unstructured":"Sherwin Bahmani Ivan Skorokhodov Aliaksandr Siarohin Willi Menapace Guocheng Qian Michael Vasilkovsky Hsin-Ying Lee Chaoyang Wang Jiaxu Zou Andrea Tagliasacchi et\u00a0al. 2024b. VD3D: Taming large video diffusion transformers for 3D camera control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.12781 (2024)."},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"e_1_3_3_2_5_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_2_7_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman Clarence Ng Ricky Wang and Aditya Ramesh. 2024. Video generation models as world simulators. (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"e_1_3_3_2_8_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang et\u00a0al. 2023. VideoCrafter1: Open diffusion models for high-quality video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.19512 (2023)."},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00127"},{"key":"e_1_3_3_2_10_1","volume-title":"ECCV","author":"Cho Seokju","year":"2024","unstructured":"Seokju Cho, Jiahui Huang, Jisu Nam, Honggyu An, Seungryong Kim, and Joon-Young Lee. 2024. Local All-Pair Correspondence for Point Tracking. In ECCV."},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1186822.1073273"},{"key":"e_1_3_3_2_12_1","volume-title":"Perception of Space and Motion","author":"Epstein W.","year":"1995","unstructured":"W. Epstein and S. Rogers. 1995. Perception of Space and Motion. Academic Press."},{"key":"e_1_3_3_2_13_1","unstructured":"Yoav HaCohen Nisan Chiprut Benny Brazowski Daniel Shalem Dudu Moshe Eitan Richardson Eran Levin Guy Shiran Nir Zabari Ori Gordon et\u00a0al. 2024. LTX-Video: Realtime Video Latent Diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.00103 (2024)."},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/861369"},{"key":"e_1_3_3_2_15_1","unstructured":"Hao He Yinghao Xu Yuwei Guo Gordon Wetzstein Bo Dai Hongsheng Li and Ceyuan Yang. 2024. CameraCtrl: Enabling camera control for text-to-video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02101 (2024)."},{"key":"e_1_3_3_2_16_1","volume-title":"NeurIPS","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In NeurIPS."},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00575"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/258734.258854"},{"key":"e_1_3_3_2_19_1","unstructured":"Chen Hou Guoqiang Wei Yan Zeng and Zhibo Chen. 2024. Training-free Camera Control for Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.10126 (2024)."},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00364"},{"key":"e_1_3_3_2_21_1","unstructured":"Rahima Khanam and Muhammad Hussain. 2024. YOLOv11: An overview of the key architectural enhancements. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.17725 (2024)."},{"key":"e_1_3_3_2_22_1","unstructured":"Pengxiang Li Kai Chen Zhili Liu Ruiyuan Gao Lanqing Hong Guo Zhou Hua Yao Dit-Yan Yeung Huchuan Lu and Xu Jia. 2023. TrackDiffusion: Tracklet-Conditioned Video Generation via Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00651 (2023)."},{"key":"e_1_3_3_2_23_1","volume-title":"ICLR","author":"Lipman Yaron","year":"2023","unstructured":"Yaron Lipman, Ricky T.\u00a0Q. Chen, Heli Ben-Hamu, Maximilian Nickel, and Matthew Le. 2023. Flow Matching for Generative Modeling. In ICLR."},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01419"},{"key":"e_1_3_3_2_25_1","unstructured":"Shaoteng Liu Tianyu Wang Jui-Hsien Wang Qing Liu Zhifei Zhang Joon-Young Lee Yijun Li Bei Yu Zhe Lin Soo\u00a0Ye Kim et\u00a0al. 2024a. Generative Video Propagation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.19761 (2024)."},{"key":"e_1_3_3_2_26_1","volume-title":"ICLR","author":"Liu Xingchao","year":"2024","unstructured":"Xingchao Liu, Xiwen Zhang, Jianzhu Ma, Jian Peng, et\u00a0al. 2024b. InstaFlow: One step is enough for high-quality diffusion-based text-to-image generation. In ICLR."},{"key":"e_1_3_3_2_27_1","unstructured":"I Loshchilov. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_2_28_1","unstructured":"Wan-Duo\u00a0Kurt Ma J.\u00a0P. Lewis and W.\u00a0Bastiaan Kleijn. 2023. TrailBlazer: Trajectory Control for Diffusion-Based Video Generation. arxiv:https:\/\/arXiv.org\/abs\/2401.00896\u00a0[cs.CV]"},{"key":"e_1_3_3_2_29_1","unstructured":"Aniruddha Mahapatra Aliaksandr Siarohin Hsin-Ying Lee Sergey Tulyakov and Jun-Yan Zhu. 2023. Text-Guided Synthesis of Eulerian Cinemagraphs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.03190 (2023)."},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02036"},{"key":"e_1_3_3_2_31_1","volume-title":"NeurIPS","author":"Mou Chong","year":"2024","unstructured":"Chong Mou, Mingdeng Cao, Xintao Wang, Zhaoyang Zhang, Ying Shan, and Jian Zhang. 2024. ReVideo: Remake a Video with Motion and Content Control. In NeurIPS."},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"crossref","unstructured":"Simon Niklaus Long Mai Jimei Yang and Feng Liu. 2019. 3D Ken Burns effect from a single image. ACM Trans. Graph. 38 6 Article 184 (Nov. 2019) 15\u00a0pages.","DOI":"10.1145\/3355089.3356528"},{"key":"e_1_3_3_2_33_1","volume-title":"ECCV","author":"Niu Muyao","year":"2024","unstructured":"Muyao Niu, Xiaodong Cun, Xintao Wang, Yong Zhang, Ying Shan, and Yinqiang Zheng. 2024. MOFA-video: Controllable image animation via generative motion field adaptions in frozen image-to-video diffusion model. In ECCV."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_35_1","unstructured":"Adam Polyak Amit Zohar Andrew Brown Andros Tjandra Animesh Sinha Ann Lee Apoorv Vyas Bowen Shi Chih-Yao Ma Ching-Yao Chuang et\u00a0al. 2024. Movie Gen: A cast of media foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.13720 (2024)."},{"key":"e_1_3_3_2_36_1","unstructured":"Haonan Qiu Zhaoxi Chen Zhouxia Wang Yingqing He Menghan Xia and Ziwei Liu. 2024. FreeTraj: Tuning-free trajectory control in video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16863 (2024)."},{"key":"e_1_3_3_2_37_1","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chloe Rolland Laura Gustafson et\u00a0al. 2024. SAM 2: Segment anything in images and videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00714 (2024)."},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657497"},{"key":"e_1_3_3_2_39_1","unstructured":"Chen Qian Kwan-Yee Lin Hongsheng\u00a0Li Siming\u00a0Fan Jingtan\u00a0Piao. 2022. Simulating Fluids in Real-World Still Images. arXiv preprint arXiv:2204.11335 (2022)."},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"crossref","unstructured":"Takafumi Taketomi Hideaki Uchiyama and Sei Ikeda. 2017. Visual SLAM algorithms: A survey from 2010 to 2016. IPSJ trans. on computer vision and app. 9 (2017) 1\u201311.","DOI":"10.1186\/s41074-017-0027-2"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"crossref","unstructured":"Yunlong Tang Junjia Guo Pinxin Liu Zhiyuan Wang Hang Hua Jia-Xing Zhong Yunzhong Xiao Chao Huang Luchuan Song Susan Liang Yizhi Song Liu He Jing Bi Mingqian Feng Xinyang Li Zeliang Zhang and Chenliang Xu. 2025. Generative AI for Cel-Animation: A Survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.06250 (2025).","DOI":"10.1109\/ICCVW69036.2025.00400"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.5555\/2031421"},{"key":"e_1_3_3_2_44_1","volume-title":"ICLR workshop","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new metric for video generation. In ICLR workshop."},{"key":"e_1_3_3_2_45_1","volume-title":"ICML","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li. 2024d. Boximator: Generating Rich and Controllable Motions for Video Synthesis. In ICML."},{"key":"e_1_3_3_2_46_1","unstructured":"Ruicheng Wang Sicheng Xu Cassie Dai Jianfeng Xiang Yu Deng Xin Tong and Jiaolong Yang. 2024b. MoGe: Unlocking accurate monocular geometry estimation for open-domain images with optimal training supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.19115 (2024)."},{"key":"e_1_3_3_2_47_1","unstructured":"Xi Wang Robin Courant Marc Christie and Vicky Kalogeiton. 2024a. AKiRa: Augmentation Kit on Rays for optical video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14158 (2024)."},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00749"},{"key":"e_1_3_3_2_50_1","unstructured":"Jianzong Wu Xiangtai Li Yanhong Zeng Jiangning Zhang Qianyu Zhou Yining Li Yunhai Tong and Kai Chen. 2024b. MotionBooth: Motion-Aware Customized Text-to-Video Generation. NeurIPS (2024)."},{"key":"e_1_3_3_2_51_1","volume-title":"ECCV","author":"Wu Weijia","year":"2024","unstructured":"Weijia Wu, Zhuang Li, Yuchao Gu, Rui Zhao, Yefei He, David\u00a0Junhao Zhang, Mike\u00a0Zheng Shou, Yan Li, Tingting Gao, and Di Zhang. 2024a. DragAnything: Motion control for anything using entity representation. In ECCV."},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"crossref","unstructured":"Jinbo Xing Hanyuan Liu Menghan Xia Yong Zhang Xintao Wang Ying Shan and Tien-Tsin Wong. 2024a. ToonCrafter: Generative cartoon interpolation. ACM TOG 43 6 (2024) 1\u201311.","DOI":"10.1145\/3687761"},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"crossref","unstructured":"Jinbo Xing Menghan Xia Yuxin Liu Yuechen Zhang Yong Zhang Yingqing He Hanyuan Liu Haoxin Chen Xiaodong Cun Xintao Wang et\u00a0al. 2025. Make-your-video: Customized video generation using textual and structural guidance. IEEE TVCG 31 02 (2025) 1526\u20131541.","DOI":"10.1109\/TVCG.2024.3365804"},{"key":"e_1_3_3_2_54_1","volume-title":"ECCV","author":"Xing Jinbo","year":"2024","unstructured":"Jinbo Xing, Menghan Xia, Yong Zhang, Haoxin Chen, Wangbo Yu, Hanyuan Liu, Gongye Liu, Xintao Wang, Ying Shan, and Tien-Tsin Wong. 2024b. DynamiCrafter: Animating open-domain images with video diffusion priors. In ECCV."},{"key":"e_1_3_3_2_55_1","unstructured":"Dejia Xu Weili Nie Chao Liu Sifei Liu Jan Kautz Zhangyang Wang and Arash Vahdat. 2024. CamCo: Camera-Controllable 3D-Consistent Image-to-Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.02509 (2024)."},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1457515.1409070"},{"key":"e_1_3_3_2_57_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024. CogVideoX: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_2_58_1","unstructured":"Wangbo Yu Jinbo Xing Li Yuan Wenbo Hu Xiaoyu Li Zhipeng Huang Xiangjun Gao Tien-Tsin Wong Ying Shan and Yonghong Tian. 2024. ViewCrafter: Taming video diffusion models for high-fidelity novel view synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02048 (2024)."},{"key":"e_1_3_3_2_59_1","volume-title":"Open-Sora: Democratizing Efficient Video Production for All","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiangyu Peng, Tianji Yang, Chenhui Shen, Shenggui Li, Hongxin Liu, Yukun Zhou, Tianyi Li, and Yang You. 2024. Open-Sora: Democratizing Efficient Video Production for All. https:\/\/github.com\/hpcaitech\/Open-Sora"},{"key":"e_1_3_3_2_60_1","doi-asserted-by":"crossref","unstructured":"Tinghui Zhou Richard Tucker John Flynn Graham Fyffe and Noah Snavely. 2018. Stereo Magnification: Learning view synthesis using multiplane images. ACM TOG 37 4 (2018).","DOI":"10.1145\/3197517.3201323"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730604","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:57:18Z","timestamp":1774018638000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730604"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":59,"alternative-id":["10.1145\/3721238.3730604","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730604","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}