{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:47:35Z","timestamp":1774021655537,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62441615, 62422606, 624B2124"],"award-info":[{"award-number":["62441615, 62422606, 624B2124"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730662","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LayerFlow: A Unified Model for Layer-aware Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8552-4985","authenticated-orcid":false,"given":"Sihui","family":"Ji","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6405-4011","authenticated-orcid":false,"given":"Hao","family":"Luo","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5008-4720","authenticated-orcid":false,"given":"Xi","family":"Chen","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2978-666X","authenticated-orcid":false,"given":"Yuanpeng","family":"Tu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1352-5883","authenticated-orcid":false,"given":"Yiyang","family":"Wang","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8277-2706","authenticated-orcid":false,"given":"Hengshuang","family":"Zhao","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-vl: A versatile vision-language model for understanding localization text reading and beyond. arXiv:https:\/\/arXiv.org\/abs\/2308.12966 (2023)."},{"key":"e_1_3_3_2_3_1","unstructured":"Jianhong Bai Tianyu He Yuchi Wang Junliang Guo Haoji Hu Zuozhu Liu and Jiang Bian. 2024. Uniedit: A unified tuning-free framework for video motion and appearance editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13185 (2024)."},{"key":"e_1_3_3_2_4_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman et\u00a0al. 2024. Video generation models as world simulators. 2024. URL https:\/\/openai. com\/research\/video-generation-models-as-world-simulators (2024)."},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Shiran Zada Roni Paiss Ariel Ephrat Omer Tov Michael Rubinstein Lior Wolf Tali Dekel Tomer Michaeli and Inbar Mosseri. 2024. Still-moving: Customized video generation without customized video data. ACM Transactions on Graphics (TOG) 43 6 (2024) 1\u201311.","DOI":"10.1145\/3687945"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"crossref","unstructured":"Haoxin Chen Yong Zhang Xiaodong Cun Menghan Xia Xintao Wang Chao Weng and Ying Shan. 2024. VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2401.09047\u00a0[cs.CV]","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_2_7_1","unstructured":"Xuewei Chen Zhimin Chen and Yiren Song. 2025. TransAnimate: Taming Layer Diffusion to Generate RGBA Video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.17934 (2025)."},{"key":"e_1_3_3_2_8_1","unstructured":"Yangming Cheng Liulei Li Yuanyou Xu Xiaodi Li Zongxin Yang Wenguan Wang and Yi Yang. 2023. Segment and track anything. arXiv:https:\/\/arXiv.org\/abs\/2305.06558 (2023)."},{"key":"e_1_3_3_2_9_1","unstructured":"Zeqi Gu Wenqi Xian Noah Snavely and Abe Davis. 2023. Factormatte: Redefining video matting for re-composition tasks. TOG (2023)."},{"key":"e_1_3_3_2_10_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Zhengyang Liang Yaohui Wang Yu Qiao Maneesh Agrawala Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_2_11_1","volume-title":"NeurIPS","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NeurIPS."},{"key":"e_1_3_3_2_12_1","unstructured":"Wenyi Hong Ming Ding Wendi Zheng Xinghan Liu and Jie Tang. 2022. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv:https:\/\/arXiv.org\/abs\/2205.15868 (2022)."},{"key":"e_1_3_3_2_13_1","unstructured":"Wenyi Hong Weihan Wang Ming Ding Wenmeng Yu Qingsong Lv Yan Wang Yean Cheng Shiyu Huang Junhui Ji Zhao Xue et\u00a0al. 2024. Cogvlm2: Visual language models for image and video understanding. arXiv:https:\/\/arXiv.org\/abs\/2408.16500 (2024)."},{"key":"e_1_3_3_2_14_1","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021)."},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_3_2_16_1","unstructured":"Yao-Chih Lee Erika Lu Sarah Rumbley Michal Geyer Jia-Bin Huang Tali Dekel and Forrester Cole. 2024. Generative Omnimatte: Learning to Decompose Video into Layers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.16683 (2024)."},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00448"},{"key":"e_1_3_3_2_18_1","unstructured":"Wan-Duo\u00a0Kurt Ma John\u00a0P Lewis and W\u00a0Bastiaan Kleijn. 2023. Trailblazer: Trajectory control for diffusion-based video generation. arXiv:https:\/\/arXiv.org\/abs\/2401.00896 (2023)."},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_3_2_21_1","unstructured":"Fabio Quattrini Vittorio Pippi Silvia Cascianelli and Rita Cucchiara. 2024. Alfie: Democratising RGBA Image Generation With No $$$. arXiv:https:\/\/arXiv.org\/abs\/2408.14826 (2024)."},{"key":"e_1_3_3_2_22_1","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)."},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02115"},{"key":"e_1_3_3_2_24_1","volume-title":"NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"crossref","unstructured":"Luozhou Wang Yijun Li Zhifei Chen Jui-Hsien Wang Zhifei Zhang He Zhang Zhe Lin and Yingcong Chen. 2025. TransPixeler: Advancing Text-to-Video Generation with Transparency. CoRR (2025).","DOI":"10.1109\/CVPR52734.2025.01699"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657481"},{"key":"e_1_3_3_2_28_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024b. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_2_29_1","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2024. Transparent image layer diffusion using latent transparency. arXiv:https:\/\/arXiv.org\/abs\/2402.17113 (2024)."},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00684"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-96530-3"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730662","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:53:25Z","timestamp":1774018405000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730662"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":31,"alternative-id":["10.1145\/3721238.3730662","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730662","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}