{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T20:58:52Z","timestamp":1767992332896,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680683","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"6745-6754","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Fuse Your Latents: Video Editing with Multi-source Latent Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7803-194X","authenticated-orcid":false,"given":"Tianyi","family":"Lu","sequence":"first","affiliation":[{"name":"Shanghai Key Lab of Intel. Info. Proc., School of CS, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1721-035X","authenticated-orcid":false,"given":"Xing","family":"Zhang","sequence":"additional","affiliation":[{"name":"National Institute of Informatics, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6201-9503","authenticated-orcid":false,"given":"Jiaxi","family":"Gu","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7513-6576","authenticated-orcid":false,"given":"Renjing","family":"Pei","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8689-5807","authenticated-orcid":false,"given":"Songcen","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2099-4973","authenticated-orcid":false,"given":"Xingjun","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intel. Info. Proc., School of CS, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0022-0906","authenticated-orcid":false,"given":"Hang","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3645-8972","authenticated-orcid":false,"given":"Zuxuan","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai Key Lab of Intel. Info. Proc., School of CS, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"e_1_3_2_1_6_1","volume-title":"Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840","author":"Chen Weifeng","year":"2023","unstructured":"Weifeng Chen, Jie Wu, Pan Xie, Hefeng Wu, Jiashi Li, Xin Xia, Xuefeng Xiao, and Liang Lin. 2023. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arXiv preprint arXiv:2305.13840 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems, Vol. 34 (2021), 8780--8794."},{"key":"e_1_3_2_1_8_1","first-page":"19822","article-title":"2021. Cogview: Mastering text-to-image generation via transformers","volume":"34","author":"Ding Ming","year":"2021","unstructured":"Ming Ding, Zhuoyi Yang, Wenyi Hong, Wendi Zheng, Chang Zhou, Da Yin, Junyang Lin, Xu Zou, Zhou Shao, Hongxia Yang, et al. 2021. Cogview: Mastering text-to-image generation via transformers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 19822--19835.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011","author":"Esser Patrick","year":"2023","unstructured":"Patrick Esser, Johnathan Chiu, Parmida Atighehchian, Jonathan Granskog, and Anastasis Germanidis. 2023. Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011 (2023)."},{"key":"e_1_3_2_1_10_1","unstructured":"Ruoyu Feng Wenming Weng Yanhui Wang Yuhui Yuan Jianmin Bao Chong Luo Zhibo Chen and Baining Guo. 2024. CCEdit: Creative and Controllable Video Editing via Diffusion Models. arxiv: 2309.16496 [cs.CV] https:\/\/arxiv.org\/abs\/2309.16496"},{"key":"e_1_3_2_1_11_1","volume-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit H Bermano, Gal Chechik, and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373","author":"Geyer Michal","year":"2023","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2023. Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation. arXiv preprint arXiv:2309.03549","author":"Gu Jiaxi","year":"2023","unstructured":"Jiaxi Gu, Shicong Wang, Haoyu Zhao, Tianyi Lu, Xing Zhang, Zuxuan Wu, Songcen Xu, Wei Zhang, Yu-Gang Jiang, and Hang Xu. 2023. Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation. arXiv preprint arXiv:2309.03549 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_16_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_18_1","volume-title":"Oliver Wang, and Joon-Young Lee.","author":"Huang Jiahui","year":"2023","unstructured":"Jiahui Huang, Leonid Sigal, Kwang Moo Yi, Oliver Wang, and Joon-Young Lee. 2023. Inve: Interactive neural video editing. arXiv preprint arXiv:2307.07663 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"RAVE: Randomized Noise Shuffling for Fast and Consistent Video Editing with Diffusion Models. arxiv: 2312.04524 [cs.CV] https:\/\/arxiv.org\/abs\/2312.04524","author":"Kara Ozgur","year":"2023","unstructured":"Ozgur Kara, Bariscan Kurtkaya, Hidir Yesiltepe, James M. Rehg, and Pinar Yanardag. 2023. RAVE: Randomized Noise Shuffling for Fast and Consistent Video Editing with Diffusion Models. arxiv: 2312.04524 [cs.CV] https:\/\/arxiv.org\/abs\/2312.04524"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_1_21_1","unstructured":"Yuval Kirstain Adam Polyak Uriel Singer Shahbuland Matiana Joe Penna and Omer Levy. 2023. Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_1_23_1","volume-title":"Sdm: Spatial diffusion model for large hole image inpainting. arXiv preprint arXiv:2212.02963","author":"Li Wenbo","year":"2022","unstructured":"Wenbo Li, Xin Yu, Kun Zhou, Yibing Song, Zhe Lin, and Jiaya Jia. 2022. Sdm: Spatial diffusion model for large hole image inpainting. arXiv preprint arXiv:2212.02963 (2022)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_1_25_1","volume-title":"Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, and Mike Zheng Shou.","author":"Liu Jia-Wei","year":"2023","unstructured":"Jia-Wei Liu, Yan-Pei Cao, Jay Zhangjie Wu, Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, and Mike Zheng Shou. 2023. DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and View-Change Human-Centric Video Editing. arxiv: 2310.10624 [cs.CV]"},{"key":"e_1_3_2_1_26_1","volume-title":"Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761","author":"Liu Shaoteng","year":"2023","unstructured":"Shaoteng Liu, Yuechen Zhang, Wenbo Li, Zhe Lin, and Jiaya Jia. 2023. Video-p2p: Video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"e_1_3_2_1_28_1","unstructured":"Zhengxiong Luo Dayou Chen Yingya Zhang Yan Huang Liang Wang Yujun Shen Deli Zhao Jingren Zhou and Tieniu Tan. 2023. VideoFusion: Decomposed Diffusion Models for High-Quality Video Generation. arxiv: 2303.08320 [cs.CV]"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_30_1","volume-title":"Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen.","author":"Molad Eyal","year":"2023","unstructured":"Eyal Molad, Eliahu Horwitz, Dani Valevski, Alex Rav Acha, Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen. 2023. Dreamix: Video diffusion models are general video editors. arXiv preprint arXiv:2302.01329 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_32_1","volume-title":"Codef: Content deformation fields for temporally consistent video processing. arXiv preprint arXiv:2308.07926","author":"Ouyang Hao","year":"2023","unstructured":"Hao Ouyang, Qiuyu Wang, Yuxi Xiao, Qingyan Bai, Juntao Zhang, Kecheng Zheng, Xiaowei Zhou, Qifeng Chen, and Yujun Shen. 2023. Codef: Content deformation fields for temporally consistent video processing. arXiv preprint arXiv:2308.07926 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675","author":"Pont-Tuset Jordi","year":"2017","unstructured":"Jordi Pont-Tuset, Federico Perazzi, Sergi Caelles, Pablo Arbel\u00e1ez, Alex Sorkine-Hornung, and Luc Van Gool. 2017. The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)."},{"key":"e_1_3_2_1_34_1","volume-title":"Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535","author":"Qi Chenyang","year":"2023","unstructured":"Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, and Qifeng Chen. 2023. Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Machine Learning. PMLR, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821--8831."},{"key":"e_1_3_2_1_37_1","volume-title":"International conference on machine learning. PMLR, 1060--1069","author":"Reed Scott","year":"2016","unstructured":"Scott Reed, Zeynep Akata, Xinchen Yan, Lajanugen Logeswaran, Bernt Schiele, and Honglak Lee. 2016. Generative adversarial text to image synthesis. In International conference on machine learning. PMLR, 1060--1069."},{"key":"e_1_3_2_1_38_1","volume-title":"Appeared in the Proceedings of the International Conference on Visualization, Imaging and Image Processing (VIIP","author":"Richard MMOBB","year":"2001","unstructured":"MMOBB Richard and MYS Chang. 2001. Fast digital image inpainting. In Appeared in the Proceedings of the International Conference on Visualization, Imaging and Image Processing (VIIP 2001), Marbella, Spain. 106--107."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"e_1_3_2_1_42_1","first-page":"36479","article-title":"2022. Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","unstructured":"Xincheng Shuai Henghui Ding Xingjun Ma Rongcheng Tu Yu-Gang Jiang and Dacheng Tao. 2024. A Survey of Multimodal-Guided Image Editing with Text-to-Image Diffusion Models. arxiv: 2406.14555 [cs.CV] https:\/\/arxiv.org\/abs\/2406.14555"},{"key":"e_1_3_2_1_44_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_1_47_1","volume-title":"VideoComposer: Compositional Video Synthesis with Motion Controllability. arXiv preprint arXiv:2306.02018","author":"Wang Xiang","year":"2023","unstructured":"Xiang Wang, Hangjie Yuan, Shiwei Zhang, Dayou Chen, Jiuniu Wang, Yingya Zhang, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. VideoComposer: Compositional Video Synthesis with Motion Controllability. arXiv preprint arXiv:2306.02018 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Lavie: High-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103","author":"Wang Yaohui","year":"2023","unstructured":"Yaohui Wang, Xinyuan Chen, Xin Ma, Shangchen Zhou, Ziqi Huang, Yi Wang, Ceyuan Yang, Yinan He, Jiashuo Yu, Peiqing Yang, et al. 2023. Lavie: High-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565","author":"Wu Jay Zhangjie","year":"2022","unstructured":"Jay Zhangjie Wu, Yixiao Ge, Xintao Wang, Weixian Lei, Yuchao Gu, Wynne Hsu, Ying Shan, Xiaohu Qie, and Mike Zheng Shou. 2022. Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"CVPR 2023 Text Guided Video Editing Competition. arxiv: 2310","author":"Wu Jay Zhangjie","year":"2023","unstructured":"Jay Zhangjie Wu, Xiuyu Li, Difei Gao, Zhen Dong, Jinbin Bai, Aishani Singh, Xiaoyu Xiang, Youzeng Li, Zuwei Huang, Yuanxi Sun, Rui He, Feng Hu, Junhua Hu, Hai Huang, Hanyu Zhu, Xu Cheng, Jie Tang, Mike Zheng Shou, Kurt Keutzer, and Forrest Iandola. 2023. CVPR 2023 Text Guided Video Editing Competition. arxiv: 2310.16003 [cs.CV]"},{"key":"e_1_3_2_1_51_1","unstructured":"Zhen Xing Qi Dai Han Hu Zuxuan Wu and Yu-Gang Jiang. 2023. SimDA: Simple Diffusion Adapter for Efficient Video Generation. arxiv: 2308.09710 [cs.CV]"},{"key":"e_1_3_2_1_52_1","unstructured":"Zhen Xing Qijun Feng Haoran Chen Qi Dai Han Hu Hang Xu Zuxuan Wu and Yu-Gang Jiang. 2023. A Survey on Video Diffusion Models. arxiv: 2310.10647 [cs.CV] https:\/\/arxiv.org\/abs\/2310.10647"},{"key":"e_1_3_2_1_53_1","volume-title":"Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al.","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al. 2022. Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789, Vol. 2, 3 (2022), 5."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"e_1_3_2_1_55_1","volume-title":"Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)."},{"key":"e_1_3_2_1_56_1","unstructured":"Haoyu Zhao Tianyi Lu Jiaxi Gu Xing Zhang Qingping Zheng Zuxuan Wu Hang Xu and Yu-Gang Jiang. 2024. MagDiff: Multi-Alignment Diffusion for High-Fidelity Video Generation and Editing. arxiv: 2311.17338 [cs.CV] https:\/\/arxiv.org\/abs\/2311.17338"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680683","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680683","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680683"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":56,"alternative-id":["10.1145\/3664647.3680683","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680683","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}