{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T10:12:34Z","timestamp":1768990354159,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100006785","name":"Google","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687623","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Fashion-VDM: Video Diffusion Model for Virtual Try-On"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9999-5510","authenticated-orcid":false,"given":"Johanna","family":"Karras","sequence":"first","affiliation":[{"name":"Google Inc., Seattle, United States of America and University of Washington, Seattle, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2968-0735","authenticated-orcid":false,"given":"Yingwei","family":"Li","sequence":"additional","affiliation":[{"name":"Google Inc., Mountain View, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1380-5428","authenticated-orcid":false,"given":"Nan","family":"Liu","sequence":"additional","affiliation":[{"name":"Google Inc., New York City, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8543-9177","authenticated-orcid":false,"given":"Luyang","family":"Zhu","sequence":"additional","affiliation":[{"name":"Google Inc., Seattle, United States of America and University of Washington, Seattle, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4616-4644","authenticated-orcid":false,"given":"Innfarn","family":"Yoo","sequence":"additional","affiliation":[{"name":"Google Inc., Mountain View, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5154-6116","authenticated-orcid":false,"given":"Andreas","family":"Lugmayr","sequence":"additional","affiliation":[{"name":"Google Inc., Mountain View, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5703-9605","authenticated-orcid":false,"given":"Chris","family":"Lee","sequence":"additional","affiliation":[{"name":"Google Inc., New York City, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9498-584X","authenticated-orcid":false,"given":"Ira","family":"Kemelmacher-Shlizerman","sequence":"additional","affiliation":[{"name":"Google Inc., Seattle, United States of America and University of Washington, Seattle, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_2_2_1","unstructured":"Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts Varun Jampani Robin\u00a0Rombach Andreas\u00a0Blattmann Tim\u00a0Dockhorn. 2023. Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets."},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_24"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_2_6_1","unstructured":"Shouyuan Chen Sherman Wong Liangjian Chen and Yuandong Tian. 2023b. Extending Context Window of Large Language Models via Positional Interpolation. arXiv:arXiv:2306.15595"},{"key":"e_1_3_3_2_7_1","unstructured":"Xinyuan Chen Yaohui Wang Lingjun Zhang Shaobin Zhuang Xin Ma Jiashuo Yu Yali Wang Dahua Lin Yu Qiao and Ziwei Liu. 2023a. SEINE: Short-to-Long Video Diffusion Model for Generative Transition and Prediction. arXiv:https:\/\/arXiv.org\/abs\/2310.20700"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_3_2_9_1","unstructured":"Aiyu Cui Jay Mahajan Viraj Shah Preeti Gomathinayagam and Svetlana Lazebnik. 2023. Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person Images. arXiv:https:\/\/arXiv.org\/abs\/2311.16094"},{"key":"e_1_3_3_2_10_1","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. arXiv:arXiv:2105.05233"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00347"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"crossref","unstructured":"Rohit Girdhar Mannat Singh Andrew Brown Quentin Duval Samaneh Azadi Sai\u00a0Saketh Rambhatla Akbar Shah Xi Yin Devi Parikh and Ishan Misra. 2023. Emu Video: Factorizing Text-to-Video Generation by Explicit Image Conditioning. arXiv:arXiv:2311.10709","DOI":"10.1007\/978-3-031-73033-7_12"},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"crossref","unstructured":"Ke Gong Yiming Gao Xiaodan Liang Xiaohui Shen Meng Wang and Liang Lin. 2019. Graphonomy: Universal Human Parsing via Graph Transfer Learning. arXiv:arXiv:1904.04536","DOI":"10.1109\/CVPR.2019.00763"},{"key":"e_1_3_3_2_14_1","unstructured":"Jiaxi Gu Shicong Wang Haoyu Zhao Tianyi Lu Xing Zhang Zuxuan Wu Songcen Xu Wei Zhang Yu-Gang Jiang and Hang Xu. 2023. Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation. arXiv:arXiv:2309.03549"},{"key":"e_1_3_3_2_15_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Yaohui Wang Yu Qiao Dahua Lin and Bo Dai. 2023. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. arXiv:arXiv:2307.04725"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00787"},{"key":"e_1_3_3_2_17_1","first-page":"1161\u2013\u20131170","volume-title":"2019 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Xiaodan\u00a0Liang Xiaohui Shen B. Wu Bing cheng\u00a0Chen Haoye\u00a0Dong,","year":"2019","unstructured":"Xiaohui Shen B. Wu Bing cheng\u00a0Chen Haoye\u00a0Dong, Xiaodan\u00a0Liang and J. Yin. 2019. FW-GAN: Flow-Navigated Warping GAN for Video Virtual Try-On. In 2019 IEEE\/CVF International Conference on Computer Vision (ICCV). Yunlin, Taiwan, 1161\u2013\u20131170."},{"key":"e_1_3_3_2_18_1","unstructured":"William Harvey Saeid Naderiparizi Vaden Masrani Christian Weilbach and Frank Wood. 2022. Flexible Diffusion Modeling of Long Videos. arXiv:arXiv:2205.11495"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00346"},{"key":"e_1_3_3_2_20_1","unstructured":"Yingqing He Tianyu Yang Yong Zhang Ying Shan and Qifeng Chen. 2022b. Latent Video Diffusion Models for High-Fidelity Long Video Generation. arXiv:arXiv:2211.13221"},{"key":"e_1_3_3_2_21_1","volume-title":"Advances in Neural Information Processing Systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/8a1d694707eb0fefe65871369074926d-Paper.pdf https:\/\/dl.acm.org\/doi\/10.5555\/3295222.3295408"},{"key":"e_1_3_3_2_22_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik\u00a0P. Kingma Ben Poole Mohammad Norouzi David\u00a0J. Fleet and Tim Salimans. 2022a. Imagen Video: High Definition Video Generation with Diffusion Models. arXiv:arXiv:2210.02303"},{"key":"e_1_3_3_2_23_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv:arXiv:2006.11239"},{"key":"e_1_3_3_2_24_1","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-Free Diffusion Guidance. arXiv:arXiv:2207.12598"},{"key":"e_1_3_3_2_25_1","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J. Fleet. 2022b. Video Diffusion Models. arXiv:arXiv:2204.03458"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"crossref","unstructured":"Li Hu Xin Gao Peng Zhang Ke Sun Bang Zhang and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv:arXiv:2311.17117","DOI":"10.1109\/CVPR52733.2024.00779"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Gao Huang Yu Sun Zhuang Liu Daniel Sedra and Kilian Weinberger. 2016. Deep Networks with Stochastic Depth. arXiv:arXiv:1603.09382","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01053"},{"key":"e_1_3_3_2_29_1","first-page":"22680","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Karras Johanna","year":"2023","unstructured":"Johanna Karras, Aleksander Holynski, Ting-Chun Wang, and Ira Kemelmacher-Shlizerman. 2023. DreamPose: Fashion Video Synthesis with Stable Diffusion. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 22680\u201322690."},{"key":"e_1_3_3_2_30_1","unstructured":"Jeongho Kim Gyojung Gu Minho Park Sunghyun Park and Jaegul Choo. 2023. StableVITON: Learning Semantic Correspondence with Latent Diffusion Model for Virtual Try-On. arXiv:https:\/\/arXiv.org\/abs\/2312.01725"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_13"},{"key":"e_1_3_3_2_32_1","unstructured":"Seung\u00a0Hyun Lee Sieun Kim Innfarn Yoo Feng Yang Donghyeon Cho Youngseo Kim Huiwen Chang Jinkyu Kim and Sangpil Kim. 2023. Soundini: Sound-Guided Diffusion for Natural Video Editing. arXiv:https:\/\/arXiv.org\/abs\/2304.06818"},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","unstructured":"Kathleen\u00a0M Lewis Srivatsan Varadharajan and Ira Kemelmacher-Shlizerman. 2021. TryOnGAN: body-aware try-on via layered interpolation. ACM Trans. Graph. 40 4 Article 115 (jul 2021) 10\u00a0pages. 10.1145\/3450626.3459884https:\/\/dl.acm.org\/doi\/10.1145\/3450626.3459884","DOI":"10.1145\/3450626.3459884"},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"crossref","unstructured":"Kangfu Mei and Vishal Patel. 2023. VIDM: Video Implicit Diffusion Models. Proceedings of the AAAI Conference on Artificial Intelligence 37 8 (Jun. 2023) 9117\u20139125.","DOI":"10.1609\/aaai.v37i8.26094"},{"key":"e_1_3_3_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00513"},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"crossref","unstructured":"William Peebles and Saining Xie. 2022. Scalable Diffusion Models with Transformers. arXiv:arXiv:2212.09748","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_38_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:arXiv:2103.00020"},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01317"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_41_1","unstructured":"Tim Salimans and Jonathan Ho. 2022. Progressive Distillation for Fast Sampling of Diffusion Models. arXiv:arXiv:2202.00512"},{"key":"e_1_3_3_2_42_1","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman Patrick Schramowski Srivatsa Kundurthy Katherine Crowson Ludwig Schmidt Robert Kaczmarczyk and Jenia Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. arXiv:arXiv:2210.08402"},{"key":"e_1_3_3_2_43_1","unstructured":"Jascha Sohl-Dickstein Eric\u00a0A. Weiss Niru Maheswaranathan and Surya Ganguli. 2015. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. arXiv:arXiv:1503.03585"},{"key":"e_1_3_3_2_44_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. arXiv:arXiv:2010.02502"},{"key":"e_1_3_3_2_45_1","unstructured":"Yang Song and Stefano Ermon. 2019. Generative Modeling by Estimating Gradients of the Data Distribution. arXiv:arXiv:1907.05600"},{"key":"e_1_3_3_2_46_1","unstructured":"Thomas Unterthiner Sjoerd van Steenkiste Karol Kurach Raphael Marinier Marcin Michalski and Sylvain Gelly. 2018. Towards Accurate Generative Models of Video: A New Metric & Challenges. arXiv:arXiv:1812.01717"},{"key":"e_1_3_3_2_47_1","unstructured":"Fu-Yun Wang Wenshuo Chen Guanglu Song Han-Jia Ye Yu Liu and Hongsheng Li. 2023b. Gen-L-Video: Multi-Text to Long Video Generation via Temporal Co-Denoising. arXiv:https:\/\/arXiv.org\/abs\/2305.18264"},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"crossref","unstructured":"Yaohui Wang Xinyuan Chen Xin Ma Shangchen Zhou Ziqi Huang Yi Wang Ceyuan Yang Yinan He Jiashuo Yu Peiqing Yang Yuwei Guo Tianxing Wu Chenyang Si Yuming Jiang Cunjian Chen Chen\u00a0Change Loy Bo Dai Dahua Lin Yu Qiao and Ziwei Liu. 2023a. LAVIE: High-Quality Video Generation with Cascaded Latent Diffusion Models. arXiv:arXiv:2309.15103","DOI":"10.1007\/s11263-024-02295-1"},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"crossref","unstructured":"Yi-Cheng\u00a0Tien Wen-Jiin\u00a0Tsai. 2023. Attention-based Video Virtual Try-On. ACM Proceedings of the 2023 ACM International Conference on Multimedia Retrieval 209\u2013216.","DOI":"10.1145\/3591106.3592252"},{"key":"e_1_3_3_2_50_1","unstructured":"Weilin\u00a0Huang Xintong\u00a0Han Xiaojun\u00a0Hu and Matthew\u00a0R Scott. 2020. Clothflow: A flow-based model for clothed person generation. Proceedings of the IEEE\/CVF international conference on computer vision 139\u2013144 ."},{"key":"e_1_3_3_2_51_1","unstructured":"Zhongcong Xu Jianfeng Zhang Jun\u00a0Hao Liew Hanshu Yan Jia-Wei Liu Chenxu Zhang Jiashi Feng and Mike\u00a0Zheng Shou. 2023. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model. arXiv:arXiv:2311.16498"},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00787"},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"crossref","unstructured":"Ruiyun Yu Xiaoqi Wang and Xiaohui Xie. 2019. Vtnfp: An image-based virtual try-on network with body and clothing feature preservation. Proceedings of the IEEE\/CVF international conference on computer vision 10511\u201310520.","DOI":"10.1109\/ICCV.2019.01061"},{"key":"e_1_3_3_2_54_1","unstructured":"Polina Zablotskaia Aliaksandr Siarohin Bo Zhao and Leonid Sigal. 2019. DwNet: Dense warp-based network for pose-guided human video generation. arXiv:arXiv:1910.09139"},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00789"},{"key":"e_1_3_3_2_56_1","unstructured":"Xujie Zhang Xiu Li Michael Kampffmeyer Xin Dong Zhenyu Xie Feida Zhu Haoye Dong and Xiaodan Liang. 2023. WarpDiffusion: Efficient Diffusion Model for High-Fidelity Virtual Try-on. arXiv:https:\/\/arXiv.org\/abs\/2312.03667"},{"key":"e_1_3_3_2_57_1","doi-asserted-by":"crossref","unstructured":"Xiaojing Zhong Zhonghua Wu Taizhe Tan Guosheng Lin and Qingyao Wu. 2021. MV-TON: Memory-based Video Virtual Try-on network. (2021). arXiv:arXiv:2108.07502","DOI":"10.1145\/3474085.3475269"},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"crossref","unstructured":"Luyang Zhu Yingwei Li Nan Liu Hao Peng Dawei Yang and Ira Kemelmacher-Shlizerman. 2024. M&M VTO: Multi-Garment Virtual Try-On and Editing.","DOI":"10.1109\/CVPR52733.2024.00134"},{"key":"e_1_3_3_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00447"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687623","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687623","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:27Z","timestamp":1750294707000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687623"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":58,"alternative-id":["10.1145\/3680528.3687623","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687623","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}