{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T01:25:54Z","timestamp":1776043554052,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"Italian Ministry of University"},{"name":"European Union"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612137","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"8580-8589","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":103,"title":["LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7918-6220","authenticated-orcid":false,"given":"Davide","family":"Morelli","sequence":"first","affiliation":[{"name":"University of Modena and Reggio Emilia, Modena, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5012-5800","authenticated-orcid":false,"given":"Alberto","family":"Baldrati","sequence":"additional","affiliation":[{"name":"University of Florence, Florence, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5590-3253","authenticated-orcid":false,"given":"Giuseppe","family":"Cartella","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia, Modena, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia, Modena, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1364-218X","authenticated-orcid":false,"given":"Marco","family":"Bertini","sequence":"additional","affiliation":[{"name":"University of Florence, Florence, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-283X","authenticated-orcid":false,"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[{"name":"University of Modena and Reggio Emilia, Modena, Italy"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01762"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_24"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01407"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00543"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02138"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Bhunia Ankan Kumar","year":"2023","unstructured":"Ankan Kumar Bhunia, Salman Khan, Hisham Cholakkal, Rao Muhammad Anwer, Jorma Laaksonen, Mubarak Shah, and Fahad Shahbaz Khan. 2023. Person Image Synthesis via Denoising Diffusion Model. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Bi\u0144kowski Miko\u0142aj","year":"2018","unstructured":"Miko\u0142aj Bi\u0144kowski, Dougal J Sutherland, Michael Arbel, and Arthur Gretton. 2018. Demystifying MMD GANs. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01290"},{"key":"e_1_3_2_1_11_1","volume-title":"Multiresolution Textual Inversion. In Advances in Neural Information Processing Systems Workshops.","author":"Daras Giannis","year":"2022","unstructured":"Giannis Daras and Alexandros G Dimakis. 2022. Multiresolution Textual Inversion. In Advances in Neural Information Processing Systems Workshops."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531017"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.04101"},{"key":"e_1_3_2_1_14_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Jean Duchon. 1977. Splines minimizing rotation-invariant semi-norms in Sobolev spaces. In Constructive Theory of Functions of Several Variables.","DOI":"10.1007\/BFb0086566"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00226"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00246"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412052"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Gal Rinon","year":"2023","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit H. Bermano, Gal Chechik, and Daniel Cohen-Or. 2023. An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00838"},{"key":"e_1_3_2_1_22_1","unstructured":"Ian Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative adversarial nets. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.382"},{"key":"e_1_3_2_1_25_1","volume-title":"Highly Personalized Text Embedding for Image Manipulation by Stable Diffusion. arXiv preprint arXiv:2303.08767","author":"Han Inhwa","year":"2023","unstructured":"Inhwa Han, Serin Yang, Taesung Kwon, and Jong Chul Ye. 2023. Highly Personalized Text Embedding for Image Manipulation by Stable Diffusion. arXiv preprint arXiv:2303.08767 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00787"},{"key":"e_1_3_2_1_27_1","volume-title":"Gaussian Error Linear Units (GELUs). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian Error Linear Units (GELUs). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_28_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler G\u00fcnter Klambauer and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a Nash equilibrium. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_29_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_30_1","volume-title":"Classifier-Free Diffusion Guidance. In Advances in Neural Information Processing Systems Workshops.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In Advances in Neural Information Processing Systems Workshops."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00748"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_37"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3528223.3530104","article-title":"Text2human: Text-driven controllable human image generation","volume":"41","author":"Jiang Yuming","year":"2022","unstructured":"Yuming Jiang, Shuai Yang, Haonan Qju, Wayne Wu, Chen Change Loy, and Ziwei Liu. 2022. Text2human: Text-driven controllable human image generation. ACM Transactions on Graphics, Vol. 41, 4 (2022), 1--11.","journal-title":"ACM Transactions on Graphics"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_13"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01529"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Meng Chenlin","year":"2022","unstructured":"Chenlin Meng, Yutong He adnd Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2022. SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_43_1","volume-title":"CEUR Workshop Proceedings.","author":"Morelli Davide","year":"2021","unstructured":"Davide Morelli, Marcella Cornia, Rita Cucchiara, et al. 2021. FashionSearch: Improving consumer-to-shop clothes retrieval with hard negatives. In CEUR Workshop Proceedings."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00243"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Nichol Alexander Quinn","year":"2021","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, A. Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_49_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"e_1_3_2_1_54_1","volume-title":"Raphael Gontijo-Lopes, Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Raphael Gontijo-Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022b. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00359"},{"key":"e_1_3_2_1_56_1","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade W Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman Patrick Schramowski Srivatsa R Kundurthy Katherine Crowson Ludwig Schmidt Robert Kaczmarczyk and Jenia Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.5555\/2627435.2670313"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_36"},{"key":"e_1_3_2_1_61_1","volume-title":"Pretraining is All You Need for Image-to-Image Translation. arXiv preprint arXiv:2205.12952","author":"Wang Tengfei","year":"2022","unstructured":"Tengfei Wang, Ting Zhang, Bo Zhang, Hao Ouyang, Dong Chen, Qifeng Chen, and Fang Wen. 2022. Pretraining is All You Need for Image-to-Image Translation. arXiv preprint arXiv:2205.12952 (2022)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00787"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_68_1","volume-title":"EGSDE: Unpaired Image-to-Image Translation via Energy-Guided Stochastic Differential Equations. In Advances in Neural Information Processing Systems.","author":"Zhao Min","year":"2022","unstructured":"Min Zhao, Fan Bao, Chongxuan Li, and Jun Zhu. 2022. EGSDE: Unpaired Image-to-Image Translation via Energy-Guided Stochastic Differential Equations. In Advances in Neural Information Processing Systems."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612137","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612137","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:53:53Z","timestamp":1755820433000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612137"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":68,"alternative-id":["10.1145\/3581783.3612137","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612137","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}