{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T22:46:04Z","timestamp":1775861164237,"version":"3.50.1"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726422","type":"print"},{"value":"9783031726439","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72643-9_8","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T20:47:10Z","timestamp":1732222030000},"page":"123-139","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["WildVidFit: Video Virtual Try-On in\u00a0the\u00a0Wild via\u00a0Image-Based Controlled Diffusion Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1845-2195","authenticated-orcid":false,"given":"Zijian","family":"He","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8641-1048","authenticated-orcid":false,"given":"Peixin","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7760-1339","authenticated-orcid":false,"given":"Guangrun","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4805-0926","authenticated-orcid":false,"given":"Guanbin","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0259-5732","authenticated-orcid":false,"given":"Philip H. S.","family":"Torr","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2248-3755","authenticated-orcid":false,"given":"Liang","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"8_CR1","doi-asserted-by":"publisher","unstructured":"Bai, S., Zhou, H., Li, Z., Zhou, C., Yang, H.: Single stage virtual try-on via\u00a0deformable attention flows. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XV, pp. 409\u2013425. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_24","DOI":"10.1007\/978-3-031-19784-0_24"},{"key":"8_CR2","unstructured":"Bi\u0144kowski, M., Sutherland, D.J., Arbel, M., Gretton, A.: Demystifying MMD GANs. arXiv preprint arXiv:1801.01401 (2018)"},{"key":"8_CR3","unstructured":"Cao, Z., Hidalgo Martinez, G., Simon, T., Wei, S., Sheikh, Y.A.: Openpose: realtime multi-person 2d pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Intell. (2019)"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H.: Anydoor: zero-shot object-level image customization. arXiv preprint arXiv:2307.09481 (2023)","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Choi, S., Park, S., Lee, M., Choo, J.: Viton-hd: high-resolution virtual try-on via misalignment-aware normalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14131\u201314140 (2021)","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"8_CR7","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Dong, H., Liang, X., Shen, X., Wu, B., Chen, B.C., Yin, J.: Fw-gan: flow-navigated warping gan for video virtual try-on. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1161\u20131170 (2019)","DOI":"10.1109\/ICCV.2019.00125"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Dong, X.,et al.: Dressing in the wild by watching dance videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3480\u20133489 (2022)","DOI":"10.1109\/CVPR52688.2022.00347"},{"key":"8_CR10","first-page":"35946","volume":"35","author":"C Feichtenhofer","year":"2022","unstructured":"Feichtenhofer, C., Li, Y., He, K., et al.: Masked autoencoders as spatiotemporal learners. Adv. Neural. Inf. Process. Syst. 35, 35946\u201335958 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Ge, Y., Song, Y., Zhang, R., Ge, C., Liu, W., Luo, P.: Parser-free virtual try-on via distilling appearance flows. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8485\u20138493 (2021)","DOI":"10.1109\/CVPR46437.2021.00838"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Gou, J., Sun, S., Zhang, J., Si, J., Qian, C., Zhang, L.: Taming the power of diffusion models for high-quality virtual try-on with appearance flow. arXiv preprint arXiv:2308.06101 (2023)","DOI":"10.1145\/3581783.3612255"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Han, X., Hu, X., Huang, W., Scott, M.R.: Clothflow: a flow-based model for clothed person generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10471\u201310480 (2019)","DOI":"10.1109\/ICCV.2019.01057"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Han, X., Wu, Z., Wu, Z., Yu, R., Davis, L.S.: Viton: an image-based virtual try-on network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7543\u20137552 (2018)","DOI":"10.1109\/CVPR.2018.00787"},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"He, S., Song, Y.Z., Xiang, T.: Style-based global appearance flow for virtual try-on. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3470\u20133479 (2022)","DOI":"10.1109\/CVPR52688.2022.00346"},{"key":"8_CR16","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"8_CR17","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR18","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"8_CR19","first-page":"32736","volume":"35","author":"Z Huang","year":"2022","unstructured":"Huang, Z., Li, H., Xie, Z., Kampffmeyer, M., Liang, X., et al.: Towards hard-pose virtual try-on via 3d-aware global correspondence learning. Adv. Neural. Inf. Process. Syst. 35, 32736\u201332748 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"619","DOI":"10.1007\/978-3-030-58565-5_37","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Issenhuth","year":"2020","unstructured":"Issenhuth, T., Mary, J., Calauz\u00e8nes, C.: Do not mask what you do not need to mask: a parser-free virtual try-on. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 619\u2013635. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_37"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Jafarian, Y., Park, H.S.: Self-supervised 3d representation learning of dressed humans from social media videos. IEEE Trans. Pattern Anal. Mach. Intell. (2022)","DOI":"10.1109\/TPAMI.2022.3231558"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Jiang, J., Wang, T., Yan, H., Liu, J.: Clothformer: taming video virtual try-on in all module. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10799\u201310808 (2022)","DOI":"10.1109\/CVPR52688.2022.01053"},{"issue":"2","key":"8_CR23","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1109\/4.996","volume":"23","author":"N Kanopoulos","year":"1988","unstructured":"Kanopoulos, N., Vasanthavada, N., Baker, R.L.: Design of an image edge detection filter using the sobel operator. IEEE J. Solid-State Circuits 23(2), 358\u2013367 (1988)","journal-title":"IEEE J. Solid-State Circuits"},{"key":"8_CR24","unstructured":"Kirillov, A., et al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"8_CR25","doi-asserted-by":"publisher","unstructured":"Lee, S., Gu, G., Park, S., Choi, S., Choo, J.: High-resolution virtual try-on with\u00a0misalignment and\u00a0occlusion-handled conditions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XVII, pp. 204\u2013219. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_13","DOI":"10.1007\/978-3-031-19790-1_13"},{"key":"8_CR26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3048039","author":"P Li","year":"2020","unstructured":"Li, P., Xu, Y., Wei, Y., Yang, Y.: Self-correction for human parsing. IEEE Trans. Pattern Anal. Mach. Intell. (2020). https:\/\/doi.org\/10.1109\/TPAMI.2020.3048039","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR27","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Men, Y., Mao, Y., Jiang, Y., Ma, W.Y., Lian, Z.: Controllable person image synthesis with attribute-decomposed GAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5084\u20135093 (2020)","DOI":"10.1109\/CVPR42600.2020.00513"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Morelli, D., Baldrati, A., Cartella, G., Cornia, M., Bertini, M., Cucchiara, R.: Ladi-vton: latent diffusion textual-inversion enhanced virtual try-on. arXiv preprint arXiv:2305.13501 (2023)","DOI":"10.1145\/3581783.3612137"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Morelli, D., Fincato, M., Cornia, M., Landi, F., Cesari, F., Cucchiara, R.: Dress code: high-resolution multi-category virtual try-on. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2231\u20132235 (2022)","DOI":"10.1109\/CVPRW56347.2022.00243"},{"key":"8_CR31","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Ren, Y., Fan, X., Li, G., Liu, S., Li, T.H.: Neural texture extraction and distribution for controllable person image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13535\u201313544 (2022)","DOI":"10.1109\/CVPR52688.2022.01317"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR34","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"8_CR35","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"8_CR36","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"8_CR37","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)"},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Wang, B., Zheng, H., Liang, X., Chen, Y., Lin, L., Yang, M.: Toward characteristic-preserving image-based virtual try-on network. In: Proceedings of the European Conference on Computer Vision (ECCV) pp. 589\u2013604 (2018)","DOI":"10.1007\/978-3-030-01261-8_36"},{"issue":"4","key":"8_CR39","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004)","journal-title":"IEEE Trans. Image Process."},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: Gp-vton: towards general purpose virtual try-on via collaborative local-flow global-parsing learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23550\u201323559 (2023)","DOI":"10.1109\/CVPR52729.2023.02255"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Yang, H., Yu, X., Liu, Z.: Full-range virtual try-on with recurrent tri-level transform. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3460\u20133469 (2022)","DOI":"10.1109\/CVPR52688.2022.00345"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Yang, H., Zhang, R., Guo, X., Liu, W., Zuo, W., Luo, P.: Towards photo-realistic virtual try-on by adaptively generating-preserving image content. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7850\u20137859 (2020)","DOI":"10.1109\/CVPR42600.2020.00787"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Yu, R., Wang, X., Xie, X.: Vtnfp: an image-based virtual try-on network with body and clothing feature preservation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10511\u201310520 (2019)","DOI":"10.1109\/ICCV.2019.01061"},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, J., Li, K., Lai, Y.K., Yang, J.: Pise: person image synthesis and editing with decoupled GAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7982\u20137990 (2021)","DOI":"10.1109\/CVPR46437.2021.00789"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"8_CR46","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: Controlvideo: training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077 (2023)"},{"key":"8_CR47","doi-asserted-by":"crossref","unstructured":"Zhong, X., Wu, Z., Tan, T., Lin, G., Wu, Q.: Mv-ton: memory-based video virtual try-on network. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 908\u2013916 (2021)","DOI":"10.1145\/3474085.3475269"},{"key":"8_CR48","doi-asserted-by":"crossref","unstructured":"Zhu, L., et al.: Tryondiffusion: a tale of two unets. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4606\u20134615 (2023)","DOI":"10.1109\/CVPR52729.2023.00447"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72643-9_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T21:25:49Z","timestamp":1732224349000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72643-9_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031726422","9783031726439"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72643-9_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}