{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T02:11:34Z","timestamp":1768356694872,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556786","type":"print"},{"value":"9789819556793","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5679-3_32","type":"book-chapter","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T18:37:03Z","timestamp":1768329423000},"page":"460-474","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ITVTON: Virtual Try-On Diffusion Transformer Based on\u00a0Integrated Image and\u00a0Text"],"prefix":"10.1007","author":[{"given":"Haifeng","family":"Ni","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,14]]},"reference":[{"key":"32_CR1","unstructured":"Bi\u0144kowski, M., Sutherland, D.J., Arbel, M., Gretton, A.: Demystifying MMD GANs (2021). https:\/\/arxiv.org\/abs\/1801.01401"},{"key":"32_CR2","doi-asserted-by":"crossref","unstructured":"Cao, Z., Simon, T., Wei, S.E., Sheikh, Y.: Realtime multi-person 2D pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.143"},{"key":"32_CR3","doi-asserted-by":"crossref","unstructured":"Chen, C.Y., Chen, Y.C., Shuai, H.H., Cheng, W.H.: Size does matter: size-aware virtual try-on via clothing-oriented transformation try-on network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7513\u20137522 (2023)","DOI":"10.1109\/ICCV51070.2023.00691"},{"key":"32_CR4","doi-asserted-by":"crossref","unstructured":"Choi, S., Park, S., Lee, M., Choo, J.: VITON-HD: high-resolution virtual try-on via misalignment-aware normalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14131\u201314140 (2021)","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"32_CR5","doi-asserted-by":"crossref","unstructured":"Choi, Y., Kwak, S., Lee, K., Choi, H., Shin, J.: Improving diffusion models for authentic virtual try-on in the wild (2024). https:\/\/arxiv.org\/abs\/2403.05139","DOI":"10.1007\/978-3-031-73016-0_13"},{"key":"32_CR6","unstructured":"Chong, Z., et al.: CatVTON: concatenation is all you need for virtual try-on with diffusion models (2024). https:\/\/arxiv.org\/abs\/2407.15886"},{"key":"32_CR7","doi-asserted-by":"crossref","unstructured":"Dong, H., et al.: Towards multi-pose guided virtual try-on network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00912"},{"key":"32_CR8","unstructured":"Esser, P., et al.: Scaling rectified flow transformers for high-resolution image synthesis. In: Forty-first International Conference on Machine Learning (2024). https:\/\/openreview.net\/forum?id=FPnUhsQJ5B"},{"key":"32_CR9","doi-asserted-by":"crossref","unstructured":"Gao, B., Ren, J., Shen, F., Wei, M., Huang, Z.: Exploring warping-guided features via adaptive latent diffusion model for virtual try-on. In: 2024 IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136. IEEE (2024)","DOI":"10.1109\/ICME57554.2024.10687416"},{"key":"32_CR10","doi-asserted-by":"publisher","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020). https:\/\/doi.org\/10.1145\/3422622","DOI":"10.1145\/3422622"},{"key":"32_CR11","doi-asserted-by":"publisher","unstructured":"Gou, J., Sun, S., Zhang, J., Si, J., Qian, C., Zhang, L.: Taming the power of diffusion models for high-quality virtual try-on with appearance flow. In: Proceedings of the 31st ACM International Conference on Multimedia, MM 2023, pp. 7599\u20137607. Association for Computing Machinery, New York (2023). https:\/\/doi.org\/10.1145\/3581783.3612255","DOI":"10.1145\/3581783.3612255"},{"key":"32_CR12","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Adv. Neural. Inf. Proc. Syst. 30 (2017)"},{"key":"32_CR13","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models (2021). https:\/\/arxiv.org\/abs\/2106.09685"},{"key":"32_CR14","unstructured":"Huang, L., et al.: In-context lora for diffusion transformers (2024). https:\/\/arxiv.org\/abs\/2410.23775"},{"key":"32_CR15","doi-asserted-by":"crossref","unstructured":"Kim, J., Gu, G., Park, M., Park, S., Choo, J.: StableVITON: learning semantic correspondence with latent diffusion model for virtual try-on. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8176\u20138185 (2024)","DOI":"10.1109\/CVPR52733.2024.00781"},{"issue":"6","key":"32_CR16","doi-asserted-by":"publisher","first-page":"3260","DOI":"10.1109\/TPAMI.2020.3048039","volume":"44","author":"P Li","year":"2022","unstructured":"Li, P., Xu, Y., Wei, Y., Yang, Y.: Self-correction for human parsing. IEEE Trans. Pattern Anal. Mach. Intell. 44(6), 3260\u20133271 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2020.3048039","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"32_CR17","doi-asserted-by":"publisher","unstructured":"Morelli, D., et al.: LaDI-VTON: latent diffusion textual-inversion enhanced virtual try-on. In: Proceedings of the 31st ACM International Conference on Multimedia, MM 2023, pp. 8580\u20138589. Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3581783.3612137","DOI":"10.1145\/3581783.3612137"},{"key":"32_CR18","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"32_CR19","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"32_CR20","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020). http:\/\/jmlr.org\/papers\/v21\/20-074.html"},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"32_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"32_CR23","unstructured":"Shen, F., et al.: IMAGDressing-v1: customizable virtual dressing (2024). https:\/\/arxiv.org\/abs\/2407.12705"},{"key":"32_CR24","unstructured":"Shen, F., et al.: Long-term TalkingFace generation via motion-prior conditional diffusion model. arXiv preprint arXiv:2502.09533 (2025)"},{"key":"32_CR25","unstructured":"Shen, F., et al.: Boosting consistency in story visualization with rich-contextual conditional diffusion models. arXiv preprint arXiv:2407.02482 (2024)"},{"key":"32_CR26","unstructured":"Shen, F., Ye, H., Zhang, J., Wang, C., Han, X., Yang, W.: Advancing pose-guided image synthesis with progressive conditional diffusion models. arXiv preprint arXiv:2310.06313 (2023)"},{"key":"32_CR27","doi-asserted-by":"crossref","unstructured":"Simon, T., Joo, H., Matthews, I., Sheikh, Y.: Hand keypoint detection in single images using multiview bootstrapping. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.494"},{"issue":"4","key":"32_CR28","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A., Sheikh, H., Simoncelli, E.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004). https:\/\/doi.org\/10.1109\/TIP.2003.819861","journal-title":"IEEE Trans. Image Process."},{"key":"32_CR29","doi-asserted-by":"crossref","unstructured":"Wei, S.E., Ramakrishna, V., Kanade, T., Sheikh, Y.: Convolutional pose machines. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.511"},{"key":"32_CR30","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: GP-VTON: Towards general purpose virtual try-on via collaborative local-flow global-parsing learning (2023). https:\/\/arxiv.org\/abs\/2303.13756","DOI":"10.1109\/CVPR52729.2023.02255"},{"key":"32_CR31","unstructured":"Xu, Y., Gu, T., Chen, W., Chen, C.: OOTDiffusion: outfitting fusion based latent diffusion for controllable virtual try-on (2024). https:\/\/arxiv.org\/abs\/2403.01779"},{"key":"32_CR32","doi-asserted-by":"crossref","unstructured":"Yang, B., et al.: Paint by example: exemplar-based image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18381\u201318391 (2023)","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"32_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"32_CR34","doi-asserted-by":"crossref","unstructured":"Zhu, L., et al.: TryOnDiffusion: a tale of two UNets. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4606\u20134615 (2023)","DOI":"10.1109\/CVPR52729.2023.00447"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5679-3_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T18:37:06Z","timestamp":1768329426000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5679-3_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556786","9789819556793"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5679-3_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"14 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}