{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:53Z","timestamp":1777865333374,"version":"3.51.4"},"reference-count":73,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100023542","name":"Centre for Perceptual and Interactive Intelligence (CPII) Ltd","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100023542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003452","name":"Innovation and Technology Commission (ITC)'s InnoHK, and HKU Startup Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003452","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01110","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"11937-11947","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Identity Human Image Animation with Structural Video Diffusion"],"prefix":"10.1109","author":[{"given":"Zhenzhi","family":"Wang","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Yixuan","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Yanhong","family":"Zeng","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory"}]},{"given":"Yuwei","family":"Guo","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Tianfan","family":"Xue","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Bo","family":"Dai","sequence":"additional","affiliation":[{"name":"The University of Hong Kong"}]}],"member":"263","reference":[{"key":"ref2","article-title":"ediffi: Text-to-image diffusion models with an ensemble of expert denoisers","author":"Balaji","year":"2022","journal-title":"arXiv preprint"},{"key":"ref3","article-title":"Hspace: Synthetic parametric humans animated in complex environments","author":"Gabriel Bazavan","year":"2021","journal-title":"arXiv preprint"},{"key":"ref4","article-title":"Zoedepth: Zero-shot transfer by combining relative and metric depth","author":"Farooq Bhat","year":"2023","journal-title":"arXiv preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"ref6","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3450537"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"ref12","article-title":"Magicdance: Realistic human dance video generation with motions & facial expressions transfer","author":"Chang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00762"},{"key":"ref14","article-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Cameractrl: Enabling camera control for text-to-video generation","author":"He","year":"2024","journal-title":"arXiv preprint"},{"key":"ref16","first-page":"6626","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"Heusel","year":"2017","journal-title":"NIPS"},{"key":"ref17","article-title":"Denoising diffusion probabilistic models","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref18","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref19","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv preprint"},{"key":"ref20","article-title":"Video diffusion models","author":"Ho","year":"2022","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"Animate anyone: Consistent and controllable image-to-video synthesis for character animation","volume-title":"arXiv preprint","author":"Hu","year":"2023"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00193"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01256"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_12"},{"key":"ref25","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2013","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Hunyuanvideo: A systematic framework for large video generative models","author":"Kong","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","volume-title":"Flux","year":"2024"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref29","article-title":"Hyperhuman: Hyper-realistic human generation with latent structural diffusion","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"ref31","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"Dpm-solver: A fast ODE solver for diffusion probabilistic model sampling in around 10 steps","author":"Lu","year":"2022","journal-title":"NeurIPS"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"ref34","article-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01326"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref37","article-title":"Sdxl: improving latent diffusion models for high-resolution image synthesis","volume-title":"arXiv preprint","author":"Podell","year":"2023"},{"key":"ref38","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref39","article-title":"Sam 2: Segment anything in images and videos","author":"Ravi","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3018224"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"ref44","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","author":"Saharia","year":"2022","journal-title":"NeurIPS"},{"key":"ref45","article-title":"Human4dit: Free-view human video generation with 4d diffusion transformer","author":"Shao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref46","first-page":"7135","article-title":"First order motion model for image animation","author":"Siarohin","year":"2019","journal-title":"NeurIPS"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00248"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.494"},{"key":"ref50","article-title":"Make-a-video: Text-to-video generation without text-video data","author":"Singer","year":"2022","journal-title":"arXiv preprint"},{"key":"ref51","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","author":"Sohl-Dickstein","year":"2015","journal-title":"ICML"},{"key":"ref52","article-title":"Denoising diffusion implicit models","author":"Song","year":"2021","journal-title":"ICLR. OpenReview.net"},{"key":"ref53","article-title":"Score-based generative modeling through stochastic differential equations","author":"Song","journal-title":"arXiv preprint"},{"key":"ref54","first-page":"16558","article-title":"DROID-SLAM: deep visual SLAM for monocular, stereo, and RGB-D cameras","author":"Teed","year":"2021","journal-title":"NeurIPS"},{"key":"ref55","article-title":"FVD: A new metric for video generation","author":"Unterthiner","year":"2019","journal-title":"DGS@ICLR"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref57","article-title":"Leo: Generative latent image animator for human video synthesis","volume-title":"arXiv preprint","author":"Wang","year":"2023"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73247-8_27"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref60","article-title":"Humanvid: Demystifying training data for camera-controllable human image animation","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.511"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00147"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01855"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"ref65","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","article-title":"Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory","author":"Yin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00690"},{"key":"ref68","article-title":"Dwnet: Dense warp-based network for pose-guided human video generation","author":"Zablotskaia","journal-title":"arXiv preprint"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref71","article-title":"Mimicmotion: High-quality human motion video generation with confidence-aware pose guidance","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"ref73","article-title":"Magicvideo: Efficient video generation with latent diffusion models","author":"Zhou","year":"2022","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73001-6_9"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445009.pdf?arnumber=11445009","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:17:33Z","timestamp":1777529853000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445009\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01110","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}