{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T23:31:38Z","timestamp":1779147098635,"version":"3.51.4"},"reference-count":228,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076144"],"award-info":[{"award-number":["62076144"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Science and Technology Program","award":["WDZC20220816140515001"],"award-info":[{"award-number":["WDZC20220816140515001"]}]},{"name":"Shenzhen Science and Technology Program","award":["JCYJ20220818101014030"],"award-info":[{"award-number":["JCYJ20220818101014030"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1109\/tpami.2025.3594034","type":"journal-article","created":{"date-parts":[[2025,7,31]],"date-time":"2025-07-31T18:31:16Z","timestamp":1753986676000},"page":"10709-10730","source":"Crossref","is-referenced-by-count":12,"title":["Human Motion Video Generation: A Survey"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7318-9682","authenticated-orcid":false,"given":"Haiwei","family":"Xue","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2828-387X","authenticated-orcid":false,"given":"Xiangyang","family":"Luo","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0840-2796","authenticated-orcid":false,"given":"Zhanghao","family":"Hu","sequence":"additional","affiliation":[{"name":"01.AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Xi&#x2019;an Jiaotong University, Xi&#x2019;An, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xunzhi","family":"Xiang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuqin","family":"Dai","sequence":"additional","affiliation":[{"name":"PCA Lab, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7960-9382","authenticated-orcid":false,"given":"Jianzhuang","family":"Liu","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7911-7564","authenticated-orcid":false,"given":"Zhensong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab, Shenzhen, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minglei","family":"Li","sequence":"additional","affiliation":[{"name":"01.AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4800-832X","authenticated-orcid":false,"given":"Jian","family":"Yang","sequence":"additional","affiliation":[{"name":"PCA Lab, Nanjing University of Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5388-9125","authenticated-orcid":false,"given":"Fei","family":"Ma","sequence":"additional","affiliation":[{"name":"Guangdong Laboratory of Artificial Intelligence and Digital Economy (SZ), Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changpeng","family":"Yang","sequence":"additional","affiliation":[{"name":"01.AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7723-4130","authenticated-orcid":false,"given":"Zonghong","family":"Dai","sequence":"additional","affiliation":[{"name":"Artificial Intelligence Innovation and Incubation (Al&#x2019;) Institute of Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1136-5377","authenticated-orcid":false,"given":"Fei Richard","family":"Yu","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201283"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"ref3","article-title":"Make-a-video: Text-to-video generation without text-video data","author":"Singer","year":"2022"},{"key":"ref4","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Guo"},{"key":"ref5","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00842"},{"key":"ref7","article-title":"MagicDance: Realistic human dance video generation with motions & facial expressions transfer","author":"Chang","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3575656"},{"key":"ref11","article-title":"A comprehensive survey on human video generation: Challenges, methods, and insights","author":"Lei","year":"2024"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02305-2"},{"key":"ref13","article-title":"A comprehensive taxonomy and analysis of talking head synthesis: Techniques for portrait generation, driving mechanisms, and editing","author":"Meng","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.3389\/frsip.2023.1230755"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"ref17","article-title":"Affective faces for goal-driven dyadic communication","author":"Geng","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-91578-9_8"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32877"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3330935"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICEAAI64185.2025.10956370"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3191852"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447878"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01621"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687587"},{"key":"ref26","article-title":"LivePortrait: Efficient portrait animation with stitching and retargeting control","author":"Guo","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657459"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01484"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00603"},{"key":"ref30","article-title":"Human motionformer: Transferring human motions with vision transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu"},{"key":"ref31","article-title":"Bidirectional temporal diffusion model for temporally consistent human animation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Adiya"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00891"},{"key":"ref33","first-page":"8153","article-title":"Animate Anyone: Consistent and controllable image-to-video synthesis for character animation","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Hu"},{"key":"ref34","article-title":"Follow-your-pose V2: Multiple-condition guided character image animation for stable pose control","author":"Xue","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3687980"},{"key":"ref36","article-title":"MimicMotion: High-quality human motion video generation with confidence-aware pose guidance","author":"Zhang","year":"2024"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657407"},{"key":"ref38","article-title":"ViViD: Video virtual try-on using diffusion models","author":"Fang","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00668"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16286"},{"key":"ref42","article-title":"ID-animator: Zero-shot identity-preserving human video generation","author":"He","year":"2024"},{"key":"ref43","article-title":"Edit-your-motion: Space-time diffusion decoupling learning for video motion editing","author":"Zuo","year":"2024"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02079"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_23"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"ref51","article-title":"Hallo: Hierarchical audio-driven visual synthesis for portrait image animation","author":"Xu","year":"2024"},{"key":"ref52","article-title":"Emotional conversation: Empowering talking faces with cohesive expression, gaze and pose generation","author":"Liang","year":"2024"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480484"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01482"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00502"},{"key":"ref57","article-title":"Auto-encoding variational Bayes","author":"Kingma","year":"2013"},{"key":"ref58","article-title":"GeneFace: Generalized and high-fidelity audio-driven 3D talking face synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ye"},{"key":"ref59","article-title":"GeneFace: Generalized and stable real-time audio-driven 3D talking face generation","author":"Ye","year":"2023"},{"key":"ref60","first-page":"6306","article-title":"Neural discrete representation learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Van DenOord"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2024.3364018"},{"key":"ref65","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sohl-Dickstein"},{"key":"ref66","first-page":"12438","article-title":"Improved techniques for training score-based generative models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Song"},{"key":"ref67","first-page":"8162","article-title":"Improved denoising diffusion probabilistic models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nichol"},{"key":"ref68","article-title":"Denoising diffusion implicit models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Song"},{"key":"ref69","first-page":"8780","article-title":"Diffusion models beat GANs on image synthesis","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dhariwal"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3361474"},{"key":"ref71","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref72","article-title":"Dance your latents: Consistent dance generation through spatial-temporal subspace attention guided by motion flow","author":"Fang","year":"2023"},{"key":"ref73","article-title":"Human modelling and pose estimation overview","author":"Knap","year":"2024"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73001-6_9"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"ref77","article-title":"VividPose: Advancing stable video diffusion for realistic human image animation","author":"Wang","year":"2024"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00147"},{"key":"ref79","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01232"},{"key":"ref82","article-title":"MegActor: Harness the power of raw video for vivid portrait animation","author":"Yang","year":"2024"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00349"},{"key":"ref84","first-page":"13981","article-title":"FineMoGen: Fine-grained spatio-temporal motion generation and editing","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref85","article-title":"Plan, posture and go: Towards open-world text-to-motion generation","author":"Liu","year":"2023"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00135"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"ref88","article-title":"MotionScript: Natural language descriptions for expressive 3D human motions","author":"Yazdian","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00925"},{"key":"ref90","first-page":"105397","article-title":"InterControl: Zero-shot human interaction generation by controlling every joint","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00062"},{"key":"ref92","article-title":"Style-preserving lip sync via audio-aware style reference","author":"Zhong","year":"2024"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613753"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00639"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2024.3371530"},{"key":"ref96","first-page":"6263","article-title":"MagicPose: Realistic human poses and facial expressions retargeting with identity-aware diffusion","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chang"},{"key":"ref97","article-title":"DreaMoving: A human dance video generation framework based on diffusion models","author":"Feng","year":"2023"},{"key":"ref98","article-title":"Disentangling foreground and background motion for enhanced realism in human video generation","author":"Liu","year":"2024"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00753"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4592-3"},{"key":"ref101","article-title":"Synthesizing moving people with 3D control","author":"Li","year":"2024"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73202-7_19"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72633-0_8"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref105","first-page":"2664","article-title":"Gromov-Wasserstein averaging of kernel and distance matrices","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Peyr\u00e9"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref107","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/198"},{"key":"ref110","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","volume-title":"Proc. Conf. Lang. Model.","author":"Gu","year":"2024"},{"key":"ref111","article-title":"IP-Adapter: Text compatible image prompt adapter for text-to-image diffusion models","author":"Ye","year":"2023"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref113","first-page":"84839","article-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Tian"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680836"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_8"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1145\/3626235"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00206"},{"key":"ref118","first-page":"7594","article-title":"VideoComposer: Compositional video synthesis with motion controllability","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref119","article-title":"Text-to-video: A two-stage framework for zero-shot identity-agnostic talking-head generation","author":"Wang","year":"2023"},{"key":"ref120","article-title":"Neural text to articulate talk: Deep text to audiovisual speech synthesis achieving both auditory and photo-realism","author":"Milis","year":"2023"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00472"},{"key":"ref122","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Casanova"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92808-6_2"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00706"},{"key":"ref127","article-title":"MagicAvatar: Multimodal avatar generation and animation","author":"Zhang","year":"2023"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00762"},{"key":"ref129","article-title":"VideoCrafter1: Open diffusion models for high-quality video generation","author":"Chen","year":"2023"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref131","article-title":"AnimateZero: Video diffusion models are zero-shot image animators","author":"Yu","year":"2023"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2020.3023573"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.104911"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref137","article-title":"AniPortrait: Audio-driven synthesis of photorealistic portrait animation","author":"Wei","year":"2024"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3308441"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681198"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00070"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72684-2_8"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681675"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1145\/3610661.3616547"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3386836"},{"key":"ref146","article-title":"Dual-stream diffusion net for text-to-video generation","author":"Liu","year":"2023"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73024-5_20"},{"key":"ref148","article-title":"Tokenflow: Consistent diffusion features for consistent video editing","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Geyer"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00746"},{"key":"ref150","article-title":"ZeroSmooth: Training-free diffuser adaptation for high frame rate video generation","author":"Yang","year":"2024"},{"key":"ref151","article-title":"Dancing avatar: Pose and text-guided human motion videos synthesis with image diffusion model","author":"Qin","year":"2023"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-894"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1007\/11550907_126"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"ref157","article-title":"Real3D-portrait: One-shot realistic 3D talking portrait synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ye"},{"key":"ref158","article-title":"DreamTalk: When expressive talking head generation meets diffusion probabilistic models","author":"Ma","year":"2023"},{"key":"ref159","article-title":"V-Express: Conditional dropout for progressive training of portrait video generation","author":"Wang","year":"2024"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3099900"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00441"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00129"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00812"},{"key":"ref166","article-title":"Listen, disentangle, and control: Controllable speech-driven talking head generation","author":"Cai","year":"2024"},{"key":"ref167","article-title":"Vasa-1: Lifelike audio-driven talking faces generated in real time","author":"Xu","year":"2024"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW59549.2023.00073"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657413"},{"key":"ref170","first-page":"21386","article-title":"Audio-driven co-speech gesture video generation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref171","article-title":"Make your actor talk: Generalizable and high-fidelity lip sync with motion and appearance disentanglement","author":"Yu","year":"2024"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00320"},{"key":"ref173","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Baevski"},{"key":"ref174","article-title":"RealTalk: Real-time and realistic audio-driven face generation with 3D facial prior-guided identity alignment network","author":"Ji","year":"2024"},{"key":"ref175","article-title":"R2-Talker: Realistic real-time talking head synthesis with hash grid landmarks encoding and progressive multilayer conditioning","author":"Ye","year":"2023"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054103"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681627"},{"key":"ref179","article-title":"Large-scale multilingual audio visual dubbing","author":"Yang","year":"2020"},{"key":"ref180","first-page":"30599","article-title":"Towards robust blind face restoration with codebook lookup transformer","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhou"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_12"},{"key":"ref182","article-title":"StreamDiffusion: A pipeline-level solution for real-time interactive generation","author":"Kodaira","year":"2023"},{"key":"ref183","article-title":"Looking backward: Streaming video-to-video translation with feature banks","author":"Liang","year":"2024"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"ref185","first-page":"111000","article-title":"Motion consistency model: Accelerating video diffusion with disentangled motion-appearance distillation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhai"},{"key":"ref186","first-page":"6626","article-title":"GANs trained by a two time-scale update rule converge to a local nash equilibrium","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Heusel"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/QOMEX.2009.5246972"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2131660"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2012.2227726"},{"key":"ref194","article-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2018"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00134"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.261"},{"key":"ref202","first-page":"7135","article-title":"First order motion model for image animation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Siarohin"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01256"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01965"},{"key":"ref209","article-title":"Animate-X: Universal character image animation with enhanced motion representation","author":"Tan","year":"2024"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_38"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-22419-5_25"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01422"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_8"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.427"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00276"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3320236"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475438"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00865"},{"key":"ref222","article-title":"3DYoga90: A hierarchical video dataset for yoga pose understanding","author":"Kim","year":"2023"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00332"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00168"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01254"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_19"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11192800\/11106267.pdf?arnumber=11106267","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T19:35:41Z","timestamp":1776195341000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11106267\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11]]},"references-count":228,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3594034","relation":{"has-preprint":[{"id-type":"doi","id":"10.36227\/techrxiv.172793202.22697340\/v1","asserted-by":"object"}]},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11]]}}}