{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T07:52:13Z","timestamp":1776757933454,"version":"3.51.2"},"reference-count":76,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T00:00:00Z","timestamp":1776729600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T00:00:00Z","timestamp":1776729600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s11263-026-02766-7","type":"journal-article","created":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T07:17:11Z","timestamp":1776755831000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AI killed the Video Star. Audio-Driven Diffusion Model for Expressive Talking Head Generation"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7156-7632","authenticated-orcid":false,"given":"Baptiste","family":"Chopin","sequence":"first","affiliation":[]},{"given":"Tashvik","family":"Dhamija","sequence":"additional","affiliation":[]},{"given":"Pranav","family":"Balaji","sequence":"additional","affiliation":[]},{"given":"Yaohui","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Antitza","family":"Dantcheva","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,4,21]]},"reference":[{"key":"2766_CR1","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., & Sebe, N. (2019). First order motion model for image animation. In: Proceedings of NeurIPS"},{"key":"2766_CR2","unstructured":"Wang, Y., Yang, D., Bremond, F., & Dantcheva, A. (2022). Latent image animator: Learning to animate images via latent space navigation. In: Proceedings of ICLR."},{"key":"2766_CR3","doi-asserted-by":"crossref","unstructured":"Zhao, J., & Zhang, H. (2022). Thin-plate spline motion model for image animation. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"2766_CR4","doi-asserted-by":"crossref","unstructured":"Siarohin, A., Woodford, O., Ren, J., Chai, M., & Tulyakov, S. (2021). Motion representations for articulated animation. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"2766_CR5","doi-asserted-by":"crossref","unstructured":"Hong, F.-T., Zhang, L., Shen, L., & Xu, D. (2022). Depth-aware generative adversarial network for talking head video generation. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"2766_CR6","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y., Bao, H., & Zhang, J. (2021). Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In: Proceedings of ICCV.","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"2766_CR7","doi-asserted-by":"crossref","unstructured":"Prajwal, K. R., Mukhopadhyay, R., Namboodiri, V. P., & Jawahar, C. V. (2020). A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of ACM Multimedia","DOI":"10.1145\/3394171.3413532"},{"key":"2766_CR8","doi-asserted-by":"crossref","unstructured":"Guan, J., Zhang, Z., Zhou, H., HU, T., Wang, K., He, D., Feng, H., Liu, J., Ding, E., Liu, Z., & Wang, J. (2023). Stylesync: High-fidelity generalized and personalized lip sync in style-based generator. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"2766_CR9","doi-asserted-by":"crossref","unstructured":"Ma, Y., Wang, S., Hu, Z., Fan, C., Lv, T., Ding, Y., Deng, Z., & Yu, X. (2023a). Styletalk: One-shot talking head generation with controllable speaking styles. In: Proceedings of AAAI.","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"2766_CR10","unstructured":"Ye, Z., Zhong, T., Ren, Y., Yang, J., Li, W., Huang, J., Jiang, Z., He, J., Huang, R., Liu, J., Zhang, C., Yin, X., MA, Z., & Zhao, Z. (2024). Real3d-portrait: One-shot realistic 3d talking portrait synthesis. In: Proceedings of ICLR"},{"key":"2766_CR11","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., & Tong, X. (2019). Accurate 3d face reconstruction with weakly-supervised learning: From single image to image set. In: Proceedings of CVPRW.","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"2766_CR12","doi-asserted-by":"crossref","unstructured":"Oorloff, T., & Yacoob, Y. (2023). Expressive talking head video encoding in stylegan2 latent space. In: Proceedings of ICCVW.","DOI":"10.1109\/ICCVW60793.2023.00322"},{"key":"2766_CR13","doi-asserted-by":"crossref","unstructured":"Bounareli, S., Tzelepis, C., Argyriou, V., Patras, I., & Tzimiropoulos, G. (2023). Hyperreenact: One-shot reenactment via jointly learning to refine and retarget faces. In: Proceedings ICCV.","DOI":"10.1109\/ICCV51070.2023.00657"},{"key":"2766_CR14","unstructured":"Tao, J., Gu, S., Li, W., & Duan, L. (2023). Learning motion refinement for unsupervised face animation. In: Proceedings of NeurIPS."},{"key":"2766_CR15","doi-asserted-by":"crossref","unstructured":"Ni, H., Shi, C., Li, K., Huang, S. X., & Min, M. R. (2023). Conditional image-to-video generation with latent flow diffusion models. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"2766_CR16","doi-asserted-by":"crossref","unstructured":"Pang, Y., Zhang, Y., Quan, W., Fan, Y., Cun, X., Shan, Y., & Yan, D.-M. (2023). Dpe: Disentanglement of pose and expression for general video portrait editing. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52729.2023.00049"},{"key":"2766_CR17","doi-asserted-by":"crossref","unstructured":"Ji, X., Zhou, H., Wang, K., Wu, W., Loy, C.C., Cao, X., & Xu, F. (2021). Audio-driven emotional video portraits. In: Proceedings of CVPR","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"2766_CR18","doi-asserted-by":"crossref","unstructured":"Fried, O., Tewari, A., Zollh\u00f6fer, M., Finkelstein, A., Shechtman, E., Goldman, D.B., Genova, K., Jin, Z., Theobalt, C., & Agrawala, M. (2019). Text-based editing of talking-head video. ACM Transaction on Graphics.","DOI":"10.1145\/3306346.3323028"},{"key":"2766_CR19","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., & Liu, Z. (2021). Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of CVPR","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"2766_CR20","doi-asserted-by":"crossref","unstructured":"Wang, D., Deng, Y., Yin, Z., Shum, H.-Y., & Wang, B. (2023). Progressive disentangled representation learning for fine-grained controllable talking head synthesis. In: Proceedings of CVPR","DOI":"10.1109\/CVPR52729.2023.01724"},{"key":"2766_CR21","doi-asserted-by":"crossref","unstructured":"Gururani, S., Mallya, A., Wang, T.-C., Valle, R., & Liu, M. -Y. (2023). Space: Speech-driven portrait animation with controllable expression. In: Proceedings of CVPR.","DOI":"10.1109\/ICCV51070.2023.01912"},{"key":"2766_CR22","unstructured":"Suzhen, W., Lincheng, L., Yu, D., Changjie, F., & Xin, Y. (2021). Audio2head: Audio-driven one-shot talking-head generation with natural head motion. In: Proceedings of IJCAI."},{"key":"2766_CR23","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y., & Wang, F. (2023). Sadtalker: Learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"2766_CR24","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., & Fan, C. (2021). Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"2766_CR25","doi-asserted-by":"crossref","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., & Nie\u00dfner, M. (2020). Neural voice puppetry: Audio-driven facial reenactment. Proceedings of ECCV.","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"2766_CR26","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., & Li, D. (2020). Makelttalk: speaker-aware talking-head animation. ACM Transaction Graphics.","DOI":"10.1145\/3414685.3417774"},{"key":"2766_CR27","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yin, Z., Zhou, D., Wang, D., Wong, F., & Wang, B. (2023). Talking head generation with probabilistic audio-to-visual diffusion priors. In: Proceedings of ICCV","DOI":"10.1109\/ICCV51070.2023.00703"},{"key":"2766_CR28","unstructured":"Wei, H., Yang, Z., & Wang, Z. (2024). Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694"},{"key":"2766_CR29","unstructured":"Zhiyuan, C., Jiajiong, C., Zhiquan, C., Yuming, L., & Chenguang, M. (2024). Echomimic: Lifelike audio-driven portrait animations through editable landmark conditioning. In: Proceedings of AAAI"},{"key":"2766_CR30","doi-asserted-by":"crossref","unstructured":"Thambiraja, B., Habibie, I., Aliakbarian, S., Cosker, D., Theobalt, C., & Thies, J. (2023). Imitator: Personalized speech-driven 3d facial animation. In: Proceedings of ICCV","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"2766_CR31","doi-asserted-by":"crossref","unstructured":"Ye, Z., Zhang, L.-G., Zeng, D., Lu, Q., & Jiang, N. (2025). Realistic real-time talking head synthesis with grid encoding and progressive conditioning. In: Proceedings of ICASSP","DOI":"10.1109\/ICASSP49660.2025.10887890"},{"key":"2766_CR32","doi-asserted-by":"crossref","unstructured":"Nocentini, F., Besnier, T., Ferrari, C., Arguillere, S., Berretti, S., & Daoudi, M. (2024). Scantalk: 3d talking heads from unregistered scans. In: Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-031-73397-0_2"},{"key":"2766_CR33","unstructured":"Ye, Z., Jiang, Z., Ren, Y., Liu, J., He, J., & Zhao, Z. (2023a). Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis. In: Proceedings of ICLR."},{"key":"2766_CR34","unstructured":"Ye, Z., He, J., Jiang, Z., Huang, R., Huang, J., Liu, J., Ren, Y., Yin, X., Ma, Z., & Zhao, Z. (2023b). Geneface++: Generalized and stable real-time audio-driven 3d talking face generation. arXiv preprint arXiv:2305.00787"},{"key":"2766_CR35","unstructured":"Ma, Y., Zhang, S., Wang, J., Wang, X., Zhang, Y., & Deng, Z. (2023b). Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767."},{"key":"2766_CR36","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., & Auli, M. (2020). wav2vec 2.0: A framework for self-supervised learning of speech representations. In: Proceedings of NeurIPS."},{"key":"2766_CR37","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., & Ganguli, S. (2015). Deep unsupervised learning using nonequilibrium thermodynamics. In: Proceedings of ICML."},{"key":"2766_CR38","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. In: Proceedings of NeurIPS."},{"key":"2766_CR39","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Liu, B., Zhu, Y., Yang, X., Chen, C., & Xu, J. (2023). Shifted diffusion for text-to-image generation. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52729.2023.00979"},{"key":"2766_CR40","doi-asserted-by":"crossref","unstructured":"Yu, H.-C., & Chien, J.-T. (2025). Attention disentanglement for semantic diffusion modeling in text-to-image generation. In: Proceedings of ICASSP.","DOI":"10.1109\/ICASSP49660.2025.10888688"},{"key":"2766_CR41","unstructured":"Hoogeboom, E., Satorras, V.G., Vignac, C., & Welling, M. (2022). Equivariant diffusion for molecule generation in 3D. In: Proceedings of ICML"},{"key":"2766_CR42","doi-asserted-by":"crossref","unstructured":"Harvey, W., Naderiparizi, S., Masrani, V., Weilbach, C., & Wood, F. (2022). Flexible diffusion modeling of long videos. In: Proceedings of NeurIPS.","DOI":"10.52202\/068431-2027"},{"key":"2766_CR43","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., Ge, Y., Wang, X., Lei, S.W., Gu, Y., Shi, Y., Hsu, W., Shan, Y., Qie, X., & Shou, M. Z. (2023). Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of ICCV","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"2766_CR44","unstructured":"Zhang, Y., Liu, Y., Xia, B., Peng, B., Yan, Z., Lo, E., & Jia, J. (2025). Magic mirror: Id-preserved video generation in video diffusion transformers. arXiv preprint arXiv:2501.03931"},{"key":"2766_CR45","unstructured":"Lin, S., Xia, X., Ren, Y., Yang, C., Xiao, X., & Jiang, L. (2025). Diffusion adversarial post-training for one-step video generation. arXiv preprint arXiv:2501.08316"},{"key":"2766_CR46","doi-asserted-by":"crossref","unstructured":"Huang, Z., Chan, K.C., Jiang, Y., & Liu, Z. (2023). Collaborative diffusion for multi-modal face generation and editing. In: Proceedings of CVPR","DOI":"10.1109\/CVPR52729.2023.00589"},{"key":"2766_CR47","doi-asserted-by":"crossref","unstructured":"Kim, M., Liu, F., Jain, A., & Liu, X. (2023). Dcface: Synthetic face generation with dual condition diffusion model. In: Proceedings of CVPR","DOI":"10.1109\/CVPR52729.2023.01223"},{"key":"2766_CR48","doi-asserted-by":"crossref","unstructured":"Song, W., Ye, Z., Sun, M., Hou, X., Li, S., & Hao, A. (2025). Attridiffuser: Adversarially enhanced diffusion model for text-to-facial attribute image synthesis. Pattern Recognition","DOI":"10.2139\/ssrn.4965105"},{"key":"2766_CR49","doi-asserted-by":"crossref","unstructured":"Stypu\u0142kowski, M., Vougioukas, K., He, S., Zieba, M., Petridis, S., & Pantic, M. (2024). Diffused heads: Diffusion models beat gans on talking-face generation. In: Proceedings of WACV","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"2766_CR50","doi-asserted-by":"crossref","unstructured":"Du, C., Chen, Q., He, T., Tan, X., Chen, X., Yu, K., Zhao, S., & Bian, J. (2023). Dae-talker: High fidelity speech-driven talking face generation with diffusion autoencoder. In: Proceedings of ACM Multimedia","DOI":"10.1145\/3581783.3613753"},{"key":"2766_CR51","doi-asserted-by":"crossref","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., & Romero, J. (2017). Learning a model of facial shape and expression from 4D scans. Proceedings of SIGGRAPH Asia)","DOI":"10.1145\/3130800.3130813"},{"key":"2766_CR52","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo\u00a0Lopes, R., Karagol\u00a0Ayan, B., Salimans, T., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. In: Proceedings of NeurIPS"},{"key":"2766_CR53","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., Parikh, D., Gupta, S., & Taigman, Y. (2023). Make-a-video: Text-to-video generation without text-video data. In: Proceedings of ICLR"},{"key":"2766_CR54","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., & Bermano, A.H. (2023). Human motion diffusion model. In: Proceedings of ICLR."},{"key":"2766_CR55","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125"},{"key":"2766_CR56","doi-asserted-by":"crossref","unstructured":"Chen, S., Wang, C., Chen, Z., Wu, Y., Liu, S., Chen, Z., Li, J., Kanda, N., Yoshioka, T., Xiao, X., et al. (2022). Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"2766_CR57","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., & Li, H. (2021). Efficient attention: Attention with linear complexities. In: Proceedings of WACV."},{"key":"2766_CR58","doi-asserted-by":"crossref","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., & Liu, S. (2021). Pirenderer: Controllable portrait image generation via semantic neural rendering. In: Proceedings of ICCV.","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"2766_CR59","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhang, J., Chen, T., Wang, W., & Luo, P. (2023). Restoreformer++: Towards real-world blind face restoration from undegraded key-value pairs. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2023.3315753"},{"key":"2766_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., & Fan, C. (2021). Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of CVPR","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"2766_CR61","doi-asserted-by":"crossref","unstructured":"Chung, J.S., & Zisserman, A. (2017). Lip reading in the wild. In: Proceedings of ACCV","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"2766_CR62","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., & Zisserman, A. (2018). Voxceleb2: Deep speaker recognition. In: Proceedings of INTERSPEECH.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"2766_CR63","doi-asserted-by":"crossref","unstructured":"Zhu, H., Wu, W., Zhu, W., Jiang, L., Tang, S., Zhang, L., Liu, Z., & Loy, C. C. (2022). CelebV-HQ: A large-scale video facial attributes dataset. In: Proceedings of ECCV.","DOI":"10.1007\/978-3-031-20071-7_38"},{"key":"2766_CR64","doi-asserted-by":"crossref","unstructured":"Bulat, A., & Tzimiropoulos, G. (2017). How far are we from solving the 2d & 3d face alignment problem? (and a dataset of 230,000 3d facial landmarks). In: Proceedings of ICCV.","DOI":"10.1109\/ICCV.2017.116"},{"key":"2766_CR65","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, Z., Maddox, R.K., Duan, Z., & Xu, C. (2018). Lip movements generation at a glance. In: Proceedings ECCV.","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"2766_CR66","unstructured":"King, D. E. (2009). Dlib-ml: A machine learning toolkit. Journal of Machine Learning Research."},{"key":"2766_CR67","doi-asserted-by":"crossref","unstructured":"Umeyama, S. (1991). Least-squares estimation of transformation parameters between two point patterns. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/34.88573"},{"key":"2766_CR68","doi-asserted-by":"crossref","unstructured":"Chung, J. S., & Zisserman, A. (2017). Out of time: automated lip sync in the wild. In: Proceedings of ACCV.","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"2766_CR69","doi-asserted-by":"crossref","unstructured":"Huang, Z., He, Y., Yu, J., Zhang, F., Si, C., Jiang, Y., Zhang, Y., Wu, T., Jin, Q., Chanpaisit, N., Wang, Y., Chen, X., Wang, L., Lin, D., Qiao, Y., & Liu, Z. (2024). VBench: Comprehensive benchmark suite for video generative models. In: Proceedings CVPR.","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"2766_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, W., Zhu, C., Gao, J., Yan, Y., Zhai, G., & Yang, X. (2024). A comparative study of perceptual quality metrics for audio-driven talking head videos. In: Proceedings of ICIP.","DOI":"10.1109\/ICIP51287.2024.10647543"},{"key":"2766_CR71","doi-asserted-by":"crossref","unstructured":"Yaman, D., Eyiokur, F. I., B\u00e4rmann, L., Ekenel, H. K., & Waibel, A. (2024). Audio-driven talking face generation with stabilized synchronization loss. In: Proceedings of ECCV.","DOI":"10.1007\/978-3-031-72655-2_24"},{"key":"2766_CR72","doi-asserted-by":"crossref","unstructured":"Tian, L., Wang, Q., Zhang, B., & Bo, L. (2024). Emo: Emote portrait alive generating expressive portrait videos with audio2video diffusion model under weak conditions. In: Proceedings of ECCV.","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"2766_CR73","doi-asserted-by":"crossref","unstructured":"Drobyshev, N., Casademunt, A.B., Vougioukas, K., Landgraf, Z., Petridis, S., & Pantic, M. (2024). Emoportraits: Emotion-enhanced multimodal one-shot head avatars. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52733.2024.00812"},{"key":"2766_CR74","doi-asserted-by":"crossref","unstructured":"Xu, S., Chen, G., Guo, Y.-X., Yang, J., Li, C., Zang, Z., Zhang, Y., Tong, X., & Guo, B. (2024). Vasa-1: Lifelike audio-driven talking faces generated in real time. Proceedings of NeurIPS.","DOI":"10.52202\/079017-0021"},{"key":"2766_CR75","unstructured":"Lin, G., Jiang, J., Liang, C., Zhong, T., Yang, J., Zheng, Z., & Zheng, Y. (2025). Cyberhost: A one-stage diffusion framework for audio-driven talking body generation. In: Proceedings of ICLR."},{"key":"2766_CR76","doi-asserted-by":"crossref","unstructured":"Corona, E., Zanfir, A., Bazavan, E.G., Kolotouros, N., Alldieck, T., & Sminchisescu, C. (2025). Vlogger: Multimodal diffusion for embodied avatar synthesis. In: Proceedings of CVPR.","DOI":"10.1109\/CVPR52734.2025.01482"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02766-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02766-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02766-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T07:18:01Z","timestamp":1776755881000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02766-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,21]]},"references-count":76,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["2766"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02766-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,21]]},"assertion":[{"value":"15 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"239"}}