{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T21:08:19Z","timestamp":1773176899364,"version":"3.50.1"},"reference-count":226,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T00:00:00Z","timestamp":1764806400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T00:00:00Z","timestamp":1764806400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"Variable Energy Cyclotron Centre"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s00371-025-04232-w","type":"journal-article","created":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T17:10:45Z","timestamp":1764868245000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Advancements in talking head generation: a comprehensive review of techniques, metrics, and challenges"],"prefix":"10.1007","volume":"42","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7102-6564","authenticated-orcid":false,"given":"Vineet Kumar","family":"Rakesh","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3521-9557","authenticated-orcid":false,"given":"Soumya","family":"Mazumdar","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5326-4180","authenticated-orcid":false,"given":"Research Pratim","family":"Maity","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5246-7052","authenticated-orcid":false,"given":"Sarbajit","family":"Pal","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1460-8308","authenticated-orcid":false,"given":"Amitabha","family":"Das","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0521-0747","authenticated-orcid":false,"given":"Tapas","family":"Samanta","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,4]]},"reference":[{"key":"4232_CR1","unstructured":"Gowda, S.N., Pandey, D., Gowda, S.N.: From pixels to portraits: a comprehensive survey of talking head generation techniques and applications. arXiv (Cornell University) (2023) arxiv:2308.16041"},{"key":"4232_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105104","volume":"148","author":"A-H Shin","year":"2024","unstructured":"Shin, A.-H., Lee, J.-H., Hwang, J., Kim, Y., Park, G.-M.: Wav2NeRF: audio-driven realistic talking head generation via wavelet-based NeRF. Image Vis. Comput. 148, 105104 (2024). https:\/\/doi.org\/10.1016\/j.imavis.2024.105104","journal-title":"Image Vis. Comput."},{"key":"4232_CR3","doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NERF: representing scenes as neural radiance fields for view synthesis. arXiv (Cornell University) (2020) arxiv:2003.08934","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"4232_CR4","unstructured":"Meng, M., Zhao, Y., Zhang, B., Zhu, Y., Shi, W., Wen, M., Fan, Z.: A comprehensive taxonomy and analysis of talking head synthesis: techniques for portrait generation, driving mechanisms, and editing. arXiv (Cornell University) (2024) arxiv:2406.10553"},{"key":"4232_CR5","doi-asserted-by":"publisher","unstructured":"Ciregan, D., Meier, U., Schmidhuber, J.: Multi-column deep neural networks for image classification. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3642\u20133649 (2012). https:\/\/doi.org\/10.1109\/CVPR.2012.6248110","DOI":"10.1109\/CVPR.2012.6248110"},{"key":"4232_CR6","doi-asserted-by":"crossref","unstructured":"N\u00d3, R.L.: Vestibulo-ocular reflex arc. Arch. Neurol. Psychiatry 30(2), 245\u2013291 (1933)","DOI":"10.1001\/archneurpsyc.1933.02240140009001"},{"issue":"12","key":"4232_CR7","doi-asserted-by":"publisher","first-page":"755","DOI":"10.3390\/info15120755","volume":"15","author":"ID Mienye","year":"2024","unstructured":"Mienye, I.D., Swart, T.G.: A comprehensive review of deep learning: architectures, recent advances, and applications. Information 15(12), 755 (2024). https:\/\/doi.org\/10.3390\/info15120755","journal-title":"Information"},{"issue":"6","key":"4232_CR8","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017). https:\/\/doi.org\/10.1145\/3065386","journal-title":"Commun. ACM"},{"key":"4232_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.physd.2019.132306","volume":"404","author":"A Sherstinsky","year":"2020","unstructured":"Sherstinsky, A.: Fundamentals of recurrent neural network (RNN) and long short-term memory (LSTM) network. Physica D 404, 132306 (2020). https:\/\/doi.org\/10.1016\/j.physd.2019.132306","journal-title":"Physica D"},{"issue":"5","key":"4232_CR10","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1016\/0893-6080(89)90020-8","volume":"2","author":"K Hornik","year":"1989","unstructured":"Hornik, K., Stinchcombe, M., White, H.: Multilayer feedforward networks are universal approximators. Neural Netw. 2(5), 359\u2013366 (1989). https:\/\/doi.org\/10.1016\/0893-6080(89)90020-8","journal-title":"Neural Netw."},{"key":"4232_CR11","unstructured":"Thies, J., Zollh\u00f6fer, M., Stamminger, M., Theobalt, C., Nie\u00dfner, M.: Face2Face: real-time face capture and reenactment of RGB videos. arXiv (Cornell University) (2020) arxiv:2007.14808"},{"key":"4232_CR12","doi-asserted-by":"crossref","unstructured":"Shen, S., Zhao, W., Meng, Z., Li, W., Zhu, Z., Zhou, J., Lu, J.: DiffTalk: crafting diffusion models for generalized audio-driven portraits animation. arXiv (Cornell University) (2023) arxiv:2301.03786","DOI":"10.1109\/CVPR52729.2023.00197"},{"issue":"2","key":"4232_CR13","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1007\/bf01931367","volume":"16","author":"S Linnainmaa","year":"1976","unstructured":"Linnainmaa, S.: Taylor expansion of the accumulated rounding error. BIT Numer. Math. 16(2), 146\u2013160 (1976). https:\/\/doi.org\/10.1007\/bf01931367","journal-title":"BIT Numer. Math."},{"key":"4232_CR14","unstructured":"Ollivier, Y., Charpiat, G.: Training recurrent networks online without backtracking. arXiv (Cornell University) (2015) arxiv:1507.07680"},{"issue":"1\u20133","key":"4232_CR15","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1016\/j.neucom.2005.12.126","volume":"70","author":"G-B Huang","year":"2006","unstructured":"Huang, G.-B., Zhu, Q.-Y., Siew, C.-K.: Extreme learning machine: theory and applications. Neurocomputing 70(1\u20133), 489\u2013501 (2006). https:\/\/doi.org\/10.1016\/j.neucom.2005.12.126","journal-title":"Neurocomputing"},{"key":"4232_CR16","doi-asserted-by":"publisher","first-page":"182","DOI":"10.1016\/j.neunet.2012.09.020","volume":"37","author":"B Widrow","year":"2012","unstructured":"Widrow, B., Greenblatt, A., Kim, Y., Park, D.: The No-Prop algorithm: a new learning algorithm for multilayer neural networks. Neural Netw. 37, 182\u2013188 (2012). https:\/\/doi.org\/10.1016\/j.neunet.2012.09.020","journal-title":"Neural Netw."},{"key":"4232_CR17","doi-asserted-by":"publisher","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: Voxceleb: a large-scale speaker identification dataset. In: Interspeech 2017, pp. 2616\u20132620 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-950","DOI":"10.21437\/Interspeech.2017-950"},{"key":"4232_CR18","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A., Group, V.G.: VoxCeleb2: Deep Speaker Recognition. Technical report, Visual Geometry Group (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"4232_CR19","doi-asserted-by":"crossref","unstructured":"Wang, T.-C., Mallya, A., Liu, M.-Y.: One-shot free-view neural talking-head synthesis for video conferencing (talking head\u20131kh dataset). In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00991"},{"issue":"4","key":"4232_CR20","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/taffc.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: CREMA-D: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014). https:\/\/doi.org\/10.1109\/taffc.2014.2336244","journal-title":"IEEE Trans. Affect. Comput."},{"key":"4232_CR21","doi-asserted-by":"publisher","unstructured":"Yang, S., Zhang, Y., Feng, D., Yang, M., Wang, C., Xiao, J., Long, K., Shan, S., Chen, X.: Lrw-1000: a naturally-distributed large-scale benchmark for lip reading in the wild. In: 2019 14th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2019), pp. 1\u20138 (2019). https:\/\/doi.org\/10.1109\/FG.2019.8756582","DOI":"10.1109\/FG.2019.8756582"},{"key":"4232_CR22","doi-asserted-by":"crossref","unstructured":"Wang, K., Wu, Q., Song, L., Yang, Z., Wu, W., Qian, C., He, R., Qiao, Y., Loy, C.C.: Mead: a large-scale audio-visual dataset for emotional talking-face generation. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"4232_CR23","doi-asserted-by":"crossref","unstructured":"R\u00f6ssler, A., Cozzolino, D., Verdoliva, L., Riess, C., Thies, J., Nie\u00dfner, M.: FaceForensics++: Learning to detect manipulated facial images. In: International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00009"},{"key":"4232_CR24","doi-asserted-by":"publisher","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 3660\u20133669 (2021) https:\/\/doi.org\/10.1109\/cvpr46437.2021.00366","DOI":"10.1109\/cvpr46437.2021.00366"},{"key":"4232_CR25","doi-asserted-by":"crossref","unstructured":"Zhu, H., Wu, W., Zhu, W., Jiang, L., Tang, S., Zhang, L., Liu, Z., Loy, C.C.: CelebV-HQ: a Large-Scale Video Facial Attributes Dataset. arXiv (Cornell University) (2022) arxiv:2207.12393","DOI":"10.1007\/978-3-031-20071-7_38"},{"key":"4232_CR26","unstructured":"Terven, J., Cordova-Esparza, D.M., Ramirez-Pedraza, A., Chavez-Urbiola, E.A.: Loss functions and metrics in deep learning. arXiv (Cornell University) (2023) arxiv:2307.02694"},{"key":"4232_CR27","doi-asserted-by":"crossref","unstructured":"Chen, L., Cui, G., Liu, C., Li, Z., Kou, Z., Xu, Y., Xu, C.: Talking-head Generation with Rhythmic Head Motion. arXiv (Cornell University) (2020) arxiv:2007.08547","DOI":"10.1007\/978-3-030-58545-7_3"},{"key":"4232_CR28","doi-asserted-by":"publisher","unstructured":"Zhou, Y., Zhang, Z., Sun, W., Liu, X., Min, X., Wang, Z., Zhang, X.-P., Zhai, G.: Thqa: A perceptual quality assessment database for talking heads. In: 2024 IEEE International Conference on Image Processing (ICIP), pp. 15\u201321 (2024). https:\/\/doi.org\/10.1109\/ICIP51287.2024.10647507","DOI":"10.1109\/ICIP51287.2024.10647507"},{"key":"4232_CR29","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: LRS3-TED: a large-scale dataset for visual speech recognition. arXiv (Cornell University) (2018) arxiv:1809.00496"},{"key":"4232_CR30","doi-asserted-by":"crossref","unstructured":"Waibel, A., Behr, M., Eyiokur, F.I., Yaman, D., Nguyen, T.-N., Mullov, C., Demirtas, M.A., Kantarc\u0131, A., Constantin, S., Ekenel, H.K.: Face-dubbing++: Lip-synchronous, voice preserving translation of videos. arXiv preprint arXiv:2206.04523 (2022)","DOI":"10.1109\/ICASSPW59220.2023.10193719"},{"issue":"1","key":"4232_CR31","doi-asserted-by":"publisher","first-page":"134","DOI":"10.54254\/2755-2721\/102\/20241156","volume":"102","author":"Z Liu","year":"2024","unstructured":"Liu, Z.: Review of talking head synthesis for driving mechanisms and portrait rendering. Appl. Comput. Eng. 102(1), 134\u2013140 (2024). https:\/\/doi.org\/10.54254\/2755-2721\/102\/20241156","journal-title":"Appl. Comput. Eng."},{"issue":"8","key":"4232_CR32","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997). https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput."},{"key":"4232_CR33","doi-asserted-by":"publisher","unstructured":"Prajwal, K.R., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.V.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 484\u2013492 (2020) https:\/\/doi.org\/10.1145\/3394171.3413532","DOI":"10.1145\/3394171.3413532"},{"key":"4232_CR34","doi-asserted-by":"crossref","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: neural voice puppetry: audio-driven facial reenactment. arXiv (Cornell University) (2019) arxiv:1912.05566","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"4232_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, B., Qi, C., Zhang, P., Zhang, B., Wu, H., Chen, D., Chen, Q., Wang, Y., Wen, F.: Metaportrait: identity-preserving talking head generation with fast personalized adaptation. arXiv preprint arXiv:2212.08062 (2023)","DOI":"10.1109\/CVPR52729.2023.02116"},{"key":"4232_CR36","doi-asserted-by":"publisher","unstructured":"Drobyshev, N., Chelishev, J., Khakhulin, T., Ivakhnenko, A., Lempitsky, V., Zakharov, E.: Megaportraits: one-shot megapixel neural head avatars. In: Proceedings of the 30th ACM International Conference on Multimedia. MM\u201922, pp. 2663\u20132671. Association for Computing Machinery, New York, NY, USA (2022). https:\/\/doi.org\/10.1145\/3503161.3547838","DOI":"10.1145\/3503161.3547838"},{"key":"4232_CR37","doi-asserted-by":"crossref","unstructured":"Viazovetskyi, Y., Ivashkin, V., Kashin, E.: StyleGAN2 distillation for feed-forward image manipulation. arXiv (Cornell University) (2020) arxiv:2003.03581","DOI":"10.1007\/978-3-030-58542-6_11"},{"key":"4232_CR38","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4232_CR39","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O.M., Zisserman, A.: VGGFace2: a dataset for recognising faces across pose and age. arXiv (Cornell University) (2017) arxiv:1710.08092"},{"key":"4232_CR40","doi-asserted-by":"crossref","unstructured":"Nguyen-Le, H.-H., Tran, V.-T., Nguyen, D.-T., Le-Khac, N.-A.: Passive deepfake detection across multi-modalities: a comprehensive survey. arXiv (Cornell University) (2024) arxiv:2411.17911","DOI":"10.36227\/techrxiv.173273016.63626046\/v1"},{"key":"4232_CR41","unstructured":"Li, X., Zhang, Q., Kang, D., Cheng, W., Gao, Y., Zhang, J., Liang, Z., Liao, J., Cao, Y.-P., Shan, Y.: Advances in 3D generation: a survey. arXiv (Cornell University) (2024) arxiv:2401.17807"},{"key":"4232_CR42","doi-asserted-by":"publisher","unstructured":"Gandhi, K., Kulkarni, P., Shah, T., Chaudhari, P., Narvekar, M., Ghag, K.: A multimodal framework for deepfake detection. J. Electr. Syst. (2024) https:\/\/doi.org\/10.53555\/jes.v20i10s.6126","DOI":"10.53555\/jes.v20i10s.6126"},{"issue":"4","key":"4232_CR43","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1007\/bf02478259","volume":"5","author":"WS McCulloch","year":"1943","unstructured":"McCulloch, W.S., Pitts, W.: A logical calculus of the ideas immanent in nervous activity. Bull. Math. Biophys. 5(4), 115\u2013133 (1943). https:\/\/doi.org\/10.1007\/bf02478259","journal-title":"Bull. Math. Biophys."},{"issue":"12","key":"4232_CR44","doi-asserted-by":"publisher","first-page":"5873","DOI":"10.1109\/tai.2024.3444742","volume":"5","author":"DH Hagos","year":"2024","unstructured":"Hagos, D.H., Battle, R., Rawat, D.B.: Recent advances in generative AI and large language models: current status, challenges, and perspectives. IEEE Trans. Artif. Intell. 5(12), 5873\u20135893 (2024). https:\/\/doi.org\/10.1109\/tai.2024.3444742","journal-title":"IEEE Trans. Artif. Intell."},{"key":"4232_CR45","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., Sebe, N.: First order motion model for image animation. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS), pp. 7137\u20137147. Curran Associates Inc., Red Hook, NY, USA (2019). Chap. 641"},{"key":"4232_CR46","unstructured":"Ko, J., Cho, K., Lee, J., Yoon, H., Lee, S., Ahn, S., Kim, S.: Talk3D: high-fidelity talking portrait synthesis via personalized 3D generative Prior. arXiv (Cornell University) (2024) arxiv:2403.20153"},{"key":"4232_CR47","unstructured":"Xu, E.Z., Zhang, J., Liew, J.H., Zhang, W., Bai, S., Feng, J., Shou, M.Z.: PV3D: a 3D generative model for portrait video generation. arXiv (Cornell University) (2022) arxiv:2212.06384"},{"key":"4232_CR48","doi-asserted-by":"crossref","unstructured":"Li, W., Zhang, L., Wang, D., Zhao, B., Wang, Z., Chen, M., Zhang, B., Wang, Z., Bo, L., Li, X.: One-shot high-fidelity talking-head synthesis with deformable neural radiance field. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 17969\u201317978 (2023)","DOI":"10.1109\/CVPR52729.2023.01723"},{"key":"4232_CR49","unstructured":"Yao, S., Zhong, R., Yan, Y., Zhai, G., Yang, X.: DFA-NERF: personalized talking head generation via disentangled face attributes neural rendering. arXiv (Cornell University) (2022) arxiv:2201.00791"},{"key":"4232_CR50","unstructured":"Zhang, Y., Zhong, Z., Liu, M., Chen, Z., Wu, B., Zeng, Y., Zhan, C., He, Y., Huang, J., Zhou, W.: Musetalk: real-time high-fidelity video dubbing via spatio-temporal sampling. arXiv preprint arXiv:2410.10122 (2025)"},{"issue":"1","key":"4232_CR51","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/s00371-022-02764-z","volume":"40","author":"X Lu","year":"2023","unstructured":"Lu, X., Xie, X., Ye, C., Xing, H., Liu, Z., Cai, C.: A lightweight generative adversarial network for single image super-resolution. Vis. Comput. 40(1), 41\u201352 (2023). https:\/\/doi.org\/10.1007\/s00371-022-02764-z","journal-title":"Vis. Comput."},{"issue":"1","key":"4232_CR52","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1007\/s00371-023-02784-3","volume":"40","author":"SS Aldin","year":"2023","unstructured":"Aldin, S.S., Aldin, N.B., Ayka\u00e7, M.: Enhanced image classification using edge CNN (E-CNN). Vis. Comput. 40(1), 319\u2013332 (2023). https:\/\/doi.org\/10.1007\/s00371-023-02784-3","journal-title":"Vis. Comput."},{"key":"4232_CR53","doi-asserted-by":"publisher","unstructured":"Deng, Y., Yang, J., Chen, D., Wen, F., Tong, X.: Disentangled and controllable face image generation via 3D imitative-contrastive learning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5153\u20135162 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00520","DOI":"10.1109\/CVPR42600.2020.00520"},{"key":"4232_CR54","doi-asserted-by":"crossref","unstructured":"Zhao, S., Hong, F.-T., Huang, X., Xu, D.: Synergizing motion and appearance: multi-scale compensatory codebooks for talking head video generation. arXiv preprint arXiv:2412.00719 (2025)","DOI":"10.1109\/CVPR52734.2025.02443"},{"key":"4232_CR55","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Xie, W., Zisserman, A.: Voxceleb: Large-Scale Speaker Verification in the W. Technical report, Visual Geometry Group (October (2019)","DOI":"10.1016\/j.csl.2019.101027"},{"key":"4232_CR56","doi-asserted-by":"publisher","unstructured":"Ni, H., Liu, Y., Huang, S.X., Xue, Y.: Cross-identity video motion retargeting with joint transformation and synthesis. In: 2023 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 412\u2013422 (2023). https:\/\/doi.org\/10.1109\/WACV56688.2023.00049","DOI":"10.1109\/WACV56688.2023.00049"},{"issue":"6","key":"4232_CR57","doi-asserted-by":"publisher","first-page":"523","DOI":"10.1121\/1.5042758","volume":"143","author":"N Alghamdi","year":"2018","unstructured":"Alghamdi, N., Maddock, S., Marxer, R., Barker, J., Brown, G.J.: A corpus of audio-visual Lombard speech with frontal and profile views. J. Acoust. Soc. Am. 143(6), 523\u2013529 (2018). https:\/\/doi.org\/10.1121\/1.5042758","journal-title":"J. Acoust. Soc. Am."},{"key":"4232_CR58","doi-asserted-by":"publisher","unstructured":"Grassal, P.-W., Prinzler, M., Leistner, T., Rother, C., Nie\u00dfner, M., Thies, J.: Neural head avatars from monocular RGB videos. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18632\u201318643 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01810","DOI":"10.1109\/CVPR52688.2022.01810"},{"key":"4232_CR59","doi-asserted-by":"publisher","first-page":"2226","DOI":"10.1109\/tmm.2022.3144890","volume":"25","author":"N Jiang","year":"2022","unstructured":"Jiang, N., Sheng, B., Li, P., Lee, T.-Y.: PhotoHelper: portrait Photographing guidance via deep feature retrieval and fusion. IEEE Trans. Multimedia 25, 2226\u20132238 (2022). https:\/\/doi.org\/10.1109\/tmm.2022.3144890","journal-title":"IEEE Trans. Multimedia"},{"key":"4232_CR60","doi-asserted-by":"crossref","unstructured":"Hong, F.-T., Zhang, L., Shen, L., Xu, D.: Depth-aware generative adversarial network for talking head video generation. arXiv (Cornell University) (2022) arxiv:2203.06605","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"4232_CR61","doi-asserted-by":"publisher","unstructured":"Wang, Q., Zhang, L., Li, B.: SAFA: structure aware face animation. In: 2021 International Conference on 3D Vision (3DV), pp. 679\u2013688. IEEE Computer Society, Los Alamitos, CA, USA (2021). https:\/\/doi.org\/10.1109\/3DV53792.2021.00077 . https:\/\/doi.ieeecomputersociety.org\/10.1109\/3DV53792.2021.00077","DOI":"10.1109\/3DV53792.2021.00077"},{"key":"4232_CR62","doi-asserted-by":"publisher","unstructured":"Huang, P.-H., Yang, F.-E., Wang, Y.-C.F.: Learning identity-invariant motion representations for cross-id face reenactment. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7082\u20137090 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00711","DOI":"10.1109\/CVPR42600.2020.00711"},{"key":"4232_CR63","doi-asserted-by":"publisher","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., Sebe, N.: Animating arbitrary objects via deep motion transfer. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2372\u20132381 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00248","DOI":"10.1109\/CVPR.2019.00248"},{"key":"4232_CR64","doi-asserted-by":"publisher","unstructured":"Yao, G., Yuan, Y., Shao, T., Zhou, K.: Mesh guided one-shot face reenactment using graph convolutional networks. In: Proceedings of the 28th ACM International Conference on Multimedia. MM\u201920, pp. 1773\u20131781. Association for Computing Machinery, New York, NY, USA (2020). https:\/\/doi.org\/10.1145\/3394171.3413865","DOI":"10.1145\/3394171.3413865"},{"issue":"3","key":"4232_CR65","doi-asserted-by":"publisher","first-page":"413","DOI":"10.1109\/TVCG.2013.249","volume":"20","author":"C Cao","year":"2014","unstructured":"Cao, C., Weng, Y., Zhou, S., Tong, Y., Zhou, K.: Facewarehouse: a 3d facial expression database for visual computing. IEEE Trans. Visual Comput. Graph. 20(3), 413\u2013425 (2014). https:\/\/doi.org\/10.1109\/TVCG.2013.249","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"4232_CR66","doi-asserted-by":"publisher","unstructured":"K\u00f6stinger, M., Wohlhart, P., Roth, P.M., Bischof, H.: Annotated facial landmarks in the wild: a large-scale, real-world database for facial landmark localization. In: 2011 IEEE International Conference on Computer Vision Workshops (ICCV Workshops), pp. 2144\u20132151 (2011). https:\/\/doi.org\/10.1109\/ICCVW.2011.6130513","DOI":"10.1109\/ICCVW.2011.6130513"},{"key":"4232_CR67","unstructured":"Ha, S., Kersner, M., Kim, B., Seo, S., Kim, D.: MarioNETte: few-shot face reenactment Preserving identity of unseen targets. arXiv (Cornell University) (2019) arxiv:1911.08139"},{"key":"4232_CR68","unstructured":"Wayne, W., Yunxuan, Z., Cheng, L., Chen, Q., Change, L.C.: ReenactGAN: learning to reenact faces via boundary transfer. arXiv (Cornell University) (2018) arxiv:1807.11079"},{"key":"4232_CR69","doi-asserted-by":"crossref","unstructured":"Wiles, O., Koepke, A.S., Zisserman, A.: X2Face: a network for controlling face generation by using images, audio, and pose codes. arXiv (Cornell University) (2018) arxiv:1807.10550","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"4232_CR70","unstructured":"Pham, H.X., Wang, Y., Pavlovic, V.: Generative adversarial talking head: bringing portraits to life with a weakly supervised neural network. arXiv (Cornell University) (2018) arxiv:1803.07716"},{"key":"4232_CR71","doi-asserted-by":"crossref","unstructured":"Xu, Z., Yu, Z., Zhou, Z., Zhou, J., Jin, X., Hong, F.-T., Ji, X., Zhu, J., Cai, C., Tang, S., Lin, Q., Li, X., Lu, Q.: Hunyuanportrait: implicit condition control for enhanced portrait animation. arXiv preprint arXiv:2503.18860 (2025)","DOI":"10.1109\/CVPR52734.2025.01483"},{"key":"4232_CR72","doi-asserted-by":"crossref","unstructured":"Xie, Y., Xu, H., Song, G., Wang, C., Shi, Y., Luo, L.: X-Portrait: expressive portrait animation with hierarchical motion attention. arXiv (Cornell University) (2024) arxiv:2403.15931","DOI":"10.1145\/3641519.3657459"},{"key":"4232_CR73","doi-asserted-by":"publisher","unstructured":"Agarwal, M., Mukhopadhyay, R., Namboodiri, V., Jawahar, C.V.: Audio-visual face reenactment. In: 2023 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 5167\u20135176 (2023). https:\/\/doi.org\/10.1109\/WACV56688.2023.00515","DOI":"10.1109\/WACV56688.2023.00515"},{"key":"4232_CR74","unstructured":"Guo, J., Zhang, D., Liu, X., Zhong, Z., Zhang, Y., Wan, P., Zhang, D.: Liveportrait: efficient portrait animation with stitching and retargeting control. arXiv preprint arXiv:2407.03168 (2025)"},{"key":"4232_CR75","doi-asserted-by":"crossref","unstructured":"Hong, F.-T., Xu, D.: Implicit identity representation conditioned memory compensation network for talking head video generation. arXiv (Cornell University) (2023) arxiv:2307.09906","DOI":"10.1109\/ICCV51070.2023.02108"},{"key":"4232_CR76","doi-asserted-by":"publisher","unstructured":"Zhao, J., Zhang, H.: Thin-plate spline motion model for image animation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3647\u20133656 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00364","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"4232_CR77","doi-asserted-by":"crossref","unstructured":"Yin, F., Zhang, Y., Cun, X., Cao, M., Fan, Y., Wang, X., Bai, Q., Wu, B., Wang, J., Yang, Y.: Styleheat: one-shot high-resolution editable talking face generation via pre-trained stylegan. arXiv preprint arXiv:2203.04036 (2022)","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"4232_CR78","unstructured":"Wu, Y., Deng, Y., Yang, J., Wei, F., Chen, Q., Tong, X.: Anifacegan: animatable 3d-aware face image generation for video avatars. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. NIPS\u201922. Curran Associates Inc., Red Hook, NY, USA (2022)"},{"key":"4232_CR79","unstructured":"Wang, Y., Yang, D., Bremond, F., Dantcheva, A.: Latent Image Animator: learning to animate images via latent space navigation. arXiv (Cornell University) (2022) arxiv:2203.09043"},{"key":"4232_CR80","unstructured":"Lin, G., Jiang, J., Yang, J., Zheng, Z., Liang, C.: OmniHuman-1: rethinking the scaling-up of one-stage conditioned human animation models. arXiv (Cornell University) (2025) arxiv:2502.01061"},{"key":"4232_CR81","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson audio-visual database of emotional speech and song (RAVDESS). Zenodo (2018)"},{"key":"4232_CR82","doi-asserted-by":"crossref","unstructured":"Zhang, C., Li, Z., Xu, H., Xie, Y., Zhao, X., Gu, T., Song, G., Chen, X., Liang, C., Jiang, J., Luo, L.: X-Actor: emotional and expressive long-range portrait acting from audio. arXiv (Cornell University) (2025) arXiv:2508.02944","DOI":"10.1145\/3757377.3763985"},{"key":"4232_CR83","unstructured":"Ji, X., Hu, X., Xu, Z., Zhu, J., Lin, C., He, Q., Zhang, J., Luo, D., Chen, Y., Lin, Q., Lu, Q., Wang, C.: Sonic: shifting focus to global audio perception in Portrait Animation. arXiv (Cornell University) (2024) arxiv:2411.16331"},{"key":"4232_CR84","unstructured":"Gu, B., Yu, Y., Fan, H., Zhang, L.: Flow-guided diffusion for video inpainting. arXiv (Cornell University) (2023) arxiv:2311.15368"},{"key":"4232_CR85","doi-asserted-by":"crossref","unstructured":"Tian, L., Wang, Q., Zhang, B., Bo, L.: EMO: Emote Portrait Alive\u2013generating expressive portrait videos with Audio2Video Diffusion Model under Weak Conditions. arXiv (Cornell University) (2024) arxiv:2402.17485","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"4232_CR86","unstructured":"Wang, C., Tian, K., Zhang, J., Guan, Y., Luo, F., Shen, F., Jiang, Z., Gu, Q., Han, X., Yang, W.: V-Express: conditional dropout for progressive training of portrait video generation. arXiv (Cornell University) (2024) arxiv:2406.02511"},{"key":"4232_CR87","doi-asserted-by":"publisher","unstructured":"Wang, L.X.X., Zhang, H., Dong, C., Shan, Y.: VFHQ: a high-quality dataset and benchmark for video face super-resolution. arXiv (Cornell University) (2022) https:\/\/doi.org\/10.48550\/arXiv.2205.03409","DOI":"10.48550\/arXiv.2205.03409"},{"key":"4232_CR88","doi-asserted-by":"publisher","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y.-J., Bao, H., Zhang, J.: Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 5764\u20135774 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00573","DOI":"10.1109\/ICCV48922.2021.00573"},{"issue":"1","key":"4232_CR89","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1016\/j.vrih.2023.08.006","volume":"6","author":"L Niu","year":"2024","unstructured":"Niu, L., Xie, W., Wang, D., Cao, Z., Liu, X.: Audio2AB: audio-driven collaborative generation of virtual character animation. Virtual Real. Intell. Hardw. 6(1), 56\u201370 (2024). https:\/\/doi.org\/10.1016\/j.vrih.2023.08.006","journal-title":"Virtual Real. Intell. Hardw."},{"key":"4232_CR90","doi-asserted-by":"publisher","unstructured":"Liu, H., Zhu, Z., Iwamoto, N., Peng, Y., Li, Z., Zhou, Y., Bozkurt, E., Zheng, B.: BEAT: a large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: Computer Vision\u2014ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part VII, pp. 612\u2013630. Springer, Berlin, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_36","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"4232_CR91","doi-asserted-by":"crossref","unstructured":"Sicheng, X., Guojun, C., Yu-Xiao, G., Jiaolong, Y., Chong, L., Zhenyu, Z., Yizhong, Z., Xin, T.: Vasa-1: Lifelike audio-driven talking faces generated in real time. In: Conference on Neural Information Processing Systems (NeurIPS 2024), vol. 38 (2024)","DOI":"10.52202\/079017-0021"},{"key":"4232_CR92","unstructured":"Cao, X., Shi, S., Zhao, J., Yao, Y., Fei, J., Gao, M., Wang, G.: JoyVASA: portrait and animal image animation with diffusion-based audio-driven facial dynamics and head motion generation. arXiv (Cornell University) (2024) arxiv:2411.09209"},{"key":"4232_CR93","unstructured":"Xu, M., Li, H., Su, Q., Shang, H., Zhang, L., Liu, C., Wang, J., Luc, V.G., Yao, Y., Zhu, S.: Hallo: hierarchical audio-driven visual synthesis for portrait image animation. arXiv (Cornell University) (2024) arxiv:2406.08801"},{"key":"4232_CR94","doi-asserted-by":"crossref","unstructured":"Liu, Y., Lin, L., Yu, F., Zhou, C., Li, Y.: MODA: mapping-once audio-driven portrait animation with dual attentions. arXiv (Cornell University) (2023) arxiv:2307.10008","DOI":"10.1109\/ICCV51070.2023.02104"},{"key":"4232_CR95","doi-asserted-by":"crossref","unstructured":"Lu, Y., Chai, J., Cao, X.: Live speech portraits: real-time photorealistic talking-head animation. arXiv (Cornell University) (2021) arxiv:2109.10595","DOI":"10.1145\/3478513.3480484"},{"key":"4232_CR96","doi-asserted-by":"publisher","DOI":"10.1002\/cav.2226","author":"C Liang","year":"2023","unstructured":"Liang, C., Wang, Q., Chen, Y., Tang, M.: Wav2Lip-HR: synthesising clear high-resolution talking head in the wild. Comput Anim Virtual Worlds (2023). https:\/\/doi.org\/10.1002\/cav.2226","journal-title":"Comput Anim Virtual Worlds"},{"key":"4232_CR97","doi-asserted-by":"crossref","unstructured":"Huang, R., Lai, P., Qin, Y., Li, G.: Parametric implicit face representation for Audio-Driven facial reenactment. arXiv (Cornell University) (2023) arxiv:2306.07579","DOI":"10.1109\/CVPR52729.2023.01227"},{"key":"4232_CR98","doi-asserted-by":"publisher","unstructured":"Liang, B., Pan, Y., Guo, Z., Zhou, H., Hong, Z., Han, X., Han, J., Liu, J., Ding, E., Wang, J.: Expressive talking head generation with granular audio-visual control. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3377\u20133386 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00338","DOI":"10.1109\/CVPR52688.2022.00338"},{"key":"4232_CR99","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zhao, Y., Huang, Y., Zeng, M., Ni, S., Budagavi, M., Guo, X.: FACIAL: Synthesizing dynamic talking face with implicit attribute learning. arXiv (Cornell University) (2021) arxiv:2108.07938","DOI":"10.1109\/ICCV48922.2021.00384"},{"key":"4232_CR100","doi-asserted-by":"publisher","unstructured":"Wang, S., Li, L., Ding, Y., Fan, C., Yu, X.: Audio2head: audio-driven one-shot talking-head generation with natural head motion. In: Zhou, Z.-H. (ed.) Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence, IJCAI-21, pp. 1098\u20131105. International Joint Conferences on Artificial Intelligence Organization, ??? (2021). https:\/\/doi.org\/10.24963\/ijcai.2021\/152 . Main Track","DOI":"10.24963\/ijcai.2021\/152"},{"key":"4232_CR101","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., Wang, X.: Talking face generation by adversarially disentangled audio-visual representation. arXiv preprint arXiv:1807.07860 (2019)","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"4232_CR102","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, Z., Maddox, R.K., Duan, Z., Xu, C.: Lip movements generation at a glance. arXiv (Cornell University) (2018) arxiv:1803.10404","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"4232_CR103","doi-asserted-by":"publisher","unstructured":"Zhou, Y., Xu, Z., Landreth, C., Kalogerakis, E., Maji, S., Singh, K.: Visemenet: audio-driven animator-centric speech animation. ACM Trans. Graph. 37(4) (2018) https:\/\/doi.org\/10.1145\/3197517.3201292","DOI":"10.1145\/3197517.3201292"},{"issue":"10","key":"4232_CR104","doi-asserted-by":"publisher","first-page":"50","DOI":"10.4236\/jcc.2017.510006","volume":"05","author":"K Sato","year":"2017","unstructured":"Sato, K., Nose, T., Ito, A.: HMM-based photo-realistic talking face synthesis using facial expression parameter mapping with deep neural networks. J. Comput. Commun. 05(10), 50\u201365 (2017). https:\/\/doi.org\/10.4236\/jcc.2017.510006","journal-title":"J. Comput. Commun."},{"key":"4232_CR105","unstructured":"Hong, F.-T., Xu, Z., Zhou, Z., Zhou, J., Li, X., Lin, Q., Lu, Q., Xu, D.: Audio-visual controlled video diffusion with masked selective state spaces modeling for natural talking head generation. arXiv (Cornell University) (2025)"},{"key":"4232_CR106","unstructured":"Wei, H., Yang, Z., Wang, Z.: AniPortrait: audio-driven synthesis of photorealistic portrait animation. arXiv (Cornell University) (2024) arxiv:2403.17694"},{"key":"4232_CR107","doi-asserted-by":"crossref","unstructured":"Tan, S., Ji, B., Bi, M., Pan, Y.: EDTaLK: efficient disentanglement for Emotional Talking head synthesis. arXiv (Cornell University) (2024) arxiv:2404.01647","DOI":"10.1007\/978-3-031-72658-3_23"},{"issue":"3","key":"4232_CR108","doi-asserted-by":"publisher","first-page":"2403","DOI":"10.1609\/aaai.v39i3.32241","volume":"39","author":"Z Chen","year":"2025","unstructured":"Chen, Z., Cao, J., Chen, Z., Li, Y., Ma, C.: EchoMimic: lifelike audio-driven portrait animations through editable landmark conditions. Proc. AAAI Conf. Artif. Intell 39(3), 2403\u20132410 (2025). https:\/\/doi.org\/10.1609\/aaai.v39i3.32241","journal-title":"Proc. AAAI Conf. Artif. Intell"},{"key":"4232_CR109","doi-asserted-by":"crossref","unstructured":"Tan, S., Ji, B., Pan, Y.: Flowvqtalker: high-quality emotional talking face generation through normalizing flow and quantization. arXiv (Cornell University) (2024)","DOI":"10.1109\/CVPR52733.2024.02486"},{"key":"4232_CR110","doi-asserted-by":"crossref","unstructured":"Guan, J., Xu, Z., Zhou, H., Wang, K., He, S., Zhang, Z., Liang, B., Feng, H., Ding, E., Liu, J., Wang, J., Zhao, Y., Liu, Z.: Resyncer: rewiring style-based generator for unified audio-visually synced facial performer. arXiv preprint arXiv:2408.03284 (2024)","DOI":"10.1007\/978-3-031-72940-9_20"},{"key":"4232_CR111","unstructured":"Ye, Z., Zhong, T., Ren, Y., Yang, J., Li, W., Huang, J., Jiang, Z., He, J., Huang, R., Liu, J., Zhang, C., Yin, X., Ma, Z., Zhao, Z.: Real3d-portrait: one-shot realistic 3d talking portrait synthesis. arXiv preprint arXiv:2401.08503 (2024)"},{"issue":"7","key":"4232_CR112","doi-asserted-by":"publisher","first-page":"4913","DOI":"10.1007\/s00371-024-03490-4","volume":"40","author":"H Fang","year":"2024","unstructured":"Fang, H., Weng, D., Tian, Z., Ma, Y.: Manitalk: manipulable talking head generation from single image in the wild. Vis. Comput. 40(7), 4913\u20134925 (2024). https:\/\/doi.org\/10.1007\/s00371-024-03490-4","journal-title":"Vis. Comput."},{"key":"4232_CR113","doi-asserted-by":"crossref","unstructured":"Hwang, G., Hong, S., Lee, S., Park, S., Chae, G.: DISCoHEAD: audio-and-video-driven talking head generation by disentangled control of head pose and facial expressions. arXiv (Cornell University) (2023) arxiv:2303.07697","DOI":"10.1109\/ICASSP49357.2023.10095670"},{"key":"4232_CR114","unstructured":"Zhao, X., Xu, H., Song, G., Xie, Y., Zhang, C., Li, X., Luo, L., Suo, J., Liu, Y.: X-NEMO: expressive neural motion reenactment via disentangled latent attention. arXiv (Cornell University) (2025) arXiv:2507.23143"},{"key":"4232_CR115","doi-asserted-by":"crossref","unstructured":"Ma, Y., Liu, H., Wang, H., Pan, H., He, Y., Yuan, J., Zeng, A., Cai, C., Shum, H.-Y., Liu, W., Chen, Q.: Follow-Your-Emoji: fine-Controllable and expressive freestyle portrait animation. arXiv (Cornell University) (2024) arxiv:2406.01900","DOI":"10.1145\/3680528.3687587"},{"key":"4232_CR116","unstructured":"Lin, K., Ahmed, F., Li, L., Lin, C.-C., Azarnasab, E., Yang, Z., Wang, J., Liang, L., Liu, Z., Lu, Y., Liu, C., Wang, L.: MM-VID: advancing video understanding with GPT-4(Vision). arXiv (Cornell University) (2023) arxiv:2310.19773"},{"key":"4232_CR117","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.367"},{"key":"4232_CR118","doi-asserted-by":"crossref","unstructured":"Khakhulin, T., Sklyarova, V., Lempitsky, V., Zakharov, E.: Realistic one-shot mesh-based head avatars. arXiv (Cornell University) (2022) arxiv:2206.08343","DOI":"10.1007\/978-3-031-20086-1_20"},{"key":"4232_CR119","unstructured":"Wang, T.-C., Mallya, A., Liu, M.-Y.: One-Shot free-view neural talking-head synthesis for video conferencing. arXiv (Cornell University) (2020) arxiv:2011.15126"},{"key":"4232_CR120","doi-asserted-by":"publisher","unstructured":"Zakharov, E., Shysheya, A., Burkov, E., Lempitsky, V.: Few-shot adversarial learning of realistic neural talking head models. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9458\u20139467 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00955","DOI":"10.1109\/ICCV.2019.00955"},{"key":"4232_CR121","unstructured":"Wang, T.-C., Liu, M.-Y., Tao, A., Liu, G., Kautz, J., Catanzaro, B.: Few-shot video-to-video synthesis. arXiv (Cornell University) (2019) arxiv:1910.12713"},{"issue":"11\u201312","key":"4232_CR122","doi-asserted-by":"publisher","first-page":"1767","DOI":"10.1007\/s11263-019-01150-y","volume":"127","author":"A Jamaludin","year":"2019","unstructured":"Jamaludin, A., Chung, J.S., Zisserman, A.: You said that? Synthesising talking faces from audio. Int. J. Comput. Vision 127(11\u201312), 1767\u20131779 (2019). https:\/\/doi.org\/10.1007\/s11263-019-01150-y","journal-title":"Int. J. Comput. Vision"},{"key":"4232_CR123","doi-asserted-by":"publisher","unstructured":"Huang, X., Belongie, S.: Arbitrary style transfer in real-time with adaptive instance normalization. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 1510\u20131519 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.167","DOI":"10.1109\/ICCV.2017.167"},{"key":"4232_CR124","unstructured":"Wang, Y., Guo, J., Bai, J., Yu, R., He, T., Tan, X., Sun, X., Bian, J.: InstructAvatar: text-guided emotion and motion control for avatar generation. arXiv (Cornell University) (2024) arxiv:2405.15758"},{"key":"4232_CR125","unstructured":"Ma, Y., Wang, S., Ding, Y., Ma, B., Lv, T., Fan, C., Hu, Z., Deng, Z., Yu, X.: TalkCLIP: Talking Head Generation with Text-Guided Expressive Speaking Styles. arXiv (Cornell University) (2023) arxiv:2304.00334"},{"key":"4232_CR126","unstructured":"Diao, X., Cheng, M., Barrios, W., Jin, S.: FT2TF: first-person statement text-to-talking face generation. arXiv (Cornell University) (2023) arxiv:2312.05430"},{"key":"4232_CR127","doi-asserted-by":"crossref","unstructured":"Ma, Y., Wang, S., Hu, Z., Fan, C., Lv, T., Ding, Y., Deng, Z., Yu, X.: StyleTalk: one-shot talking head generation with controllable speaking styles. arXiv (Cornell University) (2023) arxiv:2301.01081","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"4232_CR128","doi-asserted-by":"publisher","unstructured":"Zhang, S., Yuan, J., Liao, M., Zhang, L.: Text2video: text-driven talking-head video synthesis with personalized phoneme\u2014pose dictionary. In: ICASSP 2022\u20142022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2659\u20132663 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747380","DOI":"10.1109\/ICASSP43922.2022.9747380"},{"key":"4232_CR129","unstructured":"Li, L., Wang, S., Zhang, Z., Ding, Y., Zheng, Y., Yu, X., Fan, C.: Write-a-Speaker: Text-based emotional and rhythmic talking-head generation. arXiv (Cornell University) (2021) arxiv:2104.07995"},{"key":"4232_CR130","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. arXiv preprint arXiv:2104.11116 (2021)","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"4232_CR131","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk: speaker-aware talking-head animation. ACM Trans. Graph. (TOG) (2020). https:\/\/doi.org\/10.1145\/3414685.3417774","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"4232_CR132","unstructured":"Tian, Q., Chen, Y., Zhang, Z., Lu, H., Chen, L., Xie, L., Liu, S.: TFGAN: time and frequency domain based generative adversarial network for high-fidelity speech synthesis. arXiv (Cornell University) (2020) arxiv:2011.12206"},{"key":"4232_CR133","doi-asserted-by":"crossref","unstructured":"Shen, J., Pang, R., Weiss, R.J., Schuster, M., Jaitly, N., Yang, Z., Chen, Z., Zhang, Y., Wang, Y., Skerry-Ryan, R., Saurous, R.A., Agiomyrgiannakis, Y., Wu, Y.: Natural TTS synthesis by conditioning WaveNet on MEL spectrogram predictions. arXiv (Cornell University) (2017) arxiv:1712.05884","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"4232_CR134","unstructured":"Jalalifar, S.A., Hasani, H., Aghajan, H.: Speech-driven facial reenactment using conditional generative adversarial networks. arXiv (Cornell University) (2018) arxiv:1803.07461"},{"key":"4232_CR135","unstructured":"Kumar, R., Sotelo, J., Kumar, K., De\u00a0Br\u00e9bisson, A., Bengio, Y.: ObamaNet: photo-realistic lip-sync from text. arXiv (Cornell University) (2018) arxiv:1801.01442"},{"key":"4232_CR136","unstructured":"Sun, K., Jourabloo, A., Bhalodia, R., Meshry, M., Rong, Y., Yang, Z., Nguyen-Phuoc, T., Haene, C., Xu, J., Johnson, S., Li, H., Bouaziz, S.: GENCA: a text-conditioned generative model for realistic and drivable CODEC avatars. arXiv (Cornell University) (2024) arxiv:2408.13674"},{"key":"4232_CR137","doi-asserted-by":"crossref","unstructured":"Peng, W., Zhang, K., Zhang, S.Q.: T3m: Text guided 3d human motion synthesis from speech. arXiv (Cornell University) (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.74"},{"key":"4232_CR138","doi-asserted-by":"crossref","unstructured":"Chai, Z., Tang, C., Wong, Y., Kankanhalli, M.: STAR: Skeleton-aware text-based 4D Avatar generation with in-network motion retargeting. arXiv (Cornell University) (2024) arxiv:2406.04629","DOI":"10.1109\/TVCG.2025.3559988"},{"key":"4232_CR139","unstructured":"Wang, Z., Dai, M., Lundgaard, K.: Text-to-video: a two-stage framework for zero-shot identity-agnostic talking-head generation. arXiv (Cornell University) (2023) arxiv:2308.06457"},{"key":"4232_CR140","doi-asserted-by":"publisher","unstructured":"Pham, T.T., Do, T., Le, N., Le, N., Nguyen, H., Tjiputra, E., Tran, Q., Nguyen, A.: Style transfer for 2d talking head generation. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp. 7500\u20137509 (2024). https:\/\/doi.org\/10.1109\/CVPRW63382.2024.00745","DOI":"10.1109\/CVPRW63382.2024.00745"},{"key":"4232_CR141","doi-asserted-by":"crossref","unstructured":"Stypu\u0142kowski, M., Vougioukas, K., He, S., Zi\u0119ba, M., Petridis, S., Pantic, M.: Diffused heads: diffusion models beat GANs on talking-face generation. arXiv (Cornell University) (2023) arxiv:2301.03396","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"4232_CR142","doi-asserted-by":"publisher","unstructured":"Bregler, C., Covell, M., Slaney, M.: Video rewrite: driving visual speech with audio. In: Addison-Wesley Publishing Co. ACM Press, USA (1997). https:\/\/doi.org\/10.1145\/258734.258880","DOI":"10.1145\/258734.258880"},{"key":"4232_CR143","doi-asserted-by":"publisher","DOI":"10.1016\/j.pacs.2025.100698","volume":"43","author":"H-K Huang","year":"2025","unstructured":"Huang, H.-K., Kuo, J., Zhang, Y., Aborahama, Y., Cui, M., Sastry, K., Park, S., Villa, U., Wang, L.V., Anastasio, M.A.: Fast aberration correction in 3D transcranial photoacoustic computed tomography via a learning-based image reconstruction method. Photoacoustics 43, 100698 (2025). https:\/\/doi.org\/10.1016\/j.pacs.2025.100698","journal-title":"Photoacoustics"},{"key":"4232_CR144","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2025.129706","volume":"630","author":"S Li","year":"2025","unstructured":"Li, S., Pan, Y.: Aniartavatar: animatable 3d art avatar from a single image. Neurocomputing 630, 129706 (2025). https:\/\/doi.org\/10.1016\/j.neucom.2025.129706","journal-title":"Neurocomputing"},{"key":"4232_CR145","unstructured":"Jafari, F., Berretti, S., Basu, A.: JambaTalk: speech-driven 3D talking head generation based on hybrid transformer-mamba model. arXiv (Cornell University) (2024) arxiv:2408.01627"},{"key":"4232_CR146","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.: Capture, learning, and synthesis of 3D speaking styles. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10101\u201310111 (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"4232_CR147","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2024.103925","volume":"120","author":"H Fang","year":"2024","unstructured":"Fang, H., Weng, D., Tian, Z., Ma, Y., Lu, X.: Audio-to-deep-lip: apeaking lip synthesis based on 3d landmarks. Comput. Graph. 120, 103925 (2024). https:\/\/doi.org\/10.1016\/j.cag.2024.103925","journal-title":"Comput. Graph."},{"key":"4232_CR148","doi-asserted-by":"publisher","first-page":"55476","DOI":"10.2196\/55476","volume":"8","author":"H Lee","year":"2024","unstructured":"Lee, H., Oh, B., Kim, S.-C.: Recognition of forward head posture through 3d human pose estimation with a graph convolutional network: development and feasibility study. JMIR Form Res. 8, 55476 (2024). https:\/\/doi.org\/10.2196\/55476","journal-title":"JMIR Form Res."},{"key":"4232_CR149","doi-asserted-by":"crossref","unstructured":"Jang, Y., Kim, J.-H., Ahn, J., Kwak, D., Yang, H.-S., Ju, Y.-C., Kim, I.-H., Kim, B.-Y., Chung, J.S.: Faces that speak: jointly synthesising talking face and speech from text. arXiv (Cornell University) (2024) arxiv:2405.10272","DOI":"10.1109\/CVPR52733.2024.00842"},{"key":"4232_CR150","doi-asserted-by":"crossref","unstructured":"Sung-Bin, K., Hyun, L., Hong, D.H., Nam, S., Ju, J., Oh, T.-H.: LaughTalk: expressive 3D talking head generation with laughter. arXiv (Cornell University) (2023) arxiv:2311.00994","DOI":"10.1109\/WACV57701.2024.00628"},{"key":"4232_CR151","unstructured":"Li, D., Zhao, K., Wang, W., Peng, B., Zhang, Y., Dong, J., Tan, T.: AE-NERF: audio enhanced neural radiance field for few shot talking head synthesis. arXiv (Cornell University) (2023) arxiv:2312.10921"},{"key":"4232_CR152","doi-asserted-by":"crossref","unstructured":"Kim, H., Garrido, P., Tewari, A., Xu, W., Thies, J., Nie\u00dfner, M., P\u00e9rez, P., Richardt, C., Zollh\u00f6fer, M., Theobalt, C.: Deep video portraits. arXiv (Cornell University) (2018) arxiv:1805.11714","DOI":"10.1145\/3197517.3201283"},{"key":"4232_CR153","doi-asserted-by":"publisher","unstructured":"Stan, S., Haque, K.I., Yumak, Z.: Facediffuser: speech-driven 3d facial animation synthesis using diffusion. In: Proceedings of the 16th ACM SIGGRAPH Conference on Motion, Interaction and Games. MIG \u201923. Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3623264.3624447","DOI":"10.1145\/3623264.3624447"},{"issue":"6","key":"4232_CR154","first-page":"194","volume":"36","author":"T Li","year":"2017","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4D scans. ACM Trans. Graph. (Proc. SIGGRAPH Asia) 36(6), 194\u2013119417 (2017)","journal-title":"ACM Trans. Graph. (Proc. SIGGRAPH Asia)"},{"issue":"8","key":"4232_CR155","doi-asserted-by":"publisher","first-page":"3635","DOI":"10.1007\/s00371-023-03035-1","volume":"39","author":"MM Rashid","year":"2023","unstructured":"Rashid, M.M., Wu, S., Nie, Y., Li, G.: High-fidelity facial expression transfer using part-based local-global conditional GANs. Vis. Comput. 39(8), 3635\u20133646 (2023). https:\/\/doi.org\/10.1007\/s00371-023-03035-1","journal-title":"Vis. Comput."},{"key":"4232_CR156","doi-asserted-by":"crossref","unstructured":"Ma, H., Zhang, T., Sun, S., Yan, X., Han, K., Xie, X.: CVTHead: one-shot controllable head avatar with vertex-feature transformer. arXiv (Cornell University) (2023) arxiv:2311.06443","DOI":"10.1109\/WACV57701.2024.00602"},{"key":"4232_CR157","doi-asserted-by":"publisher","unstructured":"Ni, H., Liu, J., Xue, Y., Huang, S.X.: 3d-aware talking-head video motion transfer. In: 2024 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 4942\u20134952 (2024). https:\/\/doi.org\/10.1109\/WACV57701.2024.00488","DOI":"10.1109\/WACV57701.2024.00488"},{"key":"4232_CR158","doi-asserted-by":"crossref","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. arXiv (Cornell University) (2022) arxiv:2201.07786","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"4232_CR159","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Fern\u00e1ndez\u00a0Abrevaya, V., B\u00fchler, M.C., Chen, X., Black, M.J., Hilliges, O.: I m avatar: implicit morphable head avatars from videos. arXiv preprint arXiv:2112.07471 (2022)","DOI":"10.1109\/CVPR52688.2022.01318"},{"key":"4232_CR160","unstructured":"Zeng, B., Liu, B., Li, H., Liu, X., Liu, J., Chen, D., Peng, W., Zhang, B.: FNEVR: neural volume rendering for face animation. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. NIPS\u201922. Curran Associates Inc., Red Hook, NY, USA (2022)"},{"key":"4232_CR161","doi-asserted-by":"publisher","unstructured":"Gafni, G., Thies, J., Zollh\u00f6fer, M., Nie\u00dfner, M.: Dynamic neural radiance fields for monocular 4d facial avatar reconstruction. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8645\u20138654 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.00854","DOI":"10.1109\/CVPR46437.2021.00854"},{"key":"4232_CR162","doi-asserted-by":"publisher","unstructured":"Yin, Y., Ghasedi, K., Wu, H., Yang, J., Tong, X., Fu, Y.: Nerfinvertor: high fidelity NERF-GAN inversion for single-shot real image animation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8539\u20138548 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00825","DOI":"10.1109\/CVPR52729.2023.00825"},{"key":"4232_CR163","doi-asserted-by":"publisher","first-page":"666","DOI":"10.1007\/978-3-031-19775-8_39","volume-title":"Computer vision\u2014ECCV 2022","author":"S Shen","year":"2022","unstructured":"Shen, S., Li, W., Zhu, Z., Duan, Y., Zhou, J., Lu, J.: Learning dynamic facial radiance fields for few-shot talking head synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer vision\u2014ECCV 2022, pp. 666\u2013682. Springer, Cham (2022)"},{"issue":"9","key":"4232_CR164","doi-asserted-by":"publisher","first-page":"6020","DOI":"10.1109\/TVCG.2023.3323578","volume":"30","author":"J Tang","year":"2024","unstructured":"Tang, J., Zhang, B., Yang, B., Zhang, T., Chen, D., Ma, L., Wen, F.: 3DFaceShop: explicitly controllable 3d-aware portrait generation. IEEE Trans. Visual Comput. Graph. 30(9), 6020\u20136037 (2024). https:\/\/doi.org\/10.1109\/TVCG.2023.3323578","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"4232_CR165","doi-asserted-by":"publisher","unstructured":"Sun, J., Wang, X., Wang, L., Li, X., Zhang, Y., Zhang, H., Liu, Y.: Next3d: generative neural texture rasterization for 3d-aware head avatars. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 20991\u201321002 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.02011","DOI":"10.1109\/CVPR52729.2023.02011"},{"key":"4232_CR166","unstructured":"Kim, S., Jin, S., Park, J., Kim, K., Kim, J., Nam, J., Kim, S.: MODiTalker: motion-disentangled diffusion model for high-fidelity talking head generation. arXiv (Cornell University) (2024) arxiv:2403.19144"},{"key":"4232_CR167","unstructured":"Ma, Y., Zhang, S., Wang, J., Wang, X., Zhang, Y., Deng, Z.: DreamTalk: when expressive talking head generation meets diffusion probabilistic Models. arXiv (Cornell University) (2023) arxiv:2312.09767"},{"key":"4232_CR168","doi-asserted-by":"publisher","unstructured":"Zhang, B., Zhang, X., Cheng, N., Yu, J., Xiao, J., Wang, J.: Emotalker: emotionally editable talking face generation via diffusion model. In: ICASSP 2024\u20132024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8276\u20138280 (2024). https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10447505","DOI":"10.1109\/ICASSP48485.2024.10447505"},{"key":"4232_CR169","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.104911","volume":"142","author":"D Bigioi","year":"2024","unstructured":"Bigioi, D., Basak, S., Stypu\u0142kowski, M., Zieba, M., Jordan, H., McDonnell, R., Corcoran, P.: Speech driven video editing via an audio-conditioned diffusion model. Image Vis. Comput. 142, 104911 (2024). https:\/\/doi.org\/10.1016\/j.imavis.2024.104911","journal-title":"Image Vis. Comput."},{"key":"4232_CR170","doi-asserted-by":"publisher","unstructured":"Du, C., Chen, Q., He, T., Tan, X., Chen, X., Yu, K., Zhao, S., Bian, J.: Dae-talker: high fidelity speech-driven talking face generation with diffusion autoencoder. In: Proceedings of the 31st ACM International Conference on Multimedia. MM\u201923, pp. 4281\u20134289. ACM (2023). https:\/\/doi.org\/10.1145\/3581783.3613753","DOI":"10.1145\/3581783.3613753"},{"key":"4232_CR171","doi-asserted-by":"publisher","unstructured":"Mir, A., Alonso, E., Mondrag\u00f3n, E.: DiT-head: high resolution talking head synthesis using diffusion transformers. In: Proceedings of the 16th International Conference on Agents and Artificial Intelligence\u2014Volume 3: ICAART, pp. 159\u2013169. SciTePress, ??? (2024). https:\/\/doi.org\/10.5220\/0012312200003636 . INSTICC","DOI":"10.5220\/0012312200003636"},{"key":"4232_CR172","doi-asserted-by":"publisher","unstructured":"Yu, Z., Yin, Z., Zhou, D., Wang, D., Wong, F., Wang, B.: Talking head generation with probabilistic audio-to-visual diffusion priors. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7611\u20137621 (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.00703","DOI":"10.1109\/ICCV51070.2023.00703"},{"key":"4232_CR173","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-025-03907-8","author":"T Ying","year":"2025","unstructured":"Ying, T., Yazhi, L., Xiong, L., Wei, L.: Adaptive diffusion landmark dynamic rendering for realistic talking face video generation. Vis. Comput. (2025). https:\/\/doi.org\/10.1007\/s00371-025-03907-8","journal-title":"Vis. Comput."},{"key":"4232_CR174","doi-asserted-by":"crossref","unstructured":"Sung-Bin, K., Chae-Yeon, L., Son, G., Hyun-Bin, O., Ju, J., Nam, S., Oh, T.-H.: MultiTalk: enhancing 3D talking head generation across languages with multilingual video dataset. arXiv (Cornell University) (2024) arxiv:2406.14272","DOI":"10.21437\/Interspeech.2024-1794"},{"key":"4232_CR175","unstructured":"Lin, G., Jiang, J., Yang, J., Zheng, Z., Liang, C.: OmniHuman-1: Rethinking the scaling-up of one-stage conditioned human animation models. arXiv (Cornell University) (2025) arxiv:2502.01061"},{"key":"4232_CR176","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3476275","author":"Y Zhuang","year":"2024","unstructured":"Zhuang, Y., Cheng, B., Cheng, Y., Jin, Y., Liu, R., Li, C., Cheng, X., Liao, J., Lin, J.: Learn2talk: 3S talking face learns from 2d talking face. IEEE Trans. Visual. Comput. Graph. (2024). https:\/\/doi.org\/10.1109\/TVCG.2024.3476275","journal-title":"IEEE Trans. Visual. Comput. Graph."},{"key":"4232_CR177","doi-asserted-by":"crossref","unstructured":"Ma, Z., Zhu, X., Qi, G., Qian, C., Zhang, Z., Lei, Z.: DiffSpeaker: speech-driven 3D facial animation with diffusion transformer. arXiv (Cornell University) (2024) arxiv:2402.05712","DOI":"10.1109\/IJCB65343.2025.11411575"},{"key":"4232_CR178","doi-asserted-by":"crossref","unstructured":"Zhao, Q., Long, P., Zhang, Q., Qin, D., Liang, H., Zhang, L., Zhang, Y., Yu, J., Xu, L.: Media2face: co-speech facial animation generation with multi-modality guidance. arXiv preprint arXiv:2401.15687 (2024)","DOI":"10.1145\/3641519.3657413"},{"key":"4232_CR179","unstructured":"Han, T., Gui, S., Huang, Y., Li, B., Liu, L., Zhou, B., Jiang, N., Lu, Q., Zhi, R., Liang, Y., Zhang, D., Wan, J.: PMMTalk: speech-driven 3D facial animation from complementary pseudo multi-modal features. arXiv (Cornell University) (2023) arxiv:2312.02781"},{"key":"4232_CR180","doi-asserted-by":"crossref","unstructured":"Thambiraja, B., Aliakbarian, S., Cosker, D., Thies, J.: 3DIFACE: diffusion-based speech-driven 3D facial animation and editing. arXiv (Cornell University) (2023) arxiv:2312.00870","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"4232_CR181","doi-asserted-by":"publisher","unstructured":"He, S., He, H., Yang, S., Wu, X., Xia, P., Yin, B., Liu, C., Dai, L., Xu, C.: Speech4mesh: speech-assisted monocular 3d facial reconstruction for speech-driven 3d facial animation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 14146\u201314156 (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.01305","DOI":"10.1109\/ICCV51070.2023.01305"},{"key":"4232_CR182","unstructured":"Wang, Q., Fan, Z., Xia, S.: 3D-TalkEMO: learning to synthesize 3D emotional talking head. arXiv (Cornell University) (2021) arxiv:2104.12051"},{"key":"4232_CR183","unstructured":"Li, T., Zheng, R., Yang, M., Chen, J., Yang, M.: Ditto: motion-space diffusion for controllable realtime talking head synthesis. arXiv (Cornell University) (2024) arxiv:2411.19509"},{"key":"4232_CR184","unstructured":"Wang, Z., Zhang, P., Qi, J., Xu, G.W.S., Zhang, B., Bo, L.: OmniTalker: real-time text-driven talking head generation with in-context audio-visual style replication. arXiv (Cornell University) (2025) arxiv:2504.02433"},{"key":"4232_CR185","unstructured":"Anastassiou, P., Chen, J., Chen, J., Chen, Y., Chen, Z., Chen, Z., Cong, J., Deng, L., Ding, C., Gao, L., Gong, M., Huang, P., Huang, Q., Huang, Z., Huo, Y., Jia, D., Li, C., Li, F., Li, H., Li, J., Li, X., Li, X., Liu, L., Liu, S., Liu, S., Liu, X., Liu, Y., Liu, Z., Lu, L., Pan, J., Wang, X., Wang, Y., Wang, Y., Wei, Z., Wu, J., Yao, C., Yang, Y., Yi, Y., Zhang, J., Zhang, Q., Zhang, S., Zhang, W., Zhang, Y., Zhao, Z., Zhong, D., Zhuang, X.: Seed-TTS: a family of high-quality versatile speech generation models. arXiv (Cornell University) (2024) arxiv:2406.02430"},{"key":"4232_CR186","doi-asserted-by":"publisher","unstructured":"Ye, Z., Zhang, L.-G., Zeng, D., Lu, Q., Jiang, N.: Realistic real-time talking head synthesis with grid encoding and progressive conditioning. In: ICASSP 2025\u20142025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2025). https:\/\/doi.org\/10.1109\/ICASSP49660.2025.10887890","DOI":"10.1109\/ICASSP49660.2025.10887890"},{"key":"4232_CR187","unstructured":"Ye, Z., Jiang, Z., Ren, Y., Liu, J., He, J., Zhao, Z.: GeneFace: generalized and high-fidelity audio-driven 3d talking face synthesis. arXiv (Cornell University) (2023) arxiv:2301.13430"},{"key":"4232_CR188","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xu, S., Guo, J., Wang, D., Wang, Z., Tan, X., Liu, X.: SyncAnimation: a real-time end-to-end framework for audio-driven human pose and talking head animation. arXiv (Cornell University) (2025) arxiv:2501.14646","DOI":"10.24963\/ijcai.2025\/185"},{"key":"4232_CR189","unstructured":"Qi, J., Ji, C., Xu, S., Zhang, P., Zhang, B., Bo, L.: ChatAnyone: stylized real-time portrait video generation with hierarchical motion diffusion model. arXiv (Cornell University) (2025) arXiv:2503.21144"},{"key":"4232_CR190","unstructured":"Aneja, S., Sevastopolsky, A., Kirschstein, T., Thies, J., Dai, A., Nie\u00dfner, M.: GaussianSpeech: audio-driven gaussian avatars. arXiv (Cornell University) (2024) arxiv:2411.18675"},{"key":"4232_CR191","doi-asserted-by":"publisher","unstructured":"Cho, K., Lee, J., Yoon, H., Hong, Y., Ko, J., Ahn, S., Kim, S.: Gaussiantalker: real-time talking head synthesis with 3d gaussian splatting. In: Proceedings of the 32nd ACM International Conference on Multimedia. MM \u201924, pp. 10985\u201310994. Association for Computing Machinery, New York, NY, USA (2024). https:\/\/doi.org\/10.1145\/3664647.3681627","DOI":"10.1145\/3664647.3681627"},{"key":"4232_CR192","unstructured":"Cui, J., Li, H., Yao, Y., Zhu, H., Shang, H., Cheng, K., Zhou, H., Zhu, S., Wang, J.: Hallo2: long-duration and high-resolution audio-driven portrait image animation. arXiv (Cornell University) (2024) arxiv:2410.07718"},{"key":"4232_CR193","doi-asserted-by":"publisher","unstructured":"Chen, Z., Cao, J., Chen, Z., Li, Y., Ma, C.: Echomimic: lifelike audio-driven portrait animations through editable landmark conditions. In: Proceedings of the Thirty-Ninth AAAI Conference on Artificial Intelligence and Thirty-Seventh Conference on Innovative Applications of Artificial Intelligence and Fifteenth Symposium on Educational Advances in Artificial Intelligence. AAAI\u201925\/IAAI\u201925\/EAAI\u201925. AAAI Press, ??? (2025). https:\/\/doi.org\/10.1609\/aaai.v39i3.32241","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"4232_CR194","doi-asserted-by":"publisher","DOI":"10.1145\/3592433","author":"B Kerbl","year":"2023","unstructured":"Kerbl, B., Kopanas, G., Leimkuehler, T., Drettakis, G.: 3d gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. (2023). https:\/\/doi.org\/10.1145\/3592433","journal-title":"ACM Trans. Graph."},{"key":"4232_CR195","doi-asserted-by":"publisher","unstructured":"Barron, J.T., Mildenhall, B., Verbin, D., Srinivasan, P.P., Hedman, P.: MIP-nerf 360: unbounded anti-aliased neural radiance fields. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5460\u20135469 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00539","DOI":"10.1109\/CVPR52688.2022.00539"},{"key":"4232_CR196","doi-asserted-by":"crossref","unstructured":"Eskander, G.S., Sabourin, R., Granger, E.: Offline signature-based fuzzy vault (OSFV: review and new results. arXiv (Cornell University) (2014) arxiv:1408.3985","DOI":"10.1109\/CIBIM.2014.7015442"},{"key":"4232_CR197","unstructured":"Wayne, W., Yunxuan, Z., Cheng, L., Chen, Q., Change, L.C.: ReenactGAN: learning to reenact faces via boundary transfer. arXiv (Cornell University) (2018) arxiv:1807.11079"},{"key":"4232_CR198","unstructured":"Zhang, C., Zhang, C., Zhang, M., Kweon, I.S.: Text-to-image diffusion models in generative AI: a survey. arXiv (Cornell University) (2023) arxiv:2303.07909"},{"key":"4232_CR199","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. arXiv (Cornell University) (2016) arxiv:1609.02612"},{"key":"4232_CR200","doi-asserted-by":"crossref","unstructured":"Zakharov, E., Ivakhnenko, A., Shysheya, A., Lempitsky, V.: Fast bi-layer neural synthesis of one-shot realistic head avatars. arXiv (Cornell University) (2020) arxiv:2008.10174","DOI":"10.1007\/978-3-030-58610-2_31"},{"key":"4232_CR201","doi-asserted-by":"publisher","unstructured":"Li, S., Pan, Y.: AniArtAvatar: animatable 3D art avatar from a single image. Neurocomputing 129706 (2025) https:\/\/doi.org\/10.1016\/j.neucom.2025.129706","DOI":"10.1016\/j.neucom.2025.129706"},{"key":"4232_CR202","doi-asserted-by":"crossref","unstructured":"Ji, X., Zhou, H., Wang, K., Wu, Q., Wu, W., Xu, F., Cao, X.: EAMM: one-shot emotional talking face via audio-based emotion-aware motion model. arXiv (Cornell University) (2022) arxiv:2205.15278","DOI":"10.1145\/3528233.3530745"},{"key":"4232_CR203","doi-asserted-by":"publisher","unstructured":"Agustsson, E., Minnen, D., Johnston, N., Balle, J., Hwang, S.J., Toderici, G.: Scale-space flow for end-to-end optimized video compression. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8500\u20138509 (2020) https:\/\/doi.org\/10.1109\/cvpr42600.2020.00853","DOI":"10.1109\/cvpr42600.2020.00853"},{"key":"4232_CR204","doi-asserted-by":"crossref","unstructured":"Bansal, A., Ma, S., Ramanan, D., Sheikh, Y.: Recycle-GAN: unsupervised video retargeting. arXiv (Cornell University) (2018) arxiv:1808.05174","DOI":"10.1007\/978-3-030-01228-1_8"},{"key":"4232_CR205","doi-asserted-by":"publisher","unstructured":"Provine, J.A., Bruton, L.T.: Lip synchronization in 3-D model based coding for video-conferencing. In: Proceedings of ISCAS\u201995\u2014International Symposium on Circuits and Systems. IEEE (2002). https:\/\/doi.org\/10.1109\/iscas.1995.521548","DOI":"10.1109\/iscas.1995.521548"},{"issue":"5","key":"4232_CR206","doi-asserted-by":"publisher","first-page":"0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5), 0196391 (2018). https:\/\/doi.org\/10.1371\/journal.pone.0196391","journal-title":"PLoS ONE"},{"key":"4232_CR207","doi-asserted-by":"publisher","unstructured":"Yu, J., Zhu, H., Jiang, L., Loy, C.C., Cai, W., Wu, W.: Celebv-text: a large-scale facial text-video dataset. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14805\u201314814 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.01422","DOI":"10.1109\/CVPR52729.2023.01422"},{"key":"4232_CR208","doi-asserted-by":"crossref","unstructured":"Han, L., Ren, J., Lee, H.-Y., Barbieri, F., Olszewski, K., Minaee, S., Metaxas, D., Tulyakov, S.: Show me what and tell me how: Video synthesis via multimodal conditioning. arXiv (Cornell University) (2022) arXiv:2203.02573 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.00360"},{"key":"4232_CR209","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y., Wang, F.: Sadtalker: learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. arXiv preprint arXiv:2211.12194 (2023)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"4232_CR210","unstructured":"Zhou, M., Bai, Y., Zhang, W., Yao, T., Zhao, T., Mei, T.: Responsive listening head generation: a benchmark dataset and baseline. arXiv (Cornell University) (2021) arxiv:2112.13548"},{"key":"4232_CR211","unstructured":"Gu, J., Liu, L., Wang, P., Theobalt, C.: Stylenerf: a style-based 3d-aware generator for high-resolution image synthesis. arXiv (Cornell University) (2021)"},{"key":"4232_CR212","doi-asserted-by":"crossref","unstructured":"Chan, E.R., Lin, C.Z., Chan, M.A., Nagano, K., Pan, B., Mello, S.D., Gallo, O., Guibas, L., Tremblay, J., Khamis, S., Karras, T., Wetzstein, G.: Efficient geometry-aware 3d generative adversarial networks In:. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"4232_CR213","unstructured":"Bahmani, S., Park, J.J., Paschalidou, D., Tang, H., Wetzstein, G., Guibas, L., Luc, V.G., Timofte, R.: 3D-aware video generation. arXiv (Cornell University) (2022) arxiv:2206.14797"},{"key":"4232_CR214","doi-asserted-by":"publisher","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7824\u20137833 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00802","DOI":"10.1109\/CVPR.2019.00802"},{"key":"4232_CR215","doi-asserted-by":"publisher","unstructured":"Zakharov, E., Ivakhnenko, A., Shysheya, A., Lempitsky, V.: Fast bi-layer neural synthesis of one-shot realistic head avatars. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision\u2014ECCV 2020. Lecture Notes in Computer Science, vol. 12357, pp. 524\u2013540. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_31","DOI":"10.1007\/978-3-030-58610-2_31"},{"key":"4232_CR216","doi-asserted-by":"publisher","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., Liu, S.: PIRenderer: controllable portrait image generation via semantic neural rendering. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13739\u201313748. IEEE Computer Society, Los Alamitos, CA, USA (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.01350 . https:\/\/doi.ieeecomputersociety.org\/10.1109\/ICCV48922.2021.01350","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"4232_CR217","doi-asserted-by":"publisher","unstructured":"Ma, S., Weng, Y., Shao, T., Zhou, K.: 3d Gaussian blendshapes for head avatar animation. In: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers \u201924. SIGGRAPH\u201924, pp. 1\u201310. ACM (2024). https:\/\/doi.org\/10.1145\/3641519.3657462","DOI":"10.1145\/3641519.3657462"},{"key":"4232_CR218","doi-asserted-by":"crossref","unstructured":"Li, J., Zhang, J., Bai, X., Zheng, J., Ning, X., Zhou, J., Gu, L.: TalkingGaussian: structure-persistent 3D talking head synthesis via gaussian splatting. arXiv (Cornell University) (2024) arxiv:2404.15264","DOI":"10.1007\/978-3-031-72684-2_8"},{"key":"4232_CR219","doi-asserted-by":"publisher","unstructured":"Yu, H., Qu, Z., Yu, Q., Chen, J., Jiang, Z., Chen, Z., Zhang, S., Xu, J., Wu, F., Lv, C., Yu, G.: Gaussiantalker: speaker-specific talking head synthesis via 3D gaussian splatting. In: Proceedings of the 32nd ACM International Conference on Multimedia. MM \u201924, pp. 3548\u20133557. ACM (2024). https:\/\/doi.org\/10.1145\/3664647.3681675","DOI":"10.1145\/3664647.3681675"},{"key":"4232_CR220","doi-asserted-by":"publisher","unstructured":"Gerogiannis, D., Papantoniou, F.P., Potamias, R.A., Lattas, A., Moschoglou, S., Ploumpis, S., Zafeiriou, S.: Animateme: 4d facial expressions via diffusion models. In: Lecture Notes in Computer Science, pp. 270\u2013287. Springer, Berlin (2024). https:\/\/doi.org\/10.1007\/978-3-031-72980-5_16","DOI":"10.1007\/978-3-031-72980-5_16"},{"key":"4232_CR221","doi-asserted-by":"crossref","unstructured":"Aneja, S., Thies, J., Dai, A., Nie\u00dfner, M.: Facetalk: audio-driven motion diffusion for neural parametric head models. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02009"},{"key":"4232_CR222","doi-asserted-by":"crossref","unstructured":"Sun, Y., Chu, W., Zhou, H., Wang, K., Koike, H.: AVI-talking: learning audio-visual instructions for expressive 3D talking face Generation. arXiv (Cornell University) (2024) arxiv:2402.16124","DOI":"10.1109\/ACCESS.2024.3390182"},{"key":"4232_CR223","doi-asserted-by":"publisher","unstructured":"Danecek, R., Chhatre, K., Tripathi, S., Wen, Y., Black, M., Bolkart, T.: Emotional speech-driven animation with content-emotion disentanglement. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201313. ACM (2023). https:\/\/doi.org\/10.1145\/3610548.3618183","DOI":"10.1145\/3610548.3618183"},{"key":"4232_CR224","doi-asserted-by":"crossref","unstructured":"Sun, Z., Lv, T., Ye, S., Lin, M.G., Sheng, J., Wen, Y.-H., Yu, M., Liu, Y.-J.: DiffPoseTalk: speech-driven stylistic 3D facial animation and head pose generation via diffusion models. arXiv (Cornell University) (2023) arxiv:2310.00434","DOI":"10.1145\/3658221"},{"key":"4232_CR225","doi-asserted-by":"crossref","unstructured":"Thambiraja, B., Habibie, I., Aliakbarian, S., Cosker, D., Theobalt, C., Thies, J.: Imitator: personalized speech-driven 3D facial animation. arXiv (Cornell University) (2023) arxiv:2301.00023","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"4232_CR226","doi-asserted-by":"crossref","unstructured":"Haque, K.I., Yumak, Z.: FaceXHuBERT: text-less speech-driven E(X)pressive 3D facial animation synthesis using self-supervised speech representation learning. arXiv (Cornell University) (2023) arxiv:2303.05416","DOI":"10.1145\/3577190.3614157"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04232-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-04232-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-04232-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T13:00:47Z","timestamp":1772629247000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-04232-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,4]]},"references-count":226,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["4232"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-04232-w","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-6917039\/v1","asserted-by":"object"}]},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,4]]},"assertion":[{"value":"17 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"9"}}