{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T20:27:59Z","timestamp":1773260879417,"version":"3.50.1"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T00:00:00Z","timestamp":1745107200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T00:00:00Z","timestamp":1745107200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"The Hunan Provincial Natural Science Foundation of China","award":["2025JJ80334"],"award-info":[{"award-number":["2025JJ80334"]}]},{"name":"The Hunan Provincial Natural Science Foundation of China","award":["2024JJ5059, 2023JJ50095"],"award-info":[{"award-number":["2024JJ5059, 2023JJ50095"]}]},{"name":"The Hunan Provincial Natural Science Foundation of China","award":["2025JJ80334"],"award-info":[{"award-number":["2025JJ80334"]}]},{"DOI":"10.13039\/501100001809","name":"The National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61772179,12442056"],"award-info":[{"award-number":["61772179,12442056"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"he Science and Technology Innovation Program of Hunan Province","award":["2016TP1020"],"award-info":[{"award-number":["2016TP1020"]}]},{"name":"The Science and Technology Innovation Project of Hengyang","award":["202250045231"],"award-info":[{"award-number":["202250045231"]}]},{"name":"The Industry University Research Innovation Foundation of Ministry of Education Science and Technology Development Center","award":["2020QT09"],"award-info":[{"award-number":["2020QT09"]}]},{"name":"The \"14th Five-Year Plan\" Key Disciplines and Application-oriented Special Disciplines of Hunan Province","award":["Xiangjiaotong [2022] 351"],"award-info":[{"award-number":["Xiangjiaotong [2022] 351"]}]},{"name":"The Open Research Fund of The State Key Laboratory of Multimodal Artificial Intelligence Systems","award":["MAIS-2023-09"],"award-info":[{"award-number":["MAIS-2023-09"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00530-025-01759-9","type":"journal-article","created":{"date-parts":[[2025,4,20]],"date-time":"2025-04-20T14:14:41Z","timestamp":1745158481000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Expressive talking face generation via audio visual control"],"prefix":"10.1007","volume":"31","author":[{"given":"Pengfei","family":"Li","sequence":"first","affiliation":[]},{"given":"Huihuang","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Mugang","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Qingyun","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Yangfan","family":"Zhou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,20]]},"reference":[{"key":"1759_CR1","unstructured":"Kumar R, Sotelo J, Kumar K, De Brebisson A, Bengio Y. Obamanet: Photo-realistic lip-sync from text. 2017, arXiv preprint arXiv:1801.01442."},{"issue":"4","key":"1759_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing obama: learning lip sync from audio. ACM Trans Graph (ToG) 36(4), 1\u201313 (2017)","journal-title":"ACM Trans Graph (ToG)"},{"key":"1759_CR3","doi-asserted-by":"crossref","unstructured":"KR P, Mukhopadhyay R, Philip J, Jha A, Namboodiri V, Jawahar C. Towards automatic face-to-face translation. In: Proceedings of the 27th ACM international conference on multimedia. 2019: 1428\u201336.","DOI":"10.1145\/3343031.3351066"},{"key":"1759_CR4","doi-asserted-by":"crossref","unstructured":"Prajwal K, Mukhopadhyay R, Namboodiri VP, Jawahar C. A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM international conference on multimedia. 2020: 484\u201392.","DOI":"10.1145\/3394171.3413532"},{"issue":"6","key":"1759_CR5","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk: speaker-aware talking-head animation. ACM Trans Graph 39(6), 1\u201315 (2020)","journal-title":"ACM Trans Graph"},{"key":"1759_CR6","doi-asserted-by":"crossref","unstructured":"Zhang W, Cun X, Wang X, Zhang Y, Shen X, Guo Y, Shan Y, Wang F. SadTalker: Learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023: 8652\u201361.","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"1759_CR7","doi-asserted-by":"crossref","unstructured":"Zhou H, Sun Y, Wu W, Loy CC, Wang X, Liu Z. Pose-controllable talking face generation by implicitly modularized audio-visual representation. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021: 4176\u20134186.","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"1759_CR8","doi-asserted-by":"crossref","unstructured":"Guo Y, Chen K, Liang S, Liu Y-J, Bao H, Zhang J. Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2021: 5784\u201394.","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"1759_CR9","doi-asserted-by":"crossref","unstructured":"Li D, Zhao K, Wang W, Ma Y, Peng B, Zhang Y. S3D-NeRF: Single-Shot Speech-Driven Neural Radiance Field for High Fidelity Talking Head Synthesis. European Conference on Computer Vision. Springer, Cham, 2025: 365\u2013382.","DOI":"10.1007\/978-3-031-72684-2_21"},{"key":"1759_CR10","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G. Squeeze-and-excitation networks. Proceedings of the IEEE conference on computer vision and pattern recognition. 2018: 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1759_CR11","doi-asserted-by":"crossref","unstructured":"Dai J, Qi H, Xiong Y, Li Y, Zhang G. Deformable convolutional networks.Proceedings of the IEEE international conference on computer vision. 2017: 764\u2013773.","DOI":"10.1109\/ICCV.2017.89"},{"key":"1759_CR12","doi-asserted-by":"crossref","unstructured":"Geng Z, Cao C, Tulyakov S. 3d guided fine-grained face manipulation. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019: 9821\u20139830.","DOI":"10.1109\/CVPR.2019.01005"},{"key":"1759_CR13","doi-asserted-by":"crossref","unstructured":"Wiles O, Koepke A, Zisserman A. X2face: A network for controlling face generation using images, audio, and pose codes. In: Proceedings of the European conference on computer vision (ECCV). 2018: 670\u201386.","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"1759_CR14","doi-asserted-by":"crossref","unstructured":"Siarohin A, Lathuili\u00e8re S, Tulyakov S, Ricci E, Sebe N. Animating arbitrary objects via deep motion transfer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019: 2377\u201386.","DOI":"10.1109\/CVPR.2019.00248"},{"key":"1759_CR15","unstructured":"Siarohin A, Lathuili\u00e8re S, Tulyakov S, Ricci E, Sebe N. First order motion model for image animation. Adv Neural Inf Process Syst 2019;32."},{"key":"1759_CR16","first-page":"36188","volume":"35","author":"Y Wu","year":"2022","unstructured":"Wu, Y., Deng, Y., Yang, J., Wei, F., Chen, Q., Tong, X.: Anifacegan: Animatable 3d-aware face image generation for video avatars. Adv. Neural. Inf. Process. Syst. 35, 36188\u201336201 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1759_CR17","doi-asserted-by":"crossref","unstructured":"FSRT:Rochow A, Schwarz M, Behnke S. FSRT: Facial Scene Representation Transformer for Face Reenactment from Factorized Appearance Head-pose and Facial Expression Features. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024: 7716\u20137726.","DOI":"10.1109\/CVPR52733.2024.00737"},{"key":"1759_CR18","doi-asserted-by":"crossref","unstructured":"Su J, Liu K, Chen L, Yao J, Liu Q, Lv D. Audio-driven High-resolution Seamless Talking Head Video Editing via StyleGAN. 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 2024: 1\u20136.","DOI":"10.1109\/ICME57554.2024.10688257"},{"key":"1759_CR19","doi-asserted-by":"crossref","unstructured":"Wang S, Li L, Ding Y, Fan C, Yu X. Audio2Head: Audio-driven one-shot talking-head generation with natural head motion. In: International joint conference on artificial intelligence. IJCAI; 2021.","DOI":"10.24963\/ijcai.2021\/152"},{"key":"1759_CR20","doi-asserted-by":"crossref","unstructured":"Su Y, Wang S, Wang H. DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields For High-Fidelity Talking Portrait Synthesis. ICASSP 2024\u20132024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2024: 3975\u20133979.","DOI":"10.1109\/ICASSP48485.2024.10448446"},{"key":"1759_CR21","unstructured":"Yao M, Huo Y, Ran Y, Tian Q, Wang R. Efficient region-aware neural radiance fields for high-fidelity talking portrait synthesis. Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2023: 7568\u20137578."},{"key":"1759_CR22","doi-asserted-by":"crossref","unstructured":"Chung JS, Zisserman A. Out of time: automated lip sync in the wild. In: Computer vision\u2013ACCV 2016 Workshops: ACCV 2016 international workshops, Taipei, Taiwan, November 20\u201324, 2016, revised selected papers, Part II 13. Springer; 2017, p. 251\u201363.","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"1759_CR23","doi-asserted-by":"crossref","unstructured":"Tan S, Ji B, Bi M, Pan Y. Edtalk: Efficient disentanglement for emotional talking head synthesis. European Conference on Computer Vision. Springer, Cham, 2025: 398\u2013416.","DOI":"10.1007\/978-3-031-72658-3_23"},{"key":"1759_CR24","unstructured":"Li C, Zhang C, Xu W, Xie J, Feng W, Peng B, Xing W. LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync. 2024, arXiv preprint arXiv:2412.09262."},{"key":"1759_CR25","unstructured":"Tan W, Lin C, Xu C, Ji X, Zhu J, Wang C. SVP: Style-Enhanced Vivid Portrait Talking Head Diffusion Model. 2024, arXiv preprint arXiv:2409.03270."},{"key":"1759_CR26","doi-asserted-by":"crossref","unstructured":"Stan S, Haque K I, Yumak Z. Facediffuser: Speech-driven 3d face animation synthesis using diffusion. Proceedings of the 16th ACM SIGGRAPH Conference on Motion, Interaction and Games. 2023: 1\u201311.","DOI":"10.1145\/3623264.3624447"},{"key":"1759_CR27","doi-asserted-by":"crossref","unstructured":"Ignatov A, Kobyshev N, Timofte R. Dslr-quality photos on mobile devices with deep convolutional networks. Proceedings of the IEEE international conference on computer vision. 2017: 3277\u20133285.","DOI":"10.1109\/ICCV.2017.355"},{"key":"1759_CR28","doi-asserted-by":"crossref","unstructured":"Gatys L A, Ecker A S, Bethge M. Image style transfer using convolutional neural networks. Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 2414\u20132423.","DOI":"10.1109\/CVPR.2016.265"},{"key":"1759_CR29","doi-asserted-by":"crossref","unstructured":"Yang T, Ren P, Xie X, Zhang L. Gan prior embedded network for blind face restoration in the wild. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021: 672\u2013681.","DOI":"10.1109\/CVPR46437.2021.00073"},{"key":"1759_CR30","doi-asserted-by":"crossref","unstructured":"Wang X, Li Y, Zhang H, Shan Y. Towards real-world blind face restoration with generative facial prior. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021: 9168\u20139178.","DOI":"10.1109\/CVPR46437.2021.00905"},{"issue":"12","key":"1759_CR31","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1759_CR32","doi-asserted-by":"crossref","unstructured":"Wang K, Wu Q, Song L, Yang Z, Wu W, Qian C, He R, Qiao Y, Loy CC. Mead: A large-scale audio-visual dataset for emotional talking-face generation. In: European conference on computer vision. Springer; 2020:700\u201317.","DOI":"10.1007\/978-3-030-58589-1_42"},{"issue":"5","key":"1759_CR33","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"1759_CR34","first-page":"1086","volume":"2018","author":"JS Chung","year":"2018","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxceleB2: Deep speaker recognition. Proceedings of the annual conference of the international speech communication association, INTERSPEECH. 2018, 1086\u20131090 (2018)","journal-title":"Proceedings of the annual conference of the international speech communication association, INTERSPEECH."},{"key":"1759_CR35","doi-asserted-by":"crossref","unstructured":"Liang B, Pan Y, Guo Z, Zhou H, Hong Z, Han X, Han J. Expressive talking head generation with granular audio-visual control. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 3387\u20133396.","DOI":"10.1109\/CVPR52688.2022.00338"},{"key":"1759_CR36","doi-asserted-by":"crossref","unstructured":"Shen S, Zhao W, Meng Z, Li W, Zhu Z, Zhou J, Lu J. Difftalk: Crafting diffusion models for generalized audio-driven portraits animation. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023: 1982\u20131991.","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"1759_CR37","unstructured":"Cao X, Shi S, Zhao J, Yao Y, Fei J, Gao M. JoyVASA: portrait and animal image animation with diffusion-based audio-driven facial dynamics and head motion generation. 2024, arXiv preprint arXiv:2411.09209."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01759-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01759-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01759-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T15:05:00Z","timestamp":1756998300000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01759-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,20]]},"references-count":37,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1759"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01759-9","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,20]]},"assertion":[{"value":"19 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"198"}}