{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:28:42Z","timestamp":1761388122862,"version":"build-2065373602"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T00:00:00Z","timestamp":1755734400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s00530-025-01980-6","type":"journal-article","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T12:46:30Z","timestamp":1755780390000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Pmfs: Progressive mouth-to-face synthesis for realistic talking face generation"],"prefix":"10.1007","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3614-7982","authenticated-orcid":false,"given":"Xuan-Nam","family":"Cao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7612-6986","authenticated-orcid":false,"given":"Nhat-Tan","family":"Vo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,21]]},"reference":[{"issue":"6","key":"1980_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417774","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk. ACM Trans. on Gr. 39(6), 1\u201315 (2020). https:\/\/doi.org\/10.1145\/3414685.3417774","journal-title":"ACM Trans. on Gr."},{"key":"1980_CR2","doi-asserted-by":"publisher","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3d speaking styles. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10093\u201310103 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.01034","DOI":"10.1109\/CVPR.2019.01034"},{"key":"1980_CR3","doi-asserted-by":"publisher","unstructured":"Prajwal, K.R., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.V.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia. MM \u201920, pp. 484\u2013492 (2020). https:\/\/doi.org\/10.1145\/3394171.3413532","DOI":"10.1145\/3394171.3413532"},{"key":"1980_CR4","doi-asserted-by":"publisher","unstructured":"Ji, X., Zhou, H., Wang, K., Wu, W., Loy, C.C., Cao, X., Xu, F.: Audio-driven emotional video portraits. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14075\u201314084 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01386","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"1980_CR5","doi-asserted-by":"publisher","unstructured":"Zhong, W., Fang, C., Cai, Y., Wei, P., Zhao, G., Lin, L., Li, G.: Identity-preserving talking face generation with landmark and appearance priors. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9729\u20139738 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00938","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"1980_CR6","doi-asserted-by":"publisher","unstructured":"Ma, Y., Wang, S., Hu, Z., Fan, C., Lv, T., Ding, Y., Deng, Z., Yu, X.: StyleTalk: One-Shot talking head generation with controllable speaking styles (2023). https:\/\/doi.org\/10.1609\/aaai.v37i2.25280","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"1980_CR7","doi-asserted-by":"publisher","unstructured":"Wang, S., Li, L., Ding, Y., Fan, C., Yu, X.: Audio2Head: Audio-driven One-shot talking-head generation with natural head motion (2021). https:\/\/doi.org\/10.48550\/arXiv.2107.09293","DOI":"10.48550\/arXiv.2107.09293"},{"key":"1980_CR8","doi-asserted-by":"publisher","unstructured":"Shen, S., Zhao, W., Meng, Z., Li, W., Zhu, Z., Zhou, J., Lu, J.: Difftalk: Crafting diffusion models for generalized audio-driven portraits animation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1982\u20131991 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00197","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"1980_CR9","doi-asserted-by":"publisher","unstructured":"Du, C., Chen, Q., He, T., Tan, X., Chen, X., Yu, K., Zhao, S., Bian, J.: Dae-talker: High fidelity speech-driven talking face generation with diffusion autoencoder. In: Proceedings of the 31st ACM International Conference on Multimedia. Association for Computing Machinery, New York. MM \u201923, pp. 4281\u20134289. (2023). https:\/\/doi.org\/10.1145\/3581783.3613753","DOI":"10.1145\/3581783.3613753"},{"key":"1980_CR10","doi-asserted-by":"publisher","unstructured":"Stan, S., Haque, K.I., Yumak, Z.: Facediffuser: Speech-driven 3d facial animation synthesis using diffusion. In: Proceedings of the 16th ACM SIGGRAPH Conference on Motion, Interaction and Games. MIG \u201923. Association for Computing Machinery, New York. (2023). https:\/\/doi.org\/10.1145\/3623264.3624447","DOI":"10.1145\/3623264.3624447"},{"key":"1980_CR11","doi-asserted-by":"publisher","unstructured":"Li, C., Zhang, C., Xu, W., Xie, J., Feng, W., Peng, B., Xing, W.: LatentSync: audio conditioned latent diffusion models for lip sync (2024). https:\/\/doi.org\/10.48550\/arXiv.2412.09262","DOI":"10.48550\/arXiv.2412.09262"},{"issue":"4","key":"1980_CR12","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1561\/2200000056","volume":"12","author":"DP Kingma","year":"2019","unstructured":"Kingma, D.P., Welling, M.: An introduction to variational autoencoders. Found. Trends\u00ae Mach. Learn. 12(4), 307\u2013392 (2019). https:\/\/doi.org\/10.1561\/2200000056","journal-title":"Found. Trends\u00ae Mach. Learn."},{"key":"1980_CR13","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1312.6114","author":"DP Kingma","year":"2022","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational Bayes (2022). https:\/\/doi.org\/10.48550\/arXiv.1312.6114","journal-title":"Auto-encoding variational Bayes"},{"issue":"4","key":"1980_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201292","volume":"37","author":"Y Zhou","year":"2018","unstructured":"Zhou, Y., Xu, Z., Landreth, C., Kalogerakis, E., Maji, S., Singh, K.: Visemenet: audio-driven animator-centric speech animation. ACM Trans. Graph. 37(4), 1\u201310 (2018). https:\/\/doi.org\/10.1145\/3197517.3201292","journal-title":"ACM Trans. Graph."},{"key":"1980_CR15","doi-asserted-by":"publisher","unstructured":"Morrone, G., Bergamaschi, S., Pasa, L., Fadiga, L., Tikhanoff, V., Badino, L.: Face landmark-based speaker-independent audio-visual speech enhancement in multi-talker environments. In: ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6900\u20136904 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8682061","DOI":"10.1109\/ICASSP.2019.8682061"},{"key":"1980_CR16","doi-asserted-by":"publisher","unstructured":"Wang, K., Wu, Q., Song, L., Yang, Z., Wu, W., Qian, C., He, R., Qiao, Y., Loy, C.C.: Mead: a large-scale audio-visual dataset for emotional talking-face generation. In: ECCV (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_42","DOI":"10.1007\/978-3-030-58589-1_42"},{"issue":"4","key":"1980_CR17","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014). https:\/\/doi.org\/10.1109\/TAFFC.2014.2336244","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1980_CR18","doi-asserted-by":"publisher","unstructured":"Cao, X.-N., Trinh, Q.-H., Ho, V.-S., Tran, M.-T.: Speechsyncnet: Speech to talking landmark via the fusion of prior frame landmark and the audio. In: 2023 IEEE International Conference on Visual Communications and Image Processing (VCIP), pp. 1\u20135 (2023). https:\/\/doi.org\/10.1109\/VCIP59821.2023.10402739","DOI":"10.1109\/VCIP59821.2023.10402739"},{"key":"1980_CR19","doi-asserted-by":"publisher","unstructured":"Cao, X.-N., Trinh, Q.-H., Tran, M.-T.: Trans-apl: transformer model for audio and prior landmark fusion for talking landmark generation. In: ICMV (2024). https:\/\/doi.org\/10.1117\/12.3055071","DOI":"10.1117\/12.3055071"},{"key":"1980_CR20","doi-asserted-by":"publisher","unstructured":"Xu, M., Li, H., Su, Q., Shang, H., Zhang, L., Liu, C., Wang, J., Yao, Y., Zhu, S.: Hallo: hierarchical audio-driven visual synthesis for portrait image animation (2024). https:\/\/doi.org\/10.48550\/arXiv.2406.08801","DOI":"10.48550\/arXiv.2406.08801"},{"key":"1980_CR21","doi-asserted-by":"publisher","unstructured":"Wang, H., Weng, Y., Li, Y., Guo, Z., Du, J., Niu, S., Ma, J., He, S., Wu, X., Hu, Q., Yin, B., Liu, C., Liu, Q.: EmotiveTalk: expressive talking head generation through audio information decoupling and emotional video diffusion (2024). https:\/\/doi.org\/10.48550\/arXiv.2411.16726","DOI":"10.48550\/arXiv.2411.16726"},{"key":"1980_CR22","doi-asserted-by":"publisher","unstructured":"Zhong, T., Liang, C., Jiang, J., Lin, G., Yang, J., Zhao, Z.: FADA: Fast diffusion avatar synthesis with mixed-supervised multi-cfg distillation. CoRR abs\/2412.16915 (2024) https:\/\/doi.org\/10.48550\/ARXIV.2412.16915","DOI":"10.48550\/ARXIV.2412.16915"},{"key":"1980_CR23","doi-asserted-by":"publisher","unstructured":"Liu, H., Sun, W., Di, D., Sun, S., Yang, J., Zou, C., Bao, H.: MoEE: Mixture of emotion experts for audio-driven portrait animation (2025). https:\/\/doi.org\/10.48550\/arXiv.2501.01808","DOI":"10.48550\/arXiv.2501.01808"},{"key":"1980_CR24","doi-asserted-by":"publisher","unstructured":"Wang, Q., Wu, D., Xu, Z., Huang, J., Lv, J.: JoyGen: audio-driven 3D depth-aware talking-face video editing (2025). https:\/\/doi.org\/10.48550\/arXiv.2501.01798","DOI":"10.48550\/arXiv.2501.01798"},{"key":"1980_CR25","doi-asserted-by":"publisher","unstructured":"Sun, W., Li, X., Di, D., Liang, Z., Zhang, Q., Li, H., Chen, W., Cui, J.: UniAvatar: Taming lifelike audio-driven talking head generation with comprehensive motion and lighting control (2024). https:\/\/doi.org\/10.48550\/arXiv.2412.19860","DOI":"10.48550\/arXiv.2412.19860"},{"issue":"6","key":"1980_CR26","doi-asserted-by":"publisher","first-page":"194-1","DOI":"10.1145\/3130800.3130813","volume":"36","author":"T Li","year":"2017","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4d scans. ACM Trans. Graph. 36(6), 194\u20131 (2017). https:\/\/doi.org\/10.1145\/3130800.3130813","journal-title":"ACM Trans. Graph."},{"key":"1980_CR27","doi-asserted-by":"publisher","unstructured":"Xuan-Nam Cao, M.-T.T.: Spear: Spade-net and hubert for enhanced audio-to-facial reconstruction. ICAART (2025) https:\/\/doi.org\/10.5220\/0013349100003890","DOI":"10.5220\/0013349100003890"},{"key":"1980_CR28","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"W-N Hsu","year":"2021","unstructured":"Hsu, W.-N., Bolte, B., Tsai, Y.-H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio, Speech Lang. Proc. 29, 3451\u20133460 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3122291","journal-title":"IEEE\/ACM Trans. Audio, Speech Lang. Proc."},{"key":"1980_CR29","doi-asserted-by":"publisher","unstructured":"Woo, S., Park, J., Lee, J.-Y., Kweon, I.S.: Cbam: Convolutional block attention module. In: Computer Vision \u2013 ECCV 2018: 15th European Conference, Munich, Germany, September 8\u201314, 2018, Proceedings, Part VII, pp. 3\u201319. Springer, Berlin, Heidelberg (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_1","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"1980_CR30","doi-asserted-by":"publisher","unstructured":"Lugaresi, C., Tang, J., Nash, H., McClanahan, C., Uboweja, E., Hays, M., Zhang, F., Chang, C.-L., Yong, M.G., Lee, J., Chang, W.-T., Hua, W., Georg, M., Grundmann, M.: MediaPipe: A framework for building perception pipelines (2019). https:\/\/doi.org\/10.48550\/arXiv.1906.08172","DOI":"10.48550\/arXiv.1906.08172"},{"key":"1980_CR31","doi-asserted-by":"publisher","unstructured":"Larkin, K.G.: Structural Similarity Index SSIMplified: Is there really a simpler concept at the heart of image quality measurement? (2015). https:\/\/doi.org\/10.48550\/arXiv.1503.06680","DOI":"10.48550\/arXiv.1503.06680"},{"key":"1980_CR32","doi-asserted-by":"publisher","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Proceedings of the 31st International Conference on Neural Information Processing Systems. NIPS\u201917, pp. 6629\u20136640. Curran Associates Inc., Red Hook (2017). https:\/\/doi.org\/10.5555\/3295222.3295408","DOI":"10.5555\/3295222.3295408"},{"key":"1980_CR33","doi-asserted-by":"publisher","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2818\u20132826 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.308","DOI":"10.1109\/CVPR.2016.308"},{"key":"1980_CR34","doi-asserted-by":"publisher","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00068","DOI":"10.1109\/CVPR.2018.00068"},{"key":"1980_CR35","doi-asserted-by":"publisher","unstructured":"Iandola, F.N., Han, S., Moskewicz, M.W., Ashraf, K., Dally, W.J., Keutzer, K.: SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and $$<$$0.5MB model size (2016). https:\/\/doi.org\/10.48550\/arXiv.1602.07360","DOI":"10.48550\/arXiv.1602.07360"},{"key":"1980_CR36","doi-asserted-by":"publisher","unstructured":"Chen, L., Li, Z., Maddox, R.K., Duan, Z., Xu, C.: Lip movements generation at a glance. In: Computer Vision \u2013 ECCV 2018: 15th European Conference, Munich, Germany, September 8\u201314, 2018, Proceedings, Part VII, pp. 538\u2013553. Springer, Berlin, Heidelberg (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_32","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"1980_CR37","doi-asserted-by":"publisher","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: Additive angular margin loss for deep face recognition. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4685\u20134694 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00482","DOI":"10.1109\/CVPR.2019.00482"},{"key":"1980_CR38","doi-asserted-by":"publisher","unstructured":"Sinha, S., Biswas, S., Yadav, R., Bhowmick, B.: Emotion-controllable generalizedtalking face generation. In: Raedt, L.D. (ed.) Proceedings of the Thirty-FirstInternational Joint Conference on Artificial Intelligence, IJCAI-22, pp. 1320\u20131327(2022). https:\/\/doi.org\/10.24963\/ijcai.2022\/184","DOI":"10.24963\/ijcai.2022\/184"},{"issue":"5","key":"1980_CR39","doi-asserted-by":"publisher","first-page":"1398","DOI":"10.1007\/s11263-019-01251-8","volume":"128","author":"K Vougioukas","year":"2019","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: Realistic speech-driven facial animation with gans. Int. J. Comput. Vision 128(5), 1398\u20131413 (2019). https:\/\/doi.org\/10.1007\/s11263-019-01251-8","journal-title":"Int. J. Comput. Vision"},{"key":"1980_CR40","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2021","unstructured":"Eskimez, S.E., Zhang, Y., Duan, Z.: Speech driven talking face generation from a single image and an emotion condition. IEEE Trans. Multimed. 24, 3480\u20133490 (2021). https:\/\/doi.org\/10.1109\/TMM.2021.3099900","journal-title":"IEEE Trans. Multimed."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01980-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01980-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01980-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T10:24:17Z","timestamp":1761387857000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01980-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,21]]},"references-count":40,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["1980"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01980-6","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,8,21]]},"assertion":[{"value":"18 March 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"Not applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}],"article-number":"389"}}