{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T22:48:30Z","timestamp":1777675710460,"version":"3.51.4"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726545","type":"print"},{"value":"9783031726552","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72655-2_24","type":"book-chapter","created":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T10:12:12Z","timestamp":1733393532000},"page":"417-435","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Audio-Driven Talking Face Generation with\u00a0Stabilized Synchronization Loss"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5047-295X","authenticated-orcid":false,"given":"Dogucan","family":"Yaman","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5754-5405","authenticated-orcid":false,"given":"Fevziye Irem","family":"Eyiokur","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3092-8726","authenticated-orcid":false,"given":"Leonard","family":"B\u00e4rmann","sequence":"additional","affiliation":[]},{"given":"Haz\u0131m Kemal","family":"Ekenel","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Waibel","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,6]]},"reference":[{"issue":"12","key":"24_CR1","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"24_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-030-58523-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Afouras","year":"2020","unstructured":"Afouras, T., Owens, A., Chung, J.S., Zisserman, A.: Self-supervised learning of audio-visual objects from video. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12363, pp. 208\u2013224. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58523-5_13"},{"key":"24_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Blanz, V., Vetter, T.: A morphable model for the synthesis of 3d faces. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, pp. 187\u2013194 (1999)","DOI":"10.1145\/311535.311556"},{"issue":"2","key":"24_CR5","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1007\/s11263-017-1009-7","volume":"126","author":"J Booth","year":"2018","unstructured":"Booth, J., Roussos, A., Ponniah, A., Dunaway, D., Zafeiriou, S.: Large scale 3d morphable models. Int. J. Comput. Vision 126(2), 233\u2013254 (2018)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Brand, M.: Voice puppetry. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, pp. 21\u201328 (1999)","DOI":"10.1145\/311535.311537"},{"key":"24_CR7","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2d & 3d face alignment problem? (and a dataset of 230,000 3d facial landmarks). In: International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"24_CR8","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronisation in the wild. arXiv preprint arXiv:2112.04432 (2021)"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"24_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, K., et al.: Videoretalking: audio-based lip synchronization for talking head video editing in the wild. In: SIGGRAPH Asia 2022 Conference Papers, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3550469.3555399"},{"key":"24_CR11","unstructured":"Chung, J.S., et al.: In defence of metric learning for speaker recognition. arXiv preprint arXiv:2003.11982 (2020)"},{"key":"24_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-319-54184-6_6","volume-title":"Computer Vision \u2013 ACCV 2016","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: Lai, S.-H., Lepetit, V., Nishino, K., Sato, Y. (eds.) ACCV 2016. LNCS, vol. 10112, pp. 87\u2013103. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54184-6_6"},{"key":"24_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Chung, S.W., Chung, J.S., Kang, H.G.: Perfect match: improved cross-modal embeddings for audio-visual synchronisation. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3965\u20133969. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"24_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-030-58577-8_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Das","year":"2020","unstructured":"Das, D., Biswas, S., Sinha, S., Bhowmick, B.: Speech-driven facial animation using cascaded GANs for learning of motion and texture. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 408\u2013424. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_25"},{"key":"24_CR16","doi-asserted-by":"crossref","unstructured":"Eskimez, S.E., Maddox, R.K., Xu, C., Duan, Z.: End-to-end generation of talking faces from noisy speech. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1948\u20131952. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054103"},{"key":"24_CR17","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. Adv. Neural Inf. Process. Syst. 27 (2014)"},{"key":"24_CR18","doi-asserted-by":"publisher","unstructured":"Gu, Y., et al.: Vqfr: blind face restoration with vector-quantized dictionary and parallel decoder. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part XVIII, pp. 126\u2013143. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19797-0_8","DOI":"10.1007\/978-3-031-19797-0_8"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Guan, J., et\u00a0al.: Stylesync: high-fidelity generalized and personalized lip sync in style-based generator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1505\u20131515 (2023)","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y.J., Bao, H., Zhang, J.: Ad-nerf: audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5784\u20135794 (2021)","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"24_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"24_CR22","unstructured":"Hershey, J., Movellan, J.: Audio vision: Using audio-visual synchrony to locate sounds. Adv. Neural Inf. Process. Syst. 12 (1999)"},{"key":"24_CR23","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"24_CR24","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"24_CR25","unstructured":"Iashin, V., Xie, W., Rahtu, E., Zisserman, A.: Sparse in space and time: audio-visual synchronisation with trainable selectors. arXiv preprint arXiv:2210.07055 (2022)"},{"key":"24_CR26","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning. pp. 448\u2013456. PMLR (2015)"},{"key":"24_CR27","doi-asserted-by":"publisher","first-page":"1767","DOI":"10.1007\/s11263-019-01150-y","volume":"127","author":"A Jamaludin","year":"2019","unstructured":"Jamaludin, A., Chung, J.S., Zisserman, A.: You said that?: synthesising talking faces from audio. Int. J. Comput. Vision 127, 1767\u20131779 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Ji, X., et al.: Eamm: one-shot emotional talking face via audio-based emotion-aware motion model. In: ACM SIGGRAPH 2022 Conference Proceedings, pp. 1\u201310 (2022)","DOI":"10.1145\/3528233.3530745"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Ji, X., et al.: Audio-driven emotional video portraits. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14080\u201314089 (2021)","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"24_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"694","DOI":"10.1007\/978-3-319-46475-6_43","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Johnson","year":"2016","unstructured":"Johnson, J., Alahi, A., Fei-Fei, L.: Perceptual losses for real-time style transfer and super-resolution. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 694\u2013711. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_43"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Kadandale, V.S., Montesinos, J.F., Haro, G.: Vocalist: an audio-visual synchronisation model for lips and voices. arXiv preprint arXiv:2204.02090 (2022)","DOI":"10.21437\/Interspeech.2022-10861"},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Kim, Y.J., Heo, H.S., Chung, S.W., Lee, B.J.: End-to-end lip synchronisation based on pattern classification. In: 2021 IEEE Spoken Language Technology Workshop (SLT), pp. 598\u2013605. IEEE (2021)","DOI":"10.1109\/SLT48900.2021.9383616"},{"key":"24_CR33","unstructured":"KR, P., Mukhopadhyay, R., Philip, J., Jha, A., Namboodiri, V., Jawahar, C.: Towards automatic face-to-face translation. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 1428\u20131436 (2019)"},{"key":"24_CR34","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"24_CR35","doi-asserted-by":"crossref","unstructured":"Liang, B., et al.: Expressive talking head generation with granular audio-visual control. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3387\u20133396 (2022)","DOI":"10.1109\/CVPR52688.2022.00338"},{"key":"24_CR36","doi-asserted-by":"publisher","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part XXXVII, pp. 106\u2013125. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_7","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"24_CR37","unstructured":"Miyato, T., Kataoka, T., Koyama, M., Yoshida, Y.: Spectral normalization for generative adversarial networks. arXiv preprint arXiv:1802.05957 (2018)"},{"key":"24_CR38","doi-asserted-by":"crossref","unstructured":"Muaz, U., et al.: Sidgan: high-resolution dubbed video generation via shift-invariant learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7833\u20137842 (2023)","DOI":"10.1109\/ICCV51070.2023.00720"},{"key":"24_CR39","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted Boltzmann machines. In: Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807\u2013814 (2010)"},{"key":"24_CR40","doi-asserted-by":"crossref","unstructured":"Nayak, S., Schuler, C., Saha, D., Baumann, T.: A deep dive into neural synchrony evaluation for audio-visual translation. In: Proceedings of the 2022 International Conference on Multimodal Interaction, pp. 642\u2013647 (2022)","DOI":"10.1145\/3536221.3556621"},{"key":"24_CR41","unstructured":"Oord, A.V.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"24_CR42","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 631\u2013648 (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"24_CR43","doi-asserted-by":"crossref","unstructured":"Papantoniou, F.P., Filntisis, P.P., Maragos, P., Roussos, A.: Neural emotion director: speech-preserving semantic control of facial expressions in \u201cin-the-wild\" videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18781\u201318790 (2022)","DOI":"10.1109\/CVPR52688.2022.01822"},{"key":"24_CR44","doi-asserted-by":"crossref","unstructured":"Park, S.J., Kim, M., Hong, J., Choi, J., Ro, Y.M.: Synctalkface: talking face generation with precise lip-syncing via audio-lip memory. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 2062\u20132070 (2022)","DOI":"10.1609\/aaai.v36i2.20102"},{"key":"24_CR45","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"24_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"24_CR47","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: a unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"24_CR48","doi-asserted-by":"publisher","unstructured":"Shen, S., Li, W., Zhu, Z., Duan, Y., Zhou, J., Lu, J.: Learning dynamic facial radiance fields for few-shot talking head synthesis. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part XII, pp. 666\u2013682. Springer, Heidelberg (2022). DOI: https:\/\/doi.org\/10.1007\/978-3-031-19775-8_39","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"24_CR49","doi-asserted-by":"crossref","unstructured":"Shen, S., et al.: Difftalk: crafting diffusion models for generalized audio-driven portraits animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1982\u20131991 (2023)","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"24_CR50","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"24_CR51","unstructured":"Slaney, M., Covell, M.: Facesync: a linear operator for measuring synchronization of video facial images and audio tracks. Adv. Neural Inf. Process. Syst. 13 (2000)"},{"key":"24_CR52","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1109\/TIFS.2022.3146783","volume":"17","author":"L Song","year":"2022","unstructured":"Song, L., Wu, W., Qian, C., He, R., Loy, C.C.: Everybody\u2019s talkin\u2019: let me talk as you want. IEEE Trans. Inf. Forensics Secur. 17, 585\u2013598 (2022)","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"24_CR53","doi-asserted-by":"crossref","unstructured":"Song, Y., Zhu, J., Li, D., Wang, X., Qi, H.: Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786 (2018)","DOI":"10.24963\/ijcai.2019\/129"},{"key":"24_CR54","doi-asserted-by":"crossref","unstructured":"Stypu\u0142kowski, M., Vougioukas, K., He, S., Zi\u0229ba, M., Petridis, S., Pantic, M.: Diffused heads: diffusion models beat gans on talking-face generation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5091\u20135100 (2024)","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"24_CR55","doi-asserted-by":"crossref","unstructured":"Sun, Y., et al.: Masked lip-sync prediction by audio-visual contextual exploitation in transformers. In: SIGGRAPH Asia 2022 Conference Papers, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3550469.3555393"},{"issue":"4","key":"24_CR56","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing obama: learning lip sync from audio. ACM Trans. Graph. (ToG) 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"24_CR57","unstructured":"Tang, J., et al.: Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368 (2022)"},{"key":"24_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1007\/978-3-030-58517-4_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Thies","year":"2020","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 716\u2013731. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42"},{"key":"24_CR59","doi-asserted-by":"publisher","first-page":"1398","DOI":"10.1007\/s11263-019-01251-8","volume":"128","author":"K Vougioukas","year":"2020","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: Realistic speech-driven facial animation with gans. Int. J. Comput. Vision 128, 1398\u20131413 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR60","doi-asserted-by":"crossref","unstructured":"Waibel, A., et al.: Face-dubbing++: lip-synchronous, voice preserving translation of videos. In: 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSPW59220.2023.10193719"},{"key":"24_CR61","doi-asserted-by":"crossref","unstructured":"Wang, D., Deng, Y., Yin, Z., Shum, H.Y., Wang, B.: Progressive disentangled representation learning for fine-grained controllable talking head synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17979\u201317989 (2023)","DOI":"10.1109\/CVPR52729.2023.01724"},{"key":"24_CR62","doi-asserted-by":"crossref","unstructured":"Wang, J., Qian, X., Zhang, M., Tan, R.T., Li, H.: Seeing what you said: talking face generation guided by a lip reading expert. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14653\u201314662 (2023)","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"24_CR63","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Lipformer: high-fidelity and generalizable talking face generation with a pre-learned facial codebook. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13844\u201313853 (2023)","DOI":"10.1109\/CVPR52729.2023.01330"},{"issue":"4","key":"24_CR64","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004)","journal-title":"IEEE Trans. Image Process."},{"key":"24_CR65","doi-asserted-by":"crossref","unstructured":"Wu, H., Jia, J., Wang, H., Dou, Y., Duan, C., Deng, Q.: Imitating arbitrary talking style for realistic audio-driven talking face synthesis. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1478\u20131486 (2021)","DOI":"10.1145\/3474085.3475280"},{"key":"24_CR66","unstructured":"Yao, S., Zhong, R., Yan, Y., Zhai, G., Yang, X.: Dfa-nerf: personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791 (2022)"},{"key":"24_CR67","unstructured":"Ye, Z., Jiang, Z., Ren, Y., Liu, J., He, J., Zhao, Z.: Geneface: generalized and high-fidelity audio-driven 3d talking face synthesis. arXiv preprint arXiv:2301.13430 (2023)"},{"issue":"1\u20132","key":"24_CR68","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1016\/S0167-6393(98)00048-X","volume":"26","author":"H Yehia","year":"1998","unstructured":"Yehia, H., Rubin, P., Vatikiotis-Bateson, E.: Quantitative association of vocal-tract and facial behavior. Speech Commun. 26(1\u20132), 23\u201343 (1998)","journal-title":"Speech Commun."},{"key":"24_CR69","doi-asserted-by":"publisher","unstructured":"Yin, F., et al.: Styleheat: one-shot high-resolution editable talking face generation via pre-trained stylegan. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, 23\u201327 October 2022, Proceedings, Part XVII, pp. 85\u2013101. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_6","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"24_CR70","unstructured":"Zhan, F., Yu, Y., Wu, R., Zhang, J., Lu, S.: Multimodal image synthesis and editing: a survey. arXiv preprint arXiv:2112.13592 (2021)"},{"key":"24_CR71","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Facial: synthesizing dynamic talking face with implicit attribute learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3867\u20133876 (2021)","DOI":"10.1109\/ICCV48922.2021.00384"},{"key":"24_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Hu, Z., Deng, W., Fan, C., Lv, T., Ding, Y.: Dinet: deformation inpainting network for realistic face visually dubbing on high resolution video. arXiv preprint arXiv:2303.03988 (2023)","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"24_CR73","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"issue":"1","key":"24_CR74","doi-asserted-by":"publisher","first-page":"218","DOI":"10.3390\/electronics12010218","volume":"12","author":"R Zhen","year":"2023","unstructured":"Zhen, R., Song, W., He, Q., Cao, J., Shi, L., Luo, J.: Human-computer interaction system: a survey of talking-head generation. Electronics 12(1), 218 (2023)","journal-title":"Electronics"},{"key":"24_CR75","doi-asserted-by":"crossref","unstructured":"Zhong, W., et al.: Identity-preserving talking face generation with landmark and appearance priors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2023)","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"24_CR76","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., Wang, X.: Talking face generation by adversarially disentangled audio-visual representation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9299\u20139306 (2019)","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"24_CR77","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4176\u20134186 (2021)","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"24_CR78","doi-asserted-by":"crossref","unstructured":"Zhou, M., Bai, Y., Zhang, W., Yao, T., Zhao, T., Mei, T.: Responsive listening head generation: a benchmark dataset and baseline (2022)","DOI":"10.1007\/978-3-031-19839-7_8"},{"issue":"6","key":"24_CR79","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk: speaker-aware talking-head animation. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72655-2_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T11:34:32Z","timestamp":1733398472000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72655-2_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"ISBN":["9783031726545","9783031726552"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72655-2_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"6 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"We believe that generating lip-synchronized faces holds significant benefits across a broad spectrum of applications. However, we acknowledge its vulnerability to potential misuse, particularly deepfake generation. We will utilize Watermarking and prevent uncontrolled usage of our model.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics & Social Impact"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}