{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T16:25:09Z","timestamp":1782318309928,"version":"3.54.5"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730092","type":"print"},{"value":"9783031730108","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73010-8_15","type":"book-chapter","created":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T13:11:36Z","timestamp":1731157896000},"page":"244-260","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":98,"title":["EMO: Emote Portrait Alive Generating Expressive Portrait Videos with\u00a0Audio2Video Diffusion Model Under Weak Conditions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1202-6040","authenticated-orcid":false,"given":"Linrui","family":"Tian","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liefeng","family":"Bo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,10]]},"reference":[{"key":"15_CR1","unstructured":"Bar-Tal, O., et al.: Lumiere: a space-time diffusion model for video generation (2024)"},{"issue":"4","key":"15_CR2","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: Crema-d: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"15_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., Tong, X.: Accurate 3d face reconstruction with weakly-supervised learning: from single image to image set. In: IEEE Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"15_CR5","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis (2021)"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: Faceformer: speech-driven 3d facial animation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Guan, J., et\u00a0al.: Stylesync: high-fidelity generalized and personalized lip sync in style-based generator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1505\u20131515 (2023)","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"15_CR8","unstructured":"Guo, Y., et al.: Animatediff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)"},{"key":"15_CR9","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"15_CR10","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"15_CR11","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models (2022)"},{"key":"15_CR12","unstructured":"Hu, L., Gao, X., Zhang, P., Sun, K., Zhang, B., Bo, L.: Animate anyone: consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117 (2023)"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"15_CR14","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. In: 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings (2014)"},{"key":"15_CR15","unstructured":"Lin, C.H., et al.: Magic3d: high-resolution text-to-3d content creation"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., Lin, L., Yu, F., Zhou, C., Li, Y.: Moda: mapping-once audio-driven portrait animation with dual attentions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 23020\u201323029 (2023)","DOI":"10.1109\/ICCV51070.2023.02104"},{"key":"15_CR17","unstructured":"Lugaresi, C., et al.: Mediapipe: a framework for building perception pipelines (2019)"},{"key":"15_CR18","unstructured":"Ma, X., et al.: Latte: latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)"},{"key":"15_CR19","unstructured":"Ma, Y., Zhang, S., Wang, J., Wang, X., Zhang, Y., Deng, Z.: Dreamtalk: when expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 (2023)"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Mukhopadhyay, S., Suri, S., Gadde, R.T., Shrivastava, A.: Diff2lip: audio conditioned diffusion models for lip-synchronization. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 5292\u20135302 (2024)","DOI":"10.1109\/WACV57701.2024.00521"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. arXiv preprint arXiv:2212.09748 (2022)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"15_CR22","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: Dreamfusion: text-to-3D using 2D diffusion. arXiv preprint arXiv:2209.14988 (2022)"},{"key":"15_CR23","doi-asserted-by":"publisher","unstructured":"Prajwal, K.R., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492. MM 2020, Association for Computing Machinery, New York, NY, USA (2020). https:\/\/doi.org\/10.1145\/3394171.3413532","DOI":"10.1145\/3394171.3413532"},{"key":"15_CR24","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"15_CR26","series-title":"LNCS","first-page":"234","volume-title":"MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, pp. 234\u2013241. Springer, Cham (2015)"},{"key":"15_CR27","doi-asserted-by":"publisher","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: unsupervised pre-training for speech recognition, pp. 3465\u20133469 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1873","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"15_CR28","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models (2022)"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Shen, S., et al.: Difftalk: crafting diffusion models for generalized audio-driven portraits animation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Shi, Y., Xue, C., Pan, J., Zhang, W., Tan, V.Y., Bai, S.: Dragdiffusion: harnessing diffusion models for interactive point-based image editing. arXiv preprint arXiv:2306.14435 (2023)","DOI":"10.1109\/CVPR52733.2024.00844"},{"key":"15_CR31","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"15_CR32","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=St1giarCHLP"},{"key":"15_CR33","unstructured":"Stypu\u0142kowski, M., Vougioukas, K., He, S., Zieba, M., Petridis, S., Pantic, M.: Diffused heads: diffusion models beat GANs on talking-face generation (2023). https:\/\/arxiv.org\/abs\/2301.03396 (2023)"},{"key":"15_CR34","unstructured":"Sun, X., et al.: Vividtalk: one-shot audio-driven talking head generation based on 3D hybrid prior. arXiv preprint arXiv:2312.01841 (2023)"},{"key":"15_CR35","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: FVD: a new metric for video generation (2019)"},{"issue":"12","key":"15_CR36","doi-asserted-by":"publisher","first-page":"3457","DOI":"10.1109\/TVCG.2020.3023573","volume":"26","author":"X Wen","year":"2020","unstructured":"Wen, X., Wang, M., Richardt, C., Chen, Z.Y., Hu, S.M.: Photorealistic audio-driven video portraits. IEEE Trans. Visual Comput. Graph. 26(12), 3457\u20133466 (2020). https:\/\/doi.org\/10.1109\/TVCG.2020.3023573","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Xie, L., Wang, X., Zhang, H., Dong, C., Shan, Y.: VFHQ: a high-quality dataset and benchmark for video face super-resolution. In: The IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW) (2022)","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"15_CR38","unstructured":"Ye, Z., et\u00a0al.: Real3d-portrait: one-shot realistic 3d talking portrait synthesis. arXiv preprint arXiv:2401.08503 (2024)"},{"key":"15_CR39","unstructured":"Zhang, S., et al.: I2vgen-xl: high-quality image-to-video synthesis via cascaded diffusion models (2023)"},{"key":"15_CR40","doi-asserted-by":"publisher","unstructured":"Zhang, W., et al.: Sadtalker: learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8652\u20138661. IEEE Computer Society, Los Alamitos, CA, USA (2023).https:\/\/doi.org\/10.1109\/CVPR52729.2023.00836","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makeittalk: speaker-aware talking-head animation. ACM Trans. Graph. 39(6) (2020)","DOI":"10.1145\/3414685.3417774"},{"key":"15_CR43","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"650","DOI":"10.1007\/978-3-031-20071-7_38","volume-title":"ECCV 2022","author":"H Zhu","year":"2022","unstructured":"Zhu, H., et al.: CelebV-HQ: a large-scale video facial attributes dataset. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13667, pp. 650\u2013667. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_38"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, L., et al.: Tryondiffusion: a tale of two unets. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4606\u20134615 (2023)","DOI":"10.1109\/CVPR52729.2023.00447"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73010-8_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T14:04:17Z","timestamp":1731161057000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73010-8_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,10]]},"ISBN":["9783031730092","9783031730108"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73010-8_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,10]]},"assertion":[{"value":"10 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}