{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T21:40:05Z","timestamp":1767994805605,"version":"3.49.0"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729393","type":"print"},{"value":"9783031729409","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72940-9_12","type":"book-chapter","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T20:41:35Z","timestamp":1731789695000},"page":"204-221","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["UniTalker: Scaling up\u00a0Audio-Driven 3D Facial Animation Through A Unified Model"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3446-524X","authenticated-orcid":false,"given":"Xiangyu","family":"Fan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0058-0266","authenticated-orcid":false,"given":"Jiaqi","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5971-1928","authenticated-orcid":false,"given":"Zhiqian","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3015-3609","authenticated-orcid":false,"given":"Weiye","family":"Xiao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0571-5924","authenticated-orcid":false,"given":"Lei","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,17]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Amberg, B., Romdhani, S., Vetter, T.: Optimal step nonrigid ICP algorithms for surface registration. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20138. IEEE (2007)","DOI":"10.1109\/CVPR.2007.383165"},{"key":"12_CR2","unstructured":"Anyi, R., et al.: Dynamic storyboard generation in an engine-based virtual environment for video production. arXiv preprint arXiv:2301.12688 (2023)"},{"key":"12_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: Wav2Vec2-Base-960h. https:\/\/huggingface.co\/facebook\/wav2vec2-base-960h"},{"key":"12_CR4","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"12_CR5","unstructured":"Bao, L., et al.: Learning audio-driven viseme dynamics for 3D face animation. arXiv preprint arXiv:2301.06059 (2023)"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Black, M.J., Patel, P., Tesch, J., Yang, J.: BEDLAM: a synthetic dataset of bodies exhibiting detailed lifelike animated motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8726\u20138737 (2023)","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"12_CR7","unstructured":"Cai, Z., et\u00a0al.: Digital life project: autonomous 3D characters with social intelligence. arXiv preprint arXiv:2312.04547 (2023)"},{"key":"12_CR8","unstructured":"Cai, Z., et al.: SMPler-x: scaling up expressive human pose and shape estimation. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 11454\u201311468. Curran Associates, Inc. (2023)"},{"key":"12_CR9","unstructured":"Chen, S., et\u00a0al.: WavLM-Base. https:\/\/huggingface.co\/microsoft\/wavlm-base"},{"key":"12_CR10","unstructured":"Chen, S., et\u00a0al.: WavLM-Base-Plus. https:\/\/huggingface.co\/microsoft\/wavlm-base-plus"},{"issue":"6","key":"12_CR11","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Sig. Process. 16(6), 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Top. Sig. Process."},{"key":"12_CR12","unstructured":"Conneau, A., Baevski, A., Collobert, R., Mohamed, A., Auli, M.: Wav2Vec2-XLSR-53. https:\/\/huggingface.co\/facebook\/wav2vec2-large-xlsr-53"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Conneau, A., Baevski, A., Collobert, R., Mohamed, A., Auli, M.: Unsupervised cross-lingual representation learning for speech recognition. arXiv preprint arXiv:2006.13979 (2020)","DOI":"10.21437\/Interspeech.2021-329"},{"key":"12_CR14","unstructured":"Contributors, X.: OpenXRLab synthetic data rendering toolbox (2023). https:\/\/github.com\/openxrlab\/xrfeitoria"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3D speaking styles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10101\u201310111 (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: FaceFormer: speech-driven 3D facial animation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18770\u201318780 (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"issue":"6","key":"12_CR17","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1109\/TMM.2010.2052239","volume":"12","author":"G Fanelli","year":"2010","unstructured":"Fanelli, G., Gall, J., Romsdorfer, H., Weise, T., Van Gool, L.: A 3-D audio-visual corpus of affective communication. IEEE Trans. Multimedia 12(6), 591\u2013598 (2010)","journal-title":"IEEE Trans. Multimedia"},{"key":"12_CR18","unstructured":"Grosman, J.: Fine-tuned XLSR-53 large model for speech recognition in English (2021). https:\/\/huggingface.co\/jonatasgrosman\/wav2vec2-large-xlsr-53-english"},{"key":"12_CR19","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"12_CR20","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: facebook\/hubert-base-ls960. https:\/\/huggingface.co\/facebook\/hubert-base-ls960"},{"key":"12_CR21","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Iwase, S., Kato, T., Yamaguchi, S., Yukitaka, T., Morishima, S.: Song2Face: synthesizing singing facial animation from audio. In: SIGGRAPH Asia 2020 Technical Communications, pp.\u00a01\u20134 (2020)","DOI":"10.1145\/3410700.3425435"},{"issue":"4","key":"12_CR23","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. (TOG) 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4D scans. ACM Trans. Graph. 36(6), 1\u201317 (2017). 194","DOI":"10.1145\/3130800.3130813"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Lin, Z., Lin, J., Li, L., Yuan, Y., Zou, Z.: High-quality 3D face reconstruction with affine convolutional networks. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 2495\u20132503 (2022)","DOI":"10.1145\/3503161.3548421"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: EMAGE: towards unified holistic co-speech gesture generation via masked audio gesture modeling. arXiv preprint arXiv:2401.00374 (2023)","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"12_CR27","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"612","DOI":"10.1007\/978-3-031-20071-7_36","volume-title":"ECCV 2022","author":"H Liu","year":"2022","unstructured":"Liu, H., et al.: BEAT: a large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13667, pp. 612\u2013630. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_36"},{"issue":"5","key":"12_CR28","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in north American English. PLoS ONE 13(5), e0196391 (2018)","journal-title":"PLoS ONE"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Martyniuk, T., Kupyn, O., Kurlyak, Y., Krashenyi, I., Matas, J., Sharmanska, V.: DAD-3Dheads: a large-scale dense, accurate and diverse dataset for 3D head alignment from a single image. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.02027"},{"key":"12_CR30","unstructured":"Pan, D., et\u00a0al.: RenderMe-360: a large digital asset library and benchmarks towards high-fidelity head avatars. In: Thirty-Seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2023)"},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Peng, Z., et al.: SelfTalk: a self-supervised commutative training diagram to comprehend 3D talking faces. arXiv preprint arXiv:2306.10799 (2023)","DOI":"10.1145\/3581783.3611734"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Peng, Z., et al.: EmoTalk: speech-driven emotional disentanglement for 3D face animation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20687\u201320697 (2023)","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Qing, Z., Cai, Z., Yang, Z., Yang, L.: Story-to-motion: synthesizing infinite and controllable character animation from long text. In: SIGGRAPH Asia 2023 Technical Communications, pp.\u00a01\u20134 (2023)","DOI":"10.1145\/3610543.3626176"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Qiu, H., et al.: ReliTalk: relightable talking portrait generation from a single video. Int. J. Comput. Vis., 1\u201316 (2024)","DOI":"10.1007\/s11263-024-02007-9"},{"key":"12_CR35","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1007\/s11263-007-0075-7","volume":"77","author":"DA Ross","year":"2008","unstructured":"Ross, D.A., Lim, J., Lin, R.S., Yang, M.H.: Incremental learning for robust visual tracking. Int. J. Comput. Vis. 77, 125\u2013141 (2008)","journal-title":"Int. J. Comput. Vis."},{"key":"12_CR36","unstructured":"Rossler, A., et al.: Learning to detect manipulated facial images. arxiv 2019. arXiv preprint arXiv:1901.08971"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Shimba, T., Sakurai, R., Yamazoe, H., Lee, J.H.: Talking heads synthesis from audio with deep neural networks. In: 2015 IEEE\/SICE International Symposium on System Integration (SII), pp. 100\u2013105. IEEE (2015)","DOI":"10.1109\/SII.2015.7404961"},{"key":"12_CR38","unstructured":"Siyao, L., et al.: Duolando: follower GPT with off-policy reinforcement learning for dance accompaniment. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"12_CR39","doi-asserted-by":"crossref","unstructured":"Stan, S., Haque, K.I., Yumak, Z.: FaceDiffuser: speech-driven 3D facial animation synthesis using diffusion. In: Proceedings of the 16th ACM SIGGRAPH Conference on Motion, Interaction and Games, pp. 1\u201311 (2023)","DOI":"10.1145\/3623264.3624447"},{"issue":"4","key":"12_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. (ToG) 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Tian, L., Wang, Q., Zhang, B., Bo, L.: EMO: emote portrait alive-generating expressive portrait videos with Audio2Video diffusion model under weak conditions. arXiv preprint arXiv:2402.17485 (2024)","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"12_CR42","unstructured":"Wang, L., Han, W., Soong, F.K., Huo, Q.: Text driven 3D photo-realistic talking head. In: Twelfth Annual Conference of the International Speech Communication Association (2011)"},{"key":"12_CR43","unstructured":"Wu, H., Jia, J., Xing, J., Xu, H., Wang, X., Wang, J.: MMFace4D: a large-scale multi-modal 4D face dataset for audio-driven 3D face animation. arXiv preprint arXiv:2303.09797 (2023)"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Wu, H., Zhou, S., Jia, J., Xing, J., Wen, Q., Wen, X.: Speech-driven 3D face animation with composite and regional facial movements. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 6822\u20136830 (2023)","DOI":"10.1145\/3581783.3611775"},{"key":"12_CR45","unstructured":"Wuu, C.H., et\u00a0al.: Multiface: a dataset for neural face rendering. arXiv preprint arXiv:2207.11243 (2022)"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: CodeTalker: speech-driven 3D facial animation with discrete motion prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12780\u201312790 (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"12_CR47","unstructured":"Xu, S., et al.: VASA-1: lifelike audio-driven talking faces generated in real time. arXiv preprint arXiv:2404.10667 (2024)"},{"key":"12_CR48","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: SynBody: synthetic dataset with layered human models for 3D human perception and modeling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 20282\u201320292, October 2023","DOI":"10.1109\/ICCV51070.2023.01855"},{"key":"12_CR49","doi-asserted-by":"crossref","unstructured":"Yi, H., et al.: Generating holistic 3D human motion from speech. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 469\u2013480 (2023)","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"12_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, M., et\u00a0al.: Large motion model for unified multi-modal motion generation. arXiv preprint arXiv:2404.01284 (2024)","DOI":"10.1007\/978-3-031-72624-8_23"},{"key":"12_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"12_CR52","doi-asserted-by":"crossref","unstructured":"Zhao, Q., et al.: Media2Face: co-speech facial animation generation with multi-modality guidance. arXiv preprint arXiv:2401.15687 (2024)","DOI":"10.1145\/3641519.3657413"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72940-9_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T21:33:38Z","timestamp":1731792818000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72940-9_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,17]]},"ISBN":["9783031729393","9783031729409"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72940-9_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,17]]},"assertion":[{"value":"17 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}