{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T15:37:38Z","timestamp":1743089858803,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620609"},{"type":"electronic","value":"9789819620616"}],"license":[{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2061-6_23","type":"book-chapter","created":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T05:46:33Z","timestamp":1735537593000},"page":"310-323","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MambaTalk: Speech-Driven 3D Facial Animation with\u00a0Mamba"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2386-4533","authenticated-orcid":false,"given":"Deli","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Zhao","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6597-2968","authenticated-orcid":false,"given":"Yunong","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,31]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"He, S., et al.: Speech4Mesh: speech-assisted monocular 3D facial reconstruction for speech-driven 3D facial animation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, October 2023","DOI":"10.1109\/ICCV51070.2023.01305"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Edwards, P., et al.: JALI-driven expressive facial animation and multilingual speech in cyberpunk 2077. In: ACM SIGGRAPH 2020 Talks. ACM, August 2020","DOI":"10.1145\/3388767.3407339"},{"issue":"4","key":"23_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073699","volume":"36","author":"S Taylor","year":"2017","unstructured":"Taylor, S., et al.: A deep learning approach for generalized speech animation. ACM Trans. Graph. 36(4), 1\u201311 (2017)","journal-title":"ACM Trans. Graph."},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Yang, K.D., Ranjan, A., Chang, J.H.R., Vemulapalli, R., Tuzel, O.: Probabilistic speech-driven 3D facial motion synthesis: new benchmarks methods and applications. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 27294\u201327303 (2024)","DOI":"10.1109\/CVPR52733.2024.02577"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Wu, H., Zhou, S., Jia, J., Xing, J., Wen, Q., Wen, X.: Speech-driven 3D face animation with composite and regional facial movements. In: Proceedings of the 31st ACM International Conference on Multimedia. ACM (2023)","DOI":"10.1145\/3581783.3611775"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: VoxCeleb: a large-scale speaker identification dataset. In: Interspeech 2017. ISCA (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"issue":"12","key":"23_CR7","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR8","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Prajwal, K.R., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia. ACM (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: SadTalker: learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Cheng, K., et al.: VideoReTalking: audio-based lip synchronization for talking head video editing in the wild. In: SIGGRAPH Asia 2022 Conference Papers. ACM (2022)","DOI":"10.1145\/3550469.3555399"},{"key":"23_CR12","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2022","unstructured":"Eskimez, S.E., Zhang, Y., Duan, Z.: Speech driven talking face generation from a single image and an emotion condition. IEEE Trans. Multimedia 24, 3480\u20133490 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"23_CR13","doi-asserted-by":"publisher","first-page":"2033","DOI":"10.1109\/TMM.2022.3142387","volume":"25","author":"Z Ye","year":"2023","unstructured":"Ye, Z., et al.: Audio-driven talking face video generation with dynamic convolution kernels. IEEE Trans. Multimedia 25, 2033\u20132046 (2023)","journal-title":"IEEE Trans. Multimedia"},{"key":"23_CR14","unstructured":"Taylor, S.L., Mahler, M., Theobald, B.J., Matthews, I.: Dynamic units of visual speech. In: Proceedings of the 11th ACM SIGGRAPH\/Eurographics Conference on Computer Animation, pp. 275\u2013284 (2012)"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Xu, Y., Feng, A.W., Marsella, S., Shapiro, A.: A practical and configurable lip sync method for games. In: Proceedings of Motion on Games, pp. 131\u2013140. Association for Computing Machinery (2013)","DOI":"10.1145\/2522628.2522904"},{"issue":"4","key":"23_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2897824.2925984","volume":"35","author":"P Edwards","year":"2016","unstructured":"Edwards, P., Landreth, C., Fiume, E., Singh, K.: JALI: an animator-centric viseme model for expressive lip synchronization. ACM Trans. Graph. (TOG) 35(4), 1\u201311 (2016)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR17","unstructured":"Bao, L., et al.: Learning audio-driven viseme dynamics for 3D face animation. arXiv preprint arXiv:2301.06059 (2023)"},{"issue":"4","key":"23_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"23_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201292","volume":"37","author":"Y Zhou","year":"2018","unstructured":"Zhou, Y., Xu, Z., Landreth, C., Kalogerakis, E., Maji, S., Singh, K.: VisemeNet: audio-driven animator-centric speech animation. ACM Trans. Graph. 37(4), 1\u201310 (2018)","journal-title":"ACM Trans. Graph."},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3D speaking styles. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"23_CR21","unstructured":"Hannun, A., et\u00a0al.: Deep speech: scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: FaceFormer: speech-driven 3D facial animation with transformers. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"23_CR23","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollhofer, M., Wen, Y., de\u00a0la Torre, F., Sheikh, Y.: MeshTalk: 3D face animation from speech using cross-modality disentanglement. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"issue":"6","key":"23_CR25","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1109\/TMM.2010.2052239","volume":"12","author":"G Fanelli","year":"2010","unstructured":"Fanelli, G., Gall, J., Romsdorfer, H., Weise, T., Van Gool, L.: A 3-D audio-visual corpus of affective communication. IEEE Trans. Multimedia 12(6), 591\u2013598 (2010)","journal-title":"IEEE Trans. Multimedia"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: CodeTalker: speech-driven 3D facial animation with discrete motion prior. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Peng, Z., et al.: SelfTalk: a self-supervised commutative training diagram to comprehend 3D talking faces. In: Proceedings of the 31st ACM International Conference on Multimedia. ACM (2023)","DOI":"10.1145\/3581783.3611734"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Stan, S., Haque, K.I., Yumak, Z.: FaceDiffuser: speech-driven 3D facial animation synthesis using diffusion. In: ACM SIGGRAPH Conference on Motion, Interaction and Games. ACM (2023)","DOI":"10.1145\/3623264.3624447"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Peng, Z., et al.: EmoTalk: speech-driven emotional disentanglement for 3D face animation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"23_CR30","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"issue":"2","key":"23_CR31","first-page":"1770","volume":"38","author":"H Fu","year":"2024","unstructured":"Fu, H., et al.: Mimic: speaking style disentanglement for speech-driven 3D facial animation. Proc. AAAI Conf. Artif. Intell. 38(2), 1770\u20131777 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"6","key":"23_CR32","first-page":"1","volume":"36","author":"T Li","year":"2017","unstructured":"Li, T., Bolkart, T., Black, M.J., Li, H., Romero, J.: Learning a model of facial shape and expression from 4D scans. ACM Trans. Graph. 36(6), 1\u201317 (2017)","journal-title":"ACM Trans. Graph."}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2061-6_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T06:06:29Z","timestamp":1735538789000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2061-6_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,31]]},"ISBN":["9789819620609","9789819620616"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2061-6_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,31]]},"assertion":[{"value":"31 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}