{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T00:26:51Z","timestamp":1760056011335,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031723377"},{"type":"electronic","value":"9783031723384"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72338-4_24","type":"book-chapter","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T10:03:01Z","timestamp":1726480981000},"page":"349-360","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Make Audio Solely Drive Lip in\u00a0Talking Face Video Synthesis"],"prefix":"10.1007","author":[{"given":"Xing","family":"Bai","sequence":"first","affiliation":[]},{"given":"Jun","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Pengyuan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ruipeng","family":"Hao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,17]]},"reference":[{"key":"24_CR1","doi-asserted-by":"crossref","unstructured":"Cao, C., Weng, Y., Zhou, S., Tong, Y., Zhou, K.: FaceWarehouse: a 3D facial expression database for visual computing. IEEE Trans. Visual. Comput. Graph. 20(3), 413\u2013425 (2013) publisher: IEEE","DOI":"10.1109\/TVCG.2013.249"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Cao, Y., Tien, W.C., Faloutsos, P., Pighin, F.: Expressive speech-driven facial animation. ACM Trans. Graph. (TOG) 24(4), 1283\u20131302 (2005) publisher: ACM New York, NY, USA","DOI":"10.1145\/1095878.1095881"},{"key":"24_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-58545-7_3","volume-title":"Computer Vision \u2013 ECCV 2020","author":"L Chen","year":"2020","unstructured":"Chen, L., et al.: Talking-head generation with rhythmic head motion. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 35\u201351. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_3"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, K., et al.: VideoReTalking: audio-based lip synchronization for talking head video editing in the wild (2022). arXiv: 2211.14758 [cs.CV]","DOI":"10.1145\/3550469.3555399"},{"key":"24_CR6","unstructured":"Chung, J.S., Jamaludin, A., Zisserman, A.: You said that? arXiv preprint arXiv:1705.02966 (2017)"},{"key":"24_CR7","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) Computer Vision \u2013 ACCV 2016 Workshops, pp. 251\u2013263. Springer International Publishing, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"24_CR8","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-030-58577-8_25","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX","author":"D Das","year":"2020","unstructured":"Das, D., Biswas, S., Sinha, S., Bhowmick, B.: Speech-driven facial animation using cascaded GANs for learning of motion and texture. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX, pp. 408\u2013424. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_25"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., Tong, X.: Accurate 3D face reconstruction with weakly-supervised learning: From single image to image set. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp.\u00a00\u20130 (2019)","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"24_CR10","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. J. mach. learn. res. 12(7) (2011)"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Edwards, P., Landreth, C., Fiume, E., Singh, K.: JALI: an animator-centric viseme model for expressive lip synchronization. ACM Trans. graph. (TOG) 35(4), 1\u201311 (2016) publisher: ACM New York, NY, USA","DOI":"10.1145\/2897824.2925984"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Gafni, G., Thies, J., Zollhofer, M., Nie\u00dfner, M.: Dynamic neural radiance fields for monocular 4D facial avatar reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8649\u20138658 (2021)","DOI":"10.1109\/CVPR46437.2021.00854"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Guo, Y., Cai, J., Jiang, B., Zheng, J., et al: CNN-based real-time dense face reconstruction with inverse-rendered photo-realistic face images. IEEE Trans. Pattern Anal. Mach. Intell. 41(6), 1294\u20131307 (2018). publisher: IEEE","DOI":"10.1109\/TPAMI.2018.2837742"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y.J., Bao, H., Zhang, J.: AD-NeRF: audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5784\u20135794 (2021)","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"He, S., et al.: Speech4Mesh: speech-assisted monocular 3D facial reconstruction for speech-driven 3D facial animation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14192\u201314202 (2023)","DOI":"10.1109\/ICCV51070.2023.01305"},{"key":"24_CR16","unstructured":"Kumar, R., Sotelo, J., Kumar, K., de\u00a0Br\u00e9bisson, A., Bengio, Y.: Obamanet: photo-realistic lip-sync from text. arXiv preprint arXiv:1801.01442 (2017)"},{"key":"24_CR17","unstructured":"Ling, J., et al.: StableFace: analyzing and improving motion stability for talking face generation. arXiv preprint arXiv:2208.13717 (2022)"},{"key":"24_CR18","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-031-19836-6_7","volume-title":"Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVII","author":"X Liu","year":"2022","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVII, pp. 106\u2013125. Springer Nature Switzerland, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_7"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Lu, Y., Chai, J., Cao, X.: Live speech portraits: real-time photorealistic talking-head animation. ACM Trans. Graph. (TOG) 40(6), 1\u201317 (2021) publisher: ACM New York, NY, USA","DOI":"10.1145\/3478513.3480484"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021) publisher: ACM New York, NY, USA","DOI":"10.1145\/3503250"},{"key":"24_CR21","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollh\u00f6fer, M., Wen, Y., De\u00a0la Torre, F., Sheikh, Y.: MeshTalk: 3D face animation from speech using cross-modality disentanglement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1173\u20131182 (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"24_CR23","doi-asserted-by":"crossref","unstructured":"Song, L., Wu, W., Qian, C., He, R., Loy, C.C.: Everybody\u2019s talkin\u2019: let me talk as you want. IEEE Trans. Inf. Forensics Secur. 17, 585\u2013598 (2022) publisher: IEEE","DOI":"10.1109\/TIFS.2022.3146783"},{"issue":"4","key":"24_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. 36(4), 1\u201313 (2017). https:\/\/doi.org\/10.1145\/3072959.3073640","journal-title":"ACM Trans. Graph."},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Thambiraja, B., Aliakbarian, S., Cosker, D., Thies, J.: 3DiFACE: diffusion-based speech-driven 3D facial animation and editing (2023). arXiv: 2312.00870 [cs.CV]","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"24_CR26","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1007\/978-3-030-58517-4_42","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVI","author":"J Thies","year":"2020","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVI, pp. 716\u2013731. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42"},{"key":"24_CR27","unstructured":"Thies, J., Zollh\u00f6fer, M., Stamminger, M., Theobalt, C., Nie\u00dfner, M.: Facevr: real-time facial reenactment and eye gaze control in virtual reality. arXiv preprint arXiv:1610.03151 (2016)"},{"key":"24_CR28","unstructured":"Wang, Q., Fan, Z., Xia, S.: 3D-TalkEmo: learning to synthesize 3D emotional talking head. arXiv preprint arXiv:2104.12051 (2021)"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Wang, X., Guo, Y., Deng, B., Zhang, J.: Lightweight photometric stereo for facial details recovery. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 740\u2013749 (2020)","DOI":"10.1109\/CVPR42600.2020.00082"},{"key":"24_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Proc. 13(4), 600\u2013612 (2004) publisher: IEEE","DOI":"10.1109\/TIP.2003.819861"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Williams, L.: Performance-driven facial animation. In: ACM SIGGRAPH 2006 courses, pp. 16\u2013es (2006)","DOI":"10.1145\/1185657.1185856"},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: FACIAL: synthesizing dynamic talking face with implicit attribute learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 3867\u20133876 (2021)","DOI":"10.1109\/ICCV48922.2021.00384"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: SadTalker: learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8652\u20138661 (2023)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Zhong, W., et al.: Identity-preserving talking face generation with landmark and appearance priors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9729\u20139738 (2023)","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"24_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Xu, Z., Landreth, C., Kalogerakis, E., Maji, S., Singh, K.: Visemenet: audio-driven animator-centric speech animation. ACM Trans. Graph. (TOG) 37(4), 1\u201310 (2018) publisher: ACM New York, NY, USA","DOI":"10.1145\/3197517.3201292"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72338-4_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T07:49:27Z","timestamp":1759996167000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72338-4_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031723377","9783031723384"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72338-4_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"17 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lugano","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"33","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}