{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:01:52Z","timestamp":1761894112477,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":31,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819527243","type":"print"},{"value":"9789819527250","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-2725-0_6","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:19:28Z","timestamp":1761887968000},"page":"76-88","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HFSD-V2C: Zero-Shot Visual Voice Cloning Via Hierarchical Face-Styled Diffusion Model"],"prefix":"10.1007","author":[{"given":"Yaping","family":"Liu","sequence":"first","affiliation":[]},{"given":"Linqin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shengxiang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Zhengtao","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Ling","family":"Dong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Chen, Q., Tan, M., Qi, Y., Zhou, J., Li, Y., Wu, Q.: V2c: visual voice cloning. In: Proceedings of CVPR Conference, pp. 21242\u201321251 (2022)","DOI":"10.1109\/CVPR52688.2022.02056"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Hegde, S.B., Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: Lip-to-speech synthesis for arbitrary speakers in the wild. In: Proceedings of ACM MM Conference, pp. 6250\u20136258 (2022)","DOI":"10.1145\/3503161.3548081"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Lu, J., Sisman, B., Liu, R., Zhang, M., Li, H.: Visualtts: Tts with accurate lip-speech synchronization for automatic voice over. In: Proceedings of International Conference on Acoustics, Speech and Signal Processing Conference, pp. 8032\u20138036 (2022)","DOI":"10.1109\/ICASSP43922.2022.9746421"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Wang, Y., Zhao, Z.: Fastlts: Non-autoregressive end-to-end unconstrained lip-to-speech synthesis. In: Proceedings of ACM MM Conference, pp. 5678\u20135687 (2022)","DOI":"10.1145\/3503161.3548194"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Hassid, M., Ramanovich, M.T., Shillingford, B., Wang, M., Jia, Y., Remez, T.: More than words: in-the-wild visually-driven prosody for text-to-speech. In: Proceedings of Computer Vision and Pattern Recognition Conference, pp. 10587\u201310597 (2022)","DOI":"10.1109\/CVPR52688.2022.01033"},{"key":"6_CR6","first-page":"16582","volume":"34","author":"C Hu","year":"2021","unstructured":"Hu, C., Tian, Q., Li, T., Yuping, W., Wang, Y., Zhao, H.: Neural dubber: dubbing for videos according to scripts. Adv. Neural. Inf. Process. Syst. 34, 16582\u201316595 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Lee, J., Chung, J.S., Chung, S.W.: Imaginary voice: face-styled diffusion model for text-to-speech. In: Proceedings of International Conference on Acoustics, Speech and Signal Processing Conference, pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10094745"},{"key":"6_CR8","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: Proceedings of International conference on machine learning, pp. 2256\u20132265 (2015)"},{"key":"6_CR9","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"6_CR10","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Huang, R., Ren, Y., Jiang, Z., Cui, C., Liu, J., Zhao, Z.: Fastdiff 2: revisiting and incorporating GANS and diffusion models in high-fidelity speech synthesis. In: Proceedings of Association for Computational Linguistics Conference, pp. 6994\u20137009 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.437"},{"issue":"6","key":"6_CR12","doi-asserted-by":"publisher","first-page":"4234","DOI":"10.1109\/TPAMI.2024.3356232","volume":"46","author":"X Tan","year":"2024","unstructured":"Tan, X., et al.: Naturalspeech: end-to-end text-to-speech synthesis with human-level quality. IEEE Trans. Pattern Anal. Mach. Intell. 46(6), 4234\u20134245 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Wang, Y., et\u00a0al.: Tacotron: towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135 (2017)","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"6_CR14","unstructured":"Ren, Y., et al.: Fastspeech 2: fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)"},{"key":"6_CR15","unstructured":"Kim, J., Kong, J., Son, J.: Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In: Proceedings of International Conference on Machine Learning, pp. 5530\u20135540 (2021)"},{"key":"6_CR16","unstructured":"Casanova, E., Weber, J., Shulby, C.D., Junior, A.C., G\u00f6lge, E., Ponti, M.A.: Yourtts: towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. In: Proceedings of International Conference on Machine Learning, pp. 2709\u20132720 (2022)"},{"key":"6_CR17","unstructured":"Popov, V., Vovk, I., Gogoryan, V., Sadekova, T., Kudinov, M.: Grad-TTS: a diffusion probabilistic model for text-to-speech. In: Proceedings of International Conference on Machine Learning, pp. 8599\u20138608 (2021)"},{"key":"6_CR18","doi-asserted-by":"crossref","unstructured":"Cong, G., Li, L., Qi, Y., Zha, Z.J., Wu, Q., Wang, W., Jiang, B., Yang, M.H., Huang, Q.: Learning to dub movies via hierarchical prosody models. In: Proceedings of CVPR Conference. pp. 14687\u201314697 (2023)","DOI":"10.1109\/CVPR52729.2023.01411"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Goto, S., Onishi, K., Saito, Y., Tachibana, K., Mori, K.: Face2speech: towards multi-speaker text-to-speech synthesis using an embedding vector predicted from a face image. In: Proceedings of INTERSPEECH Conference, pp. 1321\u20131325 (2020)","DOI":"10.21437\/Interspeech.2020-2136"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Wang, J., Wang, Z., Hu, X., Li, X., Fang, Q., Liu, L.: Residual-guided personalized speech synthesis based on face image. In: Proceedings of INTERSPEECH Conference, pp. 4743\u20134747 (2022)","DOI":"10.1109\/ICASSP43922.2022.9746808"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Learnable pins: cross-modal embeddings for person identity. In: Proceedings of the European conference on computer vision, pp. 71\u201388 (2018)","DOI":"10.1007\/978-3-030-01261-8_5"},{"key":"6_CR22","unstructured":"Vaswani, A.: Attention is all you need. Adv. Neural Inf. Proc. Syst. (2017)"},{"issue":"1","key":"6_CR23","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1038\/s42256-020-00280-0","volume":"3","author":"A Toisoul","year":"2021","unstructured":"Toisoul, A., Kossaifi, J., Bulat, A., Tzimiropoulos, G., Pantic, M.: Estimation of continuous valence and arousal levels from faces in naturalistic conditions. Nature Mach. Intell. 3(1), 42\u201350 (2021)","journal-title":"Nature Mach. Intell."},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"McAuliffe, M., Socolof, M., Mihuc, S., Wagner, M., Sonderegger, M.: Montreal forced aligner: trainable text-speech alignment using kaldi. In: Interspeech. vol.\u00a02017, pp. 498\u2013502 (2017)","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Yu, J., et al.: Audio-visual recognition of overlapped speech for the lrs2 dataset. In: Proceedings of International Conference on Acoustics, Speech and Signal Processing, pp. 6984\u20136988 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054127"},{"key":"6_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, S., Zhu, X., Lei, Z., Shi, H., Wang, X., Li, S.Z.: S3fd: Single shot scale-invariant face detector. In: Proceedings of International Conference On Computer Vision, pp. 192\u2013201 (2017)","DOI":"10.1109\/ICCV.2017.30"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Kubichek, R.: Mel-cepstral distance measure for objective speech quality assessment. In: Proceedings of IEEE pacific rim conference on communications computers and signal processing, pp. 125\u2013128 (1993)","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"6_CR28","unstructured":"Kinga, D., et\u00a0al.: A method for stochastic optimization. In: Proceedings of International conference on learning representations. vol.\u00a05, p.\u00a06 (2015)"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O.M., Zisserman, A.: Vggface2: a dataset for recognising faces across pose and age. In: Proceedings of FG Conference, pp. 67\u201374 (2018)","DOI":"10.1109\/FG.2018.00020"},{"key":"6_CR30","first-page":"17022","volume":"33","author":"J Kong","year":"2020","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Adv. Neural. Inf. Process. Syst. 33, 17022\u201317033 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"6_CR31","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-sne. J. Mach. Learn. Res. 9(11) (2008)"}],"container-title":["Lecture Notes in Computer Science","Chinese Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-2725-0_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:19:38Z","timestamp":1761887978000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-2725-0_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819527243","9789819527250"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-2725-0_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China National Conference on Chinese Computational Linguistics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Jinan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cncl2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/link.springer.com\/conference\/cncl","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}