{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T01:32:17Z","timestamp":1772674337715,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2024,6,8]],"date-time":"2024-06-08T00:00:00Z","timestamp":1717804800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,6,8]],"date-time":"2024-06-08T00:00:00Z","timestamp":1717804800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFF0902303"],"award-info":[{"award-number":["2022YFF0902303"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072036"],"award-info":[{"award-number":["62072036"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"2022 major science and technology project \"Yuelu\u00b7Multimodal Graph-Text-Sound-Semantic Gesture Big Model Research and Demonstration Application\" in Changsha","award":["kh2301019"],"award-info":[{"award-number":["kh2301019"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1007\/s00371-024-03490-4","type":"journal-article","created":{"date-parts":[[2024,6,8]],"date-time":"2024-06-08T12:01:39Z","timestamp":1717848099000},"page":"4913-4925","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Manitalk: manipulable talking head generation from single image in the wild"],"prefix":"10.1007","volume":"40","author":[{"given":"Hui","family":"Fang","sequence":"first","affiliation":[]},{"given":"Dongdong","family":"Weng","sequence":"additional","affiliation":[]},{"given":"Zeyu","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Yin","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,6,8]]},"reference":[{"key":"3490_CR1","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"3490_CR2","doi-asserted-by":"crossref","unstructured":"Baltrusaitis, T., Zadeh, A., Lim, Y.C., Morency, L.P.: Openface 2.0: facial behavior analysis toolkit. In: 2018 13th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2018), pp. 59\u201366. IEEE (2018)","DOI":"10.1109\/FG.2018.00019"},{"issue":"6","key":"3490_CR3","doi-asserted-by":"publisher","first-page":"567","DOI":"10.1109\/34.24792","volume":"11","author":"FL Bookstein","year":"1989","unstructured":"Bookstein, F.L.: Principal warps: thin-plate splines and the decomposition of deformations. IEEE Trans. Pattern Anal. Mach. Intell. 11(6), 567\u2013585 (1989)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3490_CR4","doi-asserted-by":"crossref","unstructured":"Chatziagapi, A., Athar, S., Jain, A., Rohith, M., Bhat, V., Samaras, D.: Lipnerf: what is the right feature space to lip-sync a nerf? In: 2023 IEEE 17th International Conference on Automatic Face and Gesture Recognition (FG), pp. 1\u20138. IEEE (2023)","DOI":"10.1109\/FG57933.2023.10042567"},{"key":"3490_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, K., Cun, X., Zhang, Y., Xia, M., Yin, F., Zhu, M., Wang, X., Wang, J., Wang, N.: Videoretalking: audio-based lip synchronization for talking head video editing in the wild. In: SIGGRAPH Asia 2022 Conference Papers (2022)","DOI":"10.1145\/3550469.3555399"},{"key":"3490_CR6","unstructured":"Chenxu, Z., Chao, W., Jianfeng, Z., Hongyi, X., Guoxian, S., You, X., Linjie, L., Yapeng, T., Xiaohu, G., Jiashi, F.: Dream-talk: diffusion-based realistic emotional audio-driven method for single image talking face generation. arXiv preprint arXiv:2312.13578 (2023)"},{"key":"3490_CR7","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3d speaking styles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10,101\u201310,111 (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"3490_CR8","doi-asserted-by":"crossref","unstructured":"Deng, H., Han, C., Cai, H., Han, G., He, S.: Spatially-invariant style-codes controlled makeup transfer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6549\u20136557 (2021)","DOI":"10.1109\/CVPR46437.2021.00648"},{"key":"3490_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: Additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"3490_CR10","doi-asserted-by":"crossref","unstructured":"Doukas, M.C., Ververas, E., Sharmanska, V., Zafeiriou, S.: Free-headgan: neural talking head synthesis with explicit gaze control. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3253243"},{"key":"3490_CR11","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2021","unstructured":"Eskimez, S.E., Zhang, Y., Duan, Z.: Speech driven talking face generation from a single image and an emotion condition. IEEE Trans. Multimed. 24, 3480\u20133490 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"3490_CR12","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: Faceformer: speech-driven 3d facial animation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18,770\u201318,780 (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"3490_CR13","doi-asserted-by":"crossref","unstructured":"Ganin, Y., Kononenko, D., Sungatullina, D., Lempitsky, V.: Deepwarp: photorealistic image resynthesis for gaze manipulation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II 14, pp. 311\u2013326. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_20"},{"key":"3490_CR14","doi-asserted-by":"crossref","unstructured":"He, Z., Spurr, A., Zhang, X., Hilliges, O.: Photo-realistic monocular gaze redirection using generative adversarial networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6932\u20136941 (2019)","DOI":"10.1109\/ICCV.2019.00703"},{"issue":"4","key":"3490_CR15","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"C Houwei","year":"2014","unstructured":"Houwei, C., David, G.C., Michael, K.K., Ruben, C.G., Ani, N., Ragini, V.: Crema-d: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"4","key":"3490_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph."},{"key":"3490_CR17","doi-asserted-by":"crossref","unstructured":"Lahiri, A., Kwatra, V., Frueh, C., Lewis, J., Bregler, C.: Lipsync3d: data-efficient learning of personalized 3d talking faces from video using pose and lighting normalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2755\u20132764 (2021)","DOI":"10.1109\/CVPR46437.2021.00278"},{"issue":"6","key":"3490_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480484","volume":"40","author":"Y Lu","year":"2021","unstructured":"Lu, Y., Chai, J., Cao, X.: Live speech portraits: real-time photorealistic talking-head animation. ACM Trans. Graph. 40(6), 1\u201317 (2021)","journal-title":"ACM Trans. Graph."},{"key":"3490_CR19","unstructured":"Lugaresi, C., Tang, J., Nash, H., McClanahan, C., Uboweja, E., Hays, M., Zhang, F., Chang, C.L., Yong, M.G., Lee, J., et\u00a0al.: Mediapipe: a framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)"},{"key":"3490_CR20","doi-asserted-by":"crossref","unstructured":"Mao, X., Li, Q., Xie, H., Lau, R.Y., Wang, Z., Paul\u00a0Smolley, S.: Least squares generative adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2794\u20132802 (2017)","DOI":"10.1109\/ICCV.2017.304"},{"key":"3490_CR21","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Chung, J.S., Zisserman, A.: Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"issue":"9","key":"3490_CR22","doi-asserted-by":"publisher","first-page":"2678","DOI":"10.1109\/TIP.2011.2131660","volume":"20","author":"ND Narvekar","year":"2011","unstructured":"Narvekar, N.D., Karam, L.J.: A no-reference image blur metric based on the cumulative probability of blur detection (cpbd). IEEE Trans. Image Process. 20(9), 2678\u20132683 (2011)","journal-title":"IEEE Trans. Image Process."},{"key":"3490_CR23","unstructured":"Oord, A.v.d., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A., Kavukcuoglu, K.: Wavenet: a generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)"},{"key":"3490_CR24","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"3490_CR25","doi-asserted-by":"crossref","unstructured":"Ruzzi, A., Shi, X., Wang, X., Li, G., De\u00a0Mello, S., Chang, H.J., Zhang, X., Hilliges, O.: Gazenerf: 3d-aware gaze redirection with neural radiance fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9676\u20139685 (2023)","DOI":"10.1109\/CVPR52729.2023.00933"},{"key":"3490_CR26","doi-asserted-by":"crossref","unstructured":"Siarohin, A., Woodford, O.J., Ren, J., Chai, M., Tulyakov, S.: Motion representations for articulated animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13,653\u201313,662 (2021)","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"3490_CR27","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"3490_CR28","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1109\/TIFS.2022.3146783","volume":"17","author":"L Song","year":"2022","unstructured":"Song, L., Wu, W., Qian, C., He, R., Loy, C.C.: Everybody\u2019s talkin\u2019: let me talk as you want. IEEE Trans. Inf. Forensics Secur. 17, 585\u2013598 (2022)","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"issue":"4","key":"3490_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph."},{"key":"3490_CR30","doi-asserted-by":"crossref","unstructured":"Suzhen, W., Lincheng, L., Yu, D., Xin, Y.: One-shot talking face generation from single-speaker audio-visual correlation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 2531\u20132539 (2022)","DOI":"10.1609\/aaai.v36i3.20154"},{"key":"3490_CR31","doi-asserted-by":"crossref","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVI 16, pp. 716\u2013731. Springer (2020)","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"3490_CR32","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"3490_CR33","doi-asserted-by":"crossref","unstructured":"Wang, S., Li, L., Ding, Y., Fan, C., Yu, X.: Audio2head: audio-driven one-shot talking-head generation with natural head motion. arXiv preprint arXiv:2107.09293 (2021)","DOI":"10.24963\/ijcai.2021\/152"},{"key":"3490_CR34","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Liu, M.Y., Zhu, J.Y., Tao, A., Kautz, J., Catanzaro, B.: High-resolution image synthesis and semantic manipulation with conditional gans. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8798\u20138807 (2018)","DOI":"10.1109\/CVPR.2018.00917"},{"key":"3490_CR35","doi-asserted-by":"crossref","unstructured":"Wang, X., Li, Y., Zhang, H., Shan, Y.: Towards real-world blind face restoration with generative facial prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9168\u20139178 (2021)","DOI":"10.1109\/CVPR46437.2021.00905"},{"issue":"12","key":"3490_CR36","doi-asserted-by":"publisher","first-page":"3457","DOI":"10.1109\/TVCG.2020.3023573","volume":"26","author":"X Wen","year":"2020","unstructured":"Wen, X., Wang, M., Richardt, C., Chen, Z.Y., Hu, S.M.: Photorealistic audio-driven video portraits. IEEE Trans. Visual Comput. Graph. 26(12), 3457\u20133466 (2020)","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"3490_CR37","doi-asserted-by":"crossref","unstructured":"Wolf, L., Freund, Z., Avidan, S.: An eye for an eye: a single camera gaze-replacement method. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 817\u2013824. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5540133"},{"key":"3490_CR38","doi-asserted-by":"crossref","unstructured":"Xinya, J., Hang, Z., Kaisiyuan, W., Qianyi, W., Wayne, W., Feng, X., Xun, C.: Eamm: One-shot emotional talking face via audio-based emotion-aware motion model. In: ACM SIGGRAPH 2022 Conference Proceedings, pp. 1\u201310 (2022)","DOI":"10.1145\/3528233.3530745"},{"key":"3490_CR39","unstructured":"Yi, R., Ye, Z., Zhang, J., Bao, H., Liu, Y.J.: Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137 (2020)"},{"key":"3490_CR40","doi-asserted-by":"crossref","unstructured":"Yu, Y., Odobez, J.M.: Unsupervised representation learning for gaze estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7314\u20137324 (2020)","DOI":"10.1109\/CVPR42600.2020.00734"},{"issue":"2","key":"3490_CR41","doi-asserted-by":"publisher","first-page":"1438","DOI":"10.1109\/TVCG.2021.3117484","volume":"29","author":"C Zhang","year":"2023","unstructured":"Zhang, C., Ni, S., Fan, Z., Li, H., Zeng, M., Budagavi, M., Guo, X.: 3d talking face with personalized pose dynamics. IEEE Trans. Visual Comput. Graph. 29(2), 1438\u20131449 (2023)","journal-title":"IEEE Trans. Visual Comput. Graph."},{"key":"3490_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, C., Zhao, Y., Huang, Y., Zeng, M., Ni, S., Budagavi, M., Guo, X.: Facial: synthesizing dynamic talking face with implicit attribute learning. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 3847\u20133856 (2021)","DOI":"10.1109\/ICCV48922.2021.00384"},{"key":"3490_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"3490_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, W., Cun, X., Wang, X., Zhang, Y., Shen, X., Guo, Y., Shan, Y., Wang, F.: Sadtalker: learning realistic 3d motion coefficients for stylized audio-driven single image talking face animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8652\u20138661 (2023)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"3490_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, Y., He, W., Li, M., Tian, K., Zhang, Z., Cheng, J., Wang, Y., Liao, J.: Meta talk: learning to data-efficiently generate audio-driven lip-synchronized talking face with high definition. In: ICASSP 2022\u20142022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4848\u20134852 (2022)","DOI":"10.1109\/ICASSP43922.2022.9747284"},{"key":"3490_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"3490_CR47","doi-asserted-by":"crossref","unstructured":"Zhao, J., Zhang, H.: Thin-plate spline motion model for image animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3657\u20133666 (2022)","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"3490_CR48","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk: speaker-aware talking-head animation. ACM Trans. Graph. 39(6), 1\u201315 (2020)","DOI":"10.1145\/3414685.3417774"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03490-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03490-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03490-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T12:01:40Z","timestamp":1732190500000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03490-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,8]]},"references-count":48,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["3490"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03490-4","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,6,8]]},"assertion":[{"value":"13 May 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 June 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}