{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T08:29:33Z","timestamp":1750408173953,"version":"3.40.3"},"publisher-location":"Cham","reference-count":66,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031732225"},{"type":"electronic","value":"9783031732232"}],"license":[{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73223-2_23","type":"book-chapter","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T18:48:24Z","timestamp":1731005304000},"page":"416-433","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["3D-Aware Text-Driven Talking Avatar Generation"],"prefix":"10.1007","author":[{"given":"Xiuzhe","family":"Wu","sequence":"first","affiliation":[]},{"given":"Yang-Tian","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Handi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jingdong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhengzhe","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xiaojuan","family":"Qi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,8]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Barron, J.T., Mildenhall, B., Tancik, M., Hedman, P., Martin-Brualla, R., Srinivasan, P.P.: Mip-NeRF: a multiscale representation for anti-aliasing neural radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5855\u20135864 (2021)","DOI":"10.1109\/ICCV48922.2021.00580"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Blanz, V., Vetter, T.: A morphable model for the synthesis of 3D faces. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, pp. 187\u2013194 (1999)","DOI":"10.1145\/311535.311556"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2D & 3D face alignment problem? (and a dataset of 230,000 3d facial landmarks). In: International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et\u00a0al.: Efficient geometry-aware 3D generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16123\u201316133 (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"23_CR6","unstructured":"Chung, J.S., Jamaludin, A., Zisserman, A.: You said that? arXiv preprint arXiv:1705.02966 (2017)"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Computer Vision\u2013ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20-24, 2016, Revised Selected Papers, Part II 13. pp. 251\u2013263. Springer (2017)","DOI":"10.1007\/978-3-319-54427-4_19"},{"issue":"1","key":"23_CR8","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1109\/MSP.2017.2765202","volume":"35","author":"A Creswell","year":"2018","unstructured":"Creswell, A., et al.: Generative adversarial networks: an overview. IEEE Signal Process. Mag. 35(1), 53\u201365 (2018)","journal-title":"IEEE Signal Process. Mag."},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Croitoru, F.A., Hondru, V., Ionescu, R.T., Shah, M.: Diffusion models in vision: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 45, 10850-10869 (2023)","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"23_CR10","doi-asserted-by":"publisher","unstructured":"Das, D., Biswas, S., Sinha, S., Bhowmick, B.: Speech-driven facial animation using cascaded GANs for learning of motion and texture. In: European Conference on Computer Vision, pp. 408\u2013424. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_25","DOI":"10.1007\/978-3-030-58577-8_25"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Deng, K., Liu, A., Zhu, J.Y., Ramanan, D.: Depth-supervised nerf: fewer views and faster training for free. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12882\u201312891 (2022)","DOI":"10.1109\/CVPR52688.2022.01254"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., Tong, X.: Accurate 3D face reconstruction with weakly-supervised learning: from single image to image set. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp.\u00a00\u20130 (2019)","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Doukas, M.C., Zafeiriou, S., Sharmanska, V.: HeadGAN: one-shot neural head synthesis and editing. In: Proceedings of the IEEE\/CVF International conference on Computer Vision, pp. 14398\u201314407 (2021)","DOI":"10.1109\/ICCV48922.2021.01413"},{"issue":"4","key":"23_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459936","volume":"40","author":"Y Feng","year":"2021","unstructured":"Feng, Y., Feng, H., Black, M.J., Bolkart, T.: Learning an animatable detailed 3D face model from in-the-wild images. ACM Trans. Graph. (TOG) 40(4), 1\u201313 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"3","key":"23_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2890493","volume":"35","author":"P Garrido","year":"2016","unstructured":"Garrido, P., et al.: Reconstruction of personalized 3d face rigs from monocular video. ACM Trans. Graph. (TOG) 35(3), 1\u201315 (2016)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac\u00a0Aodha, O., Firman, M., Brostow, G.J.: Digging into self-supervised monocular depth estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3828\u20133838 (2019)","DOI":"10.1109\/ICCV.2019.00393"},{"key":"23_CR18","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. Advances in neural information processing systems 27 (2014)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Guan, J., et\u00a0al.: StyleSync: high-fidelity generalized and personalized lip sync in style-based generator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1505\u20131515 (2023)","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Guo, Y., Chen, K., Liang, S., Liu, Y.J., Bao, H., Zhang, J.: AD-NeRF: audio driven neural radiance fields for talking head synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5784\u20135794 (2021)","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"23_CR21","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)"},{"key":"23_CR22","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Hong, F.T., Zhang, L., Shen, L., Xu, D.: Depth-aware generative adversarial network for talking head video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3397\u20133406 (2022)","DOI":"10.1109\/CVPR52688.2022.00339"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410 (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"23_CR25","unstructured":"Kumar, R., Sotelo, J., Kumar, K., de\u00a0Br\u00e9bisson, A., Bengio, Y.: ObamaNet: photo-realistic lip-sync from text. arXiv preprint arXiv:1801.01442 (2017)"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Liu, J., Chang, C., Liu, J., Wu, X., Ma, L., Qi, X.: MarS3D: a plug-and-play motion-aware model for semantic segmentation on multi-scan 3D point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9372\u20139381 (2023)","DOI":"10.1109\/CVPR52729.2023.00904"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Liu, X., Xu, Y., Wu, Q., Zhou, H., Wu, W., Zhou, B.: Semantic-aware implicit neural audio-driven video portrait generation. arXiv preprint arXiv:2201.07786 (2022)","DOI":"10.1007\/978-3-031-19836-6_7"},{"issue":"6","key":"23_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3478513.3480484","volume":"40","author":"Y Lu","year":"2021","unstructured":"Lu, Y., Chai, J., Cao, X.: Live speech portraits: real-time photorealistic talking-head animation. ACM Trans. Graph. (TOG) 40(6), 1\u201317 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR29","unstructured":"Ma, Y., Zhang, S., Wang, J., Wang, X., Zhang, Y., Deng, Z.: DreamTalk: when expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 (2023)"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Ma, Z., Zhu, X., Qi, G.J., Lei, Z., Zhang, L.: OTAvatar: one-shot talking face avatar with controllable tri-plane rendering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16901\u201316910 (2023)","DOI":"10.1109\/CVPR52729.2023.01621"},{"key":"23_CR31","unstructured":"Mallya, A., Wang, T.C., Liu, M.Y.: Implicit Warping for Animation with Image Sets. In: NeurIPS (2022)"},{"issue":"1","key":"23_CR32","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., et al.: NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"23_CR33","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2020.12.089","volume":"438","author":"Y Ming","year":"2021","unstructured":"Ming, Y., Meng, X., Fan, C., Yu, H.: Deep learning for monocular depth estimation: a review. Neurocomputing 438, 14\u201333 (2021)","journal-title":"Neurocomputing"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Paysan, P., Knothe, R., Amberg, B., Romdhani, S., Vetter, T.: A 3D face model for pose and illumination invariant face recognition. In: 2009 sixth IEEE International Conference on Advanced Video and Signal Based Surveillance, pp. 296\u2013301 (2009)","DOI":"10.1109\/AVSS.2009.58"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"23_CR36","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., Liu, S.: PIRenderer: controllable portrait image generation via semantic neural rendering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13759\u201313768 (2021)","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollh\u00f6fer, M., Wen, Y., De\u00a0la Torre, F., Sheikh, Y.: MeshTalk: 3D face animation from speech using cross-modality disentanglement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1173\u20131182 (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Sanyal, S., Bolkart, T., Feng, H., Black, M.J.: Learning to regress 3D face shape and expression from an image without 3D supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7763\u20137772 (2019)","DOI":"10.1109\/CVPR.2019.00795"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Shen, S., Li, W., Zhu, Z., Duan, Y., Zhou, J., Lu, J.: Learning dynamic facial radiance fields for few-shot talking head synthesis. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Shen, S., Zhao, W., Meng, Z., Li, W., Zhu, Z., Zhou, J., Lu, J.: DiffTalk: crafting diffusion models for generalized audio-driven portraits animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1982\u20131991 (2023)","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"23_CR42","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., Sebe, N.: First order motion model for image animation. In: Advances in neural information processing systems, vol. 32 (2019)"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Song, L., Wu, W., Qian, C., He, R., Loy, C.C.: Everybody\u2019s talkin\u2019: let me talk as you want. IEEE Trans. Inf. Forensics Secur. 17, 585\u2013598 (2022)","DOI":"10.1109\/TIFS.2022.3146783"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Sun, J., Deng, Q., Li, Q., Sun, M., Ren, M., Sun, Z.: AnyFace: free-style text-to-face synthesis and manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18687\u201318696 (2022)","DOI":"10.1109\/CVPR52688.2022.01813"},{"issue":"4","key":"23_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. (ToG) 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"23_CR46","doi-asserted-by":"crossref","unstructured":"Tan, H.R., Wang, C., Wu, S.T., Wang, T.Q., Zhang, X.Y., Liu, C.L.: Proxy graph matching with proximal matching networks. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 9808\u20139815 (2021)","DOI":"10.1609\/aaai.v35i11.17179"},{"key":"23_CR47","doi-asserted-by":"crossref","unstructured":"Tan, H., Wang, C., Wu, S., Zhang, X.Y., Yin, F., Liu, C.L.: Ensemble quadratic assignment network for graph matching. Int. J. Comput. Vis. 1\u201323 (2024)","DOI":"10.1007\/s11263-024-02040-8"},{"key":"23_CR48","doi-asserted-by":"crossref","unstructured":"Tewari, A., et al.: FML: face model learning from videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10812\u201310822 (2019)","DOI":"10.1109\/CVPR.2019.01107"},{"key":"23_CR49","doi-asserted-by":"publisher","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: European conference on computer vision, pp. 716\u2013731. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Wang, T., et\u00a0al.: RODIN: a generative model for sculpting 3D digital avatars using diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4563\u20134573 (2023)","DOI":"10.1109\/CVPR52729.2023.00443"},{"key":"23_CR51","unstructured":"Wang, T.C., Liu, M.Y., Tao, A., Liu, G., Kautz, J., Catanzaro, B.: Few-shot video-to-video synthesis. arXiv preprint arXiv:1910.12713 (2019)"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Mallya, A., Liu, M.Y.: One-shot free-view neural talking-head synthesis for video conferencing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10039\u201310049 (2021)","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"23_CR53","unstructured":"Wu, X., et al.: CL-NeRF: continual learning of neural radiance fields for evolving scene representation. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Wu, X., et al.: Speech2Lip: high-fidelity speech to lip generation by learning from a short video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22168\u201322177 (2023)","DOI":"10.1109\/ICCV51070.2023.02026"},{"key":"23_CR55","unstructured":"Wu, X., et al.: DO3D: self-supervised learning of decomposed object-aware 3d motion and depth from monocular videos. arXiv preprint arXiv:2403.05895 (2024)"},{"key":"23_CR56","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: Codetalker: Speech-driven 3D facial animation with discrete motion prior. arXiv preprint arXiv:2301.02379 (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"23_CR57","unstructured":"Yao, S., Zhong, R., Yan, Y., Zhai, G., Yang, X.: DFA-NeRF: personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791 (2022)"},{"key":"23_CR58","doi-asserted-by":"publisher","unstructured":"Yin, F., et al.: StyleHEAT: one-shot high-resolution editable talking face generation via pre-trained styleGAN. In: European Conference on Computer Vision, pp. 85\u2013101. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_6","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"23_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"23_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: SadTalker: learning realistic 3D motion coefficients for stylized audio-driven single image talking face animation. arXiv preprint arXiv:2211.12194 (2022)","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"23_CR61","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"23_CR62","doi-asserted-by":"crossref","unstructured":"Zhao, J., Zhang, H.: Thin-plate spline motion model for image animation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3657\u20133666 (2022)","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"23_CR63","unstructured":"Zhao, S., Qi, X.: Prototypical votenet for few-shot 3D point cloud object detection. In: Advances in Neural Information Processing Systems (2022)"},{"key":"23_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., Wang, X.: Talking face generation by adversarially disentangled audio-visual representation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9299\u20139306 (2019)","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"23_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4176\u20134186 (2021)","DOI":"10.1109\/CVPR46437.2021.00416"},{"issue":"6","key":"23_CR66","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., et al.: MakeltTalk: speaker-aware talking-head animation. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73223-2_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:07:22Z","timestamp":1731006442000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73223-2_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,8]]},"ISBN":["9783031732225","9783031732232"],"references-count":66,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73223-2_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,8]]},"assertion":[{"value":"8 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}