{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:55:48Z","timestamp":1776117348148,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613753","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"4281-4289","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":32,"title":["DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with Diffusion Autoencoder"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5329-0847","authenticated-orcid":false,"given":"Chenpeng","family":"Du","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8606-8273","authenticated-orcid":false,"given":"Qi","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4828-3228","authenticated-orcid":false,"given":"Tianyu","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5631-0639","authenticated-orcid":false,"given":"Xu","family":"Tan","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7423-617X","authenticated-orcid":false,"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9624-5381","authenticated-orcid":false,"given":"Sheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft Cloud+AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9472-600X","authenticated-orcid":false,"given":"Jiang","family":"Bian","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548101"},{"key":"e_1_3_2_1_2_1","volume-title":"vq-wav2vec: Self-supervised learning of discrete speech representations. arXiv preprint arXiv:1910.05453","author":"Baevski Alexei","year":"2019","unstructured":"Alexei Baevski, Steffen Schneider, and Michael Auli. 2019. vq-wav2vec: Self-supervised learning of discrete speech representations. arXiv preprint arXiv:1910.05453 (2019)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proc. NeurIPS.","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Proc. NeurIPS."},{"key":"e_1_3_2_1_4_1","volume-title":"Munich, Germany","volume":"553","author":"Chen Lele","year":"2018","unstructured":"Lele Chen, Zhiheng Li, Ross K. Maddox, Zhiyao Duan, and Chenliang Xu. [n.,d.]. Lip Movements Generation at a Glance. In Computer Vision - ECCV 2018 - 15th European Conference, Munich, Germany, September 8-14, 2018, Proceedings, Part VII (Lecture Notes in Computer Science, Vol. 11211). 538--553."},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. NeurIPS. 8780--8794","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Quinn Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Proc. NeurIPS. 8780--8794."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-489"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Yudong Guo Keyu Chen Sen Liang Yong-Jin Liu Hujun Bao and Juyong Zhang. 2021. AD-NeRF: Audio Driven Neural Radiance Fields for Talking Head Synthesis. 5764--5774.","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_1_9_1","volume-title":"SPACEx: Speech-driven Portrait Animation with Controllable Expression. arXiv preprint arXiv:2211.09809","author":"Gururani Siddharth","year":"2022","unstructured":"Siddharth Gururani, Arun Mallya, Ting-Chun Wang, Rafael Valle, and Ming-Yu Liu. 2022. SPACEx: Speech-driven Portrait Animation with Controllable Expression. arXiv preprint arXiv:2211.09809 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"Latent Video Diffusion Models for High-Fidelity Video Generation with Arbitrary Lengths. arXiv preprint arXiv:2211.13221","author":"He Yingqing","year":"2022","unstructured":"Yingqing He, Tianyu Yang, Yong Zhang, Ying Shan, and Qifeng Chen. 2022. Latent Video Diffusion Models for High-Fidelity Video Generation with Arbitrary Lengths. arXiv preprint arXiv:2211.13221 (2022)."},{"key":"e_1_3_2_1_11_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022a. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. NeurIPS.","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Proc. NeurIPS."},{"key":"e_1_3_2_1_13_1","article-title":"Cascaded Diffusion Models for High Fidelity Image Generation","volume":"23","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Chitwan Saharia, William Chan, David J. Fleet, Mohammad Norouzi, and Tim Salimans. 2022b. Cascaded Diffusion Models for High Fidelity Image Generation. J. Mach. Learn. Res., Vol. 23 (2022), 47:1--47:33.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_14_1","volume-title":"Fleet","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Tim Salimans, Alexey A. Gritsenko, William Chan, Mohammad Norouzi, and David J. Fleet. 2022c. Video Diffusion Models. CoRR, Vol. abs\/2204.03458 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_1_16_1","volume-title":"Diffusion Video Autoencoders: Toward Temporally Consistent Face Video Editing via Disentangled Video Encoding. arXiv preprint arXiv:2212.02802","author":"Kim Gyeongman","year":"2022","unstructured":"Gyeongman Kim, Hajin Shim, Hyunsu Kim, Yunjey Choi, Junho Kim, and Eunho Yang. 2022. Diffusion Video Autoencoders: Toward Temporally Consistent Face Video Editing via Disentangled Video Encoding. arXiv preprint arXiv:2212.02802 (2022)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00278"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01036"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. ICML","volume":"37","author":"Sohl-Dickstein Jascha","unstructured":"Jascha Sohl-Dickstein, Eric A. Weiss, Niru Maheswaranathan, and Surya Ganguli. [n.,d.]. Deep Unsupervised Learning using Nonequilibrium Thermodynamics. In Proc. ICML, Vol. 37. 2256--2265."},{"key":"e_1_3_2_1_22_1","volume-title":"Proc. ICLR.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021a. Denoising Diffusion Implicit Models. In Proc. ICLR."},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. ICLR.","author":"Song Yang","year":"2021","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P. Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021b. Score-Based Generative Modeling through Stochastic Differential Equations. In Proc. ICLR."},{"key":"e_1_3_2_1_24_1","volume-title":"Stavros Petridis, and Maja Pantic.","author":"Stypu\u0142kowski Micha\u0142","year":"2023","unstructured":"Micha\u0142 Stypu\u0142kowski, Konstantinos Vougioukas, Sen He, Maciej Zike ba, Stavros Petridis, and Maja Pantic. 2023. Diffused Heads: Diffusion Models Beat GANs on Talking-Face Generation. arXiv preprint arXiv:2301.03396 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01251-8"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_30_1","volume-title":"DFA-NERF: personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, RuiZhe Zhong, Yichao Yan, Guangtao Zhai, and Xiaokang Yang. 2022. DFA-NERF: personalized talking head generation via disentangled face attributes neural rendering. arXiv preprint arXiv:2201.00791 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137","author":"Yi Ran","year":"2020","unstructured":"Ran Yi, Zipeng Ye, Juyong Zhang, Hujun Bao, and Yong-Jin Liu. 2020. Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137 (2020)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_34_1","volume-title":"Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018","author":"Zhou Daquan","year":"2022","unstructured":"Daquan Zhou, Weimin Wang, Hanshu Yan, Weiwei Lv, Yizhe Zhu, and Jiashi Feng. 2022. Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG), Vol. 39, 6 (2020), 1--15.","journal-title":"ACM Transactions On Graphics (TOG)"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613753","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613753","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:04:35Z","timestamp":1755821075000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613753"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":35,"alternative-id":["10.1145\/3581783.3613753","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613753","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}