{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T16:25:12Z","timestamp":1782318312297,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681198","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"6696-6705","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["AniTalker: Animate Vivid and Diverse Talking Faces through Identity-Decoupled Facial Motion Encoding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4169-4160","authenticated-orcid":false,"given":"Tao","family":"Liu","sequence":"first","affiliation":[{"name":"X-LANCE Lab, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7162-8379","authenticated-orcid":false,"given":"Feilong","family":"Chen","sequence":"additional","affiliation":[{"name":"AISpeech Ltd, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0260-6080","authenticated-orcid":false,"given":"Shuai","family":"Fan","sequence":"additional","affiliation":[{"name":"AISpeech Ltd, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5329-0847","authenticated-orcid":false,"given":"Chenpeng","family":"Du","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8606-8273","authenticated-orcid":false,"given":"Qi","family":"Chen","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7423-617X","authenticated-orcid":false,"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mutual Information Neural Estimation. In International Conference on Machine Learning (ICML) (Proceedings of Machine Learning Research","volume":"540","author":"Belghazi Mohamed Ishmael","year":"2018","unstructured":"Mohamed Ishmael Belghazi, Aristide Baratin, Sai Rajeswar, Sherjil Ozair, Yoshua Bengio, Aaron Courville, and R Devon Hjelm. 2018. Mutual Information Neural Estimation. In International Conference on Machine Learning (ICML) (Proceedings of Machine Learning Research, Vol. 80), Jennifer Dy and Andreas Krause (Eds.). PMLR, 531--540."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.104911"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094777"},{"key":"e_1_3_2_1_4_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, 1779--1788","author":"Cheng Pengyu","year":"2020","unstructured":"Pengyu Cheng, Weituo Hao, Shuyang Dai, Jiachang Liu, Zhe Gan, and Lawrence Carin. 2020. Club: A contrastive log-ratio upper bound of mutual information. In International Conference on Machine Learning (ICML). PMLR, 1779--1788."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20311--20322","author":"Danvevcek Radek","year":"2022","unstructured":"Radek Danvevcek, Michael J Black, and Timo Bolkart. 2022. Emoca: Emotion driven monocular face capture and animation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20311--20322."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Brecht Desplanques Jenthe Thienpondt and Kris Demuynck. 2020. ECAPA-TDNN: Emphasized Channel Attention Propagation and Aggregation in TDNN Based Speaker Verification. (2020).","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_28"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613753"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29747"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3450626.3459936","article-title":"Learning an animatable detailed 3D face model from in-the-wild images","volume":"40","author":"Feng Yao","year":"2021","unstructured":"Yao Feng, Haiwen Feng, Michael J Black, and Timo Bolkart. 2021. Learning an animatable detailed 3D face model from in-the-wild images. ACM Transactions on Graphics (ToG), Vol. 40, 4 (2021), 1--13.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02069"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2"},{"key":"e_1_3_2_1_17_1","volume-title":"Conference of the International Speech Communication Association (InterSpeech)","author":"Anmol","year":"2020","unstructured":"Anmol Gulati et al. 2020. Conformer: Convolution-augmented transformer for speech recognition. Conference of the International Speech Communication Association (InterSpeech) (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"The International Conference on Learning Representations (ICLR)","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. The International Conference on Learning Representations (ICLR) (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"GAIA: Zero-shot Talking Avatar Generation.","author":"He Tianyu","year":"2024","unstructured":"Tianyu He, Junliang Guo, Runyi Yu, Yuchi Wang, Jialiang Zhu, Kaikai An, Leyi Li, Xu Tan, Chunyu Wang, Han Hu, HsiangTao Wu, Sheng Zhao, and Jiang Bian. 2024. GAIA: Zero-shot Talking Avatar Generation."},{"key":"e_1_3_2_1_20_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems (2020)."},{"key":"e_1_3_2_1_21_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (TASLP) (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Animate anyone: Consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117","author":"Hu Li","year":"2023","unstructured":"Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo. 2023. Animate anyone: Consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01521-4"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_1_25_1","volume-title":"ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Liu Tao","unstructured":"Tao Liu, Chenpeng Du, Shuai Fan, Feilong Chen, and Kai Yu. 2024. DiffDub: Person-Generic Visual Dubbing Using Inpainting Renderer with Diffusion Auto-Encoder. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 3630--3634."},{"key":"e_1_3_2_1_26_1","volume-title":"Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767","author":"Ma Yifeng","year":"2023","unstructured":"Yifeng Ma, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yingya Zhang, and Zhidong Deng. 2023. Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"DiffSpeaker: Speech-Driven 3D Facial Animation with Diffusion Transformer. arXiv preprint arXiv:2402.05712","author":"Ma Zhiyuan","year":"2024","unstructured":"Zhiyuan Ma, Xiangyu Zhu, Guojun Qi, Chen Qian, Zhaoxiang Zhang, and Zhen Lei. 2024. DiffSpeaker: Speech-Driven 3D Facial Animation with Diffusion Transformer. arXiv preprint arXiv:2402.05712 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00521"},{"key":"e_1_3_2_1_29_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Nagrani Arsha","year":"2017","unstructured":"Arsha Nagrani, Joon Son Chung, and Andrew Zisserman. 2017. Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00049"},{"key":"e_1_3_2_1_31_1","volume-title":"SAiD: Speech-driven Blendshape Facial Animation with Diffusion. arXiv preprint arXiv:2401.08655","author":"Park Inkyu","year":"2023","unstructured":"Inkyu Park and Jaewoong Cho. 2023. SAiD: Speech-driven Blendshape Facial Animation with Diffusion. arXiv preprint arXiv:2401.08655 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Exploring Phonetic Context-Aware Lip-Sync for Talking Face Generation. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4325--4329","author":"Park Se Jin","year":"2024","unstructured":"Se Jin Park, Minsu Kim, Jeongsoo Choi, and Yong Man Ro. 2024. Exploring Phonetic Context-Aware Lip-Sync for Talking Face Generation. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4325--4329."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_2_1_34_1","first-page":"43","article-title":"The importance of non-verbal communication","volume":"9","author":"Phutela Deepika","year":"2015","unstructured":"Deepika Phutela. 2015. The importance of non-verbal communication. IUP Journal of Soft Skills, Vol. 9, 4 (2015), 43.","journal-title":"IUP Journal of Soft Skills"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 28th ACM international conference on multimedia (ACM MM).","author":"KR","unstructured":"KR Prajwal et al. 2020. A lip sync expert is all you need for speech to lip generation in the wild. In Proceedings of the 28th ACM international conference on multimedia (ACM MM)."},{"key":"e_1_3_2_1_36_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"e_1_3_2_1_39_1","volume-title":"First order motion model for image animation. Advances in neural information processing systems","author":"Siarohin Aliaksandr","year":"2019","unstructured":"Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. 2019. First order motion model for image animation. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ILCR).","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ILCR)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Shuai Tan Bin Ji Mengxiao Bi and Ye Pan. 2024. EDTalk: Efficient Disentanglement for Emotional Talking Head Synthesis. (2024).","DOI":"10.1007\/978-3-031-72658-3_23"},{"key":"e_1_3_2_1_43_1","volume-title":"Motion Transformer for Unsupervised Image Animation. In European Conference on Computer Vision. Springer, 702--719","author":"Tao Jiale","year":"2022","unstructured":"Jiale Tao, Biao Wang, Tiezheng Ge, Yuning Jiang, Wen Li, and Lixin Duan. 2022. Motion Transformer for Unsupervised Image Animation. In European Conference on Computer Vision. Springer, 702--719."},{"key":"e_1_3_2_1_44_1","volume-title":"EMO: Emote Portrait Alive - Generating Expressive Portrait Videos with Audio2Video Diffusion Model under Weak Conditions. arxiv: 2402.17485 [cs.CV]","author":"Tian Linrui","year":"2024","unstructured":"Linrui Tian, Qi Wang, Bang Zhang, and Liefeng Bo. 2024. EMO: Emote Portrait Alive - Generating Expressive Portrait Videos with Audio2Video Diffusion Model under Weak Conditions. arxiv: 2402.17485 [cs.CV]"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01724"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2822810"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/152"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Wang Yaohui","year":"2022","unstructured":"Yaohui Wang, Di Yang, Francois Bremond, and Antitza Dantcheva. 2022. Latent image animator: Learning to animate images via latent space navigation. Proceedings of the International Conference on Learning Representations (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, Vol. 13, 4 (2004), 600--612."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"e_1_3_2_1_53_1","volume-title":"Conference of the International Speech Communication Association (InterSpeech)","author":"Chi Po-Han","year":"2021","unstructured":"Shu-wen Yang, Po-Han Chi, Yung-Sung Chuang, Cheng-I Jeff Lai, Kushal Lakhotia, Yist Y Lin, Andy T Liu, Jiatong Shi, Xuankai Chang, Guan-Ting Lin, et al. 2021. Superb: Speech processing universal performance benchmark. Conference of the International Speech Communication Association (InterSpeech) (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00070"},{"key":"e_1_3_2_1_55_1","volume-title":"DREAM-Talk: Diffusion-based Realistic Emotional Audio-driven Method for Single Image Talking Face Generation. arXiv preprint arXiv:2312.13578","author":"Zhang Chenxu","year":"2023","unstructured":"Chenxu Zhang, Chao Wang, Jianfeng Zhang, Hongyi Xu, Guoxian Song, You Xie, Linjie Luo, Yapeng Tian, Xiaohu Guo, and Jiashi Feng. 2023. DREAM-Talk: Diffusion-based Realistic Emotional Audio-driven Method for Single Image Talking Face Generation. arXiv preprint arXiv:2312.13578 (2023)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_1_58_1","volume-title":"DINet: Deformation Inpainting Network for Realistic Face Visually Dubbing on High Resolution Video. Thirty-Seventh AAAI Conference on Artificial Intelligence (AAAI)","author":"Zhimeng","year":"2023","unstructured":"Zhimeng Zhang et al. 2023. DINet: Deformation Inpainting Network for Realistic Face Visually Dubbing on High Resolution Video. Thirty-Seventh AAAI Conference on Artificial Intelligence (AAAI) (2023)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681198","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681198"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":62,"alternative-id":["10.1145\/3664647.3681198","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681198","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}