{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:37:10Z","timestamp":1776883030410,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687618","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["PersonaTalk: Bring Attention to Your Persona in Visual Dubbing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2787-3664","authenticated-orcid":false,"given":"Longhao","family":"Zhang","sequence":"first","affiliation":[{"name":"Bytedance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5966-352X","authenticated-orcid":false,"given":"Shuang","family":"Liang","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7574-6838","authenticated-orcid":false,"given":"Zhipeng","family":"Ge","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9881-0673","authenticated-orcid":false,"given":"Tianshu","family":"Hu","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490035.3490305"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555399"},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_3_2_6_1","volume-title":"Workshop on Multi-view Lip-reading, ACCV","author":"Chung J.\u00a0S.","year":"2016","unstructured":"J.\u00a0S. Chung and A. Zisserman. 2016. Out of time: automated lip sync in the wild. In Workshop on Multi-view Lip-reading, ACCV."},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"crossref","unstructured":"Daniel Cudeiro Timo Bolkart Cassidy Laidlaw Anurag Ranjan and Michael Black. 2019. Capture Learning and Synthesis of 3D Speaking Styles. Computer Vision and Pattern Recognition (CVPR) (2019) 10101\u201310111.","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01413"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","unstructured":"Yao Feng Haiwen Feng Michael\u00a0J. Black and Timo Bolkart. 2021. Learning an Animatable Detailed 3D Face Model from In-The-Wild Images. ACM Transactions on Graphics (Proc. SIGGRAPH) 40 8. 10.1145\/3450626.3459936https:\/\/dl.acm.org\/doi\/10.1145\/3450626.3459936","DOI":"10.1145\/3450626.3459936"},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00125"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_3_2_17_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Transactions on Audio Speech and Language Processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"crossref","unstructured":"Ziyao Huang Fan Tang Yong Zhang Xiaodong Cun Juan Cao Jintao Li and Tong-Yee Lee. 2024. Make-Your-Anchor: A Diffusion-based 2D Avatar Generation Framework. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.16510 (2024).","DOI":"10.1109\/CVPR52733.2024.00668"},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_3_2_22_1","first-page":"1428","volume-title":"Proceedings of the 27th ACM international conference on multimedia","author":"KR Prajwal","year":"2019","unstructured":"Prajwal KR, Rudrabha Mukhopadhyay, Jerin Philip, Abhishek Jha, Vinay Namboodiri, and CV Jawahar. 2019. Towards automatic face-to-face translation. In Proceedings of the 27th ACM international conference on multimedia. 1428\u20131436."},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00696"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","unstructured":"Yuanxun Lu Jinxiang Chai and Xun Cao. 2021. Live Speech Portraits: Real-Time Photorealistic Talking-Head Animation. ACM Transactions on Graphics 40 6 (2021) 17\u00a0pages. 10.1145\/3478513.3480484https:\/\/dl.acm.org\/doi\/10.1145\/3478513.3480484","DOI":"10.1145\/3478513.3480484"},{"key":"e_1_3_3_2_25_1","volume-title":"CVPR 2019","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Yong, Juhyun Lee, Wan-Teh Chang, Wei Hua, Manfred Georg, and Matthias Grundmann. 2019. MediaPipe: A Framework for Perceiving and Processing Reality. In CVPR 2019."},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25280"},{"key":"e_1_3_3_2_27_1","unstructured":"Yifeng Ma Shiwei Zhang Jiayu Wang Xiang Wang Yingya Zhang and Zhidong Deng. 2023b. DreamTalk: When Expressive Talking Head Generation Meets Diffusion Probabilistic Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.09767 (2023)."},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P Srinivasan Matthew Tancik Jonathan\u00a0T Barron Ravi Ramamoorthi and Ren Ng. 2021. Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. ACM 65 1 (2021) 99\u2013106.","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00070"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00372"},{"key":"e_1_3_3_2_35_1","unstructured":"Zhiyao Sun Tian Lv Sheng Ye Matthieu\u00a0Gaetan Lin Jenny Sheng Yu-Hui Wen Minjing Yu and Yong-jin Liu. 2023. Diffposetalk: Speech-driven stylistic 3d facial animation and head pose generation via diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.00434 (2023)."},{"key":"e_1_3_3_2_36_1","doi-asserted-by":"publisher","unstructured":"Supasorn Suwajanakorn Steven\u00a0M. Seitz and Ira Kemelmacher-Shlizerman. 2017. Synthesizing Obama: learning lip sync from audio. ACM Trans. Graph. 36 4 Article 95 (jul 2017) 13\u00a0pages. 10.1145\/3072959.3073640https:\/\/dl.acm.org\/doi\/10.1145\/3072959.3073640","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_3_2_37_1","unstructured":"Jiaxiang Tang Kaisiyuan Wang Hang Zhou Xiaokang Chen Dongliang He Tianshu Hu Jingtuo Liu Gang Zeng and Jingdong Wang. 2022. Real-time Neural Radiance Talking Portrait Synthesis via Audio-spatial Decomposition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.12368 (2022)."},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"e_1_3_3_2_39_1","unstructured":"Laurens Van\u00a0der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9 11 (2008)."},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_3_2_41_1","volume-title":"NIPS","author":"Waswani A","year":"2017","unstructured":"A Waswani, N Shazeer, N Parmar, J Uszkoreit, L Jones, A Gomez, L Kaiser, and I Polosukhin. 2017. Attention is all you need. In NIPS."},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475280"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611775"},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00081"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_3_2_46_1","doi-asserted-by":"publisher","unstructured":"Xinwei Yao Ohad Fried Kayvon Fatahalian and Maneesh Agrawala. 2021. Iterative Text-Based Editing of Talking-Heads Using Neural Retargeting. ACM Trans. Graph. 40 3 Article 20 (aug 2021) 14\u00a0pages. 10.1145\/3449063https:\/\/dl.acm.org\/doi\/10.1145\/3449063","DOI":"10.1145\/3449063"},{"key":"e_1_3_3_2_47_1","unstructured":"Zhenhui Ye Ziyue Jiang Yi Ren Jinglin Liu Jinzheng He and Zhou Zhao. 2023. GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.13430 (2023)."},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"crossref","unstructured":"Zhimeng Zhang Zhipeng Hu Wenjin Deng Changjie Fan Tangjie Lv and Yu Ding. 2023. DINet: Deformation Inpainting Network for Realistic Face Visually Dubbing on High Resolution Video. arxiv:https:\/\/arXiv.org\/abs\/2303.03988\u00a0[cs.CV]","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_3_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"e_1_3_3_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","unstructured":"Yang Zhou Xintong Han Eli Shechtman Jose Echevarria Evangelos Kalogerakis and Dingzeyu Li. 2020. MakeltTalk: speaker-aware talking-head animation. ACM Transactions on Graphics 39 6 (Nov. 2020) 1\u201315. 10.1145\/3414685.3417774https:\/\/dl.acm.org\/doi\/10.1145\/3414685.3417774","DOI":"10.1145\/3414685.3417774"},{"key":"e_1_3_3_2_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_38"},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"crossref","unstructured":"Xiangyu Zhu Xiaoming Liu Zhen Lei and Stan\u00a0Z Li. 2017. Face alignment in full pose range: A 3d total solution. IEEE transactions on pattern analysis and machine intelligence 41 1 (2017) 78\u201392.","DOI":"10.1109\/TPAMI.2017.2778152"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687618","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687618","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:27Z","timestamp":1750294707000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687618"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":54,"alternative-id":["10.1145\/3680528.3687618","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687618","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}