{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:08:59Z","timestamp":1765343339416,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"name":"The Beijing Natural Science Foundation","award":["JQ23016"],"award-info":[{"award-number":["JQ23016"]}]},{"name":"The Chinese National Natural Science Foundation Projects","award":["62476273 and 62406320"],"award-info":[{"award-number":["62476273 and 62406320"]}]},{"name":"The Science and Technology Development Fund of Macau Project","award":["0123\/2022\/A3, 0044\/2024\/AGJ, 0140\/2024\/AGJ, and 0084\/2024\/RIB2"],"award-info":[{"award-number":["0123\/2022\/A3, 0044\/2024\/AGJ, 0140\/2024\/AGJ, and 0084\/2024\/RIB2"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755190","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"7893-7901","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PESTalk: Speech-Driven 3D Facial Animation with Personalized Emotional Styles"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3393-1597","authenticated-orcid":false,"given":"Tianshun","family":"Han","sequence":"first","affiliation":[{"name":"Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4883-5552","authenticated-orcid":false,"given":"Benjia","family":"Zhou","sequence":"additional","affiliation":[{"name":"Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7788-9368","authenticated-orcid":false,"given":"Ajian","family":"Liu","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5780-8540","authenticated-orcid":false,"given":"Yanyan","family":"Liang","sequence":"additional","affiliation":[{"name":"Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8301-2706","authenticated-orcid":false,"given":"Du","family":"Zhang","sequence":"additional","affiliation":[{"name":"Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0791-189X","authenticated-orcid":false,"given":"Zhen","family":"Lei","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4735-2885","authenticated-orcid":false,"given":"Jun","family":"Wan","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and Macau University of Science and Technology, Macau, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449-12460."},{"key":"e_1_3_2_1_2_1","volume-title":"ICASSP 2020-2020 IEEE International conference on acoustics, speech and signal processing (ICASSP). IEEE, 7124-7128","author":"Bredin Herv\u00e9","year":"2020","unstructured":"Herv\u00e9 Bredin, Ruiqing Yin, Juan Manuel Coria, Gregory Gelly, Pavel Korshunov, Marvin Lavechin, Diego Fustes, Hadrien Titeux, Wassim Bouaziz, and Marie-Philippe Gill. 2020. Pyannote. audio: neural building blocks for speaker diarization. In ICASSP 2020-2020 IEEE International conference on acoustics, speech and signal processing (ICASSP). IEEE, 7124-7128."},{"key":"e_1_3_2_1_3_1","volume-title":"Crema-d: Crowd-sourced emotional multimodal actors dataset","author":"Cao Houwei","year":"2014","unstructured":"Houwei Cao, David G Cooper, Michael K Keutmann, Ruben C Gur, Ani Nenkova, and Ragini Verma. 2014. Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE transactions on affective computing, Vol. 5, 4 (2014), 377-390."},{"key":"e_1_3_2_1_4_1","volume-title":"DiffusionTalker: Personalization and Acceleration for Speech-Driven 3D Face Diffuser. arXiv preprint arXiv:2311.16565","author":"Chen Peng","year":"2023","unstructured":"Peng Chen, Xiaobao Wei, Ming Lu, Yitong Zhu, Naiming Yao, Xingyu Xiao, and Hui Chen. 2023. DiffusionTalker: Personalization and Acceleration for Speech-Driven 3D Face Diffuser. arXiv preprint arXiv:2311.16565 (2023)."},{"volume-title":"A Comparison of Metric Learning Loss Functions for End-To-End Speaker Verification","author":"Coria Juan M.","key":"e_1_3_2_1_5_1","unstructured":"Juan M. Coria, Herv\u00e9 Bredin, Sahar Ghannay, and Sophie Rosset. 2020. A Comparison of Metric Learning Loss Functions for End-To-End Speaker Verification. In Statistical Language and Speech Processing, Luis Espinosa-Anke, Carlos Mart\u00edn-Vide, and Irena Spasi\u00e7 (Eds.). Springer International Publishing, 137-148."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01967"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27945"},{"key":"e_1_3_2_1_11_1","unstructured":"Tianshun Han Shengnan Gui Yiqing Huang Baihui Li Lijian Liu Benjia Zhou Ning Jiang Quan Lu Ruicong Zhi Yanyan Liang et al. 2024. PMMTalk : Speech-Driven 3D Facial Animation from Complementary Pseudo Multi-modal Features. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"FaceXHuBERT: Text-less Speech-driven E (X) pressive 3D Facial Animation Synthesis Using Self-Supervised Speech Representation Learning. arXiv preprint arXiv:2303.05416","author":"Haque Kazi Injamamul","year":"2023","unstructured":"Kazi Injamamul Haque and Zerrin Yumak. 2023. FaceXHuBERT: Text-less Speech-driven E (X) pressive 3D Facial Animation Synthesis Using Self-Supervised Speech Representation Learning. arXiv preprint arXiv:2303.05416 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_1_14_1","volume-title":"Geon Kim, and Youngjae Yu.","author":"Kim Jisoo","year":"2024","unstructured":"Jisoo Kim, Jungbin Cho, Joonho Park, Soonmin Hwang, Da Eun Kim, Geon Kim, and Youngjae Yu. 2024. DEEPTalk: Dynamic Emotion Embedding for Probabilistic Speech-Driven 3D Face Animation. arXiv preprint arXiv:2408.06010 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3205170"},{"key":"e_1_3_2_1_16_1","volume-title":"Glditalker: Speech-driven 3d facial animation with graph latent diffusion transformer. arXiv preprint arXiv:2408.01826","author":"Lin Yihong","year":"2024","unstructured":"Yihong Lin, Zhaoxin Fan, Xianjia Wu, Lingyu Xiong, Liang Peng, Xiandong Li, Wenxiong Kang, Songju Lei, and Huang Xu. 2024. Glditalker: Speech-driven 3d facial animation with graph latent diffusion transformer. arXiv preprint arXiv:2408.01826 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Kan: Kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756","author":"Liu Ziming","year":"2024","unstructured":"Ziming Liu, Yixuan Wang, Sachin Vaidya, Fabian Ruehle, James Halverson, Marin Solja\u010di\u0107, Thomas Y Hou, and Max Tegmark. 2024. Kan: Kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_2_1_19_1","volume-title":"Juhyun Lee, et al.","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Guang Yong, Juhyun Lee, et al., 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00602"},{"key":"e_1_3_2_1_21_1","volume-title":"Culture and emotional expression. Understanding Culture","author":"Matsumoto David","year":"2013","unstructured":"David Matsumoto. 2013. Culture and emotional expression. Understanding Culture (2013), 271-287."},{"key":"e_1_3_2_1_22_1","volume-title":"PortraitTalk: Towards Customizable One-Shot Audio-to-Talking Face Generation. arXiv preprint arXiv:2412.07754","author":"Nazarieh Fatemeh","year":"2024","unstructured":"Fatemeh Nazarieh, Zhenhua Feng, Diptesh Kanojia, Muhammad Awais, and Josef Kittler. 2024. PortraitTalk: Towards Customizable One-Shot Audio-to-Talking Face Generation. arXiv preprint arXiv:2412.07754 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"SAiD: Speech-driven Blendshape Facial Animation with Diffusion. arXiv preprint arXiv:2401.08655","author":"Park Inkyu","year":"2023","unstructured":"Inkyu Park and Jaewoong Cho. 2023. SAiD: Speech-driven Blendshape Facial Animation with Diffusion. arXiv preprint arXiv:2401.08655 (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611734"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_2_1_26_1","volume-title":"Train short, test long: Attention with linear biases enables input length extrapolation. arXiv preprint arXiv:2108.12409","author":"Press Ofir","year":"2021","unstructured":"Ofir Press, Noah A Smith, and Mike Lewis. 2021. Train short, test long: Attention with linear biases enables input length extrapolation. arXiv preprint arXiv:2108.12409 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Speaker verification using adapted Gaussian mixture models. Digital signal processing","author":"Reynolds Douglas A","year":"2000","unstructured":"Douglas A Reynolds, Thomas F Quatieri, and Robert B Dunn. 2000. Speaker verification using adapted Gaussian mixture models. Digital signal processing, Vol. 10, 1-3 (2000), 19-41."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.measurement.2018.10.057"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681359"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3323452"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623264.3624447"},{"key":"e_1_3_2_1_34_1","volume-title":"Deformation transfer for triangle meshes. ACM Transactions on graphics (TOG)","author":"Sumner Robert W","year":"2004","unstructured":"Robert W Sumner and Jovan Popovi\u0107. 2004. Deformation transfer for triangle meshes. ACM Transactions on graphics (TOG), Vol. 23, 3 (2004), 399-405."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658221"},{"key":"e_1_3_2_1_36_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"e_1_3_2_1_38_1","volume-title":"3d-talkemo: Learning to synthesize 3d emotional talking head. arXiv preprint arXiv:2104.12051","author":"Wang Qianyun","year":"2021","unstructured":"Qianyun Wang, Zhenfeng Fan, and Shihong Xia. 2021. 3d-talkemo: Learning to synthesize 3d emotional talking head. arXiv preprint arXiv:2104.12051 (2021)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_41_1","volume-title":"MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation. arXiv preprint arXiv:2412.04448","author":"Zheng Longtao","year":"2024","unstructured":"Longtao Zheng, Yifan Zhang, Hanzhong Guo, Jiachun Pan, Zhenxiong Tan, Jiahao Lu, Chuanxin Tang, Bo An, and Shuicheng Yan. 2024. MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation. arXiv preprint arXiv:2412.04448 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755190","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:03Z","timestamp":1765343043000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755190"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3755190","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755190","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}