{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T03:08:01Z","timestamp":1767928081785,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"name":"Swiss National Science Foundation","award":["PZ00P2_216294"],"award-info":[{"award-number":["PZ00P2_216294"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,3]]},"DOI":"10.1145\/3769047.3769049","type":"proceedings-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T12:23:26Z","timestamp":1764332606000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PhonemeNet: A Transformer Pipeline for Text-Driven Facial Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9782-8118","authenticated-orcid":false,"given":"Philine","family":"Witzig","sequence":"first","affiliation":[{"name":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7494-8660","authenticated-orcid":false,"given":"Barbara","family":"Solenthaler","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9324-779X","authenticated-orcid":false,"given":"Markus","family":"Gross","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0158-1305","authenticated-orcid":false,"given":"Rafael","family":"Wampfler","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Z\u00fcrich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,2]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02009"},{"key":"e_1_3_3_2_3_1","series-title":"(NIPS \u201920)","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: a framework for self-supervised learning of speech representations. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS \u201920). Curran Associates Inc., Red Hook, NY, USA, Article 1044, 12\u00a0pages."},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","unstructured":"Thabo Beeler Fabian Hahn Derek Bradley Bernd Bickel Paul Beardsley Craig Gotsman Robert\u00a0W. Sumner and Markus Gross. 2011. High-quality passive facial performance capture using anchor frames. ACM Trans. Graph. 30 4 Article 75 (July 2011) 10\u00a0pages. 10.1145\/2010324.1964970","DOI":"10.1145\/2010324.1964970"},{"key":"e_1_3_3_2_5_1","unstructured":"Aggelina Chatziagapi Louis-Philippe Morency Hongyu Gong Michael Zollhoefer Dimitris Samaras and Alexander Richard. 2025. AV-Flow: Transforming Text to Audio-Visual Human-like Interactions. arxiv:https:\/\/arXiv.org\/abs\/2502.13133\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.13133"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"crossref","unstructured":"Pif Edwards Chris Landreth Eugene Fiume and Karan Singh. 2016. Jali: an animator-centric viseme model for expressive lip synchronization. ACM Transactions on graphics (TOG) 35 4 (2016) 1\u201311.","DOI":"10.1145\/2897824.2925984"},{"key":"e_1_3_3_2_9_1","first-page":"204","volume-title":"European Conference on Computer Vision","author":"Fan Xiangyu","year":"2024","unstructured":"Xiangyu Fan, Jiaqi Li, Zhiqian Lin, Weiye Xiao, and Lei Yang. 2024. UniTalker: Scaling up Audio-Driven 3D Facial Animation through A Unified Model. In European Conference on Computer Vision. Springer, Springer, Cham, Switzerland, 204\u2013221."},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"crossref","unstructured":"Yingruo Fan Zhaojiang Lin Jun Saito Wenping Wang and Taku Komura. 2022b. Joint audio-text model for expressive speech-driven 3d facial animation. Proceedings of the ACM on Computer Graphics and Interactive Techniques 5 1 (2022) 1\u201315.","DOI":"10.1145\/3522615"},{"key":"e_1_3_3_2_12_1","first-page":"e70073","volume-title":"Computer Graphics Forum","author":"Haque Kazi\u00a0Injamamul","year":"2025","unstructured":"Kazi\u00a0Injamamul Haque, Alkiviadis Pavlou, and Zerrin Yumak. 2025. \u201cWild West\u201d of Evaluating Speech-Driven 3D Facial Animation Synthesis: A Benchmark Study. In Computer Graphics Forum. Wiley Online Library, John Wiley & Sons, Hoboken, NJ, USA, e70073."},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00842"},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"crossref","unstructured":"Tero Karras Timo Aila Samuli Laine Antti Herva and Jaakko Lehtinen. 2017. Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Transactions on Graphics (TOG) 36 4 (2017) 1\u201312.","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_3_2_15_1","unstructured":"Rithesh Kumar Jose Sotelo Kundan Kumar Alexandre de Brebisson and Yoshua Bengio. 2017. ObamaNet: Photo-realistic Lip-sync from Text. arxiv:https:\/\/arXiv.org\/abs\/1801.01442\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1801.01442"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16286"},{"key":"e_1_3_3_2_17_1","unstructured":"Liying Lu Tianke Zhang Yunfei Liu Xuangeng Chu and Yu Li. 2023. Audio-Driven 3D Facial Animation from In-the-Wild Videos. arxiv:https:\/\/arXiv.org\/abs\/2306.11541\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2306.11541"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1067"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"e_1_3_3_2_21_1","unstructured":"NVIDIA : Chaeyeon Chung Ilya Fedorov Michael Huang Aleksey Karmanov Dmitry Korobchenko Roger Ribera and Yeongho Seol. 2025. Audio2Face-3D: Audio-driven Realistic Facial Animation For Digital Avatars. arxiv:https:\/\/arXiv.org\/abs\/2508.16401\u00a0[cs.GR] https:\/\/arxiv.org\/abs\/2508.16401"},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_3_2_23_1","unstructured":"Yi Ren Chenxu Hu Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2022. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. arxiv:https:\/\/arXiv.org\/abs\/2006.04558\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2006.04558"},{"key":"e_1_3_3_2_24_1","volume-title":"Advances in Neural Information Processing Systems","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. FastSpeech: Fast, Robust and Controllable Text to Speech. In Advances in Neural Information Processing Systems , H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.), Vol.\u00a032. Curran Associates, Inc., Red Hook, NY, USA. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/f63f65b503e22cb970527f23c9ad7db1-Paper.pdf"},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096441"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00009"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623264.3624447"},{"key":"e_1_3_3_2_29_1","unstructured":"Guinan Su Yanwu Yang and Zhifeng Li. 2023. DualTalker: A Cross-Modal Dual Learning Approach for Speech-Driven 3D Facial Animation. arxiv:https:\/\/arXiv.org\/abs\/2311.04766\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2311.04766"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","unstructured":"Zhiyao Sun Tian Lv Sheng Ye Matthieu Lin Jenny Sheng Yu-Hui Wen Minjing Yu and Yong-Jin Liu. 2024. DiffPoseTalk: Speech-Driven Stylistic 3D Facial Animation and Head Pose Generation via Diffusion Models. ACM Trans. Graph. 43 4 Article 46 (July 2024) 9\u00a0pages. 10.1145\/3658221","DOI":"10.1145\/3658221"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"crossref","unstructured":"Sarah Taylor Taehwan Kim Yisong Yue Moshe Mahler James Krahe Anastasio\u00a0Garcia Rodriguez Jessica Hodgins and Iain Matthews. 2017. A deep learning approach for generalized speech animation. ACM Transactions On Graphics (TOG) 36 4 (2017) 1\u201311.","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730711"},{"key":"e_1_3_3_2_33_1","first-page":"275","volume-title":"Proceedings of the 11th ACM SIGGRAPH\/Eurographics conference on Computer Animation","author":"Taylor Sarah\u00a0L","year":"2012","unstructured":"Sarah\u00a0L Taylor, Moshe Mahler, Barry-John Theobald, and Iain Matthews. 2012. Dynamic units of visual speech. In Proceedings of the 11th ACM SIGGRAPH\/Eurographics conference on Computer Animation. Association for Computing Machinery, New York, NY, USA, 275\u2013284."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3681755.3688939"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730762"},{"key":"e_1_3_3_2_36_1","unstructured":"Zhongjian Wang Peng Zhang Jinwei Qi Guangyuan Wang Chaonan Ji Sheng Xu Bang Zhang and Liefeng Bo. 2025. OmniTalker: One-shot Real-time Text-Driven Talking Audio-Video Generation With Multimodal Style Mimicking. arxiv:https:\/\/arXiv.org\/abs\/2504.02433\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2504.02433"},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640794.3665541"},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3677388.3696336"},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3677388.3696320"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_3_2_41_1","first-page":"236","volume-title":"European Conference on Computer Vision","author":"Xu Zhihao","year":"2024","unstructured":"Zhihao Xu, Shengjie Gong, Jiapeng Tang, Lingyu Liang, Yining Huang, Haojie Li, and Shuangping Huang. 2024. KMTalk: Speech-Driven 3D Facial Animation with Key Motion Embedding. In European Conference on Computer Vision. Springer, Springer, Cham, Switzerland, 236\u2013253."},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02577"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657413"}],"event":{"name":"MIG '25: The 18th ACM SIGGRAPH Conference on Motion, Interaction, and Games","location":"Zurich Switzerland","acronym":"MIG '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the 2025 18th ACM SIGGRAPH Conference on Motion, Interaction, and Games"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3769047.3769049","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T20:56:31Z","timestamp":1767905791000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3769047.3769049"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,2]]},"references-count":42,"alternative-id":["10.1145\/3769047.3769049","10.1145\/3769047"],"URL":"https:\/\/doi.org\/10.1145\/3769047.3769049","relation":{},"subject":[],"published":{"date-parts":[[2025,12,2]]},"assertion":[{"value":"2025-12-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}