{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T18:27:13Z","timestamp":1780424833542,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730672","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Model See Model Do: Speech-Driven Facial Animation with Style Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5194-0359","authenticated-orcid":false,"given":"Yifang","family":"Pan","sequence":"first","affiliation":[{"name":"Department of Computer Science, Dynamic Graphics Project, University of Toronto, North York, Ontario, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2232-7480","authenticated-orcid":false,"given":"Karan","family":"Singh","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Dynamic Graphics Project, University of Toronto, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8311-0728","authenticated-orcid":false,"given":"Luiz Gustavo","family":"Hafemann","sequence":"additional","affiliation":[{"name":"La Forge, Ubisoft, Montreal, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-4-431-66911-1_13"},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"publisher","unstructured":"Daniel Cudeiro Timo Bolkart Cassidy Laidlaw Anurag Ranjan and Michael\u00a0J. Black. 2019. Capture Learning and Synthesis of 3D Speaking Styles. 10.48550\/arXiv.1905.03079arXiv:https:\/\/arXiv.org\/abs\/1905.03079 [cs] version: 1.","DOI":"10.48550\/arXiv.1905.03079"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","unstructured":"Radek Dan\u011b\u010dek Michael\u00a0J. Black and Timo Bolkart. 2022. EMOCA: Emotion Driven Monocular Face Capture and Animation. 10.48550\/arXiv.2204.11312arXiv:https:\/\/arXiv.org\/abs\/2204.11312 [cs].","DOI":"10.48550\/arXiv.2204.11312"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","unstructured":"Pif Edwards Chris Landreth Eugene Fiume and Karan Singh. 2016. JALI: an animator-centric viseme model for expressive lip synchronization. ACM Transactions on Graphics 35 4 (July 2016) 1\u201311. 10.1145\/2897824.2925984","DOI":"10.1145\/2897824.2925984"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","unstructured":"Bernhard Egger William A.\u00a0P. Smith Ayush Tewari Stefanie Wuhrer Michael Zollhoefer Thabo Beeler Florian Bernard Timo Bolkart Adam Kortylewski Sami Romdhani Christian Theobalt Volker Blanz and Thomas Vetter. 2020. 3D Morphable Face Models\u2014Past Present and Future. ACM Transactions on Graphics 39 5 (June 2020) 157:1\u2013157:38. 10.1145\/3395208","DOI":"10.1145\/3395208"},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"publisher","unstructured":"Yingruo Fan Zhaojiang Lin Jun Saito Wenping Wang and Taku Komura. 2022. FaceFormer: Speech-Driven 3D Facial Animation with Transformers. 10.48550\/arXiv.2112.05329arXiv:https:\/\/arXiv.org\/abs\/2112.05329 [cs].","DOI":"10.48550\/arXiv.2112.05329"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","unstructured":"Gabriele Fanelli Juergen Gall Harald Romsdorfer Thibaut Weise and Luc Van\u00a0Gool. 2010. A 3-D Audio-Visual Corpus of Affective Communication. IEEE Transactions on Multimedia 12 6 (Oct. 2010) 591\u2013598. 10.1109\/TMM.2010.2052239Conference Name: IEEE Transactions on Multimedia.","DOI":"10.1109\/TMM.2010.2052239"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","unstructured":"Yao Feng Haiwen Feng Michael\u00a0J. Black and Timo Bolkart. 2021. Learning an Animatable Detailed 3D Face Model from In-The-Wild Images. 10.48550\/arXiv.2012.04012arXiv:https:\/\/arXiv.org\/abs\/2012.04012 [cs].","DOI":"10.48550\/arXiv.2012.04012"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"publisher","unstructured":"Cletus\u00a0G. Fisher. 1968. Confusions Among Visually Perceived Consonants. Journal of Speech and Hearing Research 11 4 (Dec. 1968) 796\u2013804. 10.1044\/jshr.1104.796Publisher: American Speech-Language-Hearing Association.","DOI":"10.1044\/jshr.1104.796"},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","unstructured":"Saeed Ghorbani Ylva Ferstl Daniel Holden Nikolaus\u00a0F. Troje and Marc-Andr\u00e9 Carbonneau. 2022. ZeroEGGS: Zero-shot Example-based Gesture Generation from Speech. 10.48550\/arXiv.2209.07556arXiv:https:\/\/arXiv.org\/abs\/2209.07556 [cs].","DOI":"10.48550\/arXiv.2209.07556"},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"publisher","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. 10.48550\/arXiv.2006.11239arXiv:https:\/\/arXiv.org\/abs\/2006.11239 [cs].","DOI":"10.48550\/arXiv.2006.11239"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"publisher","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-Free Diffusion Guidance. 10.48550\/arXiv.2207.12598arXiv:https:\/\/arXiv.org\/abs\/2207.12598 [cs].","DOI":"10.48550\/arXiv.2207.12598"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. 10.48550\/arXiv.2106.07447arXiv:https:\/\/arXiv.org\/abs\/2106.07447 [cs].","DOI":"10.48550\/arXiv.2106.07447"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","unstructured":"Arthur Josi Luiz\u00a0Gustavo Hafemann Abdallah Dib Emeline Got Rafael M.\u00a0O. Cruz and Marc-Andre Carbonneau. 2024. SEREP: Semantic Facial Expression Representation for Robust In-the-Wild Capture and Retargeting. 10.48550\/arXiv.2412.14371arXiv:https:\/\/arXiv.org\/abs\/2412.14371 [cs].","DOI":"10.48550\/arXiv.2412.14371"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","unstructured":"Tero Karras Timo Aila Samuli Laine Antti Herva and Jaakko Lehtinen. 2017. Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36 4 (July 2017) 94:1\u201394:12. 10.1145\/3072959.3073658","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"publisher","unstructured":"Korrawe Karunratanakul Konpat Preechakul Emre Aksan Thabo Beeler Supasorn Suwajanakorn and Siyu Tang. 2024. Optimizing Diffusion Noise Can Serve As Universal Motion Priors. 10.48550\/arXiv.2312.11994arXiv:https:\/\/arXiv.org\/abs\/2312.11994 [cs].","DOI":"10.48550\/arXiv.2312.11994"},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","unstructured":"Tianye Li Timo Bolkart Michael\u00a0J. Black Hao Li and Javier Romero. 2017. Learning a model of facial shape and expression from 4D scans. ACM Transactions on Graphics 36 6 (Dec. 2017) 1\u201317. 10.1145\/3130800.3130813","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"publisher","unstructured":"Steven\u00a0R. Livingstone and Frank\u00a0A. Russo. 2018. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic multimodal set of facial and vocal expressions in North American English. PLOS ONE 13 5 (May 2018) e0196391. 10.1371\/journal.pone.0196391Publisher: Public Library of Science.","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","unstructured":"Camillo Lugaresi Jiuqiang Tang Hadon Nash Chris McClanahan Esha Uboweja Michael Hays Fan Zhang Chuo-Ling Chang Ming\u00a0Guang Yong Juhyun Lee Wan-Teh Chang Wei Hua Manfred Georg and Matthias Grundmann. 2019. MediaPipe: A Framework for Building Perception Pipelines. 10.48550\/arXiv.1906.08172arXiv:https:\/\/arXiv.org\/abs\/1906.08172 [cs].","DOI":"10.48550\/arXiv.1906.08172"},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","unstructured":"Yifeng Ma Suzhen Wang Yu Ding Bowen Ma Tangjie Lv Changjie Fan Zhipeng Hu Zhidong Deng and Xin Yu. 2024. TalkCLIP: Talking Head Generation with Text-Guided Expressive Speaking Styles. 10.48550\/arXiv.2304.00334arXiv:https:\/\/arXiv.org\/abs\/2304.00334 [cs].","DOI":"10.48550\/arXiv.2304.00334"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511843891.014"},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555408"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"publisher","unstructured":"Ziqiao Peng Haoyu Wu Zhenbo Song Hao Xu Xiangyu Zhu Jun He Hongyan Liu and Zhaoxin Fan. 2023. EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation. 10.48550\/arXiv.2303.11089arXiv:https:\/\/arXiv.org\/abs\/2303.11089 [cs].","DOI":"10.48550\/arXiv.2303.11089"},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"publisher","unstructured":"Alexander Richard Michael Zollhoefer Yandong Wen Fernando de\u00a0la Torre and Yaser Sheikh. 2022. MeshTalk: 3D Face Animation from Speech using Cross-Modality Disentanglement. 10.48550\/arXiv.2104.08223arXiv:https:\/\/arXiv.org\/abs\/2104.08223 [cs].","DOI":"10.48550\/arXiv.2104.08223"},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. 10.48550\/arXiv.2112.10752arXiv:https:\/\/arXiv.org\/abs\/2112.10752 [cs].","DOI":"10.48550\/arXiv.2112.10752"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"crossref","unstructured":"Zhiyao Sun Tian Lv Sheng Ye Matthieu Lin Jenny Sheng Yu-Hui Wen Minjing Yu and Yong-jin Liu. 2024. Diffposetalk: Speech-driven stylistic 3d facial animation and head pose generation via diffusion models. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u20139.","DOI":"10.1145\/3658221"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","unstructured":"Guy Tevet Sigal Raab Brian Gordon Yonatan Shafir Daniel Cohen-Or and Amit\u00a0H. Bermano. 2022. Human Motion Diffusion Model. 10.48550\/arXiv.2209.14916arXiv:https:\/\/arXiv.org\/abs\/2209.14916 [cs].","DOI":"10.48550\/arXiv.2209.14916"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","unstructured":"Balamurugan Thambiraja Ikhsanul Habibie Sadegh Aliakbarian Darren Cosker Christian Theobalt and Justus Thies. 2022. Imitator: Personalized Speech-driven 3D Facial Animation. 10.48550\/arXiv.2301.00023arXiv:https:\/\/arXiv.org\/abs\/2301.00023 [cs].","DOI":"10.48550\/arXiv.2301.00023"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"publisher","unstructured":"Guillermo Valle-P\u00e9rez Gustav\u00a0Eje Henter Jonas Beskow Andr\u00e9 Holzapfel Pierre-Yves Oudeyer and Simon Alexanderson. 2021. Transflower: probabilistic autoregressive dance generation with multimodal attention. ACM Transactions on Graphics 40 6 (Dec. 2021) 1\u201314. 10.1145\/3478513.3480570arXiv:https:\/\/arXiv.org\/abs\/2106.13871 [cs].","DOI":"10.1145\/3478513.3480570"},{"key":"e_1_3_3_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","unstructured":"Yuxuan Wang Daisy Stanton Yu Zhang R.\u00a0J. Skerry-Ryan Eric Battenberg Joel Shor Ying Xiao Fei Ren Ye Jia and Rif\u00a0A. Saurous. 2018. Style Tokens: Unsupervised Style Modeling Control and Transfer in End-to-End Speech Synthesis. 10.48550\/arXiv.1803.09017arXiv:https:\/\/arXiv.org\/abs\/1803.09017 [cs].","DOI":"10.48550\/arXiv.1803.09017"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_3_2_36_1","doi-asserted-by":"publisher","unstructured":"Sicheng Xu Guojun Chen Yu-Xiao Guo Jiaolong Yang Chong Li Zhenyu Zang Yizhong Zhang Xin Tong and Baining Guo. 2024. VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time. 10.48550\/arXiv.2404.10667arXiv:https:\/\/arXiv.org\/abs\/2404.10667 [cs].","DOI":"10.48550\/arXiv.2404.10667"},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","unstructured":"Hongdi Yang Chengyang Li Zhenxuan Wu Gaozheng Li Jingya Wang Jingyi Yu Zhuo Su and Lan Xu. 2024. SMGDiff: Soccer Motion Generation using diffusion probabilistic models. 10.48550\/arXiv.2411.16216arXiv:https:\/\/arXiv.org\/abs\/2411.16216 [cs].","DOI":"10.48550\/arXiv.2411.16216"},{"key":"e_1_3_3_2_38_1","doi-asserted-by":"publisher","unstructured":"Jianhui Yu Hao Zhu Liming Jiang Chen\u00a0Change Loy Weidong Cai and Wayne Wu. 2023. CelebV-Text: A Large-Scale Facial Text-Video Dataset. 10.48550\/arXiv.2303.14717arXiv:https:\/\/arXiv.org\/abs\/2303.14717 [cs].","DOI":"10.48550\/arXiv.2303.14717"},{"key":"e_1_3_3_2_39_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10761"},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","unstructured":"Wenxuan Zhang Xiaodong Cun Xuan Wang Yong Zhang Xi Shen Yu Guo Ying Shan and Fei Wang. 2023. SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation. 10.48550\/arXiv.2211.12194arXiv:https:\/\/arXiv.org\/abs\/2211.12194 [cs].","DOI":"10.48550\/arXiv.2211.12194"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","unstructured":"Ya-Jie Zhang Shifeng Pan Lei He and Zhen-Hua Ling. 2019. Learning Latent Representations for Style Control and Transfer in End-to-end Speech Synthesis. 10.48550\/arXiv.1812.04342arXiv:https:\/\/arXiv.org\/abs\/1812.04342 [cs].","DOI":"10.48550\/arXiv.1812.04342"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","unstructured":"Qingcheng Zhao Pengyu Long Qixuan Zhang Dafei Qin Han Liang Longwen Zhang Yingliang Zhang Jingyi Yu and Lan Xu. 2024. Media2Face: Co-speech Facial Animation Generation With Multi-Modality Guidance. 10.48550\/arXiv.2401.15687arXiv:https:\/\/arXiv.org\/abs\/2401.15687 [cs].","DOI":"10.48550\/arXiv.2401.15687"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","unstructured":"Yang Zhou Xintong Han Eli Shechtman Jose Echevarria Evangelos Kalogerakis and Dingzeyu Li. 2020. MakeItTalk: Speaker-Aware Talking-Head Animation. ACM Transactions on Graphics 39 6 (Dec. 2020) 1\u201315. 10.1145\/3414685.3417774arXiv:https:\/\/arXiv.org\/abs\/2004.12992 [cs].","DOI":"10.1145\/3414685.3417774"},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"publisher","unstructured":"Hao Zhu Wayne Wu Wentao Zhu Liming Jiang Siwei Tang Li Zhang Ziwei Liu and Chen\u00a0Change Loy. 2022. CelebV-HQ: A Large-Scale Video Facial Attributes Dataset. 10.48550\/arXiv.2207.12393arXiv:https:\/\/arXiv.org\/abs\/2207.12393 [cs].","DOI":"10.48550\/arXiv.2207.12393"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730672","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:54:14Z","timestamp":1774018454000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730672"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":43,"alternative-id":["10.1145\/3721238.3730672","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730672","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}