{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:47:58Z","timestamp":1774021678992,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730711","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["xADA: Controllable and Expressive Audio-Driven Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1299-0446","authenticated-orcid":false,"given":"Sarah","family":"Taylor","sequence":"first","affiliation":[{"name":"Epic Games, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8350-7651","authenticated-orcid":false,"given":"Salvador","family":"Medina","sequence":"additional","affiliation":[{"name":"Epic Games, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4985-2282","authenticated-orcid":false,"given":"Jonathan","family":"Windle","sequence":"additional","affiliation":[{"name":"Epic Games, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0175-0257","authenticated-orcid":false,"given":"Erica","family":"Alcusa S\u00e1ez","sequence":"additional","affiliation":[{"name":"Epic Games, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5004-2397","authenticated-orcid":false,"given":"Iain","family":"Matthews","sequence":"additional","affiliation":[{"name":"Epic Games, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_2_2_1","doi-asserted-by":"crossref","unstructured":"Simon Alexanderson Rajmund Nagy Jonas Beskow and Gustav\u00a0Eje Henter. 2023. Listen denoise action! audio-driven motion synthesis with diffusion models. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201320.","DOI":"10.1145\/3592458"},{"key":"e_1_3_3_2_3_1","unstructured":"MetaHuman Animator. 2024. http:\/\/www.unrealengine.com\/en-US\/metahuman."},{"key":"e_1_3_3_2_4_1","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020) 12449\u201312460."},{"key":"e_1_3_3_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311537"},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596787"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"crossref","unstructured":"Yong Cao Wen\u00a0C Tien Petros Faloutsos and Fr\u00e9d\u00e9ric Pighin. 2005. Expressive speech-driven facial animation. ACM Transactions on Graphics (TOG) 24 4 (2005) 1283\u20131302.","DOI":"10.1145\/1095878.1095881"},{"key":"e_1_3_3_2_8_1","unstructured":"Antoni\u00a0Bigata Casademunt Rodrigo Mira Nikita Drobyshev Konstantinos Vougioukas Stavros Petridis and Maja Pantic. 2023. Laughing matters: Introducing laughing-face generation using diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.08854 (2023)."},{"key":"e_1_3_3_2_9_1","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao et\u00a0al. 2022. Wavlm: Large-scale self-supervised pre-training for full stack speech processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518.","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW59549.2023.00073"},{"key":"e_1_3_3_2_11_1","unstructured":"Kyunghyun Cho Bart Van\u00a0Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1406.1078 (2014)."},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555398"},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_3_2_14_1","doi-asserted-by":"crossref","unstructured":"Alan\u00a0S Cowen and Dacher Keltner. 2017. Self-report captures 27 distinct categories of emotion bridged by continuous gradients. Proceedings of the national academy of sciences 114 38 (2017) E7900\u2013E7909.","DOI":"10.1073\/pnas.1702247114"},{"key":"e_1_3_3_2_15_1","doi-asserted-by":"crossref","unstructured":"Alan\u00a0S Cowen and Dacher Keltner. 2020. What the face displays: Mapping 28 emotions conveyed by naturalistic expression. American Psychologist 75 3 (2020) 349.","DOI":"10.1037\/amp0000488"},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_3_2_19_1","doi-asserted-by":"crossref","unstructured":"Dorottya Demszky Dana Movshovitz-Attias Jeongwoo Ko Alan Cowen Gaurav Nemade and Sujith Ravi. 2020. GoEmotions: A dataset of fine-grained emotions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.00547 (2020).","DOI":"10.18653\/v1\/2020.acl-main.372"},{"key":"e_1_3_3_2_20_1","volume-title":"Statistical Shape Analysis","author":"Dryden Ian\u00a0L.","year":"1998","unstructured":"Ian\u00a0L. Dryden and Kanti\u00a0V. Mardia. 1998. Statistical Shape Analysis. Wiley."},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"crossref","unstructured":"Pif Edwards Chris Landreth Eugene Fiume and Karan Singh. 2016. Jali: an animator-centric viseme model for expressive lip synchronization. ACM Transactions on graphics (TOG) 35 4 (2016) 1\u201311.","DOI":"10.1145\/2897824.2925984"},{"key":"e_1_3_3_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93764-9_35"},{"key":"e_1_3_3_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CA.1998.681913"},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"crossref","unstructured":"Tony Ezzat and Tomaso Poggio. 2000. Visual speech synthesis by morphing visemes. International Journal of Computer Vision 38 (2000) 45\u201357.","DOI":"10.1023\/A:1008166717597"},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_3_2_26_1","doi-asserted-by":"crossref","unstructured":"Gabriele Fanelli Juergen Gall Harald Romsdorfer Thibaut Weise and Luc Van\u00a0Gool. 2010. A 3-d audio-visual corpus of affective communication. IEEE Transactions on Multimedia 12 6 (2010) 591\u2013598.","DOI":"10.1109\/TMM.2010.2052239"},{"key":"e_1_3_3_2_27_1","unstructured":"Leo Feng Frederick Tung Mohamed\u00a0Osama Ahmed Yoshua Bengio and Hossein Hajimirsadegh. 2024. Were rnns all we needed? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.01201 (2024)."},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"crossref","unstructured":"David Greenwood Iain Matthews and Stephen Laycock. 2018. Joint learning of facial expression and head pose from speech. Interspeech.","DOI":"10.21437\/Interspeech.2018-2587"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"crossref","unstructured":"Aparna\u00a0R Gullapalli Nathaniel\u00a0E Anderson Rohit Yerramsetty Carla\u00a0L Harenski and Kent\u00a0A Kiehl. 2021. In the blink of an eye: Quantitative blink dynamics predict deceptive personality traits in forensic interviews. Personality and Individual Differences 176 (2021) 110764.","DOI":"10.1016\/j.paid.2021.110764"},{"key":"e_1_3_3_2_30_1","unstructured":"A Hannun. 2014. Deep Speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.5567 (2014)."},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614157"},{"key":"e_1_3_3_2_32_1","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio speech and language processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_2_33_1","unstructured":"Sunjin Jung Sewhan Chun and Junyong Noh. 2024. Audio-Driven Speech Animation with Text-Guided Expression. (2024)."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CA.2001.982373"},{"key":"e_1_3_3_2_35_1","doi-asserted-by":"crossref","unstructured":"Tero Karras Timo Aila Samuli Laine Antti Herva and Jaakko Lehtinen. 2017. Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Transactions on Graphics (ToG) 36 4 (2017) 1\u201312.","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_3_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3616120"},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","unstructured":"Tianye Li Timo Bolkart Michael.\u00a0J. Black Hao Li and Javier Romero. 2017. Learning a model of facial shape and expression from 4D scans. ACM Transactions on Graphics (Proc. SIGGRAPH Asia) 36 6 (2017) 194:1\u2013194:17. 10.1145\/3130800.3130813","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_3_2_38_1","unstructured":"Gaojie Lin Jianwen Jiang Jiaqi Yang Zerong Zheng and Chao Liang. 2025. OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.01061 (2025)."},{"key":"e_1_3_3_2_39_1","volume-title":"Perceiving talking faces: From speech perception to a behavioral principle","author":"Massaro Dominic\u00a0W","year":"1998","unstructured":"Dominic\u00a0W Massaro. 1998. Perceiving talking faces: From speech perception to a behavioral principle. Mit Press."},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"publisher","unstructured":"Salvador Medina. 2024. Talking us into the Metaverse: Towards Realistic Streaming Speech-to-Face Animation. (4 2024). 10.1184\/R1\/25453366.v1","DOI":"10.1184\/R1\/25453366.v1"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448411"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01976"},{"key":"e_1_3_3_2_43_1","unstructured":"Autodesk Motion\u00a0Builder. 2024. http:\/\/www.autodesk.com\/products\/motionbuilder\/."},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"crossref","unstructured":"Helen O\u2019Reilly Delia Pigat Shimrit Fridenson Steve Berggren Shahar Tal Ofer Golan Sven B\u00f6lte Simon Baron-Cohen and Daniel Lundqvist. 2016. The EU-emotion stimulus set: a validation study. Behavior research methods 48 (2016) 567\u2013576.","DOI":"10.3758\/s13428-015-0601-4"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"crossref","unstructured":"Yifang Pan Rishabh Agrawal and Karan Singh. 2024. S3: Speech Script and Scene driven Head and Eye Animation. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u201312.","DOI":"10.1145\/3658172"},{"key":"e_1_3_3_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555408"},{"key":"e_1_3_3_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611734"},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_3_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3243017"},{"key":"e_1_3_3_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_3_2_52_1","first-page":"28492","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492\u201328518."},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00009"},{"key":"e_1_3_3_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"crossref","unstructured":"Ernst\u00a0H Rothauser. 1969. IEEE recommended practice for speech quality measurements. IEEE Transactions on Audio and Electroacoustics 17 3 (1969) 225\u2013246.","DOI":"10.1109\/TAU.1969.1162058"},{"key":"e_1_3_3_2_56_1","unstructured":"Vicon Shogun\u00a0Post. 2024. http:\/\/www.vicon.com\/software\/shogun\/."},{"key":"e_1_3_3_2_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623264.3624447"},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"crossref","unstructured":"Zhiyao Sun Tian Lv Sheng Ye Matthieu Lin Jenny Sheng Yu-Hui Wen Minjing Yu and Yong-jin Liu. 2024. Diffposetalk: Speech-driven stylistic 3d facial animation and head pose generation via diffusion models. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u20139.","DOI":"10.1145\/3658221"},{"key":"e_1_3_3_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00628"},{"key":"e_1_3_3_2_60_1","doi-asserted-by":"crossref","unstructured":"Supasorn Suwajanakorn Steven\u00a0M Seitz and Ira Kemelmacher-Shlizerman. 2017. Synthesizing obama: learning lip sync from audio. ACM Transactions on Graphics (ToG) 36 4 (2017) 1\u201313.","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_3_2_61_1","doi-asserted-by":"crossref","unstructured":"Sarah Taylor Taehwan Kim Yisong Yue Moshe Mahler James Krahe Anastasio\u00a0Garcia Rodriguez Jessica Hodgins and Iain Matthews. 2017. A deep learning approach for generalized speech animation. ACM Transactions On Graphics (TOG) 36 4 (2017) 1\u201311.","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"e_1_3_3_2_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_3_2_64_1","unstructured":"5.4 Unreal\u00a0Engine. 2024. http:\/\/www.unrealengine.com\/."},{"key":"e_1_3_3_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2003.1200072"},{"key":"e_1_3_3_2_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_3_2_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611775"},{"key":"e_1_3_3_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_3_2_69_1","doi-asserted-by":"crossref","unstructured":"Sicheng Xu Guojun Chen Yu-Xiao Guo Jiaolong Yang Chong Li Zhenyu Zang Yizhong Zhang Xin Tong and Baining Guo. 2024. Vasa-1: Lifelike audio-driven talking faces generated in real time. Advances in Neural Information Processing Systems 37 (2024) 660\u2013684.","DOI":"10.52202\/079017-0021"},{"key":"e_1_3_3_2_70_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28594"},{"key":"e_1_3_3_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00589"},{"key":"e_1_3_3_2_72_1","doi-asserted-by":"crossref","unstructured":"Yang Zhou Zhan Xu Chris Landreth Evangelos Kalogerakis Subhransu Maji and Karan Singh. 2018. Visemenet: Audio-driven animator-centric speech animation. ACM Transactions on Graphics (TOG) 37 4 (2018) 1\u201310.","DOI":"10.1145\/3197517.3201292"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730711","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:54:24Z","timestamp":1774018464000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730711"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":71,"alternative-id":["10.1145\/3721238.3730711","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730711","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}