{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T08:15:06Z","timestamp":1774685706337,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T00:00:00Z","timestamp":1700006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,15]]},"DOI":"10.1145\/3623264.3624451","type":"proceedings-article","created":{"date-parts":[[2023,10,29]],"date-time":"2023-10-29T21:02:31Z","timestamp":1698613351000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Audiovisual Inputs for Learning Robust, Real-time Facial Animation with Lip Sync"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8763-4134","authenticated-orcid":false,"given":"I\u00f1aki","family":"Navarro","sequence":"first","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8678-3767","authenticated-orcid":false,"given":"Dario","family":"Kneubuehler","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8451-6541","authenticated-orcid":false,"given":"Tijmen","family":"Verhulsdonck","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7169-3370","authenticated-orcid":false,"given":"Eloi","family":"Du Bois","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9706-5881","authenticated-orcid":false,"given":"William","family":"Welch","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6444-2241","authenticated-orcid":false,"given":"Charles","family":"Shang","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6206-0009","authenticated-orcid":false,"given":"Ian","family":"Sachs","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1074-0953","authenticated-orcid":false,"given":"Morgan","family":"Mcguire","sequence":"additional","affiliation":[{"name":"Roblox, USA and University of Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7309-7013","authenticated-orcid":false,"given":"Victor","family":"Zordan","sequence":"additional","affiliation":[{"name":"Roblox, USA and Clemson University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1221-7256","authenticated-orcid":false,"given":"Kiran","family":"Bhat","sequence":"additional","affiliation":[{"name":"Roblox, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,11,15]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2009-2017. \"Your Weekly Address\". https:\/\/obamawhitehouse.archives.gov\/briefing-room\/weekly-address."},{"key":"e_1_3_2_2_2_1","unstructured":"2022. Tensorflow Graph Transform. https:\/\/github.com\/tensorflow\/tensorflow\/tree\/master\/tensorflow\/tools\/graph_transforms."},{"key":"e_1_3_2_2_3_1","unstructured":"Apple. 2021. ARKit Developer Documentation. https:\/\/developer.apple.com\/documentation\/arkit\/arfaceanchor"},{"key":"e_1_3_2_2_4_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449\u201312460."},{"key":"e_1_3_2_2_5_1","volume-title":"Subpixel heatmap regression for facial landmark localization. arXiv preprint arXiv:2111.02360","author":"Bulat Adrian","year":"2021","unstructured":"Adrian Bulat, Enrique Sanchez, and Georgios Tzimiropoulos. 2021. Subpixel heatmap regression for facial landmark localization. arXiv preprint arXiv:2111.02360 (2021)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00709"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461502"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_2_10_1","volume-title":"Facial action coding system. Environmental Psychology & Nonverbal Behavior","author":"Ekman Paul","year":"1978","unstructured":"Paul Ekman and Wallace\u00a0V Friesen. 1978. Facial action coding system. Environmental Psychology & Nonverbal Behavior (1978)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459936"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_33"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_2_17_1","volume-title":"Attention Mesh: High-fidelity Face Mesh Prediction in Real-time. arxiv:2006.10962\u00a0[cs.CV]","author":"Grishchenko Ivan","year":"2020","unstructured":"Ivan Grishchenko, Artsiom Ablavatski, Yury Kartynnik, Karthik Raveendran, and Matthias Grundmann. 2020. Attention Mesh: High-fidelity Face Mesh Prediction in Real-time. arxiv:2006.10962\u00a0[cs.CV]"},{"key":"e_1_3_2_2_18_1","volume-title":"PFLD: A Practical Facial Landmark Detector. CoRR abs\/1902.10859","author":"Guo Xiaojie","year":"2019","unstructured":"Xiaojie Guo, Siyuan Li, Jiawan Zhang, Jiayi Ma, Lin Ma, Wei Liu, and Haibin Ling. 2019. PFLD: A Practical Facial Landmark Detector. CoRR abs\/1902.10859 (2019). arxiv:1902.10859http:\/\/arxiv.org\/abs\/1902.10859"},{"key":"e_1_3_2_2_19_1","volume-title":"Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567","author":"Hannun Awni","year":"2014","unstructured":"Awni Hannun, Carl Case, Jared Casper, Bryan Catanzaro, Greg Diamos, Erich Elsen, Ryan Prenger, Sanjeev Satheesh, Shubho Sengupta, Adam Coates, 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014)."},{"key":"e_1_3_2_2_20_1","volume-title":"FaceXHuBERT: Text-less Speech-driven E (X) pressive 3D Facial Animation Synthesis Using Self-Supervised Speech Representation Learning. arXiv preprint arXiv:2303.05416","author":"Haque Kazi\u00a0Injamamul","year":"2023","unstructured":"Kazi\u00a0Injamamul Haque and Zerrin Yumak. 2023. FaceXHuBERT: Text-less Speech-driven E (X) pressive 3D Facial Animation Synthesis Using Self-Supervised Speech Representation Learning. arXiv preprint arXiv:2303.05416 (2023)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Sina Honari Pavlo Molchanov Stephen Tyree Pascal Vincent Christopher Pal and Jan Kautz. 2018. Improving Landmark Localization with Semi-Supervised Learning. arxiv:1709.01591\u00a0[cs.CV]","DOI":"10.1109\/CVPR.2018.00167"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418840"},{"key":"e_1_3_2_2_24_1","volume-title":"International conference on machine learning. PMLR, 448\u2013456","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. In International conference on machine learning. PMLR, 448\u2013456."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00826"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3099564.3099581"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_2_2_29_1","volume-title":"Proc. icml, Vol.\u00a030","author":"Maas L","year":"2013","unstructured":"Andrew\u00a0L Maas, Awni\u00a0Y Hannun, Andrew\u00a0Y Ng, 2013. Rectifier nonlinearities improve neural network acoustic models. In Proc. icml, Vol.\u00a030. Atlanta, Georgia, USA, 3."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Michael McAuliffe Michaela Socolof Sarah Mihuc Michael Wagner and Morgan Sonderegger. 2017. Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi.. In Interspeech Vol.\u00a02017. 498\u2013502.","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1016\/S1090-5138(99)00014-8"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"e_1_3_2_2_35_1","volume-title":"Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation. CoRR abs\/1801.04381","author":"Sandler Mark","year":"2018","unstructured":"Mark Sandler, Andrew\u00a0G. Howard, Menglong Zhu, Andrey Zhmoginov, and Liang-Chieh Chen. 2018. Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation. CoRR abs\/1801.04381 (2018). arXiv:1801.04381http:\/\/arxiv.org\/abs\/1801.04381"},{"key":"e_1_3_2_2_36_1","unstructured":"Tencent. 2023. NCNN. https:\/\/github.com\/Tencent\/ncnn."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_10"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_2_2_39_1","unstructured":"Yufei Xu Jing Zhang Qiming Zhang and Dacheng Tao. 2022. ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_2_40_1","volume-title":"High Performance Zero-Memory Overhead Direct Convolutions. CoRR abs\/1809.10170","author":"Zhang Jiyuan","year":"2018","unstructured":"Jiyuan Zhang, Franz Franchetti, and Tze\u00a0Meng Low. 2018. High Performance Zero-Memory Overhead Direct Convolutions. CoRR abs\/1809.10170 (2018). arXiv:1809.10170http:\/\/arxiv.org\/abs\/1809.10170"},{"key":"e_1_3_2_2_41_1","volume-title":"Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks. CoRR abs\/1604.02878","author":"Zhang Kaipeng","year":"2016","unstructured":"Kaipeng Zhang, Zhanpeng Zhang, Zhifeng Li, and Yu Qiao. 2016. Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Networks. CoRR abs\/1604.02878 (2016). arxiv:1604.02878http:\/\/arxiv.org\/abs\/1604.02878"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3197517.3201292","article-title":"Visemenet: Audio-driven animator-centric speech animation","volume":"37","author":"Zhou Yang","year":"2018","unstructured":"Yang Zhou, Zhan Xu, Chris Landreth, Evangelos Kalogerakis, Subhransu Maji, and Karan Singh. 2018. Visemenet: Audio-driven animator-centric speech animation. ACM Transactions on Graphics (TOG) 37, 4 (2018), 1\u201310.","journal-title":"ACM Transactions on Graphics (TOG)"}],"event":{"name":"MIG '23: The 16th ACM SIGGRAPH Conference on Motion, Interaction and Games","location":"Rennes France","acronym":"MIG '23","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["ACM SIGGRAPH Conference on Motion Interaction and Games"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623264.3624451","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3623264.3624451","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T16:23:29Z","timestamp":1756484609000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623264.3624451"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,15]]},"references-count":42,"alternative-id":["10.1145\/3623264.3624451","10.1145\/3623264"],"URL":"https:\/\/doi.org\/10.1145\/3623264.3624451","relation":{},"subject":[],"published":{"date-parts":[[2023,11,15]]},"assertion":[{"value":"2023-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}