{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:38:20Z","timestamp":1770917900379,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T00:00:00Z","timestamp":1538438400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,2]]},"DOI":"10.1145\/3242969.3243017","type":"proceedings-article","created":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T08:09:29Z","timestamp":1538467769000},"page":"361-365","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":48,"title":["End-to-end Learning for 3D Facial Animation from Speech"],"prefix":"10.1145","author":[{"given":"Hai Xuan","family":"Pham","sequence":"first","affiliation":[{"name":"Rutgers University, Piscataway, NJ, USA"}]},{"given":"Yuting","family":"Wang","sequence":"additional","affiliation":[{"name":"Rutgers University, Piscataway, NJ, USA"}]},{"given":"Vladimir","family":"Pavlovic","sequence":"additional","affiliation":[{"name":"Rutgers University, Piscataway, NJ, USA"}]}],"member":"320","published-online":{"date-parts":[[2018,10,2]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2339736"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288864"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"V. Blanz C. Basso T. Poggio and T. Vetter . 1999. Reanimating faces in images and video. In SIGGAPH. 187--194. V. Blanz C. Basso T. Poggio and T. Vetter . 1999. Reanimating faces in images and video. In SIGGAPH. 187--194.","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"V. Blanz and T. Vetter . 2003. A morphable model for the synthesis of 3d faces. In Eurographics. 641--650. V. Blanz and T. Vetter . 2003. A morphable model for the synthesis of 3d faces. In Eurographics. 641--650.","DOI":"10.1111\/1467-8659.t01-1-00712"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/258734.258880"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2013.249"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1095878.1095881"},{"key":"e_1_3_2_2_8_1","volume-title":"Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling NIPS 2014 Deep Learning and Representation Learning Workshop.","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung , Caglar Gulcehre , KyungHyun Cho , and Yoshua Bengio . 2014 . Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling NIPS 2014 Deep Learning and Representation Learning Workshop. Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio . 2014. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling NIPS 2014 Deep Learning and Representation Learning Workshop."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2003.817141"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Li Deng Ossama Abdel-Hamid and Dong Yu . 2013. A deep convolutional neural network using heterogeneous pooling for trading acoustic invariance with phonetic confusion IEEE International Conference on Acoustics Speech and Signal Processing. Li Deng Ossama Abdel-Hamid and Dong Yu . 2013. A deep convolutional neural network using heterogeneous pooling for trading acoustic invariance with phonetic confusion IEEE International Conference on Acoustics Speech and Signal Processing.","DOI":"10.1109\/ICASSP.2013.6638952"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-014-2156-2"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/566570.566594"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-015-2944-3"},{"key":"e_1_3_2_2_14_1","volume-title":"Int. Conf. on Auditory-Visual Speech Processing (AVSP'08)","author":"Haq S.","year":"2008","unstructured":"S. Haq , P.J.B. Jackson , and J. Edge . 2008 . Audio-visual feature selection and reduction for emotion classification Proc . Int. Conf. on Auditory-Visual Speech Processing (AVSP'08) , Tangalooma, Australia. S. Haq, P.J.B. Jackson, and J. Edge . 2008. Audio-visual feature selection and reduction for emotion classification Proc. Int. Conf. on Auditory-Visual Speech Processing (AVSP'08), Tangalooma, Australia."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_16_1","volume-title":"Wilson","author":"Hoshen Yedid","year":"2015","unstructured":"Yedid Hoshen , Ron J. Weiss , and Kevin W . Wilson . 2015 . Speech acoustic modeling from raw multichannel waveforms IEEE International Conference on Acoustics, Speech and Signal Processing . Yedid Hoshen, Ron J. Weiss, and Kevin W. Wilson . 2015. Speech acoustic modeling from raw multichannel waveforms IEEE International Conference on Acoustics, Speech and Signal Processing."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_2_18_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference for Learning Representations.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba . 2015 . Adam: A Method for Stochastic Optimization. In 3rd International Conference for Learning Representations. Diederik P. Kingma and Jimmy Ba . 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference for Learning Representations."},{"key":"e_1_3_2_2_19_1","unstructured":"Yann LeCun and Yoshua Bengio . 1998. Convolutional Networks for Images Speech and Time-Series. (1998) 255--258. Yann LeCun and Yoshua Bengio . 1998. Convolutional Networks for Images Speech and Time-Series. (1998) 255--258."},{"key":"e_1_3_2_2_20_1","volume-title":"RAVDESS: The Ryerson Audio-Visual Database of Emotional Speech and Song 22nd Annual Meeting of the Canadian Society for Brain, Behaviour and Cognitive Science (CSBBCS).","author":"Livingstone S. R.","year":"2012","unstructured":"S. R. Livingstone , K. Peck , and F. A. Russo . 2012 . RAVDESS: The Ryerson Audio-Visual Database of Emotional Speech and Song 22nd Annual Meeting of the Canadian Society for Brain, Behaviour and Cognitive Science (CSBBCS). S. R. Livingstone, K. Peck, and F. A. Russo . 2012. RAVDESS: The Ryerson Audio-Visual Database of Emotional Speech and Song 22nd Annual Meeting of the Canadian Society for Brain, Behaviour and Cognitive Science (CSBBCS)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Dimitri Palaz Ronan Collobert and Mathew Magimai-Doss . 2013. Estimating phoneme class conditional probabilities from raw speech signal using convolutional neural networks. In Interspeech. Dimitri Palaz Ronan Collobert and Mathew Magimai-Doss . 2013. Estimating phoneme class conditional probabilities from raw speech signal using convolutional neural networks. In Interspeech.","DOI":"10.21437\/Interspeech.2013-438"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.287"},{"key":"e_1_3_2_2_23_1","volume-title":"Pham and Vladimir Pavlovic","author":"Hai","year":"2016","unstructured":"Hai X. Pham and Vladimir Pavlovic . 2016 . Robust Real-time 3D Face Tracking from RGBD Videos under Extreme Pose, Depth , and Expression Variations. In 3DV. Hai X. Pham and Vladimir Pavlovic . 2016. Robust Real-time 3D Face Tracking from RGBD Videos under Extreme Pose, Depth, and Expression Variations. In 3DV."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Hai X. Pham Vladimir Pavlovic Jianfei Cai and Tat jen Cham . 2016. Robust Real-time Performance-driven 3D Face Tracking ICPR. Hai X. Pham Vladimir Pavlovic Jianfei Cai and Tat jen Cham . 2016. Robust Real-time Performance-driven 3D Face Tracking ICPR.","DOI":"10.1109\/ICPR.2016.7899906"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Y. Qian Y. Fan and F. K. Soong . 2014. On the training aspects of deep neural network (DNN) for parametric TTS synthesis ICASSP. 3829--3833. Y. Qian Y. Fan and F. K. Soong . 2014. On the training aspects of deep neural network (DNN) for parametric TTS synthesis ICASSP. 3829--3833.","DOI":"10.1109\/ICASSP.2014.6854318"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2014.08.005"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Tara N. Sainath Oriol Vinyals Andrew Senior and Hasim Sak . 2015 b. Convolutional long short-term memory fully connected deep neural networks IEEE International Conference on Acoustics Speech and Signal Processing. Tara N. Sainath Oriol Vinyals Andrew Senior and Hasim Sak . 2015 b. Convolutional long short-term memory fully connected deep neural networks IEEE International Conference on Acoustics Speech and Signal Processing.","DOI":"10.1109\/ICASSP.2015.7178838"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Tara N. Sainath Ron J. Weiss Andrew Senior Kevin W. Wilson and Oriol Vinyals . 2015 c. Learning the speech front-end with raw waveforms CLDNNs Interspeech. Tara N. Sainath Ron J. Weiss Andrew Senior Kevin W. Wilson and Oriol Vinyals . 2015 c. Learning the speech front-end with raw waveforms CLDNNs Interspeech.","DOI":"10.21437\/Interspeech.2015-1"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"S. Sako K. Tokuda T. Masuko T. Kobayashi and T. Kitamura . 2000. HMM-based text-to-audio-visual speech synthesis. In ICSLP. 25--28. S. Sako K. Tokuda T. Masuko T. Kobayashi and T. Kitamura . 2000. HMM-based text-to-audio-visual speech synthesis. In ICSLP. 25--28.","DOI":"10.21437\/ICSLP.2000-469"},{"key":"e_1_3_2_2_30_1","volume-title":"Synface: speech-driven facial animation for virtual speech-reading support. URASIP journal on Audio, speech, and music processing","author":"Salvi G.","year":"2009","unstructured":"G. Salvi , J. Beskow , S.A. Moubayed , and B. Granstrom . 2009. Synface: speech-driven facial animation for virtual speech-reading support. URASIP journal on Audio, speech, and music processing ( 2009 ). G. Salvi, J. Beskow, S.A. Moubayed, and B. Granstrom . 2009. Synface: speech-driven facial animation for virtual speech-reading support. URASIP journal on Audio, speech, and music processing (2009)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-01793-3_21"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"George Trigeorgis Fabien Ringeval Raymond Brueckner Erik Marchi Mihalis A. Nicolaou Bj\u00f6rn Schuller and Stefanos Zafeiriou . 2016. Adieu features? End-to-end speech emotion recognition using a deep convolutional recurrent network. In Interspeech. George Trigeorgis Fabien Ringeval Raymond Brueckner Erik Marchi Mihalis A. Nicolaou Bj\u00f6rn Schuller and Stefanos Zafeiriou . 2016. Adieu features? End-to-end speech emotion recognition using a deep convolutional recurrent network. In Interspeech.","DOI":"10.1109\/ICASSP.2016.7472669"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1274940.1274947"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"L. Wang X. Qian W. Han and F. K. Soong . 2010. Synthesizing photo-real talking head via trajectoryguided sample selection. In Interspeech. 446--449. L. Wang X. Qian W. Han and F. K. Soong . 2010. Synthesizing photo-real talking head via trajectoryguided sample selection. In Interspeech. 446--449.","DOI":"10.21437\/Interspeech.2010-194"},{"key":"e_1_3_2_2_37_1","unstructured":"L. Wang X. Qian F. K. Soong and Q. Huo . 2011. Text driven 3D Photo-realistic talking head. In Interspeech. 3307--3310. L. Wang X. Qian F. K. Soong and Q. Huo . 2011. Text driven 3D Photo-realistic talking head. In Interspeech. 3307--3310."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Z. Wu S. Zhang L. Cai and H. Meng . 2006. Real-time synthesis of chinese visual speech and facial expressions using mpeg-4 fap features in a three-dimensional avatar. In Interspeech. 1802--1805. Z. Wu S. Zhang L. Cai and H. Meng . 2006. Real-time synthesis of chinese visual speech and facial expressions using mpeg-4 fap features in a three-dimensional avatar. In Interspeech. 1802--1805.","DOI":"10.21437\/Interspeech.2006-498"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2006.888009"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"crossref","unstructured":"H. Zen A. Senior and M. Schuster . 2013. Statistical parametric speech synthesis using deep neural networks ICASSP. 7962--7966. H. Zen A. Senior and M. Schuster . 2013. Statistical parametric speech synthesis using deep neural networks ICASSP. 7962--7966.","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"X. Zhang L. Wang G. Li F. Seide and F. K. Soong . 2013. A new language independent photo realistic talking head driven by voice only Interspeech. X. Zhang L. Wang G. Li F. Seide and F. K. Soong . 2013. A new language independent photo realistic talking head driven by voice only Interspeech.","DOI":"10.21437\/Interspeech.2013-629"}],"event":{"name":"ICMI '18: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Boulder CO USA","acronym":"ICMI '18","sponsor":["SIGCHI Specialist Interest Group in Computer-Human Interaction of the ACM"]},"container-title":["Proceedings of the 20th ACM International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3243017","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3242969.3243017","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:45:18Z","timestamp":1761093918000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3243017"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,2]]},"references-count":41,"alternative-id":["10.1145\/3242969.3243017","10.1145\/3242969"],"URL":"https:\/\/doi.org\/10.1145\/3242969.3243017","relation":{},"subject":[],"published":{"date-parts":[[2018,10,2]]},"assertion":[{"value":"2018-10-02","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}