{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T18:39:31Z","timestamp":1780425571746,"version":"3.54.1"},"reference-count":31,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1109\/icassp.2018.8461368","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:24:48Z","timestamp":1537568688000},"page":"4779-4783","source":"Crossref","is-referenced-by-count":1650,"title":["Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions"],"prefix":"10.1109","author":[{"given":"Jonathan","family":"Shen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruoming","family":"Pang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ron J.","family":"Weiss","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mike","family":"Schuster","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Navdeep","family":"Jaitly","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zongheng","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhifeng","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuxuan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rj","family":"Skerrv-Ryan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rif A.","family":"Saurous","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yannis","family":"Agiomvrgiannakis","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yonghui","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-522"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-264"},{"key":"ref10","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","volume":"abs 1705 8947","author":"arik","year":"2017","journal-title":"CoRR"},{"key":"ref11","article-title":"Deep voice 3: 2000-speaker neural text-to-speech","volume":"abs 1710 7654","author":"ping","year":"2017","journal-title":"CoRR"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref13","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Proc NIPS"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref15","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref16","article-title":"Char2Wav: End-to-end speech synthesis","author":"sotelo","year":"2017","journal-title":"Proc ICLR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"ref18","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc ICML"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref28","article-title":"Parallel WaveNet: Fast High-Fidelity Speech Synthesis","volume":"abs 1711 10433","author":"van den oord","year":"2017","journal-title":"CoRR"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref27","article-title":"PixeICNN++: Improving the PixelCNN with discretized logistic mixture likelihood and other modifications","author":"salimans","year":"2017","journal-title":"Proc ICLR"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"601","DOI":"10.21437\/Eurospeech.1997-219","article-title":"Automatically clustering similar units for unit selection in speech synthesis","author":"black","year":"1997","journal-title":"Proc EUROSPEECH"},{"key":"ref6","first-page":"7962","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"zen","year":"2013","journal-title":"Proc ICASSP"},{"key":"ref29","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"Proc ICLR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.04.004"},{"key":"ref8","article-title":"WaveNet: A generative model for raw audio","volume":"abs 1609 3499","author":"van den oord","year":"2016","journal-title":"CoRR"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2013.2251852"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"ref9","article-title":"Deep voice: Real-time neural text-to-speech","volume":"abs 1702 7825","author":"arik","year":"2017","journal-title":"CoRR"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511816338"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref22","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc ICLR"},{"key":"ref21","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Proc NIPS"},{"key":"ref24","author":"schuster","year":"1999","journal-title":"On Supervised Learning from Sequential Data with Applications for Speech Recognition"},{"key":"ref23","article-title":"Mixture density networks","author":"bishop","year":"1994","journal-title":"Tech Rep"},{"key":"ref26","article-title":"Zone out: Regularizing RNNs by randomly preserving hidden activations","author":"krueger","year":"2017","journal-title":"Proc ICLR"},{"key":"ref25","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"Journal of Machine Learning Research"}],"event":{"name":"ICASSP 2018 - 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Calgary, AB","start":{"date-parts":[[2018,4,15]]},"end":{"date-parts":[[2018,4,20]]}},"container-title":["2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8450881\/8461260\/08461368.pdf?arnumber=8461368","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,8]],"date-time":"2025-07-08T18:14:14Z","timestamp":1751998454000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8461368\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/icassp.2018.8461368","relation":{},"subject":[],"published":{"date-parts":[[2018,4]]}}}