{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:28:13Z","timestamp":1775230093235,"version":"3.50.1"},"reference-count":27,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1109\/icassp.2018.8461452","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:24:48Z","timestamp":1537568688000},"page":"4804-4808","source":"Crossref","is-referenced-by-count":36,"title":["A Comparison of Recent Waveform Generation and Acoustic Modeling Methods for Neural-Network-Based Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Xin","family":"Wang","sequence":"first","affiliation":[]},{"given":"Jaime","family":"Lorenzo-Trueba","sequence":"additional","affiliation":[]},{"given":"Shinji","family":"Takaki","sequence":"additional","affiliation":[]},{"given":"Lauri","family":"Juvela","sequence":"additional","affiliation":[]},{"given":"Junichi","family":"Yamagishi","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","first-page":"1877","article-title":"WORLD: A vocoder-based high-quality speech synthesis system for real-time applications","volume":"99","author":"masanori","year":"2016","journal-title":"IEICE Trans on Information and Systems"},{"key":"ref11","article-title":"A log domain pulse model for parametric speech synthesis","author":"degottex","year":"2017","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language Processing"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref13","doi-asserted-by":"crossref","first-page":"1059","DOI":"10.21437\/Interspeech.2017-246","article-title":"An RNN-based quantized F0 model with multi-tier feedback links for text-to-speech synthesis","author":"wang","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref14","first-page":"936","article-title":"Speech parameter generation algorithms for HMM-based speech synthesis","author":"keiichi","year":"2000","journal-title":"Proc ICASSP"},{"key":"ref15","first-page":"4581","article-title":"Complex cepstrum as phase information in statistical parametric speech synthesis","author":"ranniery","year":"2012","journal-title":"Proc ICASSP"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-014-0038-1"},{"key":"ref17","first-page":"5630","article-title":"Initial investigation of speech synthesis based on complex-valued neural networks","author":"qiong","year":"2016","journal-title":"Proc ICASSP"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"4021","DOI":"10.21437\/Interspeech.2017-584","article-title":"Complex-valued restricted Boltzmann machine for direct learning of frequency spectra","author":"nakashika","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"ref4","first-page":"1383","article-title":"Direct modelling of magnitude and phase spectra for statistical parametric speech synthesis","author":"felipe","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref27","first-page":"1530","article-title":"Variational inference with normalizing flows","author":"rezende","year":"2015","journal-title":"Proc ICML"},{"key":"ref3","author":"van den oord","year":"2016","journal-title":"WaveNet A Generative Model for Raw Audio"},{"key":"ref6","first-page":"1133","article-title":"A hierarchical encoder-decoder model for statistical parametric speech synthesis","author":"srikanth","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"1128","DOI":"10.21437\/Interspeech.2017-488","article-title":"Direct modeling of frequency spectra and waveform generation based on phase recovery for DNN-based speech synthesis","author":"takaki","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953087"},{"key":"ref7","first-page":"4910","article-title":"Generative adversarial network-based postfilter for statistical parametric speech synthesis","author":"takuhiro","year":"2017","journal-title":"Proc ICASSP"},{"key":"ref2","article-title":"Char2wav: End-to-end speech synthesis","author":"sotelo","year":"2017","journal-title":"ICLR (workshop track)"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref1","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"yuxuan","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref20","first-page":"3389","article-title":"Generative adversarial network-based postfilter for STFT spectrograms","author":"takuhiro","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/5.135378"},{"key":"ref21","article-title":"The USTC system for Blizzard Challenge 2017","author":"yajun","year":"2017","journal-title":"Proc Blizzard Challenge Workshop"},{"key":"ref24","first-page":"179","article-title":"XIMERA: A new TTS from ATR based on corpus-based technologies","author":"hisashi","year":"2004","journal-title":"Proc SSW5"},{"key":"ref23","first-page":"1068","article-title":"Neural audio synthesis of musical notes with WaveNet autoencoders","author":"jesse","year":"2017","journal-title":"Proc ICML"},{"key":"ref26","first-page":"547","article-title":"Introducing CURRENT: The Munich open-source CUDA recurrent neural network toolkit","volume":"16","author":"weninger","year":"2015","journal-title":"The Journal of Machine Learning Research"},{"key":"ref25","article-title":"The Japanese TTS System &#x2018;Open JTalk&#x2019;","year":"2015","journal-title":"HTS Working Group"}],"event":{"name":"ICASSP 2018 - 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Calgary, AB","start":{"date-parts":[[2018,4,15]]},"end":{"date-parts":[[2018,4,20]]}},"container-title":["2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8450881\/8461260\/08461452.pdf?arnumber=8461452","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T06:21:48Z","timestamp":1598250108000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8461452\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/icassp.2018.8461452","relation":{},"subject":[],"published":{"date-parts":[[2018,4]]}}}