{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T20:45:21Z","timestamp":1754599521292,"version":"3.28.0"},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9054466","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T16:21:13Z","timestamp":1586449273000},"page":"7644-7648","source":"Crossref","is-referenced-by-count":3,"title":["Semi-Supervised Learning Based on Hierarchical Generative Models for End-to-End Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Takato","family":"Fujimoto","sequence":"first","affiliation":[]},{"given":"Shinji","family":"Takaki","sequence":"additional","affiliation":[]},{"given":"Kei","family":"Hashimoto","sequence":"additional","affiliation":[]},{"given":"Keiichiro","family":"Oura","sequence":"additional","affiliation":[]},{"given":"Yoshihiko","family":"Nankaku","sequence":"additional","affiliation":[]},{"given":"Keiichi","family":"Tokuda","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","article-title":"Deep Voice 3: Scaling text-to-speech with convolutional sequence learning","author":"ping","year":"2018","journal-title":"Proc ICLR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682880"},{"key":"ref12","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2014","journal-title":"Proc ICLR"},{"key":"ref13","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"2018","journal-title":"arXiv 1810 07217"},{"key":"ref14","article-title":"Effective use of variational embedding capacity in expressive end-to-end speech synthesis","author":"battenberg","year":"2019","journal-title":"arXiv 1906 03402"},{"key":"ref15","article-title":"Semi-supervised generative modeling for controllable speech synthesis","author":"habib","year":"2019","journal-title":"arXiv 1910 01709"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1118"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3233"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref19","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Proc NIPS"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref27","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"Proc ICLR"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref6","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref5","article-title":"Close to human quality TTS with transformer","author":"li","year":"2018","journal-title":"arXiv 1809 08895"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682674"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref2","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc NIPS"},{"key":"ref9","first-page":"6905","article-title":"Investigation of enhanced Tacotron text-to-speech synthesis systems with self-attention for pitch accent language","author":"yasuda","year":"2019","journal-title":"Proc ICASSP"},{"key":"ref1","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc ICLR"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","article-title":"Statistical parametric speech synthesis","volume":"51","author":"zen","year":"2009","journal-title":"Speech Communication"},{"key":"ref22","doi-asserted-by":"crossref","first-page":"1045","DOI":"10.21437\/Interspeech.2010-343","article-title":"Recurrent neural network based language model","author":"mikolov","year":"2010","journal-title":"Proc INTERSPEECH"},{"key":"ref21","article-title":"Almost unsupervised text to speech and automatic speech recognition","author":"ren","year":"2019","journal-title":"Proc ICML"},{"key":"ref24","first-page":"47974805","article-title":"Conditional image generation with PixelCNN decoders","author":"van den oord","year":"2016","journal-title":"Proc NIPS"},{"key":"ref23","first-page":"881","article-title":"MADE: Masked autoencoder for distribution estimation","volume":"37","author":"germain","year":"2015","journal-title":"Proc ICML"},{"year":"0","key":"ref26","article-title":"SKK dictionary"},{"key":"ref25","first-page":"179","article-title":"XIMERA: A new TTS from ATR based on corpus-based technologies","author":"kawai","year":"2004","journal-title":"Proc 5th ISCA Speech Synthesis Workshop"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2020,5,4]]},"location":"Barcelona, Spain","end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09054466.pdf?arnumber=9054466","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,29]],"date-time":"2023-09-29T15:30:52Z","timestamp":1696001452000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9054466\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9054466","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}