{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:28:28Z","timestamp":1775230108027,"version":"3.50.1"},"reference-count":57,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9053512","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T16:21:13Z","timestamp":1586449273000},"page":"7654-7658","source":"Crossref","is-referenced-by-count":104,"title":["Espnet-TTS: Unified, Reproducible, and Integratable Open Source End-to-End Text-to-Speech Toolkit"],"prefix":"10.1109","author":[{"given":"Tomoki","family":"Hayashi","sequence":"first","affiliation":[]},{"given":"Ryuichi","family":"Yamamoto","sequence":"additional","affiliation":[]},{"given":"Katsuki","family":"Inoue","sequence":"additional","affiliation":[]},{"given":"Takenori","family":"Yoshimura","sequence":"additional","affiliation":[]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[]},{"given":"Tomoki","family":"Toda","sequence":"additional","affiliation":[]},{"given":"Kazuya","family":"Takeda","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Tan","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","author":"hayashi","year":"2019","journal-title":"kan-bayashi\/PytorchWaveNetVocoder"},{"key":"ref38","author":"yamamoto","year":"2019","journal-title":"r9y9\/wavenetz_vocoder"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462020"},{"key":"ref32","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"NIPS"},{"key":"ref31","article-title":"Automatic differentiation in PyTorch","author":"paszke","year":"2017","journal-title":"Proc NIPS Autodiff Workshop"},{"key":"ref30","first-page":"5410","article-title":"Almost unsupervised text to speech and automatic speech recognition","author":"ren","year":"2019","journal-title":"ICML"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2013.6701851"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682890"},{"key":"ref27","first-page":"426","article-title":"Back-translation-style data augmentation for end-to-end ASR","author":"hayashi","year":"2018","journal-title":"Proc SLT"},{"key":"ref29","article-title":"Self-supervised sequence-to-sequence ASR using unpaired speech and text","author":"baskar","year":"2019","journal-title":"arXiv preprint arXiv 1905 00571"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2016-33"},{"key":"ref1","year":"2016","journal-title":"The HMM-Based Speech Synthesis System (HTS)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472657"},{"key":"ref22","author":"park","year":"2019","journal-title":"Kyubuyong\/tacotron"},{"key":"ref21","author":"yamamoto","year":"2019","journal-title":"r9y9\/deepvoice3_pytorch"},{"key":"ref24","author":"valle","year":"2019","journal-title":"NVIDIA\/tacotron2"},{"key":"ref23","author":"mama","year":"2019","journal-title":"Rayhane-mamah\/Tacotron-2"},{"key":"ref26","article-title":"Mixed-precision training for NLP and speech recognition with OpenSeq2Seq","author":"kuchaiev","year":"2018","journal-title":"arXiv preprint arXiv 1805 10387"},{"key":"ref25","author":"g\u00f6lge","year":"2019","journal-title":"The Mozilla"},{"key":"ref50","author":"solak","year":"2019","journal-title":"The M-AILABS speech dataset"},{"key":"ref51","author":"park","year":"2017","journal-title":"The World English Bible A large single-speaker speech datasaetin english"},{"key":"ref57","author":"yamamoto","year":"2019","journal-title":"r9y9\/nnmnkwii"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref54","first-page":"5206","article-title":"LibriSpeech: An ASR corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"ICASSP"},{"key":"ref53","author":"park","year":"2019","journal-title":"G2pe"},{"key":"ref52","author":"do","year":"2017","journal-title":"VAIS-1000 A Vietnamese speech synthesis corpus"},{"key":"ref10","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv preprint arXiv 1609 04802"},{"key":"ref11","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461332"},{"key":"ref12","first-page":"2415","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"2018","journal-title":"ICML"},{"key":"ref13","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"2019","journal-title":"ICLRE"},{"key":"ref14","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"2018","journal-title":"arXiv preprint arXiv 1803 09017"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref17","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc ASRU"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854321"},{"key":"ref19","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"chung","year":"2014","journal-title":"arXiv preprint arXiv 1412 3555"},{"key":"ref4","first-page":"79627966","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"zen","year":"2013","journal-title":"ICASSP"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2013.2251852"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref5","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end- to-end speech synthesis","author":"wang","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref8","article-title":"Close to human quality TTS with Transformer","author":"li","year":"2018","journal-title":"arXiv preprint arXiv 1809 08895"},{"key":"ref7","article-title":"Deep Voice 3: Scaling text- to-speech with convolutional sequence learning","author":"ping","year":"2018","journal-title":"ICLRE"},{"key":"ref49","author":"ito","year":"2017","journal-title":"The LJ speech dataset"},{"key":"ref9","first-page":"3165","article-title":"Fastspeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"NIPS"},{"key":"ref46","article-title":"JSUT corpus: Free large-scale Japanese speech corpus for end-to-end speech synthesis","author":"sonobe","year":"2017","journal-title":"arXiv preprint arXiv 1711 00540"},{"key":"ref45","first-page":"199","article-title":"JNAS: Japanese speech corpus for large vocabulary continuous speech recognition research","volume":"20","author":"itou","year":"1999","journal-title":"Acoustical Science and Technology"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref47","article-title":"JVS corpus: Free Japanese multi-speaker voice corpus","author":"takamichi","year":"2019","journal-title":"arXiv preprint arXiv 1908 00310"},{"key":"ref42","article-title":"The Blizzard Challenge 2017","author":"king","year":"2017","journal-title":"Proc Blizzard Challenge Workshop"},{"key":"ref41","author":"hayashi","year":"2019","journal-title":"kan-bayashi\/ParallelWaveGAN"},{"key":"ref44","author":"baker","year":"2017","journal-title":"Chinese Standard Mandarin Speech Copus"},{"key":"ref43","article-title":"The CMU Arctic speech databases","author":"kominek","year":"2004","journal-title":"Proc of SSW6"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Barcelona, Spain","start":{"date-parts":[[2020,5,4]]},"end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09053512.pdf?arnumber=9053512","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,27]],"date-time":"2022-06-27T20:18:29Z","timestamp":1656361109000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9053512\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":57,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9053512","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}