{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T07:53:07Z","timestamp":1776412387961,"version":"3.51.2"},"reference-count":80,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JST CREST VoicePersonae","award":["JPMJCR18A6"],"award-info":[{"award-number":["JPMJCR18A6"]}]},{"name":"MEXT KAKENHI","award":["16H06302"],"award-info":[{"award-number":["16H06302"]}]},{"name":"MEXT KAKENHI","award":["17H04687"],"award-info":[{"award-number":["17H04687"]}]},{"name":"MEXT KAKENHI","award":["18H04120"],"award-info":[{"award-number":["18H04120"]}]},{"name":"MEXT KAKENHI","award":["18H04112"],"award-info":[{"award-number":["18H04112"]}]},{"name":"MEXT KAKENHI","award":["18KT0051"],"award-info":[{"award-number":["18KT0051"]}]},{"name":"SOKENDAI"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.3034994","type":"journal-article","created":{"date-parts":[[2020,10,30]],"date-time":"2020-10-30T20:43:06Z","timestamp":1604090586000},"page":"2967-2981","source":"Crossref","is-referenced-by-count":51,"title":["NAUTILUS: A Versatile Voice Cloning System"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4772-5995","authenticated-orcid":false,"given":"Hieu-Thi","family":"Luong","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2752-3955","authenticated-orcid":false,"given":"Junichi","family":"Yamagishi","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855134"},{"key":"ref72","first-page":"1121","article-title":"Native and non-native speaker judgements on the quality of synthesized speech","author":"janska","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.29007\/bw9p"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref77","article-title":"The LJ speech dataset","author":"ito","year":"2017"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2016EDP7231"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054596"},{"key":"ref75","article-title":"The EMIME mandarin bilingual database","author":"wester","year":"2011"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953089"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1121\/1.402284"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683746"},{"key":"ref33","first-page":"2413","article-title":"Voice conversion with smoothed GMM and MAP adaptation","author":"chen","year":"0","journal-title":"Proc EUROSPEECH"},{"key":"ref32","first-page":"266","article-title":"StarGAN-VC: Non-parallel many-to-many voice conversion using star generative adversarial networks","author":"kameoka","year":"2018","journal-title":"Proc IEEE\/ACL Workshop Spoken Lang Technol"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-28"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1093\/ietisy\/e90-d.2.533"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref34","first-page":"2446","article-title":"Eigenvoice conversion based on Gaussian mixture model","author":"toda","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref60","first-page":"1","article-title":"Quasi-recurrent neural networks","author":"bradbury","year":"0","journal-title":"Proc ICLR"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178816"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-39"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953087"},{"key":"ref28","article-title":"Sample efficient adaptive text-to-speech","author":"chen","year":"0","journal-title":"Proc ICLR"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269007"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref65","first-page":"1295","article-title":"Robust LTS rules with the combilex speech technology lexicon","author":"richmond","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref66","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"veaux","year":"2017"},{"key":"ref29","first-page":"447","article-title":"Statistical methods for voice quality transformation","author":"stylianou","year":"0","journal-title":"Proc EUROSPEECH"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref68","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"0","journal-title":"Proc Automatic Speech Recognition and Understanding"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674423"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-32"},{"key":"ref22","article-title":"A unified speaker adaptation method for speech synthesis using transcribed and untranscribed speech with backpropagation","author":"luong","year":"2019","journal-title":"arXiv 1906 07414"},{"key":"ref21","first-page":"10 040","article-title":"Neural voice cloning with a few samples","author":"arik","year":"0","journal-title":"Proc Neural Inf Process Syst"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2668"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004008"},{"key":"ref26","first-page":"2005","article-title":"TTS for low resource languages: A Bangla synthesizer","author":"gutkin","year":"0","journal-title":"Proc LREC"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-34"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1791"},{"key":"ref51","first-page":"195","volume":"1","author":"gales","year":"2008","journal-title":"The Application of Hidden Markov Models in Speech Recognition Foundations Trends Signal Process"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2019.8902651"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2872060"},{"key":"ref57","first-page":"1","article-title":"Auto-encoding variational bayes","author":"kingma","year":"0","journal-title":"Proc ICLR"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-172"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682589"},{"key":"ref54","article-title":"Voice transformer network: Sequence-to-sequence voice conversion using transformer with text-to-speech pretraining","author":"huang","year":"2019","journal-title":"arXiv 1912 06813"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682890"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-247"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1357"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2960721"},{"key":"ref14","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"0","journal-title":"arXiv 1609 03499"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682298"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1190"},{"key":"ref18","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053371"},{"key":"ref80","first-page":"1","article-title":"SampleRNN: An unconditional end-to-end neural audio generation model","author":"mehri","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1026"},{"key":"ref3","first-page":"3355","article-title":"Reconstructing intelligible audio speech from visual speech features","author":"cornu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2017.2784878"},{"key":"ref5","first-page":"1760","article-title":"Text-informed speech enhancement with deep neural networks","author":"kinoshita","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053340"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659621"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960573"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1038"},{"key":"ref47","first-page":"879","article-title":"A study of speaker adaptation for DNN-based speech synthesis","author":"wu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462373"},{"key":"ref41","first-page":"1","article-title":"ClariNet: Parallel wave generation in end-to-end text-to-speech","author":"ping","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639659"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178817"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/09246264.pdf?arnumber=9246264","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T01:07:38Z","timestamp":1641949658000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9246264\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":80,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.3034994","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}