{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T06:56:02Z","timestamp":1774162562094,"version":"3.50.1"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9053371","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T16:21:13Z","timestamp":1586449273000},"page":"7634-7638","source":"Crossref","is-referenced-by-count":12,"title":["Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models"],"prefix":"10.1109","author":[{"given":"Katsuki","family":"Inoue","sequence":"first","affiliation":[]},{"given":"Sunao","family":"Hara","sequence":"additional","affiliation":[]},{"given":"Masanobu","family":"Abe","sequence":"additional","affiliation":[]},{"given":"Tomoki","family":"Hayashi","sequence":"additional","affiliation":[]},{"given":"Ryuichi","family":"Yamamoto","sequence":"additional","affiliation":[]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref32","article-title":"Automatic differentiation in pytorch","author":"paszke","year":"2017"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"2017","journal-title":"Proc of Interspeech"},{"key":"ref30","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv preprint arXiv 1609 04802"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1789"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref10","first-page":"10019","article-title":"Neural voice cloning with a few samples","author":"arik","year":"2018","journal-title":"Proc of NIPS"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2730"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2904"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683307"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3167"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"540","DOI":"10.1109\/TASLP.2019.2960721","article-title":"Non-parallel sequence-to- sequence voice conversion with disentangled linguistic and speaker representations","volume":"28","author":"zhang","year":"2019","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language Processing"},{"key":"ref18","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"2014","journal-title":"Proc of ICML"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref28","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc of ASRU"},{"key":"ref4","article-title":"Close to human quality TTS with Transformer","author":"li","year":"2018","journal-title":"arXiv preprint arXiv 1809 08895"},{"key":"ref27","first-page":"5206","article-title":"Librispeech: an ASR corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"Proc of ICASSP"},{"key":"ref3","article-title":"Deep voice 3: Scaling text-to-speech with convolutional sequence learning","author":"ping","year":"2018","journal-title":"Proc of ICLR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2013.2251852"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2012"},{"key":"ref5","article-title":"FastSpeech: fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"arXiv preprint arXiv 1905 00571"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref9","article-title":"The LJ speech dataset","author":"ito","year":"2017"},{"key":"ref1","article-title":"Tacotron: towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"arXiv preprint arXiv 1703 10593"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref21","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc of NIPS"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref23","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014","journal-title":"arXiv preprint arXiv 1409 0473"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref25","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"2018","journal-title":"Proc of NIPS"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Barcelona, Spain","start":{"date-parts":[[2020,5,4]]},"end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09053371.pdf?arnumber=9053371","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,27]],"date-time":"2022-06-27T20:14:57Z","timestamp":1656360897000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9053371\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9053371","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}