{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T18:51:07Z","timestamp":1774551067631,"version":"3.50.1"},"reference-count":67,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["17H01763"],"award-info":[{"award-number":["17H01763"]}]},{"name":"JST CREST","award":["JPMJCR19A3"],"award-info":[{"award-number":["JPMJCR19A3"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.3001456","type":"journal-article","created":{"date-parts":[[2020,6,10]],"date-time":"2020-06-10T20:53:00Z","timestamp":1591822380000},"page":"1849-1863","source":"Crossref","is-referenced-by-count":48,"title":["ConvS2S-VC: Fully Convolutional Sequence-to-Sequence Voice Conversion"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3102-0162","authenticated-orcid":false,"given":"Hirokazu","family":"Kameoka","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kou","family":"Tanaka","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2276-5288","authenticated-orcid":false,"given":"Damian","family":"Kwasny","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Takuhiro","family":"Kaneko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nobukatsu","family":"Hojo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1789"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1357"},{"key":"ref33","article-title":"Convolutional sequence to sequence learning","author":"gehring","year":"0","journal-title":"Proc ICML"},{"key":"ref32","article-title":"WaveNet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"arXiv 1609 03499 [cs SD]"},{"key":"ref31","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"dauphin","year":"0","journal-title":"Proc ICML"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-247"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref34","article-title":"ConvS2S-VC: Fully convolutional sequence-to-sequence voice conversion","author":"kameoka","year":"2018","journal-title":"ArXiv"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1044\/jslhr.4101.73"},{"key":"ref62","year":"0"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-29"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-28"},{"key":"ref28","article-title":"Deep Voice 3: Scaling text-to-speech with convolutional sequence learning","author":"ping","year":"0","journal-title":"Proc ICLR"},{"key":"ref64","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"ulyanov","year":"2016","journal-title":"arXiv 1607 08022 [cs CV]"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref65","first-page":"901","article-title":"Weight normalization: A simple reparameterization to accelerate training of deep neural networks","author":"salimans","year":"0","journal-title":"Adv NIPS"},{"key":"ref66","first-page":"448","article-title":"Batch normalization: accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc ICML"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref67","year":"0"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.05.001"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674423"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref22","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Adv NIPS"},{"key":"ref21","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Adv NIPS"},{"key":"ref24","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","author":"ar?k","year":"0","journal-title":"Proc ICML"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref26","article-title":"Char2Wav: End-to-end speech synthesis","author":"sotelo","year":"0","journal-title":"Proc ICLR"},{"key":"ref25","first-page":"2962","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"ar?k","year":"0","journal-title":"Proc NIPS"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639636"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"ref59","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc ICLR"},{"key":"ref58","year":"0"},{"key":"ref57","first-page":"223","article-title":"The CMU Arctic speech databases","author":"kominek","year":"2004","journal-title":"Proc of SSW6"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2236"},{"key":"ref55","article-title":"A learned representation for artistic style","author":"dumoulin","year":"0","journal-title":"Proc ICLR"},{"key":"ref54","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Adv NIPS"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041699"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2047683"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-38"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078543"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2017EDL8034"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-970"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-63"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.09.006"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2011.07.007"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2205241"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041113"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.11.004"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682298"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref46","article-title":"ClariNet: Parallel wave generation in end-to-end text-to-speech","author":"ping","year":"0","journal-title":"Proc ICLR"},{"key":"ref45","first-page":"3918","article-title":"Parallel WaveNet: Fast high-fidelity speech synthesis","author":"oord","year":"2018","journal-title":"Proc MLRE"},{"key":"ref48","first-page":"3370","article-title":"FloWaveNet: A generative flow for raw audio","author":"kim","year":"2019","journal-title":"Proc MLRE"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref42","first-page":"2410","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"2018","journal-title":"Proc MLRE"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-314"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462431"},{"key":"ref43","article-title":"SampleRNN: An unconditional end-to-end neural audio generation model","author":"mehri","year":"0","journal-title":"Proc ICLR"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/09113442.pdf?arnumber=9113442","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T01:07:34Z","timestamp":1641949654000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9113442\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":67,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.3001456","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}