{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T00:20:04Z","timestamp":1773361204000,"version":"3.50.1"},"reference-count":77,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"JSPS KAKENHI","award":["17H01763"],"award-info":[{"award-number":["17H01763"]}]},{"name":"JST CREST","award":["JPMJCR19A3"],"award-info":[{"award-number":["JPMJCR19A3"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2020.3047262","type":"journal-article","created":{"date-parts":[[2020,12,24]],"date-time":"2020-12-24T22:26:26Z","timestamp":1608848786000},"page":"656-670","source":"Crossref","is-referenced-by-count":26,"title":["Many-to-Many Voice Transformer Network"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3102-0162","authenticated-orcid":false,"given":"Hirokazu","family":"Kameoka","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3172-3335","authenticated-orcid":false,"given":"Wen-Chin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kou","family":"Tanaka","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Takuhiro","family":"Kaneko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nobukatsu","family":"Hojo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tomoki","family":"Toda","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1044\/jslhr.4101.73"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90011-W"},{"key":"ref71","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref70","first-page":"901","article-title":"Weight normalization: A simple reparameterization to accelerate training of deep neural networks","author":"salimans","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref76","first-page":"5210","article-title":"AutoVC: Zero-shot voice style transfer with only autoencoder loss","author":"qian","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.29007\/s4t1"},{"key":"ref39","article-title":"Deep voice 3: Scaling text-to-speech with convolutional sequence learning","author":"ping","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref75","first-page":"195","article-title":"The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods","author":"lorenzo-trueba","year":"0","journal-title":"Proc Odyssey Speaker Lang Recog Workshop"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref33","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref32","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2910637"},{"key":"ref30","first-page":"3399","article-title":"Emotional voice conversion with adaptive scales F0 based on wavelet transform using limited amount of emotional data","author":"luo","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref37","article-title":"Char2Wav: End-to-end speech synthesis","author":"sotelo","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref36","first-page":"2962","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"ar?k","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref35","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","author":"ar?k","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref34","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref60","article-title":"ClariNet: Parallel wave generation in end-to-end text-to-speech","author":"ping","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref62","first-page":"3370","article-title":"FloWaveNet: A generative flow for raw audio","author":"kim","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2956145"},{"key":"ref28","first-page":"2318","article-title":"Hierarchical modeling of F0 contours for voice conversion","author":"sanchez","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639636"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref65","first-page":"910","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","volume":"14","author":"kumar","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472664"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref68","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref69","first-page":"223","article-title":"The CMU arctic speech databases","author":"kominek","year":"0","journal-title":"Proc ISCA Speech Synth Workshop"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674423"},{"key":"ref1","article-title":"Many-to-many voice transformer network","author":"kameoka","year":"2020"},{"key":"ref20","first-page":"3364","article-title":"Voice conversion from unaligned corpora using variational autoencoding wasserstein generative adversarial networks","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2307"},{"key":"ref26","first-page":"6793","article-title":"Blow: A single-scale hyperconditioned flow for non-parallel raw-audio voice conversion","author":"serr\u00e1","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3036784"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref59","first-page":"3918","article-title":"Parallel WaveNet: Fast high-fidelity speech synthesis","author":"van den oord","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462431"},{"key":"ref57","article-title":"SampleRNN: An unconditional end-to-end neural audio generation model","author":"mehri","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref56","first-page":"2410","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref55","first-page":"1118","article-title":"Speaker-dependent WaveNet vocoder","author":"tamamori","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref54","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"ref52","first-page":"503","article-title":"On layer normalization in the transformer architecture","author":"xiong","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041699"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2165944"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2723721"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424242"},{"key":"ref14","first-page":"677","article-title":"Sparse representation of phonetic features for voice conversion with and without parallel data","author":"sisman","year":"0","journal-title":"Proc IEEE Workshop on Automatic Speech Recognition and Understanding"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2047683"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078543"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref18","first-page":"2453","article-title":"Deep bidirectional LSTM modeling of timbre and prosody for emotional voice conversion","author":"ming","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2011.07.007"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.05.001"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2205241"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.09.006"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.11.004"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1066"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1789"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1357"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3001456"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref42","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref41","first-page":"1243","article-title":"Convolutional sequence to sequence learning","author":"gehring","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref43","first-page":"1268","article-title":"Voice conversion using sequence-to-sequence learning of context posterior probabilities","author":"miyoshi","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09306875.pdf?arnumber=9306875","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:53:57Z","timestamp":1652194437000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9306875\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":77,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.3047262","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}