{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T15:57:05Z","timestamp":1776182225693,"version":"3.50.1"},"reference-count":308,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"SUTD Start-up Grant Artificial Intelligence for Human Voice Conversion","award":["SRG ISTD 2020 158"],"award-info":[{"award-number":["SRG ISTD 2020 158"]}]},{"name":"SUTD AI"},{"name":"The Understanding and Synthesis of Expressive Speech by AI","award":["PIE-SGP-AI-2020-02"],"award-info":[{"award-number":["PIE-SGP-AI-2020-02"]}]},{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["AISG-GC-2019-002"],"award-info":[{"award-number":["AISG-GC-2019-002"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["AISG-100E-2018-006"],"award-info":[{"award-number":["AISG-100E-2018-006"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2020.3038524","type":"journal-article","created":{"date-parts":[[2020,11,17]],"date-time":"2020-11-17T21:51:43Z","timestamp":1605649903000},"page":"132-157","source":"Crossref","is-referenced-by-count":289,"title":["An Overview of Voice Conversion and Its Challenges: From Statistical Modeling to Deep Learning"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8078-3305","authenticated-orcid":false,"given":"Berrak","family":"Sisman","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2752-3955","authenticated-orcid":false,"given":"Junichi","family":"Yamagishi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2694-2843","authenticated-orcid":false,"given":"Simon","family":"King","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518682"},{"key":"ref274","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461960"},{"key":"ref277","article-title":"1534-1, Method for the subjective assessment of intermediate sound quality (MUSHRA)","author":"recommendation","year":"2001","journal-title":"Int Telecommun Union"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)00052-C"},{"key":"ref271","article-title":"Are we using enough listeners? No!&#x2013;an empirically-supported critique of interspeech 2014 TTS evaluations","author":"wester","year":"2015","journal-title":"Proc 16th Annu Conf Int Speech Commun Assoc"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.883250"},{"key":"ref273","article-title":"Voice conversion using GMM with enhanced global variance","author":"benisty","year":"2011","journal-title":"Proc 12th Annu Conf Int Speech Commun Assoc"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref272","article-title":"Potential biases in MUSHRA listening tests","author":"zielinski","year":"2007","journal-title":"Proc 123rd Audio Eng Soc Convers"},{"key":"ref172","first-page":"2278","article-title":"High-order sequence modeling using speaker-dependent recurrent temporal restricted Boltzmann machines for voice conversion","author":"nakashika","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc INTERSPEECH"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2353991"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1049\/cp:19991218"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1053"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2582924"},{"key":"ref178","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref177","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014","journal-title":"arXiv 1409 0473"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078543"},{"key":"ref169","article-title":"Sequence error (SE) minimization training of neural network for voice conversion","author":"xie","year":"0","journal-title":"Proc 15th Annu Conf Int Speech Commun Assoc"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-015-3039-x"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2012.05.027"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2009.5372889"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref31","first-page":"2308","article-title":"Text-independent voice conversion using speaker model alignment method from non-parallel speech","author":"song","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc INTERSPEECH"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"806","DOI":"10.1109\/TASL.2011.2165944","article-title":"Voice conversion for non-parallel datasets using dynamic kernel partial least squares regression","volume":"20","author":"silen","year":"2012","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2005.06.001"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/0893-6080(89)90020-8"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2047683"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)00058-I"},{"key":"ref269","article-title":"Speaker adaptation for HMM-based speech synthesis system using MLLR","author":"tamura","year":"0","journal-title":"Proc 3rd ESCA\/COCOSDA Workshop (ETRW) Speech Synth"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"1905","DOI":"10.21437\/Interspeech.2011-354","article-title":"Event selection from phone posteriorgrams using matched filters","author":"kintzley","year":"2011","journal-title":"Proc INTERSPEECH"},{"key":"ref288","first-page":"6626","article-title":"GANs trained by a two time-scale update rule converge to a local Nash equilibrium","author":"heusel","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref287","article-title":"High fidelity speech synthesis with adversarial networks","author":"bi?kowski","year":"2019","journal-title":"arXiv 1909 11646"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-32"},{"key":"ref285","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"ref284","article-title":"Objective evaluation measures for speaker-adaptive HMM-TTS systems","author":"remes","year":"2013","journal-title":"Proc 8th ISCA Workshop Speech Synth"},{"key":"ref181","first-page":"214","article-title":"Deep voice 3: 2000-speaker neural text-to-speech","author":"ping","year":"0","journal-title":"Proc ICLR"},{"key":"ref283","article-title":"Prediction of perceived sound quality of synthetic speech","author":"huang","year":"0","journal-title":"Proc Asia-Pacific Signal Inf Process Assoc Annu Summit Conf"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref282","first-page":"2725","article-title":"An evaluation of synthetic speech using the PESQ measure","author":"cernak","year":"0","journal-title":"Proc Eur Congr Acoust"},{"key":"ref281","article-title":"AutoMOS: Learning a non-intrusive assessor of naturalness-of-speech","author":"patton","year":"2016","journal-title":"arXiv 1611 09207"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-847"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"ref188","article-title":"ConvS2S-VC: Fully convolutional sequence-to-sequence voice conversion","author":"kameoka","year":"2018","journal-title":"arXiv abs\/1811 01609"},{"key":"ref187","article-title":"Convolutional sequence to sequence learning","author":"gehring","year":"2017","journal-title":"arXiv abs\/1705 03122"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref28","article-title":"Frame alignment method for cross-lingual voice conversion","author":"erro","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038669"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041688"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472761"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269002"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855137"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1659962"},{"key":"ref23","article-title":"Cross-language voice conversion evaluation using bilingual databases","author":"mashimo","year":"2002","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref26","article-title":"VTLN-based crosslanguage voice conversion","author":"sundermann","year":"0","journal-title":"Proc IEEE Autom Speech Recognit Understanding Workshop"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1802"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178897"},{"key":"ref293","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-30"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-34"},{"key":"ref292","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1331"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref295","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1190"},{"key":"ref297","article-title":"Voice conversion challenge 2020: Intra-lingual semi-parallel and cross-lingual voice conversion","author":"zhao","year":"2020","journal-title":"arXiv 2008 12527"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2960721"},{"key":"ref299","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2671435"},{"key":"ref298","first-page":"2037","article-title":"ASVspoof 2015: The first automatic speaker verification spoofing and countermeasures challenge","author":"wu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref154","first-page":"1617","article-title":"High individuality voice conversion based on concatenative speech synthesis","volume":"1","author":"fujii","year":"2007","journal-title":"Int J Elect Comput Energetic Electron Commun Eng"},{"key":"ref153","doi-asserted-by":"crossref","DOI":"10.21437\/Eurospeech.1995-148","article-title":"Optimising selection of units from speech databases for concatenative synthesis","author":"black","year":"1995"},{"key":"ref156","article-title":"Voice conversion of non-aligned data using unit selection","author":"erro","year":"2006"},{"key":"ref155","author":"sagisaka","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref150","article-title":"A first step towards text-independent voice conversion","author":"ney","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref291","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.10.005"},{"key":"ref152","doi-asserted-by":"crossref","first-page":"1301","DOI":"10.1109\/TSA.2005.860839","article-title":"Quality-enhanced voice morphing using maximum likelihood transformations","volume":"14","author":"ye","year":"0","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-28"},{"key":"ref151","article-title":"Voice conversion for unknown speakers","author":"ye","year":"0","journal-title":"Proc INTERSPEECH 8th Int Conf Spoken Lang Process (ICSLP)"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-567"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472664"},{"key":"ref148","first-page":"1537","article-title":"Transformation of prosody in voice conversion","author":"?i?man","year":"0","journal-title":"Proc Asia-Pacific Signal Inf Process Assoc Annu Summit Conf"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2016.7918382"},{"key":"ref289","article-title":"Demystifying MMD GANs","author":"bi?kowski","year":"2018","journal-title":"arXiv 1801 01401"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003801"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639507"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-12"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1232"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-314"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269007"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461452"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2835720"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref167","first-page":"369","article-title":"Voice conversion in high-order eigen space using deep belief NETS","author":"nakashika","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2014.6936599"},{"key":"ref165","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2020-1542","article-title":"Cotatron: Transcription-guided speech encoder for any-to-many voice conversion without parallel data","author":"park","year":"2020"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-247"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953215"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683746"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2012.2225615"},{"key":"ref4","article-title":"Towards personalised synthesised voices for individuals with vocal disabilities: Voice banking and reconstruction","author":"veaux","year":"2013"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1357"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2014.17"},{"key":"ref5","article-title":"Evaluating voice conversion-based privacy protection against informed attackers","author":"srivastava","year":"2019"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1250\/ast.11.71"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref7","article-title":"Defending your voice: Adversarial attack on voice conversion","volume":"abs 2005 8781","author":"huang","year":"2020"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462342"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.857790"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.1991.176405"},{"key":"ref158","article-title":"Eigenvoice conversion based on Gaussian mixture model","author":"toda","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref46","first-page":"6793","article-title":"Blow: A single-scale hyperconditioned flow for non-parallel raw-audio voice conversion","author":"serr\u00e0","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1043"},{"key":"ref48","article-title":"Parallel-data-free voice conversion using cycle-consistent adversarial networks","author":"kaneko","year":"2017"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-63"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-116"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820901"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-32"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-31"},{"key":"ref71","article-title":"The voice conversion challenge 2018: Database and results","author":"lorenzo-trueba","year":"2018"},{"key":"ref70","article-title":"The voice conversion challenge: Promoting development of parallel and nonparallel methods","author":"lorenzo-trueba","year":"2018","journal-title":"arXiv 1804 04262"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(99)00015-1"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/89.890068"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(92)90012-V"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674422"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038663"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2019.8902651"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2910637"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1131"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref63","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"2018","journal-title":"arXiv 1802 08435"},{"key":"ref304","article-title":"CSTR VCTK corpus: English multi-speaker corpus for cstr voice cloning toolkit","author":"veaux","year":"2016"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref307","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-29"},{"key":"ref65","first-page":"3370","article-title":"Flowavenet: A generative flow for raw audio","author":"kim","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref66","article-title":"Wavenet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2249"},{"key":"ref67","first-page":"1632","article-title":"The voice conversion challenge","author":"toda","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref300","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1111"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2016-7"},{"key":"ref303","article-title":"The CMU arctic speech databases","author":"kominek","year":"0","journal-title":"Proc 5th ISCA Workshop Speech Synth"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1331"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114"},{"key":"ref308","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00186"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00821"},{"key":"ref199","first-page":"2672","article-title":"Generative adversarial NETS","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00804"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01064"},{"key":"ref196","article-title":"Deep learning approaches for attribute manipulation and text-to-image synthesis","author":"ak","year":"2019"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2872060"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462393"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1528"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00916"},{"key":"ref92","article-title":"Machine learning for limited data voice conversion","author":"sisman","year":"2019"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01065"},{"key":"ref91","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","author":"arik","year":"0","journal-title":"Proc 34th Int Conf Mach"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00794-2_48"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003801"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1316"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1514"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682938"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2835720"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1499"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960478"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1002\/ecja.4400660203"},{"key":"ref89","first-page":"6274","article-title":"Teacher-student training for robust tacotron-based TTS","author":"liu","year":"2019","journal-title":"Proc 2020 IEEE Int Conf Acoust Speech Signal Process (ICASSP)"},{"key":"ref85","first-page":"1878","article-title":"Unsupervised learning of disentangled and interpretable representations from sequential data","author":"hsu","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-349"},{"key":"ref87","article-title":"Digital speech processing, synthesis, and recognition(revised and expanded)","author":"furui","year":"2000","journal-title":"Digit Speech Process Synth Recognit"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref200","first-page":"172","article-title":"Multimodal unsupervised image-to-image translation","author":"huang","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1424"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref209","article-title":"TimbreTron: A wavenet (cycleGAN (CQT (audio))) pipeline for musical timbre transfer","author":"huang","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00379"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00917"},{"key":"ref201","first-page":"465","article-title":"Toward multimodal image-to-image translation","author":"zhu","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2020.02.030"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CSCI46756.2018.00290"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1607\/1\/012046"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2409"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268927"},{"key":"ref211","first-page":"6820","article-title":"Cycle GAN-VC2: Improved cycle GAN-based non-parallel voice conversion","author":"kaneko","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref212","article-title":"Unsupervised cross-domain image generation","author":"taigman","year":"2016","journal-title":"arXiv abs\/1611 02200"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682156"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003939"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-33"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2014"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639647"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-35"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.3016564"},{"key":"ref220","article-title":"Expressive TTS training with frame and style reconstruction loss","author":"liu","year":"2020","journal-title":"arXiv 2008 01490"},{"key":"ref222","article-title":"Voice transformer network: Sequence-to-sequence voice conversion using transformer with text-to-speech pretraining","author":"huang","year":"2019","journal-title":"arXiv 1912 06813"},{"key":"ref221","article-title":"Transfer learning from speech synthesis to voice conversion with non-parallel training data","author":"zhang","year":"2020"},{"key":"ref229","article-title":"Autoencoding beyond pixels using a learned similarity metric","author":"larsen","year":"0","journal-title":"Proc 33rd Int Conf Mach Learn PMLR"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659543"},{"key":"ref227","article-title":"Multi-target emotional voice conversion with neural vocoders","author":"liu","year":"2020","journal-title":"arXiv 2004 03782"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1789"},{"key":"ref225","article-title":"Nautilus: A versatile voice cloning system","author":"luong","year":"2020","journal-title":"arXiv 2005 11004"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9004008"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682380"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35292-8_4"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694179"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.5923\/j.ajsp.20120205.06"},{"key":"ref124","article-title":"GMM-based voice conversion applied to emotional speech synthesis","author":"kawanami","year":"0","journal-title":"Proc 8th Eur Conf Speech Commun Technol"},{"key":"ref129","article-title":"A perceptual investigation of wavelet-based decomposition of f0 for text-to-speech synthesis","author":"ribeiro","year":"0","journal-title":"Proc 16th Annu Conf Int Speech Commun Assoc"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.12.004"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472734"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038663"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ISSPIT.2003.1341181"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CHINSL.2008.ECP.44"},{"key":"ref132","first-page":"2318","article-title":"Hierarchical modeling of F0 contours for voice conversion","author":"sanchez","year":"0","journal-title":"Proc Annu Conf Int Speech Commun Assoc INTERSPEECH"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1443"},{"key":"ref233","first-page":"10","article-title":"Auto-encoding variational bayes","volume":"1050","author":"kingma","year":"2014","journal-title":"Stat"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053854"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659628"},{"key":"ref235","first-page":"14 866","article-title":"Generating diverse high-fidelity images with vq-vae-2","author":"razavi","year":"2019","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref234","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2018.8706604"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1198"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2006.08.001"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2004.1325911"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2177820"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-34584-5_24"},{"key":"ref139","first-page":"556","article-title":"Algorithms for non-negative matrix factorization","author":"lee","year":"2001","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2571727"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2013.2270369"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/ICCPCT.2015.7159386"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518538"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674423"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2427520"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/110311a0"},{"key":"ref145","article-title":"Examplar-based voice conversion using non-negative spectrogram deconvolution","author":"wu","year":"0","journal-title":"Proc 8th ISCA Speech Synth Workshop"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-970"},{"key":"ref242","first-page":"7354","article-title":"Self-attention generative adversarial networks","author":"zhang","year":"2019","journal-title":"Proc Int Conf Mach Learn (PMLR)"},{"key":"ref243","first-page":"469","article-title":"Coupled generative adversarial networks","author":"liu","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref244","first-page":"2401","article-title":"Edge-GAN: Edge conditioned multi-view face image generation","author":"zou","year":"2020","journal-title":"Proc IEEE Int Conf Image Process"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2020.2977678"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.5194\/gmd-7-1247-2014"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-00296-0_5"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref245","article-title":"Transferring source style in non-parallel voice conversion","author":"liu","year":"2020","journal-title":"arXiv 2005 09178"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.3354\/cr030079"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2956145"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-1"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682298"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937165"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1288"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1563"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-2"},{"key":"ref111","first-page":"1","article-title":"An exemplar-based approach to frequency warping for voice conversion","volume":"25","author":"tian","year":"2016","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)00053-D"},{"key":"ref110","first-page":"14 910","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis","author":"kumar","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49127-9_5"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-014-0446-1"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941039"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.4337\/9781781003152.00014"},{"key":"ref252","article-title":"Optimization of an objective measure for estimating mean opinion score of synthesized speech","author":"chu","year":"2008"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2003.12.001"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1109\/ICSP.2016.7877819"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2020.05.004"},{"key":"ref259","first-page":"1","article-title":"Mel frequency cepstral coefficients for music modeling","author":"logan","year":"2000","journal-title":"Proc Int Symp Music Inf Retrieval"},{"key":"ref10","article-title":"On the impact of alignment on voice conversion performance","author":"helander","year":"2008","journal-title":"Proc 9th Annu Conf Int Speech Commun Assoc"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2012.6288796"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref12","article-title":"Probabilistic feature mapping based on trajectory HMMs","author":"zen","year":"2008","journal-title":"Proc 9th Annu Conf Int Speech Commun Assoc"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-970"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041699"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2165944"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853897"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/79.543975"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424242"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2005.1415037"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"1506","DOI":"10.1109\/TASLP.2014.2333242","article-title":"Exemplar-based sparse representation with residual compensation for voice conversion","volume":"22","author":"wu","year":"2014","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-40171-3_3"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1038\/nbt1406"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1250\/ast.14.353"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref116","article-title":"Voice conversion algorithm based on Gaussian mixture model applied to straight","author":"toda","year":"0"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941046"},{"key":"ref120","first-page":"145","article-title":"Em algorithms of Gaussian mixture model and hidden Markov model","author":"xuan","year":"0","journal-title":"Proc Int Conf Image Process"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1561\/9781601984319"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2014.7041540"},{"key":"ref123","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2006-582","article-title":"Maximum likelihood voice conversion based on GMM with straight mixed excitation","author":"ohtani","year":"2006"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2017.8282110"},{"key":"ref261","first-page":"3969","article-title":"Reducing F0 frame error of F0 tracking algorithms under noisy conditions with an unvoiced\/voiced classification frontend","author":"chu","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.09.003"},{"key":"ref263","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","author":"skerry-ryan","year":"2018","journal-title":"arXiv 1803 09047"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/IALP.2017.8300542"},{"key":"ref265","article-title":"Transformation of spectral envelope for voice conversion based on radial basis function networks","author":"watanabe","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref266","article-title":"Cross-lingual voice conversion-based polyglot speech synthesizer for indian languages","author":"ramani","year":"0","journal-title":"Proc 15th Annu Conf Int Speech Commun Assoc"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09262021.pdf?arnumber=9262021","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,28]],"date-time":"2022-11-28T14:19:18Z","timestamp":1669645158000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9262021\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":308,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.3038524","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}