{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T23:57:34Z","timestamp":1774742254671,"version":"3.50.1"},"reference-count":76,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2020,8,1]],"date-time":"2020-08-01T00:00:00Z","timestamp":1596240000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"MOST-Taiwan","award":["MOST 107-2221-E-001-008-MY3"],"award-info":[{"award-number":["MOST 107-2221-E-001-008-MY3"]}]},{"name":"MOST-Taiwan","award":["MOST 108-2634-F-001-004"],"award-info":[{"award-number":["MOST 108-2634-F-001-004"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Emerg. Top. Comput. Intell."],"published-print":{"date-parts":[[2020,8]]},"DOI":"10.1109\/tetci.2020.2977678","type":"journal-article","created":{"date-parts":[[2020,4,7]],"date-time":"2020-04-07T01:35:24Z","timestamp":1586223324000},"page":"468-479","source":"Crossref","is-referenced-by-count":28,"title":["Unsupervised Representation Disentanglement Using Cross Domain Features and Adversarial Learning in Variational Autoencoder Based Voice Conversion"],"prefix":"10.1109","volume":"4","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3172-3335","authenticated-orcid":false,"given":"Wen-Chin","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1900-279X","authenticated-orcid":false,"given":"Hao","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hsin-Te","family":"Hwang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen-Chou","family":"Lo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu-Huai","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6956-0418","authenticated-orcid":false,"given":"Yu","family":"Tsao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3599-5071","authenticated-orcid":false,"given":"Hsin-Min","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"arXiv 1607 06450"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref71","first-page":"5767","article-title":"Improved training of wasserstein gans","author":"gulrajani","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref70","article-title":"Wasserstein gan","author":"arjovsky","year":"2017","journal-title":"arXiv 1701 07875"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953090"},{"key":"ref74","first-page":"1436","article-title":"Ways to implement global variance in statistical speech synthesis","author":"siln","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2522655"},{"key":"ref38","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref33","first-page":"195","article-title":"The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods","author":"lorenzo-trueba","year":"0","journal-title":"IEEE Odyssey Speaker and Language Recognition Workshop"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1662"},{"key":"ref31","first-page":"282","article-title":"Adaptive wavenet vocoder for residual compensation in gan-based voice conversion","author":"sisman","year":"0","journal-title":"Proc IEEE Spoken Lang Technol Workshop"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2019.8902651"},{"key":"ref37","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2013","journal-title":"arXiv 1312 6114"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038669"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461932"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462663"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.11.063"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683204"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1190"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1761"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1131"},{"key":"ref65","first-page":"2610","article-title":"Isolating sources of disentanglement in variational autoencoders","author":"chen","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref66","article-title":"A framework for the quantitative evaluation of disentangled representations","author":"eastwood","year":"0","journal-title":"Int Conf Learn Representations"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639608"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1774"},{"key":"ref69","first-page":"1558","article-title":"Autoencoding beyond pixels using a learned similarity metric","volume":"48","author":"larsen","year":"0","journal-title":"Proc 33rd Int Conf Mach Learn"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1985.1168479"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2047683"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2353991"},{"key":"ref24","first-page":"1118","article-title":"Speaker-dependent wavenet vocoder","author":"tamamori","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref23","article-title":"Wavenet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref26","first-page":"1138","article-title":"Statistical voice conversion with wavenet-based waveform generation","author":"kobayashi","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref25","first-page":"712","article-title":"An investigation of multi-speaker training for wavenet vocoder","author":"hayashi","year":"0","journal-title":"Proc IEEE Autom Speech Recognit Understanding Workshop"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref59","first-page":"2096","article-title":"Domain-adversarial training of neural networks","volume":"17","author":"ganin","year":"2016","journal-title":"J Mach Learn Res"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007379606734"},{"key":"ref57","first-page":"1508","article-title":"Multi-objective learning and mask-based post-processing for deep neural network based speech enhancement","author":"xu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2018.8706604"},{"key":"ref53","first-page":"5210","article-title":"AutoVC: Zero-shot voice style transfer with only autoencoder loss","author":"qian","year":"0","journal-title":"Proceedings 36th Int Conf Mach Learn"},{"key":"ref52","first-page":"1947","article-title":"Emergence of invariance and disentanglement in deep representations","volume":"19","author":"achille","year":"2018","journal-title":"J Mach Learn Res"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ISSPIT.2003.1341181"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038663"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2177820"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424242"},{"key":"ref15","doi-asserted-by":"crossref","first-page":"1506","DOI":"10.1109\/TASLP.2014.2333242","article-title":"Exemplar-based sparse representation with residual compensation for voice conversion","volume":"22","author":"wu","year":"2014","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-567"},{"key":"ref17","first-page":"677","article-title":"Sparse representation of phonetic features for voice conversion with and without parallel data","author":"sisman","year":"0","journal-title":"Proc IEEE Autom Speech Recognit Understanding Workshop"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2723721"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)00058-I"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.674423"},{"key":"ref3","first-page":"3077","article-title":"Foreign accent conversion through voice morphing","author":"aryal","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2011.07.007"},{"key":"ref5","first-page":"1514","article-title":"Voice expression conversion with factorised hmm-tts models","author":"latorre","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref8","first-page":"3067","article-title":"A hybrid approach to electrolaryngeal speech enhancement based on spectral subtraction and statistical voice conversion","author":"tanaka","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2013.2286917"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683561"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref46","first-page":"240","article-title":"Can we steal your vocal identity from the internet?: Initial investigation of cloning obamas voice using gan, wavenet and low-quality found data","author":"lorenzo-trueba","year":"0","journal-title":"IEEE Odyssey Speaker and Language Recognition Workshop"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1830"},{"key":"ref48","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref47","first-page":"1878","article-title":"Unsupervised learning of disentangled and interpretable representations from sequential data","author":"hsu","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref42","article-title":"Parallel-data-free voice conversion using cycle-consistent adversarial networks","author":"kaneko","year":"2017","journal-title":"arXiv 1711 11293"},{"key":"ref41","first-page":"3364","article-title":"Voice conversion from unaligned corpora using variational autoencoding wasserstein generative adversarial networks","author":"hsu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"}],"container-title":["IEEE Transactions on Emerging Topics in Computational Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7433297\/9145895\/09057379.pdf?arnumber=9057379","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T10:54:50Z","timestamp":1643280890000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9057379\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,8]]},"references-count":76,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tetci.2020.2977678","relation":{},"ISSN":["2471-285X"],"issn-type":[{"value":"2471-285X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,8]]}}}