{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T09:11:33Z","timestamp":1775121093262,"version":"3.50.1"},"reference-count":50,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2019,1]]},"DOI":"10.1016\/j.specom.2018.11.007","type":"journal-article","created":{"date-parts":[[2018,11,30]],"date-time":"2018-11-30T06:17:04Z","timestamp":1543558624000},"page":"57-67","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":5,"special_numbering":"C","title":["Voice conversion with SI-DNN and KL divergence based mapping without parallel training data"],"prefix":"10.1016","volume":"106","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1206-3696","authenticated-orcid":false,"given":"Feng-Long","family":"Xie","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frank K.","family":"Soong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haifeng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2018.11.007_bib0001","series-title":"Proc. ICASPP","first-page":"655","article-title":"Voice conversion through vector quantization","author":"Abe","year":"1988"},{"key":"10.1016\/j.specom.2018.11.007_bib0002","series-title":"Proc. ICASSP","first-page":"7909","article-title":"Non-parallel voice conversion using joint optimization of alignment by temporal context and spectral distortion","author":"Benisty","year":"2014"},{"key":"10.1016\/j.specom.2018.11.007_bib0003","series-title":"Proc. INTERSPEECH","first-page":"3053","article-title":"Joint spectral distribution modeling using restricted boltzmann machines for voice conversion","author":"Chen","year":"2013"},{"issue":"12","key":"10.1016\/j.specom.2018.11.007_bib0004","doi-asserted-by":"crossref","first-page":"1859","DOI":"10.1109\/TASLP.2014.2353991","article-title":"Voice conversion using deep neural networks with layer-wise generative training","volume":"22","author":"Chen","year":"2014","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"5","key":"10.1016\/j.specom.2018.11.007_bib0005","doi-asserted-by":"crossref","first-page":"954","DOI":"10.1109\/TASL.2010.2047683","article-title":"Spectral mapping using artifical neural networks for voice conversion","volume":"18","author":"Desai","year":"2010","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0006","series-title":"Proc. ICASSP","first-page":"677","article-title":"A new minimum divergence approach to discriminative training","author":"Du","year":"2007"},{"issue":"5","key":"10.1016\/j.specom.2018.11.007_bib0007","doi-asserted-by":"crossref","first-page":"944","DOI":"10.1109\/TASL.2009.2038669","article-title":"INCA algorithm for training voice conversion systems from nonparallel corpora","volume":"18","author":"Erro","year":"2010","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"6","key":"10.1016\/j.specom.2018.11.007_bib0008","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","article-title":"Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups","volume":"29","author":"Hinton","year":"2012","journal-title":"IEEE Signal Process. Mag."},{"key":"10.1016\/j.specom.2018.11.007_bib0009","series-title":"Proc. INTERSPEECH","first-page":"1886","article-title":"A new DNN-based high quality pronunciation evaluation for computer-aided language learning(CALL)","author":"Hu","year":"2013"},{"key":"10.1016\/j.specom.2018.11.007_bib0010","series-title":"Proc. ICASSP","first-page":"285","article-title":"Spectral Voice Conversion for Text-to-speech Synthesis","author":"Kain","year":"1998"},{"key":"10.1016\/j.specom.2018.11.007_bib0011","series-title":"Proc. INTERSPEECH","doi-asserted-by":"crossref","first-page":"1667","DOI":"10.21437\/Interspeech.2016-970","article-title":"The NU-NAIST Voice Conversion System for the Voice Conversion Challenge 2016","author":"Kobayashi","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_sbref0012","article-title":"The CMU ARCTIC databases for speech synthesis","author":"Kominek","year":"2003","journal-title":"Tech. Rep. CMU-LTI-03-177, Language Technologies Institute"},{"issue":"1","key":"10.1016\/j.specom.2018.11.007_bib0013","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1214\/aoms\/1177729694","article-title":"On information and sufficiency","volume":"22","author":"Kullback","year":"1951","journal-title":"Anal. Math. Stat."},{"key":"10.1016\/j.specom.2018.11.007_bib0014","series-title":"Proc. INTERSPEECH","first-page":"2254","article-title":"Map-based adaptation for speech conversion using adaptation data selection and non-parallel training","author":"Lee","year":"2006"},{"issue":"5","key":"10.1016\/j.specom.2018.11.007_bib0015","doi-asserted-by":"crossref","first-page":"1492","DOI":"10.1109\/TASL.2011.2182511","article-title":"Minimum kullback-leibler divergence parameter generation for HMM-based speech synthesis","volume":"20","author":"Ling","year":"2012","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0016","series-title":"Proc. ICASSP","first-page":"5175","article-title":"Exemplar-based sparse representation of timbre and prosody for voice conversion","author":"Ming","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0017","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1016\/j.specom.2017.01.008","article-title":"An overview of voice conversion systems","volume":"88","author":"Mohammadi","year":"2017","journal-title":"Speech Commun."},{"issue":"3","key":"10.1016\/j.specom.2018.11.007_bib0018","doi-asserted-by":"crossref","first-page":"952","DOI":"10.1109\/TSA.2005.857790","article-title":"Nonparallel training for voice conversion based on a parameter adaptation approach","volume":"14","author":"Mouchtaris","year":"2006","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"4","key":"10.1016\/j.specom.2018.11.007_bib0019","doi-asserted-by":"crossref","first-page":"1180","DOI":"10.1109\/TASL.2007.894511","article-title":"A spectral conversion approach to single-channel speech enhancement","volume":"15","author":"Mouchtaris","year":"2007","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0020","series-title":"Proc. ICASSP","first-page":"552","article-title":"Optimal clustering of multivariate normal distributions using divergence and its application to HMM adaptation","author":"Myrvoll","year":"2003"},{"issue":"2","key":"10.1016\/j.specom.2018.11.007_bib0021","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1016\/0167-6393(94)00058-I","article-title":"Transformation of formants for voice conversion using artificial neural networks","volume":"16","author":"Narendranath","year":"1995","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2018.11.007_bib0022","unstructured":"Odell, J. J. The use of context in large vocabulary speech recognition, 1995."},{"key":"10.1016\/j.specom.2018.11.007_bib0023","series-title":"Proc. ICASSP","first-page":"1847","article-title":"Narrowband to wideband conversion of speech using GMM based transformation","author":"Park","year":"2000"},{"key":"10.1016\/j.specom.2018.11.007_bib0024","series-title":"Proc. DARPA SLS Workshop","first-page":"357","article-title":"The design for the wall street journal-based CSR corpus","author":"Paul","year":"1992"},{"issue":"3","key":"10.1016\/j.specom.2018.11.007_bib0025","doi-asserted-by":"crossref","first-page":"373","DOI":"10.1109\/LSP.2017.2723507","article-title":"Low latency acoustic modeling using temporal convolution and LSTMs","volume":"25","author":"Peddinti","year":"2018","journal-title":"IEEE Signal Process. Lett."},{"issue":"6","key":"10.1016\/j.specom.2018.11.007_bib0026","doi-asserted-by":"crossref","first-page":"1231","DOI":"10.1109\/TASL.2009.2015708","article-title":"A cross-language state sharing and mapping approach to bilingual (mandarin-english) TTS","volume":"17","author":"Qian","year":"2009","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0027","doi-asserted-by":"crossref","first-page":"153","DOI":"10.1016\/S0885-2308(03)00005-6","article-title":"Modelling the uncertainty in recovering articulation from acoustics","volume":"17","author":"Richmond","year":"2003","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2018.11.007_bib0028","series-title":"Proc. INTERSPEECH","first-page":"1509","article-title":"Robust bandwidth extension of noise-corrupted narrowband speech","author":"Seltzer","year":"2005"},{"key":"10.1016\/j.specom.2018.11.007_bib0029","series-title":"Proc. ICASSP","first-page":"1.10.1","article-title":"Line spectrum pair (LSP) and speech data compression","author":"Soong","year":"1984"},{"issue":"2","key":"10.1016\/j.specom.2018.11.007_bib0030","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1109\/89.661472","article-title":"Continuous probabilistic transform for voice conversion","volume":"6","author":"Stylianou","year":"1998","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0031","series-title":"Proc. ICASSP","first-page":"4869","article-title":"Voice conversion using deep bidirectional long short-term memory based recurrent neural networks","author":"Sun","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0032","series-title":"Proc. ICME","first-page":"1","article-title":"Phonetic posteriorgrams for many-to-one voice conversion without parallel data training","author":"Sun","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0033","series-title":"Proc. ICSLP","article-title":"A first step towards text-independent voice conversion","author":"Sundermann","year":"2004"},{"key":"10.1016\/j.specom.2018.11.007_bib0034","series-title":"Proc. ICASSP","first-page":"81","article-title":"Text-independent voice conversion based on unit selection","author":"Sundermann","year":"2006"},{"key":"10.1016\/j.specom.2018.11.007_bib0035","series-title":"Proc. CSICC","first-page":"495","article-title":"A new wavelet thresholding method for speech enhancement based on symmetric kullback-leibler divergence","author":"Tabibian","year":"2009"},{"issue":"5","key":"10.1016\/j.specom.2018.11.007_bib0036","doi-asserted-by":"crossref","first-page":"932","DOI":"10.1109\/TASL.2010.2041688","article-title":"Supervisory data alignment for text-independent voice conversion","volume":"18","author":"Tao","year":"2010","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"8","key":"10.1016\/j.specom.2018.11.007_bib0037","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.1109\/TASL.2007.907344","article-title":"Voice conversion based on maximum-likelihood estimation of spectral parameter trajectory","volume":"15","author":"Toda","year":"2007","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"3","key":"10.1016\/j.specom.2018.11.007_bib0038","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1016\/j.specom.2007.09.001","article-title":"Statistical mapping between articulatory movements and acoustic spectrum using a gaussian mixture model","volume":"50","author":"Toda","year":"2008","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2018.11.007_bib0039","series-title":"Proc. INTERSPEECH","doi-asserted-by":"crossref","first-page":"1632","DOI":"10.21437\/Interspeech.2016-1066","article-title":"The voice conversion challenge 2016","author":"Toda","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0040","series-title":"Proc. ICASSP","first-page":"3601","article-title":"Voice conversion for various types of body transmitted speech","author":"Toda","year":"2009"},{"key":"10.1016\/j.specom.2018.11.007_bib0041","series-title":"Proc. ICASSP","first-page":"1315","article-title":"Speech parameter generation algorithms for HMM-based speech synthesis","author":"Tokuda","year":"2000"},{"key":"10.1016\/j.specom.2018.11.007_bib0042","series-title":"Proc. ICASPP","first-page":"145","article-title":"Voice transformation using PSOLA technique","author":"Valbret","year":"1992"},{"key":"10.1016\/j.specom.2018.11.007_bib0043","series-title":"Proc. ChinaSIP","first-page":"104","article-title":"Conditional restricted boltzmann machine for voice conversion","author":"Wu","year":"2013"},{"issue":"10","key":"10.1016\/j.specom.2018.11.007_bib0044","doi-asserted-by":"crossref","first-page":"1506","DOI":"10.1109\/TASLP.2014.2333242","article-title":"Exemplar-based sparse representation with residual compensation for voice conversion","volume":"22","author":"Wu","year":"2014","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2018.11.007_bib0045","series-title":"Proc. INTERSPEECH","first-page":"2283","article-title":"Sequence error (SE) minimization training of neural network for voice conversion","author":"Xie","year":"2014"},{"key":"10.1016\/j.specom.2018.11.007_bib0046","series-title":"Proc. ICASSP","first-page":"5515","article-title":"A KL divergence and DNN approach to cross-lingual TTS","author":"Xie","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0047","series-title":"Proc. INTERSPEECH","doi-asserted-by":"crossref","first-page":"287","DOI":"10.21437\/Interspeech.2016-116","article-title":"A KL divergence and DNN approach to voice conversion without parallel training sentences","author":"Xie","year":"2016"},{"key":"10.1016\/j.specom.2018.11.007_bib0048","series-title":"Proc. ICSLP","article-title":"Voice conversion for unkown speakers","author":"Ye","year":"2004"},{"key":"10.1016\/j.specom.2018.11.007_bib0049","first-page":"7893","article-title":"KL-Divergence regularized deep neural network adaptation for improved large vocabulary speech recognition","author":"Yu","year":"2013","journal-title":"Proc. ICASSP"},{"key":"10.1016\/j.specom.2018.11.007_bib0050","series-title":"Proc. SSW","first-page":"206","article-title":"Measuring attribute dissimilarity with HMM KL-divergence for speech synthesis","author":"Zhao","year":"2007"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639318300414?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639318300414?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2019,1,16]],"date-time":"2019-01-16T11:25:49Z","timestamp":1547637949000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639318300414"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1]]},"references-count":50,"alternative-id":["S0167639318300414"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2018.11.007","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2019,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Voice conversion with SI-DNN and KL divergence based mapping without parallel training data","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2018.11.007","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2018 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}