{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T14:55:16Z","timestamp":1777128916200,"version":"3.51.4"},"reference-count":73,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"NSF","award":["1619212"],"award-info":[{"award-number":["1619212"]}]},{"name":"NSF","award":["1623750"],"award-info":[{"award-number":["1623750"]}]},{"DOI":"10.13039\/100007904","name":"Texas A and M University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100007904","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2021.3060813","type":"journal-article","created":{"date-parts":[[2021,7,8]],"date-time":"2021-07-08T19:48:01Z","timestamp":1625773681000},"page":"2367-2381","source":"Crossref","is-referenced-by-count":25,"title":["Converting Foreign Accent Speech Without a Reference"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6059-4053","authenticated-orcid":false,"given":"Guanlong","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2108-3111","authenticated-orcid":false,"given":"Shaojin","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ricardo","family":"Gutierrez-Osuna","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"CoRR"},{"key":"ref72","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref71","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref70","first-page":"435","article-title":"Preventing gradient explosions in gated recurrent units","author":"kanai","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"540","DOI":"10.1109\/TASLP.2019.2960721","article-title":"Non-parallel sequence-to-sequence voice conversion with disentangled linguistic and speaker representations","volume":"28","author":"zhang","year":"2019","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078543"},{"key":"ref31","first-page":"2514","article-title":"Statistical singing voice conversion with direct waveform modification based on the spectrum differential","author":"kobayashi","year":"2014","journal-title":"Proc INTERSPEECH"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953213"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-28"},{"key":"ref36","first-page":"paper 125","article-title":"WaveNet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"Proc Int Sci Community Assoc Workshop Speech Synth"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-247"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-1770.1995.tb00963.x"},{"key":"ref62","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","author":"yamagishi","year":"2019"},{"key":"ref61","article-title":"Recommendation P.800.1.mean opinion score (MOS) terminology, International Telecommunication Union-Telecommunication Standardisation Sector (ITU-T)","year":"2003"},{"key":"ref63","article-title":"Speech signal processing toolkit (SPTK) version 3.11","author":"group","year":"2017"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1778"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-68"},{"key":"ref66","first-page":"4485","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-014-2180-2"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1972"},{"key":"ref68","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"Proc ICLR"},{"key":"ref69","first-page":"950","article-title":"A simple weight decay can improve generalization","author":"krogh","year":"1992","journal-title":"Proc Ad Neural Inf Process Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(01)00009-7"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2008.11.004"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2015.02.003"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2014.2312456"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2004.1326078"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6855134"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-596"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2009.5372889"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462258"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462020"},{"key":"ref59","doi-asserted-by":"crossref","first-page":"3053","DOI":"10.21437\/Interspeech.2011-764","article-title":"Crowdsourcing preference tests, and how to detect cheating","author":"buchholz","year":"2011","journal-title":"Proc INTERSPEECH"},{"key":"ref58","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref57","article-title":"Audacity(R): Free audio editor and recorder","year":"2020","journal-title":"The Audacity Team"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1110"},{"key":"ref55","first-page":"223","article-title":"The CMU arctic speech databases","author":"kominek","year":"2004","journal-title":"Proc 5th Int Sci Community Assoc ITRW Speech Synth Workshop"},{"key":"ref54","first-page":"10\ufffd215","article-title":"Glow: Generative flow with invertible 1x1 convolutions","author":"kingma","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3070203"},{"key":"ref52","article-title":"An overview of multi-task learning in deep neural networks","author":"ruder","year":"2017"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-116"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2926754"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3001456"},{"key":"ref12","first-page":"6785","article-title":"Improving sequence-to-sequence acoustic modeling by adding text-supervision","author":"zhang","year":"2019","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1160"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2325"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311537"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2201474"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1121\/1.4904701"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2010.08.015"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2019.10.005"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1043"},{"key":"ref5","first-page":"289","article-title":"Subband based voice conversion","author":"turk","year":"2002","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1789"},{"key":"ref7","first-page":"299","article-title":"Non-native speech synthesis preserving speaker individuality based on partial correction of prosodic and phonetic characteristics","author":"oshima","year":"2015","journal-title":"Proc INTERSPEECH"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref46","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"Proc IEEE Workshop on Automatic Speech Recognition and Understanding"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-17"},{"key":"ref47","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1417"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053797"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref43","first-page":"3214","article-title":"A time delay neural network architecture for efficient modeling of long temporal contexts","author":"peddinti","year":"2015","journal-title":"Proc INTERSPEECH"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/6570655\/9289074\/9477581-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09477581.pdf?arnumber=9477581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,3]],"date-time":"2023-01-03T11:38:05Z","timestamp":1672745885000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9477581\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3060813","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}