{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T17:47:06Z","timestamp":1776880026755,"version":"3.51.2"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Institute for Information &amp; Communications Technology Planning &amp; Evaluation","award":["2019-0-00079"],"award-info":[{"award-number":["2019-0-00079"]}]},{"name":"Artificial Intelligence Graduate School Program"},{"DOI":"10.13039\/501100002642","name":"Korea University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002642","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Netmarble AI Center"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/taslp.2022.3156757","type":"journal-article","created":{"date-parts":[[2022,3,7]],"date-time":"2022-03-07T20:51:53Z","timestamp":1646686313000},"page":"1173-1183","source":"Crossref","is-referenced-by-count":18,"title":["Duration Controllable Voice Conversion via Phoneme-Based Information Bottleneck"],"prefix":"10.1109","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8925-4474","authenticated-orcid":false,"given":"Sang-Hoon","family":"Lee","sequence":"first","affiliation":[{"name":"Department of Brain and Cognitive Engineering, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4511-0839","authenticated-orcid":false,"given":"Hyeong-Rae","family":"Noh","sequence":"additional","affiliation":[{"name":"Department of Brain and Cognitive Engineering, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6548-4486","authenticated-orcid":false,"given":"Woo-Jeoung","family":"Nam","sequence":"additional","affiliation":[{"name":"Department of Computer and Radio Communications Engineering, Korea University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6249-4996","authenticated-orcid":false,"given":"Seong-Whan","family":"Lee","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, Korea University, Seoul, South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"ref3","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jia","year":"2018"},{"key":"ref4","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Skerry-Ryan","year":"2018"},{"key":"ref5","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICACCI.2016.7732175"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953090"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639636"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683746"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639647"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2960721"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3049336"},{"key":"ref15","first-page":"5210","article-title":"AutoVC: Zero-shot voice style transfer with only autoencoder loss","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2019"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054734"},{"key":"ref17","first-page":"7836","article-title":"Unsupervised speech decomposition via triple information bottleneck","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414257"},{"key":"ref19","article-title":"Improving zero-shot voice style transfer via disentangled representation learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yuan","year":"2021"},{"key":"ref20","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"Ulyanov","year":"2016"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414257"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2041699"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2333242"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2353991"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078543"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2236"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00916"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-63"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"ref37","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sutskever","year":"2014"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1066"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2123"},{"key":"ref40","article-title":"Parallel neural text-to-speech","author":"Peng","year":"2019"},{"key":"ref41","first-page":"3171","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ren","year":"2019"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054484"},{"key":"ref43","article-title":"Fastspeech 2: Fast and high-quality end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","year":"2020"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.5555\/2946645.2946704"},{"key":"ref47","volume-title":"Pattern Recognit. Support Vector Mach.: First Int. Workshop, SVM 2002,","volume":"2388","author":"Lee","year":"2003"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref51","article-title":"WaveNet: A generative model for raw audio","author":"Oord","year":"2016"},{"key":"ref52","article-title":"Superseded-CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","volume-title":"Univ. Edinburgh. Centre Speech Technol. Res.","author":"Veaux","year":"2016"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref54","first-page":"37","article-title":"Autoencoders, unsupervised learning, and deep architectures","volume-title":"Proc. ICML Workshop Unsupervised Transfer Learn.","author":"Baldi","year":"2012"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref56","article-title":"VoiceMixer: Adversarial voice style mixup","volume":"34","author":"Lee","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-1064"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2892235"},{"key":"ref60","article-title":"Non-attentive tacotron: Robust and controllable neural TTS synthesis including unsupervised duration modeling","author":"Shen","year":"2020"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9657755\/09729483.pdf?arnumber=9729483","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,17]],"date-time":"2024-01-17T23:44:21Z","timestamp":1705535061000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9729483\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/taslp.2022.3156757","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]}}}