{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:06:07Z","timestamp":1761897967302,"version":"3.37.3"},"reference-count":53,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Institute of Information and communications Technology Planning and Evaluation"},{"name":"Korea government","award":["2021-0-00456"],"award-info":[{"award-number":["2021-0-00456"]}]},{"name":"Development of Ultra-high Speech Quality Technology for Remote Multi-speaker Conference System"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3364085","type":"journal-article","created":{"date-parts":[[2024,2,8]],"date-time":"2024-02-08T18:55:02Z","timestamp":1707418502000},"page":"1519-1530","source":"Crossref","is-referenced-by-count":6,"title":["Transfer Learning for Low-Resource, Multi-Lingual, and Zero-Shot Multi-Speaker Text-to-Speech"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4689-3110","authenticated-orcid":false,"given":"Myeonghun","family":"Jeong","sequence":"first","affiliation":[{"name":"Kakao Enterprise, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8150-765X","authenticated-orcid":false,"given":"Minchan","family":"Kim","sequence":"additional","affiliation":[{"name":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1319-8215","authenticated-orcid":false,"given":"Byoung Jin","family":"Choi","sequence":"additional","affiliation":[{"name":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9978-0582","authenticated-orcid":false,"given":"Jaesam","family":"Yoon","sequence":"additional","affiliation":[{"name":"Kakao Enterprise, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4711-780X","authenticated-orcid":false,"given":"Won","family":"Jang","sequence":"additional","affiliation":[{"name":"Kakao Enterprise, Seongnam, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0568-4902","authenticated-orcid":false,"given":"Nam Soo","family":"Kim","sequence":"additional","affiliation":[{"name":"Institute of New Media and Communications, Department of Electrical and Computer Engineering, Seoul National University, Seoul, Republic of Korea"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.21437\/Interspeech.2021-469"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref3","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","year":"2021"},{"key":"ref4","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref5","first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Kim","year":"2020"},{"key":"ref6","first-page":"741","article-title":"Low-resource multilingual and zero-shot multispeaker TTS","volume-title":"Proc. 2nd Conf. Asia-Pacific Chapter Assoc. Comput. Linguistics 12th Int. Joint Conf. Natural Lang. Process.","author":"Lux","year":"2022"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/ACCESS.2022.3141200"},{"key":"ref8","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Casanova","year":"2022"},{"key":"ref9","first-page":"5410","article-title":"Almost unsupervised text to speech and automatic speech recognition","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ren","year":"2019"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1145\/3394486.3403331"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.21437\/interspeech.2022-11071"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.21437\/Interspeech.2022-816"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.21437\/Interspeech.2022-225"},{"key":"ref14","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref15","first-page":"27826","article-title":"Unsupervised speech recognition","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Baevski","year":"2021"},{"key":"ref16","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Jia","year":"2018"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"ref19","first-page":"7748","article-title":"Meta-stylespeech: Multi-speaker adaptive text-to-speech generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Min","year":"2021"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.21437\/Interspeech.2021-441"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.21437\/Interspeech.2022-901"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/LSP.2022.3226655"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1109\/CVPR.2019.00453"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.23919\/APSIPAASC55919.2022.9979900"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.21437\/Interspeech.2020-1229"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/ICASSP.2019.8682927"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.21437\/Interspeech.2020-2679"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.21437\/interspeech.2019-2668"},{"year":"2019","author":"Liu","article-title":"Cross-lingual multi-speaker text-to-speech synthesis for voice cloning without using parallel corpus for unseen speakers","key":"ref29"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.21437\/Interspeech.2019-1632"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.21437\/Interspeech.2022-46"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/ICASSP.2019.8682674"},{"year":"2021","author":"He","article-title":"Multilingual byte2speech models for scalable low-resource speech synthesis","key":"ref33"},{"year":"2018","author":"Oord","article-title":"Representation learning with contrastive predictive coding","key":"ref34"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1109\/TASLP.2021.3122291"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.21437\/Interspeech.2019-1873"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref38","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Choi","year":"2021"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.21437\/Interspeech.2022-10797"},{"key":"ref40","first-page":"16624","article-title":"HierSpeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Lee","year":"2022"},{"key":"ref41","first-page":"18003","article-title":"ContentVec: An improved self-supervised speech representation by disentangling speakers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qian","year":"2022"},{"key":"ref42","article-title":"NANSY++: Unified voice synthesis with neural analysis and synthesis","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Choi","year":"2023"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1016\/j.wocn.2018.07.001"},{"volume-title":"The Art of VA Filter Design","year":"2012","author":"Zavalishin","key":"ref44"},{"doi-asserted-by":"publisher","key":"ref45","DOI":"10.21437\/Interspeech.2021-329"},{"doi-asserted-by":"publisher","key":"ref46","DOI":"10.21437\/Interspeech.2020-2650"},{"doi-asserted-by":"publisher","key":"ref47","DOI":"10.1016\/S1364-6613(99)01294-2"},{"doi-asserted-by":"publisher","key":"ref48","DOI":"10.21437\/Interspeech.2020-2826"},{"year":"2019","author":"Yamagishi","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","key":"ref49"},{"key":"ref50","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"year":"2020","author":"Heo","article-title":"Clova baseline system for the voxceleb speaker recognition challenge 2020","key":"ref51"},{"year":"2021","author":"Ravanelli","article-title":"Speechbrain: A general-purpose speech toolkit","key":"ref52"},{"key":"ref53","article-title":"Visualizing data using t-SNE","volume":"9","author":"Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10428082.pdf?arnumber=10428082","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,14]],"date-time":"2024-03-14T05:21:57Z","timestamp":1710393717000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10428082\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3364085","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2024]]}}}