{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:32:59Z","timestamp":1780356779006,"version":"3.54.1"},"reference-count":86,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3451951","type":"journal-article","created":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T18:24:10Z","timestamp":1725647050000},"page":"4036-4051","source":"Crossref","is-referenced-by-count":26,"title":["ZMM-TTS: Zero-Shot Multilingual and Multispeaker Speech Synthesis Conditioned on Self-Supervised Discrete Speech Representations"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0272-3541","authenticated-orcid":false,"given":"Cheng","family":"Gong","sequence":"first","affiliation":[{"name":"Tianjin University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8246-0606","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"National Institute of Informatics (NII), Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2978-2793","authenticated-orcid":false,"given":"Erica","family":"Cooper","sequence":"additional","affiliation":[{"name":"National Institute of Informatics (NII), Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4942-4248","authenticated-orcid":false,"given":"Dan","family":"Wells","sequence":"additional","affiliation":[{"name":"Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8094-6861","authenticated-orcid":false,"given":"Longbiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9237-4821","authenticated-orcid":false,"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Korin","family":"Richmond","sequence":"additional","affiliation":[{"name":"Centre for Speech Technology Research, University of Edinburgh, Edinburgh, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2752-3955","authenticated-orcid":false,"given":"Junichi","family":"Yamagishi","sequence":"additional","affiliation":[{"name":"National Institute of Informatics (NII), Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref2","first-page":"1","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","year":"2021"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3356232"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/575"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2668"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2679"},{"key":"ref8","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. 39th Int. Conf. Mach. Learn.","volume":"162","author":"Casanova","year":"2022"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-621"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097074"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-444"},{"key":"ref13","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Jia","year":"2018"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref15","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst","author":"Baevski","year":"2020"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746223"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747814"},{"key":"ref20","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Choi","year":"2021"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746430"},{"key":"ref22","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-363"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3362\/0262-8104.2002.009"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref29","first-page":"1","article-title":"High fidelity neural audio compression","author":"Dfossez","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref30","article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","year":"2023"},{"key":"ref31","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","author":"Ju","year":"2024"},{"key":"ref32","article-title":"Hierspeech++ : Bridging the gap between semantic and acoustic representation of speech by hierarchical variational inference for zero-shot speech synthesis","author":"Lee","year":"2023"},{"key":"ref33","first-page":"10040","article-title":"Neural voice cloning with a few samples","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","volume":"31","author":"Arik","year":"2018"},{"key":"ref34","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414226"},{"key":"ref37","article-title":"Cross-lingual text-to-speech using multi-task learning and speaker classifier joint training","author":"Yang","year":"2022"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref39","first-page":"1","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Ren"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413889"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-1590"},{"key":"ref42","article-title":"WaveNet: A generative model for raw audio","volume-title":"Proc. Int. Speech. Community Assoc.","author":"Oord","year":"2016"},{"key":"ref43","first-page":"626","article-title":"MMM: Multilingual multiaccented multispeaker text to speech","volume-title":"Proc. Annu. Conf. Int. Speech Commun. Assoc.","author":"Badlani","year":"2023"},{"key":"ref44","first-page":"1","article-title":"RAD-TTS: Parallel flow-based TTS with robust alignment learning and diverse synthesis","volume-title":"Proc. ICML Workshop Invertible Neural Networks, Normalizing Flows, Explicit Likelihood Models","author":"Shih","year":"2021"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-46"},{"key":"ref46","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. 38th Int. Conf. Mac. Learn.","author":"Kim","year":"2021"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1632"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3369537"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682674"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1821"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2021-28"},{"key":"ref52","first-page":"2710","article-title":"Epitran: Precision G2P for many languages","volume-title":"Proc. 11th Int. Conf. Lang. Resour. Eval.","author":"Mortensen","year":"2018"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746929"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10797"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1356"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10019"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3278184"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-489"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26488"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10140"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2056"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448074"},{"key":"ref64","first-page":"16624","article-title":"Hierspeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Lee"},{"key":"ref65","first-page":"1","article-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Li"},{"key":"ref66","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023"},{"key":"ref67","article-title":"Seamless: Multilingual expressive and streaming speech translation","author":"Barrault","year":"2023"},{"key":"ref68","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Kong","year":"2020"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747707"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10796"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-952"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3272470"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639248"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1500"},{"key":"ref77","article-title":"The LJ speech dataset","author":"Ito","year":"2017"},{"key":"ref78","article-title":"NST swedish speech synthesis","author":"Technology","year":"2003"},{"key":"ref79","article-title":"Recommendation G.191: Software tools and audio coding standardization","year":"2005"},{"key":"ref80","article-title":"CLOVA baseline system for the VoxCeleb speaker recognition challenge 2020,","author":"Heo","year":"2020"},{"key":"ref81","article-title":"Chinese standard mandarin speech copus","year":"2022"},{"issue":"11","key":"ref82","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."},{"key":"ref83","first-page":"16","article-title":"Text-to-speech for under-resourced languages: Phoneme mapping and source language selection in transfer learning","volume-title":"Proc. ELRA\/ISCA SIG Under-Resourced Lang.","author":"Do","year":"2022"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10669054.pdf?arnumber=10669054","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,19]],"date-time":"2024-09-19T07:22:35Z","timestamp":1726730555000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10669054\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":86,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3451951","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}