{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T21:54:59Z","timestamp":1775253299747,"version":"3.50.1"},"reference-count":48,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3453606","type":"journal-article","created":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T18:24:10Z","timestamp":1725647050000},"page":"4026-4035","source":"Crossref","is-referenced-by-count":7,"title":["U-Style: Cascading U-Nets With Multi-Level Speaker and Style Modeling for Zero-Shot Voice Cloning"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5578-3960","authenticated-orcid":false,"given":"Tao","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8075-1784","authenticated-orcid":false,"given":"Zhichao","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9275-523X","authenticated-orcid":false,"given":"Xinfa","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}]},{"given":"Jian","family":"Cong","sequence":"additional","affiliation":[{"name":"Speech, Audio, and Music Intelligence (SAMI) Group, ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4078-1273","authenticated-orcid":false,"given":"Qiao","family":"Tian","sequence":"additional","affiliation":[{"name":"Speech, Audio, and Music Intelligence (SAMI) Group, ByteDance, Shanghai, China"}]},{"given":"Yuping","family":"Wang","sequence":"additional","affiliation":[{"name":"Speech, Audio, and Music Intelligence (SAMI) Group, ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8234-0823","authenticated-orcid":false,"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref2","first-page":"1094","article-title":"Deep voice 3: 2000-speaker neural text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representation","volume":"79","author":"Ping","year":"2018"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10054"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414001"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3226655"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746049"},{"key":"ref7","first-page":"10040","article-title":"Neural voice cloning with a few samples","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Arik","year":"2018"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414081"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095515"},{"key":"ref11","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Casanova","year":"2022"},{"key":"ref12","first-page":"7748","article-title":"Meta-stylespeech: Multi-speaker adaptive text-to-speech generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Min","year":"2021"},{"key":"ref13","first-page":"10970","article-title":"GenerSpeech: Towards style transfer for generalizable out-of-domain text-to-speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Huang","year":"2022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11305"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414257"},{"key":"ref16","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2985"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1251"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095840"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-754"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054556"},{"key":"ref22","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov","year":"2021"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3203888"},{"key":"ref24","article-title":"PriorGrad: Improving conditional denoising diffusion models with data-dependent adaptive prior","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lee","year":"2021"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682535"},{"key":"ref28","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"Ulyanov","year":"2016"},{"key":"ref29","article-title":"NaturalSpeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","year":"2023"},{"key":"ref30","article-title":"Mega-TTS 2: Zero-shot text-to-speech with arbitrary length speech prompts","author":"Jiang","year":"2023"},{"key":"ref31","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Skerry-Ryan","year":"2018"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"ref34","article-title":"Multi-reference tacotron by intercross training for style disentangling, transfer and control in speech synthesis","author":"Bian","year":"2019"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-610"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3164181"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3313413"},{"key":"ref38","first-page":"1180","article-title":"Unsupervised domain adaptation by backpropagation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ganin","year":"2015"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095776"},{"key":"ref40","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Choi","year":"2021"},{"key":"ref41","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"Hsu","year":"2019"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Blizzard.2021-14"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"ref45","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume-title":"Proc. 33 rd Int. Conf. Neural Inf. Process. Syst.","author":"Ren","year":"2019"},{"key":"ref46","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kong","year":"2020"},{"key":"ref47","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3145297"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10669040.pdf?arnumber=10669040","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,14]],"date-time":"2024-09-14T06:03:39Z","timestamp":1726293819000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10669040\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3453606","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}