{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T18:46:23Z","timestamp":1762109183155,"version":"build-2065373602"},"reference-count":28,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800531","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"596-600","source":"Crossref","is-referenced-by-count":2,"title":["LLM-Based Expressive Text-to-Speech Synthesizer with Style and Timbre Disentanglement"],"prefix":"10.1109","author":[{"given":"Yuanyuan","family":"Zhu","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiaxu","family":"He","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruihao","family":"Jing","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaodong","family":"Song","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Lian","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao-lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI), China Telecom,Beijing"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"Inspirational and Convincing Audio Generation Chal-lenge 2024 ICAGC 2024","volume-title":"The 14th International Symposium on Chinese Spoken Language Processing (ISCSLP 2024)","author":"Fu","key":"ref1"},{"volume-title":"Tacotron: Towards end-to-end speech synthesis","year":"2017","author":"Wang","key":"ref2"},{"journal-title":"Fast-speech: Fast, robust and controllable text to speech","year":"2019","author":"Ren","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2018.8461368"},{"journal-title":"Neural codec language models are zero-shot text to speech synthesizers","year":"2023","author":"Wang","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"journal-title":"Seed-tts: A family of high-quality versatile speech generation models","year":"2024","author":"Team","key":"ref7"},{"journal-title":"Soundstorm: Efficient parallel audio generation","year":"2023","author":"Borsos","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29747"},{"journal-title":"High fidelity neural audio compression","year":"2022","author":"D\u00e9fossez","key":"ref10"},{"journal-title":"High-fidelity audio compression with improved rvqgan","year":"2023","author":"Kumar","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2021.3129994"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"journal-title":"Speak for-eign languages with your own voice: Cross-lingual neural codec language modeling","year":"2023","author":"Zhang","key":"ref14"},{"volume-title":"Vall-e r: Robust and efficient zero-shot text-to-speech synthesis via monotonic alignment","year":"2024","author":"Han","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1559"},{"journal-title":"Base tts: Lessons from building a billion-parameter text-to-speech model on 100k hours of data","year":"2024","author":"Lajszczak","key":"ref18"},{"journal-title":"Wavlm: Large-scale self-supervised pretraining for full stack speech processing","year":"2021","author":"Chen","key":"ref19"},{"volume-title":"Unsupervised cross-lingual representation learning for speech recognition","year":"2020","author":"Conneau","key":"ref20"},{"volume-title":"Portaspeech: Portable and high-quality generative text-to-speech","year":"2022","author":"Ren","key":"ref21"},{"volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","year":"2020","author":"Kong","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2409"},{"volume-title":"Hier-speech++: Bridging the gap between semantic and acoustic representation of speech by hierarchical variational inference for zero-shot speech synthesis","year":"2023","author":"Lee","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.931"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1428"},{"journal-title":"Language models are unsupervised multitask learn-ers","year":"2019","author":"Radford","key":"ref27"},{"volume-title":"Fastpitch: Parallel text-to-speech with pitch prediction","year":"2021","author":"La\u0144cucki","key":"ref28"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","start":{"date-parts":[[2024,11,7]]},"location":"Beijing, China","end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800531.pdf?arnumber=10800531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,24]],"date-time":"2024-12-24T06:32:08Z","timestamp":1735021928000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800531\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800531","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}