{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T14:51:31Z","timestamp":1784299891916,"version":"3.55.0"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832365","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"885-890","source":"Crossref","is-referenced-by-count":56,"title":["Emilia: An Extensive, Multilingual, and Diverse Speech Dataset For Large-Scale Speech Generation"],"prefix":"10.1109","author":[{"given":"Haorui","family":"He","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zengqiang","family":"Shang","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chaoren","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuyuan","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yicheng","family":"Gu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hua","family":"Hua","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liwei","family":"Liu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiaqi","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peiyang","family":"Shi","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuancheng","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory,Shanghai,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pengyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Acoustics CAS,Laboratory of Speech &#x0026; Intelligent Information Processing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhizheng","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen,China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"SoundStorm: Efficient Parallel Audio Generation","author":"Borsos","year":"2023","journal-title":"arXiv preprint arXiv:2305.09636"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3362\/0262-8104.2002.009"},{"key":"ref3","article-title":"NaturalSpeech 3: Zero-shot Speech Synthesis with Factorized Codec and Diffusion Models","volume-title":"Proc. of ICML","author":"Ju"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448436"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3362\/0262-8104.2002.009"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832255"},{"key":"ref9","article-title":"A Survey on Neural Speech Synthesis","volume-title":"arXiv preprint arXiv:2106.15561","author":"Tan"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447759"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2343"},{"key":"ref12","article-title":"The LJ Speech Dataset","author":"Ito","year":"2017"},{"key":"ref13","article-title":"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92)","author":"Yamagishi","year":"2019"},{"key":"ref14","article-title":"AISHELL-3: A Multi-Speaker Mandarin TTS Corpus","volume-title":"Proc. of Interspeech","author":"Yao"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383498"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-205"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-78"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746108"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","location":"Macao","start":{"date-parts":[[2024,12,2]]},"end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832365.pdf?arnumber=10832365","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T19:42:43Z","timestamp":1740080563000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832365\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832365","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}