{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T16:56:25Z","timestamp":1778345785142,"version":"3.51.4"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10889894","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T13:52:43Z","timestamp":1741787563000},"page":"1-5","source":"Crossref","is-referenced-by-count":11,"title":["Enhancing Low-Resource ASR through Versatile TTS: Bridging the Data Gap"],"prefix":"10.1109","author":[{"given":"Guanrou","family":"Yang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,China"}]},{"given":"Fan","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group,Institute for Intelligent Computing,China"}]},{"given":"Ziyang","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,China"}]},{"given":"Zhihao","family":"Du","sequence":"additional","affiliation":[{"name":"Alibaba Group,Institute for Intelligent Computing,China"}]},{"given":"Zhifu","family":"Gao","sequence":"additional","affiliation":[{"name":"Alibaba Group,Institute for Intelligent Computing,China"}]},{"given":"Shiliang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group,Institute for Intelligent Computing,China"}]},{"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute, X-LANCE Lab,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1561\/116.00000050"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054295"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1280"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.42"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CISP-BMEI51763.2020.9263564"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3362\/0262-8104.2002.009"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref8","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. NeurIPS","author":"Kim"},{"key":"ref9","article-title":"Grad-tts: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. ICML","author":"Popov"},{"key":"ref10","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref11","article-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","author":"Du","year":"2024"},{"key":"ref12","article-title":"Seed-TTS: A family of high-quality versatile speech generation models","author":"Anastassiou","year":"2024"},{"key":"ref13","article-title":"BASE TTS: Lessons from building a billion-parameter text-to-speech model on 100k hours of data","author":"\u0141ajszczak","year":"2024"},{"key":"ref14","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446889"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/SynData4GenAI.2024-2"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/SSD.2019.8893184"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1645"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445906"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1080"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/SynData4GenAI.2024-4"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746217"},{"key":"ref24","article-title":"Towards selection of text-to-speech data to augment ASR training","author":"Liu","year":"2023"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/SynData4GenAI.2024-5"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-2382"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053139"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref30","article-title":"FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs","author":"SpeechTeam","year":"2024"},{"key":"ref31","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref33","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2019"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1049\/ell2.12823"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10889894.pdf?arnumber=10889894","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:22:19Z","timestamp":1774416139000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10889894\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10889894","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}