{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:03:33Z","timestamp":1775199813320,"version":"3.50.1"},"reference-count":45,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434663","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Code-switched Text-to-Speech Synthesis Capability in Large Language Models with only Monolingual Corpora"],"prefix":"10.1109","author":[{"given":"Jing","family":"Xu","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Daxin","family":"Tan","sequence":"additional","affiliation":[{"name":"Noah&#x2019;s Ark Lab, Huawei"}]},{"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}]},{"given":"Xiao","family":"Chen","sequence":"additional","affiliation":[{"name":"Noah&#x2019;s Ark Lab, Huawei"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"ref2","article-title":"Gpt-4 technical report","volume-title":"arXiv preprint arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref4","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang"},{"key":"ref5","article-title":"Salmonn: Towards generic hearing abilities for large language models","author":"Tang","year":"2023","journal-title":"arXiv preprint arXiv:2310.13289"},{"key":"ref6","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref7","article-title":"Listen, think, and understand","author":"Gong","year":"2023","journal-title":"arXiv preprint arXiv:2305.10790"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198240594.001.0001"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2005.1415035"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TENCON.2013.6719019"},{"key":"ref11","first-page":"3422","article-title":"Speech synthesis of code-mixed text","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC\u201916","author":"Sitaram"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-287"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053094"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682927"},{"key":"ref15","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023","journal-title":"arXiv preprint arXiv:2303.03926"},{"key":"ref16","first-page":"1244912460","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref20","first-page":"67","article-title":"Japanese-english code-switching speech data construction","volume-title":"2018 Oriental COCOSDA - International Conference on Speech Database and Assessments","author":"Nakayama"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383620"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00430"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"ref25","article-title":"Spoken question answering and speech continuation using spectrogrampowered 11 m","author":"Nachmani","year":"2023","journal-title":"arXiv preprint arXiv:2305.15255"},{"key":"ref26","article-title":"Audiopalm: A large language model that can speak and listen","author":"Rubenstein","year":"2023","journal-title":"arXiv preprint arXiv:2306.12925"},{"key":"ref27","article-title":"Viola: Unified codec language models for speech recognition, synthesis, and translation","volume-title":"arXiv preprint arXiv:2305.16107","author":"Wang","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447112"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.438"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref31","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International Conference on Machine Learning.","author":"Radford"},{"key":"ref32","article-title":"Google usm: Scaling automatic speech recognition beyond 100 languages","author":"Zhang","year":"2023","journal-title":"arXiv preprint arXiv:2303.01037"},{"key":"ref33","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"International Conference on Machine Learning.","author":"Kim"},{"key":"ref34","first-page":"5210","article-title":"Autovc: Zero-shot voice style transfer with only autoencoder loss","volume-title":"International Conference on Machine Learning.","author":"Qian"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref36","article-title":"Speechtokenizer: Unified speech tokenizer for speech large language models","author":"Zhang","year":"2023","journal-title":"arXiv preprint arXiv:2308.16692"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1716"},{"key":"ref38","article-title":"The Llama 3 Herd of Models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref39","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint arXiv:2106.09685"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097097"},{"key":"ref41","article-title":"Aishell-2: Transforming mandarin asr research into industrial scale","author":"Du","year":"2018","journal-title":"arXiv preprint arXiv:1808.10583"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref43","article-title":"Timit acoustic-phonetic continuous speech corpus","author":"Garofolo"},{"key":"ref44","article-title":"Ascend: A spontaneous chinese-english dataset for code-switching in multi-turn conversation","author":"Lovenia","year":"2021","journal-title":"arXiv preprint arXiv:2112.06223"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-9996"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434663.pdf?arnumber=11434663","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:54Z","timestamp":1775192274000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434663\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434663","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}