{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:05:17Z","timestamp":1757617517447,"version":"3.44.0"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"7-8","license":[{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s13042-025-02535-x","type":"journal-article","created":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T10:09:33Z","timestamp":1740391773000},"page":"4673-4683","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Personalized Lao language synthesis via disentangled neural codec language model"],"prefix":"10.1007","volume":"16","author":[{"given":"Cunli","family":"Mao","sequence":"first","affiliation":[]},{"given":"Tian","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Linqin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhengtao","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Shengxiang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Ling","family":"Dong","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,24]]},"reference":[{"key":"2535_CR1","doi-asserted-by":"crossref","unstructured":"Shen J, Pang R, Weiss RJ, Schuster M, Jaitly N, Yang Z, Chen Z, Zhang Y, Wang Y, Skerrv-Ryan R, et al (2018) Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4779\u20134783. IEEE","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"2535_CR2","unstructured":"Tan X, Qin T, Soong F, Liu T-Y (2021) A survey on neural speech synthesis. arXiv preprint arXiv:2106.15561"},{"key":"2535_CR3","doi-asserted-by":"crossref","unstructured":"Mehrish A, Kashyap AR, Yingting L, Majumder N, Poria S (2023) Adaptermix: Exploring the efficacy of mixture of adapters for low-resource tts adaptation. arXiv preprint arXiv:2305.18028","DOI":"10.21437\/Interspeech.2023-1568"},{"key":"2535_CR4","doi-asserted-by":"crossref","unstructured":"Wang T, Tao J, Fu R, Yi J, Wen Z, Zhong R (2020) Spoken content and voice factorization for few-shot speaker adaptation. In: Interspeech, pp. 796\u2013800","DOI":"10.21437\/Interspeech.2020-1745"},{"key":"2535_CR5","unstructured":"Chen M, Tan X, Li B, Liu Y, Qin T, Zhao S, Liu T-Y (2021) Adaspeech: Adaptive text to speech for custom voice. arXiv preprint arXiv:2103.00993"},{"key":"2535_CR6","doi-asserted-by":"crossref","unstructured":"Wu Y, Tan X, Li B, He L, Zhao S, Song R, Qin T, Liu T-Y (2022) Adaspeech 4: Adaptive text to speech in zero-shot scenarios. arXiv preprint arXiv:2204.00436","DOI":"10.21437\/Interspeech.2022-901"},{"key":"2535_CR7","unstructured":"Wang C, Chen S, Wu Y, Zhang Z, Zhou L, Liu S, Chen Z, Liu Y, Wang H, Li J, et al (2023) Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111"},{"key":"2535_CR8","doi-asserted-by":"crossref","unstructured":"Anh NTN, Thanh NT, et al (2022) Development of a high quality text to speech system for lao. In: 2022 25th Conference of the Oriental COCOSDA International Committee for the Co-ordination and Standardisation of Speech Databases and Assessment Techniques (O-COCOSDA), pp. 1\u20135. IEEE","DOI":"10.1109\/O-COCOSDA202257103.2022.9997914"},{"key":"2535_CR9","unstructured":"Feng Z, Wang L, Gao S, Yu Z, Dong L (2023) A discretized self-supervised representation enhancement based non-autoregressive speech synthesis method for Lao language. In: Sun, M., Qin, B., Qiu, J. Xipeng\u00a0andJiang, Han, X. (eds.) Proceedings of the 22nd Chinese National Conference on Computational Linguistics, pp. 90\u2013101. Chinese Information Processing Society of China, Harbin, China"},{"key":"2535_CR10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3407509","volume-title":"Zero-shot text normalization via cross-lingual knowledge distillation","author":"L Wang","year":"2024","unstructured":"Wang L, Huang X, Yu Z, Peng H, Gao S, Mao C, Huang Y, Dong L, Philip SY (2024) Zero-shot text normalization via cross-lingual knowledge distillation. Speech, and Language Processing, IEEE\/ACM Transactions on Audio"},{"key":"2535_CR11","unstructured":"Arik S, Chen J, Peng K, Ping W, Zhou Y (2018) Neural voice cloning with a few samples. Advances in neural information processing systems 31"},{"key":"2535_CR12","doi-asserted-by":"crossref","unstructured":"Yan Y, Tan X, Li B, Qin T, Zhao S, Shen Y, Liu T-Y (2021) Adaspeech 2: Adaptive text to speech with untranscribed data. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6613\u20136617. IEEE","DOI":"10.1109\/ICASSP39728.2021.9414872"},{"key":"2535_CR13","unstructured":"Jia Y, Zhang Y, Weiss R, Wang Q, Shen J, Ren F, Nguyen P, Pang R, Lopez\u00a0Moreno I, Wu Y, et al (2018) Transfer learning from speaker verification to multispeaker text-to-speech synthesis. Advances in neural information processing systems 31"},{"key":"2535_CR14","doi-asserted-by":"crossref","unstructured":"Choi S, Han S, Kim D, Ha S (2020) Attentron: Few-shot text-to-speech utilizing attention-based variable-length embedding. arXiv preprint arXiv:2005.08484","DOI":"10.21437\/Interspeech.2020-2096"},{"key":"2535_CR15","doi-asserted-by":"crossref","unstructured":"Zhou Y, Song C, Li X, Zhang L, Wu Z, Bian Y, Su D, Meng H (2022) Content-dependent fine-grained speaker embedding for zero-shot speaker adaptation in text-to-speech synthesis. arXiv preprint arXiv:2204.00990","DOI":"10.21437\/Interspeech.2022-10054"},{"key":"2535_CR16","doi-asserted-by":"crossref","unstructured":"Lei Y, Yang S, Cong J, Xie L, Su D (2022) Glow-wavegan 2: High-quality zero-shot text-to-speech synthesis and any-to-any voice conversion. arXiv preprint arXiv:2207.01832","DOI":"10.21437\/Interspeech.2022-684"},{"key":"2535_CR17","volume-title":"Speechx: neural codec language model as a versatile speech transformer","author":"X Wang","year":"2024","unstructured":"Wang X, Thakker M, Chen Z, Kanda N, Eskimez SE, Chen S, Tang M, Liu S, Li J, Yoshioka T (2024) Speechx: neural codec language model as a versatile speech transformer. Speech, and Language Processing, IEEE\/ACM Transactions on Audio"},{"key":"2535_CR18","doi-asserted-by":"publisher","first-page":"1703","DOI":"10.1162\/tacl_a_00618","volume":"11","author":"E Kharitonov","year":"2023","unstructured":"Kharitonov E, Vincent D, Borsos Z, Marinier R, Girgin S, Pietquin O, Sharifi M, Tagliasacchi M, Zeghidour N (2023) Speak, read and prompt: High-fidelity text-to-speech with minimal supervision. Trans Assoc Comput Linguistics 11:1703\u20131718","journal-title":"Trans Assoc Comput Linguistics"},{"issue":"1","key":"2535_CR19","first-page":"12","volume":"2","author":"TM Cover","year":"1991","unstructured":"Cover TM, Thomas JA et al (1991) Entropy, relative entropy and mutual information. Elements Inform Theory 2(1):12\u201313","journal-title":"Elements Inform Theory"},{"key":"2535_CR20","unstructured":"Cheng P, Hao W, Dai S, Liu J, Gan Z, Carin L (2020) Club: A contrastive log-ratio upper bound of mutual information. In: International Conference on Machine Learning, pp. 1779\u20131788. PMLR"},{"key":"2535_CR21","doi-asserted-by":"crossref","unstructured":"Wang D, Deng L, Yeung YT, Chen X, Liu X, Meng H (2021) Vqmivc: Vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. arXiv preprint arXiv:2106.10132","DOI":"10.21437\/Interspeech.2021-283"},{"key":"2535_CR22","doi-asserted-by":"crossref","unstructured":"Huang L, Yuan T, Liang Y, Chen Z, Wen C, Xie Y, Zhang J, Ke D (2023) Limi-vc: A light weight voice conversion model with mutual information disentanglement. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE","DOI":"10.1109\/ICASSP49357.2023.10096399"},{"key":"2535_CR23","unstructured":"Mortensen DR, Dalmia S, Littell P (2018) Epitran: Precision g2p for many languages. In: Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)"},{"key":"2535_CR24","unstructured":"D\u00e9fossez A, Copet J, Synnaeve G, Adi Y (2022) High fidelity neural audio compression. arXiv preprint arXiv:2210.13438"},{"key":"2535_CR25","doi-asserted-by":"crossref","unstructured":"Nekvinda T, Du\u0161ek O (2020) One model, many languages: Meta-learning for multilingual text-to-speech. arXiv preprint arXiv:2008.00768","DOI":"10.21437\/Interspeech.2020-2679"},{"key":"2535_CR26","doi-asserted-by":"crossref","unstructured":"Zhang Y, Weiss RJ, Zen H, Wu Y, Chen Z, Skerry-Ryan R, Jia Y, Rosenberg A, Ramabhadran B (2019) Learning to speak fluently in a foreign language: Multilingual speech synthesis and cross-language voice cloning. arXiv preprint arXiv:1907.04448","DOI":"10.21437\/Interspeech.2019-2668"},{"key":"2535_CR27","doi-asserted-by":"crossref","unstructured":"Kahn J, Riviere M, Zheng W, Kharitonov E, Xu Q, Mazar\u00e9 P-E, Karadayi J, Liptchinsky V, Collobert R, Fuegen C, et al (2020) Libri-light: A benchmark for asr with limited or no supervision. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7669\u20137673. IEEE","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"2535_CR28","unstructured":"Hines A, Skoglund J, Kokaram A, Harte N (2012)Visqol: The virtual speech quality objective listener. In: IWAENC 2012; International Workshop on Acoustic Signal Enhancement, pp. 1\u20134. VDE"},{"key":"2535_CR29","doi-asserted-by":"crossref","unstructured":"Casanova E, Shulby C, G\u00f6lge E, M\u00fcller NM, De\u00a0Oliveira FS, Junior AC, Soares AdS, Aluisio SM, Ponti MA (2021) c-glowtts: An efficient zero-shot multi-speaker text-to-speech model. arXiv preprint arXiv:2104.05557","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"2535_CR30","doi-asserted-by":"crossref","unstructured":"Snyder D, Garcia-Romero D, Sell G, Povey D, Khudanpur S (2018) X-vectors: Robust dnn embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5329\u20135333. IEEE","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"2535_CR31","unstructured":"Ren Y, Hu C, Tan X, Qin T, Zhao S, Zhao Z, Liu T-Y (2020) Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558"},{"key":"2535_CR32","unstructured":"Casanova E, Weber J, Shulby CD, Junior AC, G\u00f6lge E, Ponti MA (2022) Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone. In: International Conference on Machine Learning, pp. 2709\u20132720. PMLR"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-025-02535-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-025-02535-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-025-02535-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T06:50:59Z","timestamp":1757141459000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-025-02535-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,24]]},"references-count":32,"journal-issue":{"issue":"7-8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2535"],"URL":"https:\/\/doi.org\/10.1007\/s13042-025-02535-x","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"type":"print","value":"1868-8071"},{"type":"electronic","value":"1868-808X"}],"subject":[],"published":{"date-parts":[[2025,2,24]]},"assertion":[{"value":"21 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}