{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T07:10:50Z","timestamp":1779174650633,"version":"3.51.4"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T00:00:00Z","timestamp":1779148800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T00:00:00Z","timestamp":1779148800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cogn Comput"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1007\/s12559-026-10598-3","type":"journal-article","created":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T06:24:20Z","timestamp":1779171860000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Multilingual Training Strategy for Low-Resource Text-to-Speech"],"prefix":"10.1007","volume":"18","author":[{"given":"Asma","family":"Amalas","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mounir","family":"Ghogho","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohamed","family":"Chetouani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rachid","family":"Oulad Haj Thami","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,19]]},"reference":[{"key":"10598_CR1","unstructured":"Ito K, Johnson L. The LJ speech dataset; 2017. https:\/\/keithito.com\/LJ-Speech-Dataset\/."},{"key":"10598_CR2","doi-asserted-by":"crossref","unstructured":"Liu AH, Lai CI, Hsu WN, Auli M, Baevski A, Glass J. Simple and effective unsupervised speech synthesis. In: Proceedings of Interspeech 2022; 2022. p. 843\u2013847.","DOI":"10.21437\/Interspeech.2022-11071"},{"key":"10598_CR3","doi-asserted-by":"crossref","unstructured":"Dunbar E, Algayres R, Karadayi J, Bernard M, Benjumea J, Cao XN, et al. The zero resource speech challenge 2019: TTS without T. In: Interspeech; 2019. Available from: https:\/\/api.semanticscholar.org\/CorpusID:131777066.","DOI":"10.21437\/Interspeech.2019-2904"},{"key":"10598_CR4","doi-asserted-by":"crossref","unstructured":"Chung YA, Wang Y, Hsu WN, Zhang Y, Skerry-Ryan R. Semi-supervised training for improving data efficiency in end-to-end speech synthesis. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2019. p. 6940\u20136944.","DOI":"10.1109\/ICASSP.2019.8683862"},{"key":"10598_CR5","unstructured":"Ren Y, Tan X, Qin T, Zhao S, Zhao Z, Liu TY. Almost unsupervised text to speech and automatic speech recognition. In: International conference on machine learning. PMLR; 2019. p. 5410\u20135419."},{"key":"10598_CR6","doi-asserted-by":"crossref","unstructured":"Chen YJ, Tu T, Yeh Cc, Lee Hy. End-to-end text-to-speech for low-resource languages by cross-lingual transfer learning. In: Interspeech. 2019. p. 2075\u20132079.","DOI":"10.21437\/Interspeech.2019-2730"},{"key":"10598_CR7","doi-asserted-by":"crossref","unstructured":"Tits N, El Haddad K, Dutoit T. Exploring transfer learning for low resource emotional tts. In: Intelligent systems and applications: Proceedings of the 2019 intelligent systems conference (IntelliSys) volume 1. Springer; 2020. p. 52\u201360.","DOI":"10.1007\/978-3-030-29516-5_5"},{"key":"10598_CR8","doi-asserted-by":"crossref","unstructured":"Zen H, Dang V, Clark RAJ, Zhang Y, Weiss RJ, Jia Y, et al. LibriTTS: A corpus derived from LibriSpeech for text-to-speech. In: Interspeech. 2019. Available from: https:\/\/api.semanticscholar.org\/CorpusID:102352475.","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"10598_CR9","doi-asserted-by":"crossref","unstructured":"Latorre J, Lachowicz J, Lorenzo-Trueba J, Merritt T, Drugman T, Ronanki S, et al. Effect of data reduction on sequence-to-sequence neural TTS. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2019. p. 7075\u20137079.","DOI":"10.1109\/ICASSP.2019.8682168"},{"key":"10598_CR10","doi-asserted-by":"crossref","unstructured":"Luong HT, Wang X, Yamagishi J, Nishizawa N. Training multi-speaker neural text-to-speech systems using speaker-imbalanced speech corpora. In: Interspeech; 2019. p. 1303\u20131307.","DOI":"10.21437\/Interspeech.2019-1311"},{"key":"10598_CR11","doi-asserted-by":"crossref","unstructured":"Cai Z, Yang Y, Li M. Cross-lingual multi-speaker speech synthesis with limited bilingual training data. Comput Speech Lang. 2023;77:101427. Available from:. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0885230822000584.","DOI":"10.1016\/j.csl.2022.101427"},{"key":"10598_CR12","unstructured":"Lee Y, Shon S, Kim T. Learning pronunciation from a foreign language in speech synthesis networks. 2018. arXiv preprint arXiv:1811.09364"},{"key":"10598_CR13","unstructured":"He M, Yang J, He L, Soong FK. Multilingual byte2speech models for scalable low-resource speech synthesis. 2021. arXiv preprint arXiv:2103.03541"},{"key":"10598_CR14","doi-asserted-by":"crossref","unstructured":"de Korte M, Kim J, Klabbers E. Efficient neural speech synthesis for low-resource languages through multilingual modeling. In: Interspeech. 2020.","DOI":"10.21437\/Interspeech.2020-2664"},{"key":"10598_CR15","doi-asserted-by":"crossref","unstructured":"Wu P, Shi J, Zhong Y, Watanabe S, Black AW. Cross-lingual transfer for speech processing using acoustic language similarity. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). IEEE; 2021. p. 1050\u20131057.","DOI":"10.1109\/ASRU51503.2021.9688276"},{"issue":"8","key":"10598_CR16","doi-asserted-by":"publisher","first-page":"179798","DOI":"10.1109\/ACCESS.2020.3027619","volume":"01","author":"K Azizah","year":"2020","unstructured":"Azizah K, Adriani M, Jatmiko W. Hierarchical Transfer Learning for Multilingual, Multi-Speaker, and Style Transfer DNN-Based TTS on Low-Resource Languages. IEEE Access. 2020;01(8):179798\u2013812.","journal-title":"IEEE Access."},{"key":"10598_CR17","doi-asserted-by":"publisher","first-page":"734","DOI":"10.1109\/TASLP.2022.3230453","volume":"31","author":"A Prakash","year":"2022","unstructured":"Prakash A, Murthy HA. Exploring the Role of Language Families for Building Indic Speech Synthesisers. IEEE\/ACM Trans Audio Speech Lang Process. 2022;31:734\u201347.","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process."},{"key":"10598_CR18","doi-asserted-by":"crossref","unstructured":"Odlin T. Language transfer. Cambridge, UK: Cambridge. 1989.","DOI":"10.1017\/CBO9781139524537"},{"issue":"1","key":"10598_CR19","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1177\/00336882221081894","volume":"55","author":"K Perkins","year":"2024","unstructured":"Perkins K, Zhang LJ. The effect of first language transfer on second language acquisition and learning: From contrastive analysis to contemporary neuroimaging. RELC J. 2024;55(1):162\u201378.","journal-title":"RELC J."},{"key":"10598_CR20","doi-asserted-by":"crossref","unstructured":"Oliveira F, Casanova E, J nior A, Soares A, Filho A. CML-TTS: A Multilingual Dataset for Speech Synthesis in Low-Resource Languages. In: Text, Speech, and Dialogue. Springer Nature Switzerland; 2023. p. 188\u2013199.","DOI":"10.1007\/978-3-031-40498-6_17"},{"key":"10598_CR21","doi-asserted-by":"crossref","unstructured":"Panayotov V, Chen G, Povey D, Khudanpur S. Librispeech: An ASR corpus based on public domain audio books. 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2015;p. 5206\u20135210.","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"10598_CR22","doi-asserted-by":"crossref","unstructured":"Pratap V, Xu Q, Sriram A, Synnaeve G, Collobert R. MLS: A Large-Scale Multilingual Dataset for Speech Research. In: Proceedings of Interspeech 2020; 2020. p. 2757\u20132761.","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"10598_CR23","doi-asserted-by":"crossref","unstructured":"Black AW. Cmu wilderness multilingual speech dataset. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2019. p. 5971\u20135975.","DOI":"10.1109\/ICASSP.2019.8683536"},{"key":"10598_CR24","doi-asserted-by":"crossref","unstructured":"Meyer J, Adelani DI, Casanova E, \u00d6ktem A, Weber DWJ, Kabongo S, et al. BibleTTS: a large, high-fidelity, multilingual, and uniquely African speech corpus. In: Proceedings of Interspeech 2022; 2022. p. 2383\u20132387.","DOI":"10.21437\/Interspeech.2022-10850"},{"key":"10598_CR25","unstructured":"Cooper E. Text-to-speech synthesis using found data for low-resource languages. Columbia University; 2019."},{"key":"10598_CR26","unstructured":"Baali M, Hayashi T, Mubarak H, Maiti S, Watanabe S, El-Hajj W, et al. Unsupervised data selection for TTS: using Arabic Broadcast News as a case study. 2023. arXiv preprint arXiv:2301.09099"},{"key":"10598_CR27","unstructured":"Fang W, Chung YA, Glass J. Towards transfer learning for end-to-end speech synthesis from deep pre-trained language models. 2019. arXiv preprint arXiv:1906.07307"},{"key":"10598_CR28","doi-asserted-by":"crossref","unstructured":"Jia Y, Zen H, Shen J, Zhang Y, Wu Y. PnG BERT: Augmented BERT on Phonemes and Graphemes for Neural TTS. In: Interspeech. 2021. Available from: https:\/\/api.semanticscholar.org\/CorpusID:232404175.","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"10598_CR29","doi-asserted-by":"crossref","unstructured":"Liu AH, Tu T, Lee Hy, Lee Ls. Towards unsupervised speech recognition and synthesis with quantized speech representation learning. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2020. p. 7259\u20137263.","DOI":"10.1109\/ICASSP40776.2020.9053571"},{"key":"10598_CR30","doi-asserted-by":"crossref","unstructured":"Zhang H, Lin Y. Unsupervised Learning For Sequence-to-sequence Text-to-speech For Low-resource Languages. In: Interspeech. 2020. Available from: https:\/\/api.semanticscholar.org\/CorpusID:221142085.","DOI":"10.21437\/Interspeech.2020-1403"},{"key":"10598_CR31","doi-asserted-by":"publisher","first-page":"1336","DOI":"10.1162\/tacl_a_00430","volume":"9","author":"K Lakhotia","year":"2021","unstructured":"Lakhotia K, Kharitonov E, Hsu WN, Adi Y, Polyak A, Bolte B, et al. On generative spoken language modeling from raw audio. Trans Assoc Computat Linguist. 2021;9:1336\u201354.","journal-title":"Trans Assoc Computat Linguist."},{"key":"10598_CR32","doi-asserted-by":"crossref","unstructured":"Tjandra A, Sakti S, Nakamura S. Listening while speaking: Speech chain by deep learning. In: 2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). IEEE; 2017. p. 301\u2013308.","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"10598_CR33","doi-asserted-by":"crossref","unstructured":"Ni J, Wang L, Gao H, Qian K, Zhang Y, Chang S, et al. Unsupervised Text-to-Speech Synthesis by Unsupervised Automatic Speech Recognition. In: Interspeech. 2022. Available from:. https:\/\/api.semanticscholar.org\/CorpusID:247779077.","DOI":"10.21437\/Interspeech.2022-816"},{"key":"10598_CR34","first-page":"2021","volume":"12","author":"Z Byambadorj","year":"2021","unstructured":"Byambadorj Z, Nishimura R, Ayush A, Ohta K, Kitaoka N. Text-to-speech system for low-resource language using cross-lingual transfer learning and data augmentation. EURASIP J Audio Speech Music Process. 2021;12:2021.","journal-title":"EURASIP J Audio Speech Music Process."},{"key":"10598_CR35","doi-asserted-by":"crossref","unstructured":"Xu J, Tan X, Ren Y, Qin T, Li J, Zhao S, et al. Lrspeech: Extremely low-resource speech synthesis and recognition. In: Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery & data mining. 2020. p. 2802\u20132812.","DOI":"10.1145\/3394486.3403331"},{"key":"10598_CR36","doi-asserted-by":"crossref","unstructured":"Zhang Y, Weiss RJ, Zen H, Wu Y, Chen Z, Skerry-Ryan R, et al. Learning to speak fluently in a foreign language: Multilingual speech synthesis and cross-language voice cloning. In: Interspeech. 2019.","DOI":"10.21437\/Interspeech.2019-2668"},{"key":"10598_CR37","doi-asserted-by":"crossref","unstructured":"Kim C, Stern RM. Robust signal-to-noise ratio estimation based on waveform amplitude distribution analysis. In: Interspeech. 2008. p. 2598\u20132601.","DOI":"10.21437\/Interspeech.2008-644"},{"key":"10598_CR38","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski A, Zhou Y, Mohamed A, Auli M. wav2vec 2.0: A framework for self-supervised learning of speech representations. Adv Neural Inf Process Syst. 2020;33:12449\u201360.","journal-title":"Adv Neural Inf Process Syst."},{"key":"10598_CR39","doi-asserted-by":"crossref","unstructured":"Babu A, Wang C, Tjandra A, Lakhotia K, Xu Q, Goyal N, et al. XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale. In: Interspeech. 2021. Available from: https:\/\/api.semanticscholar.org\/CorpusID:244270531.","DOI":"10.21437\/Interspeech.2022-143"},{"key":"10598_CR40","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J. Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. p. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10598_CR41","doi-asserted-by":"crossref","unstructured":"Chung JS, Nagrani A, Zisserman A. Voxceleb2: Deep speaker recognition. 2018. arXiv preprint arXiv:1806.05622","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"10598_CR42","doi-asserted-by":"crossref","unstructured":"Celano GG. A resnet-50-based convolutional neural network model for language id identification from speech recordings. In: Proceedings of the third workshop on computational typology and multilingual NLP. 2021. p. 136\u2013144.","DOI":"10.18653\/v1\/2021.sigtyp-1.13"},{"key":"10598_CR43","doi-asserted-by":"crossref","unstructured":"Shen J, Pang R, Weiss RJ, Schuster M, Jaitly N, Yang Z, et al. Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE; 2018. p. 4779\u20134783.","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"10598_CR44","unstructured":"Li N, Liu S, Liu Y, Zhao S, Liu M, Zhou M. Close to human quality TTS with transformer. 2018;2. arXiv preprint arXiv:1809.08895"},{"key":"10598_CR45","unstructured":"Ren Y, Hu C, Tan X, Qin T, Zhao S, Zhao Z, et al. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. 2021."},{"key":"10598_CR46","unstructured":"Ren Y, Ruan Y, Tan X, Qin T, Zhao S, Zhao Z, et al. Fastspeech: Fast, robust and controllable text to speech. Adv Neural Inf Process Syst. 2019;32."},{"key":"10598_CR47","doi-asserted-by":"crossref","unstructured":"Tachibana H, Uenoyama K, Aihara S. Efficiently trainable text-to-speech system based on deep convolutional networks with guided attention. In: 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP). IEEE; 2018. p. 4784\u20134788.","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"10598_CR48","doi-asserted-by":"crossref","unstructured":"Kulkarni A, Kulkarni A, Shatnawi SAM, Aldarmaki H. ClArTTS: An Open-Source Classical Arabic Text-to-Speech Corpus. In: Proceedings of Interspeech 2023. 2023. p. 5511\u20135515.","DOI":"10.21437\/Interspeech.2023-2224"},{"key":"10598_CR49","doi-asserted-by":"crossref","unstructured":"Sharoni O, Shenberg R, Cooper E. Saspeech: A hebrew single speaker dataset for text to speech and voice conversion. In: Proceedings of Interspeech. 2023.","DOI":"10.21437\/Interspeech.2023-430"},{"key":"10598_CR50","doi-asserted-by":"crossref","unstructured":"Park K, Mulc T. CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages. In: Interspeech. 2019. Available from: https:\/\/api.semanticscholar.org\/CorpusID:85531187.","DOI":"10.21437\/Interspeech.2019-1500"},{"key":"10598_CR51","doi-asserted-by":"crossref","unstructured":"Perraudin N, Balazs P, S\u00f8ndergaard PL. A fast Grin-Lim algorithm. In: 2013 IEEE workshop on applications of signal processing to audio and acoustics. IEEE; 2013. p. 1\u20134.","DOI":"10.1109\/WASPAA.2013.6701851"},{"key":"10598_CR52","doi-asserted-by":"crossref","unstructured":"Yamamoto R, Song E, Kim JM. Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2020. p. 6199\u20136203.","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"10598_CR53","doi-asserted-by":"crossref","unstructured":"Reddy CK, Gopal V, Cutler R. DNSMOS P. 835: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE; 2022. p. 886\u2013890.","DOI":"10.1109\/ICASSP43922.2022.9746108"}],"container-title":["Cognitive Computation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-026-10598-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12559-026-10598-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-026-10598-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T06:24:33Z","timestamp":1779171873000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12559-026-10598-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,19]]},"references-count":53,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,12]]}},"alternative-id":["10598"],"URL":"https:\/\/doi.org\/10.1007\/s12559-026-10598-3","relation":{},"ISSN":["1866-9956","1866-9964"],"issn-type":[{"value":"1866-9956","type":"print"},{"value":"1866-9964","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,19]]},"assertion":[{"value":"5 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 May 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 May 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}],"article-number":"53"}}