{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T19:42:47Z","timestamp":1773776567321,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T00:00:00Z","timestamp":1756166400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T00:00:00Z","timestamp":1756166400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"name":"Manipal University Jaipur"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Discov Computing"],"DOI":"10.1007\/s10791-025-09701-3","type":"journal-article","created":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T10:42:35Z","timestamp":1756204955000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Advancing Bangla text-to-speech synthesis using a VITS-based model with a custom dataset and comprehensive evaluation"],"prefix":"10.1007","volume":"28","author":[{"given":"Sujeet","family":"Kumar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2548-1520","authenticated-orcid":false,"given":"Siddharth","family":"Kumar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kushal","family":"Sathe","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4348-073X","authenticated-orcid":false,"given":"Jayadeep","family":"Pati","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,8,26]]},"reference":[{"key":"9701_CR1","doi-asserted-by":"crossref","unstructured":"Hunt AJ, Black AW. Unit selection in a concatenative speech synthesis system using a large speech database, in 1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings, vol. 1. IEEE, 1996, pp. 373\u2013376.","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"9701_CR2","doi-asserted-by":"crossref","unstructured":"Yoshimura T, Tokuda K, Masuko T, Kobayashi T, Kitamura T. Simultaneous modeling of spectrum, pitch, and duration in hmm based speech synthesis, in Sixth European Conference on Speech Communication and Technology, 1999.","DOI":"10.21437\/Eurospeech.1999-596"},{"key":"9701_CR3","unstructured":"Xu Tan T, Qin F, Soong, Liu T-Y. A survey on neural speech synthesis, arXiv preprint arXiv:2106.15561, 2021."},{"key":"9701_CR4","doi-asserted-by":"crossref","unstructured":"Hunt AJ, Black AW. (1996) Unit selection in a concatenative speech synthesis system using a large speech database. In: 1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings, IEEE, vol 1, pp 373\u2013376.","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"9701_CR5","doi-asserted-by":"publisher","first-page":"1227","DOI":"10.1016\/j.specom.2006.05.003","volume":"48","author":"J Latorre","year":"2006","unstructured":"Latorre J, Iwano K, Furui S. New approach to the polyglot speech generation by means of an HMM-based speaker adaptable synthesizer. Speech Commun. 2006;48:1227\u201342.","journal-title":"Speech Commun"},{"key":"9701_CR6","doi-asserted-by":"crossref","unstructured":"Wang Y, Skerry-Ryan RJ, Stanton D, Wu Y, Weiss RJ, Jaitly N, Saurous RA (2017). Tacotron: Towards end-to-end speech synthesis. https:\/\/arxiv.org\/abs\/1703.10135","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"9701_CR7","unstructured":"van den Aaron S, Dieleman H, Zen K, Simonyan O, Vinyals A, Graves N, Kalchbrenner A, Senior, Kavukcuoglu K. Wavenet: A generative model for raw audio, arXivpreprintarXiv:1609.03499,2016."},{"key":"9701_CR8","doi-asserted-by":"crossref","unstructured":"Geng Yang S, Yang K, Liu P, Fang W, Chen, Xie L. Multi-band mel- gan: faster waveform generation for high-quality text-to-speech. in 2021 IEEE spoken Language technology workshop (SLT). IEEE; 2021. pp. 492\u20138.","DOI":"10.1109\/SLT48900.2021.9383551"},{"key":"9701_CR9","first-page":"2338","volume":"30","author":"X Wang","year":"2022","unstructured":"Wang X, Takaki S, Yamagishi J. Neural source-filter-based waveform model for statistical parametric speech synthesis. IEEE\/ACM Trans Audio Speech Lang Process. 2022;30:2338\u201349.","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"issue":"2","key":"9701_CR10","first-page":"127","volume":"35","author":"A Baby","year":"2018","unstructured":"Baby A, Swain SK, Routray A, Kabi JD. Statistical parametric speech synthesis in Indian languages: a review. IETE Tech Rev. 2018;35(2):127\u201348.","journal-title":"IETE Tech Rev"},{"key":"9701_CR11","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1016\/j.procs.2016.06.096","volume":"89","author":"A Nayak","year":"2016","unstructured":"Nayak A, Mishra S, Jyothi P. Review on development of speech synthesis system for Indian languages. Procedia Comput Sci. 2016;89:69\u201377.","journal-title":"Procedia Comput Sci"},{"key":"9701_CR12","unstructured":"Kumar A, Rajendran S, Anderson A, Black AW, Singh R. 2020. Improved text- to-speech for Indic languages using neural network. Proc. Interspeech 2020, pp.4146\u20134150."},{"key":"9701_CR13","unstructured":"http:\/\/tdil.meity.gov.in\/"},{"key":"9701_CR14","unstructured":"https:\/\/dhvani.sourceforge.net\/"},{"key":"9701_CR15","unstructured":"Kishore SP, Black AW. 2021. Building synthetic voices for the languages of India. Proceedings of the 1st Workshop on Natural Language Processing for Indigenous Languages of the Americas, pp.1\u201310."},{"key":"9701_CR16","unstructured":"Prasad PWC, Eldhu Fatima A, Sai Mohit G, Ayush Garg K, Aditya N, Kumar N, Prakash PVN, S. and, Premjith B. 2022. Indic NLP Suite: Monolingual Corpora, Bench- mark Datasets and Pre-trained Language Models for Indian Languages. arXiv preprint arXiv:2209.08721."},{"key":"9701_CR17","unstructured":"https:\/\/vakyansh.ai."},{"key":"9701_CR18","doi-asserted-by":"crossref","unstructured":"Saha Raju R, Bhattacharjee P, Ahmad A, Rahman MS. (2019). A Bangla text-to- speech system using deep neural networks.","DOI":"10.1109\/ICBSLP47725.2019.202055"},{"key":"9701_CR19","doi-asserted-by":"crossref","unstructured":"Akuzawa K, Iwasawa Y,  Matsuo Y. (2018). Expressive speech synthesis via modeling expressions with variational autoencoder. arXiv preprint arXiv:1804.02135. https:\/\/arxiv.org\/abs\/1804.02135","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"9701_CR20","unstructured":"Van Den Oord ., Dieleman S, Zen H, Simonyan K, Vinyals O, Graves A, Kavukcuoglu K. (2016). Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499, 12, 1. https:\/\/arxiv.org\/abs\/1609.03499"},{"key":"9701_CR21","unstructured":"Klejch O, Peter J, Espa\u00f1a-Bonet C, Gales MJ. 2022. Low Footprint Text-to- Speech Streaming from Compressed Encoder State. Proc. Interspeech 2022, pp.722\u2013726."},{"key":"9701_CR22","unstructured":"Cooper EW, \u00dcnal C, Pru\u0161a Z, Black AW, Zhang Y. 2022. Personalized neural text-to-speech. Proc. ICASSP 2022, pp.5747\u20135751."},{"key":"9701_CR23","unstructured":"Kishore SP, Zhang Y, Black AW. May. A clustering approach to low resource Language modeling. 2021 IEEE spoken Language technology workshop (SLT). IEEE; 2021. pp. 452\u20139."},{"key":"9701_CR24","doi-asserted-by":"crossref","unstructured":"Pranjol MA, Rahman F, Shuvo R A, Ahmed T, Anika BY, Anas M A A M, Rasel AA. (2022). Bengali speech recognition: An overview. In 2022 IEEE International Conference on Artificial Intelligence in Engineering and Technology (IICAIET) (pp. 1\u20136). IEEE. https:\/\/ieeexplore.ieee.org\/abstract\/document\/9936819","DOI":"10.1109\/IICAIET55139.2022.9936819"},{"key":"9701_CR25","unstructured":"Ali MG, Islam MF, Islam T, Nath B, Rahman F, Islam MS. 2022. Katha: An Advanced Bangla Text to Speech Using Deep Learning."},{"issue":"12","key":"9701_CR26","first-page":"1","volume":"2","author":"MAH Roushan","year":"2020","unstructured":"Roushan MAH, Hassan MM, Bhuiyan M, Khan MRI. Bangla text-to- speech conversion using deep convolutional neural networks. SN Appl Sci. 2020;2(12):1\u201311.","journal-title":"SN Appl Sci"},{"key":"9701_CR27","unstructured":"https:\/\/cloud.google.com\/text-to-speech\/docs\/voices"},{"key":"9701_CR28","unstructured":"https:\/\/www.narakeet.com\/"},{"key":"9701_CR29","doi-asserted-by":"crossref","unstructured":"Kim M, Jeong M, Choi BJ, Ahn S, Lee JY, Kim NS (2022). Transfer learning framework for low-resource text-to-speech using a large-scale unlabeled speech corpus. arXiv preprint arXiv:2203.15447. https:\/\/arxiv.org\/abs\/2203.15447","DOI":"10.21437\/Interspeech.2022-225"},{"key":"9701_CR30","unstructured":"Kim J, Kong J, Son J. Conditional variational autoencoder with adversarial learning for end-to end text-to-speech, in International Conference on Machine Learning. PMLR, 2021, pp. 5530\u20135540."},{"key":"9701_CR31","unstructured":"Google international language resources. accessed: 2019-07-28. [Online]]. Available: https:\/\/github.com\/google\/languageresources\/blob\/master\/bn\/festvox\/phonology.json"},{"key":"9701_CR32","unstructured":"Rahman MM, Hussain MA, Rahman MS. (2010). Text normalization and diphone preparation for Bangla speech synthesis."},{"key":"9701_CR33","unstructured":"Baby A, Thomas AL, Nishanthi NL, Consortium TTS et al. (2016). Resources for Indian languages."},{"key":"9701_CR34","unstructured":"Honnet PE, Lazaridis A, Garner PN,  Yamagishi J. (2017). The siwis french speech synthesis database? design and recording of a high quality french database for speech synthesis. Online Database. https:\/\/datashare.ed.ac.uk\/handle\/10283\/2353"},{"key":"9701_CR35","unstructured":"Sonobe R, Takamichi S, Saruwatari H. Jsut corpus: free largescale Japanese speech corpus for end-to-end speech synthesis. ArXiv Preprint arXiv:1711.00354, 2017."},{"key":"9701_CR36","doi-asserted-by":"crossref","unstructured":"Gabdrakhmanov L, Garaev R, Razinkov E. Ruslan: Russian spoken language corpus for speech synthesis, arXiv preprint arXiv:1906.11645, 2019.","DOI":"10.1007\/978-3-030-26061-3_12"},{"key":"9701_CR37","unstructured":"Ping W, Peng K, Gibiansky A, Arik SO, Kannan A, Narang S, Raiman J, Miller J (2018) ClariNet: Parallel wave generation in end-to-end text-to-speech. arXiv preprint arXiv:180707281"},{"key":"9701_CR38","first-page":"8067","volume":"33","author":"J Kim","year":"2020","unstructured":"Kim J, Kim S, Kong J,  Yoon S. (2020). Glow-tts: A generative flow for text-to-speech via monotonic alignment search. Advances in Neural Information Processing Systems, 33, 8067\u20138077. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/5c3b99e8f92532e5ad1556e53ceea00c-Abstract.html","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9701_CR39","first-page":"3199","volume":"28","author":"R Yamamoto","year":"2020","unstructured":"Yamamoto R, Song E, Kim J. Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. IEEE\/ACM Trans Audio Speech Lang Process. 2020;28:3199\u2013210.","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"9701_CR40","doi-asserted-by":"crossref","unstructured":"Li WC, Lee HT, Lee HY. Comparison of losses and generators for TTS applications using generative adversarial networks. Volume 29. IEEE\/ACM Transactions on Audio, Speech, and Language Processing; 2021. pp. 3463\u201376.","DOI":"10.1109\/TASLP.2021.3111566"},{"key":"9701_CR41","doi-asserted-by":"crossref","unstructured":"Qian Y, Fan Y, Hu W, Soong FK. On the training aspects of Deep Neural Network (DNN) for parametric TTS synthesis, in Proc. ICASSP, 2014, pp. 3829\u20133833.","DOI":"10.1109\/ICASSP.2014.6854318"},{"key":"9701_CR42","doi-asserted-by":"crossref","unstructured":"Kubichek R. (1993). Mel-cepstral distance measure for objective speech quality assessment. In Proceedings of IEEE Pacific Rim Conference on Communications Computers and Signal Processing (pp. 125\u2013128). IEEE.","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"9701_CR43","doi-asserted-by":"crossref","unstructured":"Juvela L, Bollepalli B, Wang X, Kameoka H, Airaksinen M, Yamagishi J, Alku P. Speech waveform synthesis from MFCC sequences with Generative Adversarial Networks, in Proc. ICASSP, 2018, pp. 5679\u20135683.","DOI":"10.1109\/ICASSP.2018.8461852"},{"key":"9701_CR44","unstructured":"Skerry-Ryan RJ, Battenberg E, Xiao Y, Wang Y, Stanton D, Shor J, Saurous RA. (2018). Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron. arXiv preprint arXiv:1803.09047. https:\/\/proceedings.mlr.press\/v80\/skerry-ryan18a.html"},{"key":"9701_CR45","unstructured":"Yi Z, Huang J, Zhang X, Liu J, Chen Y. (2022). Spelling Error Generation for End-to-End Speech Synthesis. Proc. Interspeech 2022, 4997\u20135001."},{"key":"9701_CR46","unstructured":"Cooper EW, \u00dcnal C, Pru\u0161a Z, Black AW, Zhang Y. (2022). Personalized neural text-to-speech. Proc. ICASSP 2022, 5747\u20135751."},{"key":"9701_CR47","unstructured":"Klejch O, Peter J, Espa\u00f1a-Bonet C, Gales MJ. (2022). Low Footprint Text-to- Speech Streaming from Compressed Encoder State. Proc. Interspeech 2022, 722\u2013726."}],"container-title":["Discover Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-025-09701-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10791-025-09701-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-025-09701-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T16:45:04Z","timestamp":1757436304000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10791-025-09701-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,26]]},"references-count":47,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["9701"],"URL":"https:\/\/doi.org\/10.1007\/s10791-025-09701-3","relation":{},"ISSN":["2948-2992"],"issn-type":[{"value":"2948-2992","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,26]]},"assertion":[{"value":"28 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Informed consent was obtained from all individual participants included in the study. The dataset used in this research is not publicly available due to privacy and ethical considerations. Administrative permissions to access and use the raw data were granted by the Institutional Ethics Committee\/Institutional Review Board (IRB) of the Indian Institute of Information Technology Ranchi (IRB Approval No. IIITR\/2023\/ETHICS\/007). We confirm that the Indian Institute of Information Technology Ranchi has a formally constituted Ethics Committee\/IRB, which reviewed and approved the entire study. All experimental protocols were conducted in accordance with the ethical standards outlined in the Declaration of Helsinki. The study also complied with national guidelines for research involving human participants, ensuring confidentiality and anonymity. Participants\u2019 ages were aggregated and reported in ranges to prevent any potential identification.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare no competing interests.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"183"}}