{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T03:48:22Z","timestamp":1752983302689,"version":"3.37.3"},"reference-count":84,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,2,2]],"date-time":"2023-02-02T00:00:00Z","timestamp":1675296000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,2,2]],"date-time":"2023-02-02T00:00:00Z","timestamp":1675296000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1007\/s10772-023-10021-4","type":"journal-article","created":{"date-parts":[[2023,2,2]],"date-time":"2023-02-02T17:50:47Z","timestamp":1675360247000},"page":"221-243","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A framework for quality assessment of synthesised speech using learning-based objective evaluation"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7539-3721","authenticated-orcid":false,"given":"Shrikant","family":"Malviya","sequence":"first","affiliation":[]},{"given":"Rohit","family":"Mishra","sequence":"additional","affiliation":[]},{"given":"Santosh Kumar","family":"Barnwal","sequence":"additional","affiliation":[]},{"given":"Uma Shanker","family":"Tiwary","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,2,2]]},"reference":[{"key":"10021_CR1","unstructured":"Ar\u0131k, S.\u00d6., Chrzanowski, M., Coates, A., Diamos, G., Gibiansky, A., Kang, Y. & Shoeybi, M. (2017). Deep voice: Real-time neural text-to-speech. In Proceedings of the 34th international conference on machine learning, Vol. 70, (pp. 195\u2013204)."},{"key":"10021_CR2","doi-asserted-by":"crossref","unstructured":"Baby, A., Nishanthi, N., Thomas, A. L. & Murthy, H. A. (2016a). Resources for Indian languages. In International conference on text, speech, and dialogue (pp. 514\u2013521).","DOI":"10.1007\/978-3-319-45510-5_59"},{"key":"10021_CR3","doi-asserted-by":"crossref","unstructured":"Baby, A., Nishanthi, N., Thomas, A. L. & Murthy, H. A. (2016b). A unified parser for developing Indian language text to speech synthesizers. In International conference on text, speech, and dialogue (pp. 514\u2013521).","DOI":"10.1007\/978-3-319-45510-5_59"},{"key":"10021_CR4","doi-asserted-by":"crossref","unstructured":"Beutnagel, M., Conkie, A., Schroeter, J., Stylianou, Y. & Syrdal, A. (1999). The at &t next-gen tts system. In Joint meeting of ASA, EAA, and DAGA (pp. 18\u201324).","DOI":"10.1121\/1.424924"},{"key":"10021_CR5","unstructured":"Black, A. W. (n.d.). CMU INDIC speech synthesis databases. Retrieved December 15, 2021, from http:\/\/festvox.org\/cmu_indic\/index.html"},{"key":"10021_CR6","doi-asserted-by":"crossref","unstructured":"Black, A. W. (2002). Perfect synthesis for all of the people all of the time. In Proceedings of 2002 IEEE workshop on speech synthesis, 2002. (pp. 167\u2013170).","DOI":"10.1109\/WSS.2002.1224400"},{"key":"10021_CR7","doi-asserted-by":"crossref","unstructured":"Black, A. W. (2006). Clustergen: A statistical parametric synthesizer using trajectory modeling. In Proceedings of Interspeech-2006, ninth international conference on spoken language processing (pp. 1762\u20131765).","DOI":"10.21437\/Interspeech.2006-488"},{"key":"10021_CR8","doi-asserted-by":"crossref","unstructured":"Black, A. W. & Taylor, P. (1997). Automatically clustering similar units for unit selection in speech synthesis. In Eurospeech97 (pp. 601\u2013604).","DOI":"10.21437\/Eurospeech.1997-219"},{"key":"10021_CR10","unstructured":"Cernak, M. & Rusko, M. (2005). An evaluation of synthetic speech using the PESQ measure. In Proceedings of the European congress on acoustics (pp. 2725\u20132728)."},{"issue":"3","key":"10021_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"CC Chang","year":"2011","unstructured":"Chang, C. C., & Lin, C. J. (2011). Libsvm: A library for support vector machines. ACM Transactions on Intelligent Systems and Technology (TIST), 2(3), 1\u201327.","journal-title":"ACM Transactions on Intelligent Systems and Technology (TIST)"},{"key":"10021_CR12","unstructured":"Chang, Y. Y. (2011). Evaluation of TTS systems in intelligibility and comprehension tasks. In Proceedings of the 23rd conference on computational linguistics and speech processing (pp. 64\u201378)."},{"key":"10021_CR13","doi-asserted-by":"crossref","unstructured":"Chen, J. D. & Campbell, N. (1999). Objective distance measures for assessing concatenative speech synthesis. In Sixth European conference on speech communication and technology.","DOI":"10.21437\/Eurospeech.1999-157"},{"key":"10021_CR14","doi-asserted-by":"crossref","unstructured":"Choi, Y., Jung, Y. & Kim, H. (2020). Deep MOS predictor for synthetic speech using cluster-based modeling. In Proceedings of Interspeech 2020 (pp. 1743\u20131747).","DOI":"10.21437\/Interspeech.2020-2111"},{"issue":"2","key":"10021_CR15","doi-asserted-by":"publisher","first-page":"190","DOI":"10.1109\/TAFFC.2015.2457417","volume":"7","author":"F Eyben","year":"2016","unstructured":"Eyben, F., Scherer, K. R., Schuller, B. W., Sundberg, J., Andr\u00e9, E., Busso, C., et al. (2016). The Geneva minimalistic acoustic parameter set (GeMAPS) for voice research and affective computing. IEEE Transactions on Affective Computing, 7(2), 190\u2013202.","journal-title":"IEEE Transactions on Affective Computing"},{"key":"10021_CR16","doi-asserted-by":"crossref","unstructured":"Eyben, F., Weninger, F., Gross, F. & Schuller, B. (2013). Recent developments in opensmile, the Munich open-source multimedia feature extractor. In Proceedings of the 21st ACM international conference on multimedia (pp. 835\u2013838).","DOI":"10.1145\/2502081.2502224"},{"key":"10021_CR17","unstructured":"Falk, T. & Chan, W. (2004). Single ended method for objective speech quality assessment in narrowband telephony applications. In ITU-T (p. 563)."},{"key":"10021_CR18","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1109\/LSP.2008.2006709","volume":"15","author":"TH Falk","year":"2008","unstructured":"Falk, T. H., & Moller, S. (2008). Towards signal-based instrumental quality diagnosis for text-to-speech systems. IEEE Signal Processing Letters, 15, 781\u2013784.","journal-title":"IEEE Signal Processing Letters"},{"key":"10021_CR19","doi-asserted-by":"crossref","unstructured":"Fu, S. W., Tsao, Y., Hwang, H. T. & Wang, H. M. (2018). Quality-net: An end-to-end non-intrusive speech quality assessment model based on BLSTM. In Proceedings of Interspeech 2018 (pp. 1873\u20131877).","DOI":"10.21437\/Interspeech.2018-1802"},{"key":"10021_CR20","unstructured":"Gibiansky, A., Arik, S., Diamos, G., Miller, J., Peng, K., Ping, W. & Zhou, Y. (2017). Deep voice 2: Multi-speaker neural text-to-speech. In I. Guyon et al. (Eds.), Advances in neural information processing systems, Vol. 30, (pp. 2962\u20132970). Curran Associates Inc."},{"issue":"6","key":"10021_CR21","doi-asserted-by":"publisher","first-page":"1948","DOI":"10.1109\/TASL.2006.883250","volume":"14","author":"V Grancharov","year":"2006","unstructured":"Grancharov, V., Zhao, D. Y., Lindblom, J., & Kleijn, W. B. (2006). Low-complexity, nonintrusive speech quality assessment. IEEE Transactions on Audio, Speech, and Language Processing, 14(6), 1948\u20131956.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"10021_CR22","unstructured":"Grice, M., Vagges, K. & Hirst, D. (1992). Prosodic form tests and \u201cprosodic function tests\u201d. SAM final report."},{"key":"10021_CR23","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-84858-7","volume-title":"The elements of statistical learning: Data mining, inference, and prediction","author":"T Hastie","year":"2009","unstructured":"Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical learning: Data mining, inference, and prediction. Springer."},{"key":"10021_CR24","unstructured":"Heute, U. (2008). Speech-transmission quality: Aspects and assessment for wideband vs. narrowband signals. Advances in Digital Speech Transmission, 572."},{"key":"10021_CR25","doi-asserted-by":"crossref","unstructured":"Hinterleitner, F., Norrenbrock, C., M\u00f6ller, S. & Heute, U. (2013). Predicting the quality of text-to-speech systems from a large-scale feature set. In Interspeech (pp. 383\u2013387).","DOI":"10.21437\/Interspeech.2013-105"},{"key":"10021_CR26","unstructured":"Huang, D. Y. (2011). Prediction of perceived sound quality of synthetic speech. In Proceedings of APSIPA."},{"key":"10021_CR27","doi-asserted-by":"crossref","unstructured":"Hunt, A.J. & Black, A.W. (1996). Unit selection in a concatenative speech synthesis system using a large speech database. In 1996 IEEE international conference on acoustics, speech, and signal processing conference proceedings, Vol. 1, (pp. 373\u2013376).","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"10021_CR28","volume-title":"Voice and speech quality perception: Assessment and evaluation","author":"U Jekosch","year":"2006","unstructured":"Jekosch, U. (2006). Voice and speech quality perception: Assessment and evaluation. Springer."},{"issue":"5","key":"10021_CR29","doi-asserted-by":"publisher","first-page":"821","DOI":"10.1109\/TSA.2005.851924","volume":"13","author":"DS Kim","year":"2005","unstructured":"Kim, D. S. (2005). Anique: An auditory model for single-ended speech quality estimation. IEEE Transactions on Speech and Audio Processing, 13(5), 821\u2013831.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"issue":"3","key":"10021_CR30","doi-asserted-by":"publisher","first-page":"949","DOI":"10.1109\/TASL.2006.885250","volume":"15","author":"E Klabbers","year":"2007","unstructured":"Klabbers, E., Van Santen, J. P., & Kain, A. (2007). The contribution of various sources of spectral mismatch to audible discontinuities in a diphone database. IEEE Transactions on Audio, Speech, and Language Processing, 15(3), 949\u2013956.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"10021_CR31","doi-asserted-by":"crossref","unstructured":"Lewis, J. R. (2004). Effect of speaker and sampling rate on mos-x ratings of concatenative TTS voices. In Proceedings of the human factors and ergonomics society annual meeting, Vol. 48, (pp. 759\u2013763).","DOI":"10.1177\/154193120404800504"},{"key":"10021_CR32","doi-asserted-by":"crossref","unstructured":"Lo, C. C., Fu, S. W., Huang, W. C., Wang, X., Yamagishi, J., Tsao, Y. & Wang, H. M. (2019). MOSNet: Deep learning-based objective assessment for voice conversion. In Proceedings of Interspeech 2019 (pp. 1541\u20131545).","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"10021_CR33","doi-asserted-by":"crossref","unstructured":"Loizou, P. C. (2011). Speech quality assessment. In Multimedia analysis, processing and communications (pp. 623\u2013654). Springer.","DOI":"10.1007\/978-3-642-19551-8_23"},{"key":"10021_CR34","doi-asserted-by":"publisher","DOI":"10.1201\/b14529","volume-title":"Speech enhancement: Theory and practice Speech enhancement: Theory and practice","author":"PC Loizou","year":"2013","unstructured":"Loizou, P. C. (2013). Speech enhancement: Theory and practice speech enhancement: Theory and practice. CRC Press."},{"key":"10021_CR35","doi-asserted-by":"crossref","unstructured":"Lorenzo-Trueba, J., Yamagishi, J., Toda, T., Saito, D., Villavicencio, F., Kinnunen, T. & Ling, Z. (2018) The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods. In Proceedings of Odyssey 2018 the speaker and language recognition workshop (pp. 195\u2013202).","DOI":"10.21437\/Odyssey.2018-28"},{"key":"10021_CR36","doi-asserted-by":"publisher","first-page":"2517","DOI":"10.1109\/TASLP.2021.3065833","volume":"29","author":"S Malviya","year":"2021","unstructured":"Malviya, S., Mishra, R., Barnwal, S. K., & Tiwary, U. S. (2021). HDRS: Hindi dialogue restaurant search corpus for dialogue state tracking in task-oriented environment. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 2517\u20132528. https:\/\/doi.org\/10.1109\/TASLP.2021.3065833","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10021_CR37","doi-asserted-by":"crossref","unstructured":"Malviya, S., Mishra, R. & Tiwary, U. S. (2016). Structural analysis of Hindi phonetics and a method for extraction of phonetically rich sentences from a very large Hindi text corpus. In 2016 conference of O-COCOSDA (pp. 188\u2013193).","DOI":"10.1109\/ICSDA.2016.7919009"},{"key":"10021_CR38","doi-asserted-by":"crossref","unstructured":"Mariniak, A. (1993). A global framework for the assessment of synthetic speech without subjects. In Third European conference on speech communication and technology.","DOI":"10.21437\/Eurospeech.1993-379"},{"issue":"3","key":"10021_CR39","doi-asserted-by":"publisher","first-page":"311","DOI":"10.1016\/j.specom.2010.10.003","volume":"53","author":"C Mayo","year":"2011","unstructured":"Mayo, C., Clark, R. A., & King, S. (2011). Listeners\u2019 weighting of acoustic cues to synthetic speech naturalness: A multidimensional scaling analysis. Speech Communication, 53(3), 311\u2013326.","journal-title":"Speech Communication"},{"key":"10021_CR40","doi-asserted-by":"crossref","unstructured":"Mishra, R., Barnwal, S. K., Malviya, S., Mishra, P. & Tiwary, U. S. (2018). Prosodic feature selection of personality traits for job interview performance. In International conference on intelligent systems design and applications (pp. 673\u2013682).","DOI":"10.1007\/978-3-030-16657-1_63"},{"key":"10021_CR41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-56046-4","volume-title":"Quality engineering: Qualit\u00e4t kommunikationstechnischer systeme","author":"S M\u00f6ller","year":"2017","unstructured":"M\u00f6ller, S. (2017). Quality engineering: Qualit\u00e4t kommunikationstechnischer Systeme. Springer."},{"key":"10021_CR42","doi-asserted-by":"crossref","unstructured":"M\u00f6ller, S., Hinterleitner, F., Falk, T. H. & Polzehl, T. (2010). Comparison of approaches for instrumentally predicting the quality of text-to-speech systems. In Eleventh annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2010-413"},{"key":"10021_CR43","doi-asserted-by":"crossref","unstructured":"Monzo, C., Iriondo, I. & Socor\u00f3, J. C. (2014). Voice quality modelling for expressive speech synthesis. The Scientific World Journal.","DOI":"10.1155\/2014\/627189"},{"key":"10021_CR44","volume-title":"An introduction to the psychology of hearing","author":"BC Moore","year":"2012","unstructured":"Moore, B. C. (2012). An introduction to the psychology of hearing. Brill."},{"issue":"6","key":"10021_CR45","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1109\/MSP.2011.942469","volume":"28","author":"S M\u00fcller","year":"2011","unstructured":"M\u00fcller, S., Chan, W., C\u00f4t\u00e9, N., Falk, T. H., Raake, A., & W\u00e4ltermann, M. (2011). Speech quality estimation: Models and trends. IEEE Signal Processing Magazine, 28(6), 18\u201328.","journal-title":"IEEE Signal Processing Magazine"},{"issue":"5","key":"10021_CR46","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1109\/LSP.2012.2189562","volume":"19","author":"CR Norrenbrock","year":"2012","unstructured":"Norrenbrock, C. R., Hinterleitner, F., Heute, U., & Moller, S. (2012). Instrumental assessment of prosodic quality for text-to-speech signals. IEEE Signal Processing Letters, 19(5), 255\u2013258.","journal-title":"IEEE Signal Processing Letters"},{"key":"10021_CR47","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1016\/j.specom.2014.06.003","volume":"66","author":"CR Norrenbrock","year":"2015","unstructured":"Norrenbrock, C. R., Hinterleitner, F., Heute, U., & M\u00f6ller, S. (2015). Quality prediction of synthesized speech based on perceptual quality dimensions. Speech Communication, 66, 17\u201335.","journal-title":"Speech Communication"},{"key":"10021_CR48","doi-asserted-by":"crossref","unstructured":"Novorita, B. (1999). Incorporation of temporal masking effects into bark spectral distortion measure. In Proceedings of ICASSP, Vol. 2, (pp. 665\u2013668).","DOI":"10.1109\/ICASSP.1999.759754"},{"key":"10021_CR49","unstructured":"Pammi, S., Charfuelan, M. & Schr\u00f6der, M. (2010). Multilingual voice creation toolkit for the mary TTS platform. In LREC."},{"key":"10021_CR50","doi-asserted-by":"crossref","unstructured":"Papadopoulos, P., Travadi, R. & Narayanan, S. (2017). Global SNR estimation of speech signals for unknown noise conditions using noise adapted non-linear regression. In Proceedings of Interspeech 2017 (pp. 3842\u20133846).","DOI":"10.21437\/Interspeech.2017-230"},{"key":"10021_CR51","unstructured":"Parlikar, A., Sitaram, S., Wilkinson, A. & Black, A. W. (2016). The festvox indic frontend for grapheme to phoneme conversion. In WILDRE: Workshop on indian language data-resources and evaluation."},{"issue":"4","key":"10021_CR52","doi-asserted-by":"publisher","first-page":"448","DOI":"10.1080\/00335635109381697","volume":"37","author":"WM Parrish","year":"1951","unstructured":"Parrish, W. M. (1951). The concept of \u201cnaturalness\u2019\u2019. Quarterly Journal of Speech, 37(4), 448\u2013454.","journal-title":"Quarterly Journal of Speech"},{"key":"10021_CR53","unstructured":"Ping, W., Peng, K., Gibiansky, A., Arik, S.\u00d6., Kannan, A. , Narang, S. & Miller, J. (2018). Deep voice 3: Scaling text-to-speech with convolutional sequence learning. In ICLR-2018. OpenReview.net."},{"key":"10021_CR54","doi-asserted-by":"crossref","unstructured":"Prakash, A., Prakash, J. J. & Murthy, H. A. (2016). Acoustic analysis of syllables across Indian languages. In INTERSPEECH (pp. 327\u2013331).","DOI":"10.21437\/Interspeech.2016-1127"},{"key":"10021_CR55","volume-title":"Objective measures of speech quality Objective measures of speech quality","author":"SR Quackenbush","year":"1988","unstructured":"Quackenbush, S. R., Barnwell, T. P., & Clements, M. A. (1988). Objective measures of speech quality. Prentice Hall."},{"key":"10021_CR56","volume-title":"P. 85. A method for subjective performance assessment of the quality of speech voice output devices","author":"I Rec","year":"1994","unstructured":"Rec, I. (1994). P. 85. A method for subjective performance assessment of the quality of speech voice output devices. International Telecommunication Union."},{"key":"10021_CR57","unstructured":"Recommendation, I. T. (2001). Perceptual evaluation of speech quality (PESQ): An objective method for end-to-end speech quality assessment of narrow-band telephone networks and speech codecs. In Rec. ITU-T P. 862."},{"key":"10021_CR58","doi-asserted-by":"crossref","unstructured":"Rosipal, R. & Kr\u00e4mer, N. (2005). Overview and recent advances in partial least squares. In International statistical and optimization perspectives workshop \"subspace, latent structure and feature selection\u201d (pp. 34\u201351).","DOI":"10.1007\/11752790_2"},{"key":"10021_CR59","unstructured":"Schr\u00f6der, M. & Hunecke, A. (2007). Creating German unit selection voices for the Mary TTSs platform from the Bits corpora. In Proceedings of SSW6."},{"key":"10021_CR60","unstructured":"Schuller, B. (2006). Automatische emotionserkennung aus sprachlicher und manueller Interaktion (Unpublished doctoral dissertation). Technische Universit\u00e4t M\u00fcnchen."},{"key":"10021_CR9","doi-asserted-by":"crossref","unstructured":"Schuller et al., B. (2009). The Interspeech 2009 emotion challenge. In Proceedings 10th ISCA.","DOI":"10.21437\/Interspeech.2009-103"},{"key":"10021_CR61","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Burkhardt, F., Devillers, L., M\u00fcller, C. & Narayanan, S. S. (2010). The Interspeech 2010 paralinguistic challenge. In Proceedings 11th ISCA.","DOI":"10.21437\/Interspeech.2010-739"},{"key":"10021_CR62","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., N\u00f6th, E., Vinciarelli, A., Burkhardt, F., & Weiss, B (2012). The Interspeech 2012 speaker trait challenge. In Proceedings 13th ISCA.","DOI":"10.21437\/Interspeech.2012-86"},{"key":"10021_CR63","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Schiel, F., Krajewski, J., et al. (2011). The Interspeech 2011 speaker state challenge. In Proceedings 12th ISCA.","DOI":"10.21437\/Interspeech.2011-801"},{"key":"10021_CR64","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Vinciarelli, A., Scherer, K., Ringeval, F. & Kim, S. (2013). The Interspeech 2013 computational paralinguistics challenge: Social signals, conflict, emotion, autism. In Proceedings 14th ISCA.","DOI":"10.21437\/Interspeech.2013-56"},{"key":"10021_CR65","doi-asserted-by":"crossref","unstructured":"Schuller, B.W., Steidl, S., Batliner, A., Hirschberg, J., Burgoon, J.K., Baird, A. & Evanini, K. (2016). The Interspeech 2016 computational paralinguistics challenge: Deception, sincerity & native language. In Interspeech, Vol. 2016, (pp. 2001\u20132005).","DOI":"10.21437\/Interspeech.2016-129"},{"key":"10021_CR66","doi-asserted-by":"crossref","unstructured":"Shen, J., Pang, R., Weiss, R.J., Schuster, M., Jaitly, N., Yang, Z. & Wu, Y. (2018). Natural TTS synthesis by conditioning wavenet on MEL spectrogram predictions. In Proceedings of ICASSP (pp. 4779\u20134783). IEEE.","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"10021_CR67","doi-asserted-by":"crossref","unstructured":"Stylianou, Y. & Syrdal, A. K. (2001). Perceptual and objective detection of discontinuities in concatenative speech synthesis. In 2001 IEEE international conference on acoustics, speech, and signal processing. proceedings (Cat. No. 01CH37221), Vol. 2, (pp. 837\u2013840).","DOI":"10.1109\/ICASSP.2001.941045"},{"issue":"2\u20133","key":"10021_CR68","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1016\/0167-6393(92)90013-W","volume":"11","author":"H Sydeserff","year":"1992","unstructured":"Sydeserff, H., Caley, R., Isard, S. D., Jack, M. A., Monaghan, A. I., & Verhoeven, J. (1992). Evaluation of speech synthesis techniques in a comprehension task. Speech Communication, 11(2\u20133), 189\u2013194.","journal-title":"Speech Communication"},{"key":"10021_CR69","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511816338","volume-title":"Text-to-speech synthesis Text-to-speech synthesis","author":"P Taylor","year":"2009","unstructured":"Taylor, P. (2009). Text-to-speech synthesis. Cambridge University Press."},{"issue":"1","key":"10021_CR70","first-page":"72","volume":"18","author":"R Thangarajan","year":"2008","unstructured":"Thangarajan, R., & Natarajan, A. (2008). Syllable based continuous speech recognition for Tamil. South Asian Language Review, 18(1), 72\u201385.","journal-title":"South Asian Language Review"},{"key":"10021_CR71","doi-asserted-by":"crossref","unstructured":"Tokuda, K., Kobayashi, T., Masuko, T. & Imai, S. (1994). Mel-generalized cepstral analysis-a unified approach to speech spectral estimation. In Third international conference on spoken language processing.","DOI":"10.21437\/ICSLP.1994-275"},{"key":"10021_CR72","volume-title":"Hypothesis testing in the multiple regression model","author":"E Uriel","year":"2013","unstructured":"Uriel, E. (2013). Hypothesis testing in the multiple regression model. Universidad de Valencia, Department of Economics."},{"key":"10021_CR73","doi-asserted-by":"crossref","unstructured":"Valentini-Botinhao, C., Yamagishi, J. & King, S. (2011). Can objective measures predict the intelligibility of modified hmm-based synthetic speech in noise? In Twelfth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2011-42"},{"key":"10021_CR74","doi-asserted-by":"crossref","unstructured":"Valstar, M., Schuller, B., Smith, K., Eyben, F., Jiang, B., Bilakhia, S. & Pantic, M. (2013). Avec 2013: The continuous audio\/visual emotion and depression recognition challenge. In Proceedings of the 3rd ACM international workshop on audio\/visual emotion challenge (pp. 3\u201310).","DOI":"10.1145\/2512530.2512533"},{"key":"10021_CR75","unstructured":"van Bezooijen, R., van Heuven, V., Gibbon, D., Moore, R. & Winski, R. (1997). Assessment of synthesis systems. In D. Gibbon, R. Moore, & R. Winski (Eds.) Handbook of standards and resources for spoken language systems (pp. 481\u2013563)."},{"key":"10021_CR76","unstructured":"van\u00a0den Oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A. & Kavukcuoglu, K. (2016). Wavenet: A generative model for raw audio. In 9th ISCA speech synthesis workshop (pp. 125\u2013125)."},{"key":"10021_CR77","unstructured":"van Heuven, V. J. & van Bezooijen, R. (1995). Quality evaluation of synthesized speech. In Speech coding and synthesis (p.\u00a0707738). Citeseer."},{"issue":"5","key":"10021_CR78","doi-asserted-by":"publisher","first-page":"1763","DOI":"10.1109\/TSA.2005.858548","volume":"14","author":"J Vepa","year":"2006","unstructured":"Vepa, J., & King, S. (2006). Subjective evaluation of join cost and smoothing methods for unit selection speech synthesis. IEEE Transactions on Audio, Speech, and Language Processing, 14(5), 1763\u20131771.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"1","key":"10021_CR79","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1016\/j.csl.2003.12.001","volume":"19","author":"M Viswanathan","year":"2005","unstructured":"Viswanathan, M., & Viswanathan, M. (2005). Measuring speech quality for text-to-speech systems: Development and assessment of a modified mean opinion score (MOS) scale. Computer Speech & Language, 19(1), 55\u201383.","journal-title":"Computer Speech & Language"},{"key":"10021_CR80","doi-asserted-by":"crossref","unstructured":"Wang, Y., Skerry-Ryan, R., Stanton, D., Wu, Y., Weiss, R. J., Jaitly, N. & Saurous, R. A. (2017). Tacotron: Towards end-to-end speech synthesis. In Proceedings of Interspeech 2017 (pp. 4006\u20134010).","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"10021_CR81","unstructured":"Wei, B. & Gibson, J. D. (2001). Comparison of distance measures in discrete spectral modeling. S. M. U."},{"key":"10021_CR82","doi-asserted-by":"crossref","unstructured":"Yi, Z., Huang, W. C., Tian, X., Yamagishi, J., Das, R.K., Kinnunen, T. & Toda, T. (2020). Voice conversion challenge 2020: Intra-lingual semi-parallel and cross-lingual voice conversion. In Proceedings of the joint workshop for the blizzard challenge and voice conversion challenge 2020 (pp. 80\u201398).","DOI":"10.21437\/VCC_BC.2020-14"},{"key":"10021_CR83","unstructured":"Young, S.J., Kershaw, D., Odell, J., Ollason, D., Valtchev, V. & Woodland, P. (2006). In The HTK Book Version 3.4. Cambridge University Press."},{"issue":"11","key":"10021_CR84","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","volume":"51","author":"H Zen","year":"2009","unstructured":"Zen, H., Tokuda, K., & Black, A. W. (2009). Statistical parametric speech synthesis. Speech Communication, 51(11), 1039\u20131064.","journal-title":"Speech Communication"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10021-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10021-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10021-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,3,27]],"date-time":"2023-03-27T11:15:59Z","timestamp":1679915759000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10021-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,2]]},"references-count":84,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,3]]}},"alternative-id":["10021"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10021-4","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,2,2]]},"assertion":[{"value":"20 February 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 January 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}