{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:37:06Z","timestamp":1776109026788,"version":"3.50.1"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T00:00:00Z","timestamp":1659916800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T00:00:00Z","timestamp":1659916800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100008628","name":"Ministry of Electronics and Information technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008628","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2023,1]]},"DOI":"10.1007\/s00034-022-02126-z","type":"journal-article","created":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T07:04:03Z","timestamp":1659942243000},"page":"361-384","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Prosody-TTS: An End-to-End Speech Synthesis System with Prosody Control"],"prefix":"10.1007","volume":"42","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5945-6860","authenticated-orcid":false,"given":"Giridhar","family":"Pamisetty","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6355-5287","authenticated-orcid":false,"given":"K.","family":"Sri Rama Murty","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,8]]},"reference":[{"key":"2126_CR1","unstructured":"S.\u00d6. Arik, M. Chrzanowski, A. Coates, G. Diamos, A. Gibiansky, Y. Kang, X. Li, J. Miller, J. Raiman, S. Sengupta, M. Shoeybi, Deep voice: real-time neural text-to-speech. CoRR. (2017). arXiv preprint arXiv:1702.07825"},{"key":"2126_CR2","unstructured":"A. Baby, A.L.N. Thomas, T. Consortium, Resources for Indian Languages. Community-Based Building of Language Resources (2016)"},{"key":"2126_CR3","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/BF00995674","volume":"15","author":"R Banse","year":"1991","unstructured":"R. Banse, K.R. Scherer, Acoustic profiles in vocal emotion expression. Motivation Emotion 15, 123\u2013148 (1991). https:\/\/doi.org\/10.1007\/BF00995674","journal-title":"Motivation Emotion"},{"key":"2126_CR4","doi-asserted-by":"publisher","unstructured":"A.W. Black, H. Zen, K. Tokuda, Statistical parametric speech synthesis, in 2007 IEEE International Conference on Acoustics, Speech and Signal Processing\u2014ICASSP \u201907, vol. 4, pp. IV-1229-IV\u20131232 (2007). https:\/\/doi.org\/10.1109\/ICASSP.2007.367298","DOI":"10.1109\/ICASSP.2007.367298"},{"key":"2126_CR5","doi-asserted-by":"publisher","unstructured":"W. Chu, A. Alwan, Reducing f0 frame error of f0 tracking algorithms under noisy conditions with an unvoiced\/voiced classification frontend, in 2009 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 3969\u20133972 (2009). https:\/\/doi.org\/10.1109\/ICASSP.2009.4960497","DOI":"10.1109\/ICASSP.2009.4960497"},{"key":"2126_CR6","doi-asserted-by":"crossref","unstructured":"Y. Chung, Y. Wang, W. Hsu, Y. Zhang, R.J. Skerry-Ryan, Semi-supervised training for improving data efficiency in end-to-end speech synthesis. CoRR (2018). arXiv preprint arXiv:1808.10128","DOI":"10.1109\/ICASSP.2019.8683862"},{"key":"2126_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2015.12.114","author":"A de Myttenaere","year":"2016","unstructured":"A. de Myttenaere, B. Golden, B. Le Grand, F. Rossi, Mean absolute percentage error for regression models. Neurocomputing (2016). https:\/\/doi.org\/10.1016\/j.neucom.2015.12.114","journal-title":"Neurocomputing"},{"key":"2126_CR8","doi-asserted-by":"crossref","unstructured":"G. Divu, S. Prasanna, B. Yegnanarayana, Neutral to target emotion conversion using source and suprasegmental information, in Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, pp. 2969\u20132972 (2011)","DOI":"10.21437\/Interspeech.2011-743"},{"issue":"2","key":"2126_CR9","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1109\/TASSP.1984.1164317","volume":"32","author":"D Griffin","year":"1984","unstructured":"D. Griffin, Jae Lim: signal estimation from modified short-time Fourier transform. IEEE Trans. Acoust. Speech Signal Process. 32(2), 236\u2013243 (1984)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"2126_CR10","doi-asserted-by":"publisher","first-page":"354","DOI":"10.1016\/j.patcog.2017.10.013.","volume":"77","author":"J Gu","year":"2018","unstructured":"J. Gu, Z. Wang, J. Kuen, L. Ma, A. Shahroudy, B. Shuai, T. Liu, X. Wang, G. Wang, J. Cai, T. Chen, Recent advances in convolutional neural networks. Pattern Recognit. 77, 354\u2013377 (2018). https:\/\/doi.org\/10.1016\/j.patcog.2017.10.013.","journal-title":"Pattern Recognit."},{"key":"2126_CR11","unstructured":"X. Huang, A. Acero, H.W. Hon, R. Reddy, Spoken Language Processing: A Guide to Theory, Algorithm, and System Development, 1st edn. (Prentice Hall PTR, USA, 2001)"},{"key":"2126_CR12","unstructured":"A.J. Hunt, A.W. Black, Unit selection in a concatenative speech synthesis system using a large speech database, in 1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings, vol. 1, pp. 373\u2013376 (1996)"},{"key":"2126_CR13","unstructured":"ITU-T Recommendation P.800, Methods for Subjective Determination of Transmission Quality (1996). https:\/\/www.itu.int\/rec\/T-REC-P.800-199608-I"},{"key":"2126_CR14","unstructured":"N. Kalchbrenner, E. Elsen, K. Simonyan, S. Noury, N. Casagrande, E. Lockhart, F. Stimberg, A. van\u00a0den Oord, S. Dieleman, K. Kavukcuoglu, Efficient neural audio synthesis. CoRR (2018). arXiv preprint arXiv:1802.08435"},{"key":"2126_CR15","unstructured":"D.P. Kingma, J. Ba, Adam: A method for stochastic optimization, in 3rd International Conference on Learning Representations, ICLR 2015, May 7-9, 2015, Conference Track Proceedings, ed. by Y.\u00a0Bengio, Y.\u00a0LeCun (eds.) (San Diego, CA, USA) (2015). arXiv preprint arXiv:1412.6980"},{"key":"2126_CR16","unstructured":"N. Li, S. Liu, Y. Liu, S. Zhao, M. Liu, M. Zhou, Close to human quality TTS with transformer. CoRR (2018). arXiv preprint arXiv:1809.08895"},{"key":"2126_CR17","unstructured":"S. Liu, A. Davison, E. Johns, Self-supervised generalisation with meta auxiliary learning, in Advances in Neural Information Processing Systems, vol.\u00a032 (Curran Associates, Inc., 2019). arXiv preprint arXiv:1901.08933"},{"key":"2126_CR18","doi-asserted-by":"publisher","unstructured":"Z. Luo, T. Takiguchi, Y. Ariki, Emotional voice conversion using deep neural networks with mcc and f0 features, in 2016 IEEE\/ACIS 15th International Conference on Computer and Information Science (ICIS), pp. 1\u20135 (2016). https:\/\/doi.org\/10.1109\/ICIS.2016.7550889","DOI":"10.1109\/ICIS.2016.7550889"},{"issue":"7","key":"2126_CR19","doi-asserted-by":"publisher","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","volume":"E99.D","author":"M Morise","year":"2016","unstructured":"M. Morise, F. Yokomori, K. Ozawa, World: a vocoder-based high-quality speech synthesis system for real-time applications. IEICE Trans. Inf. Syst. E99.D(7), 1877\u20131884 (2016). https:\/\/doi.org\/10.1587\/transinf.2015EDP7457","journal-title":"IEICE Trans. Inf. Syst."},{"issue":"3","key":"2126_CR20","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1016\/j.specom.2007.09.003.","volume":"50","author":"T Nakatani","year":"2008","unstructured":"T. Nakatani, S. Amano, T. Irino, K. Ishizuka, T. Kondo, A method for fundamental frequency estimation and voicing decision: application to infant utterances recorded in real acoustical environments. Speech Commun. 50(3), 203\u2013214 (2008). https:\/\/doi.org\/10.1016\/j.specom.2007.09.003.","journal-title":"Speech Commun."},{"key":"2126_CR21","doi-asserted-by":"crossref","unstructured":"A. \u00d6ktem, M. Farr\u00fas, A. Bonafonte, Prosodic phrase alignment for machine dubbing. CoRR (2019). arXiv preprint arXiv:1908.07226","DOI":"10.21437\/Interspeech.2019-1621"},{"key":"2126_CR22","doi-asserted-by":"publisher","unstructured":"A. Oktem, M. Farr\u00fas, A. Bonafonte, Prosodic phrase alignment for machine dubbing, in Interspeech 2019, pp. 4215\u20134219 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1621","DOI":"10.21437\/Interspeech.2019-1621"},{"key":"2126_CR23","unstructured":"W. Ping, K. Peng, J. Chen, Clarinet: parallel wave generation in end-to-end text-to-speech. CoRR (2018). arXiv preprint arXiv:1807.07281"},{"key":"2126_CR24","doi-asserted-by":"crossref","unstructured":"R. Prenger, R. Valle, B. Catanzaro, Waveglow: a flow-based generative network for speech synthesis. CoRR (2018). arXiv preprint arXiv:1811.00002","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"2126_CR25","unstructured":"Y. Ren, Y. Ruan, X. Tan, T. Qin, S. Zhao, Z. Zhao, T. Liu, Fastspeech: fast, robust and controllable text to speech. CoRR (2019). arXiv preprint arXiv:1905.09263"},{"key":"2126_CR26","unstructured":"J. Shen, Y. Jia, M. Chrzanowski, Y. Zhang, I. Elias, H. Zen, Y. Wu, Non-attentive tacotron: Robust and controllable neural TTS synthesis including unsupervised duration modeling. CoRR (2020). arXiv preprint arXiv:2010.04301"},{"key":"2126_CR27","unstructured":"R.J. Skerry-Ryan, E. Battenberg, Y. Xiao, Y. Wang, D. Stanton, J. Shor, R.J. Weiss, R. Clark, R.A. Saurous, Towards end-to-end prosody transfer for expressive speech synthesis with tacotron. CoRR (2018). arXiv preprint arXiv:1803.09047"},{"key":"2126_CR28","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1017\/S0952675700003894","volume":"17","author":"C Smith","year":"2000","unstructured":"C. Smith, Handbook of the international phonetic association: a guide to the use of the international phonetic alphabet (1999). Phonology 17, 291\u2013295 (2000). https:\/\/doi.org\/10.1017\/S0952675700003894","journal-title":"Phonology"},{"key":"2126_CR29","unstructured":"J. Sotelo, S. Mehri, K. Kumar, J.F. Santos, K. Kastner, A. Courville, Y. Bengio, Char2wav: end-to-end speech synthesis, in ICLR (2017)"},{"key":"2126_CR30","doi-asserted-by":"publisher","unstructured":"C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, A. Rabinovich, Going deeper with convolutions, in 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1\u20139 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298594","DOI":"10.1109\/CVPR.2015.7298594"},{"issue":"3","key":"2126_CR31","doi-asserted-by":"publisher","first-page":"223","DOI":"10.2307\/409203","volume":"17","author":"GL Trager","year":"1941","unstructured":"G.L. Trager, B. Bloch, The syllabic phonemes of English. Language 17(3), 223\u2013246 (1941) http:\/\/www.jstor.org\/stable\/409203","journal-title":"Language"},{"key":"2126_CR32","unstructured":"A. van\u00a0den Oord, S. Dieleman, H. Zen, K. Simonyan, O. Vinyals, A. Graves, N. Kalchbrenner, A.W. Senior, K. Kavukcuoglu, Wavenet: a generative model for raw audio. CoRR (2016). arXiv preprint arXiv:1609.03499"},{"key":"2126_CR33","unstructured":"Y. Wang, R.J. Skerry-Ryan, D. Stanton, Y. Wu, R.J. Weiss, N. Jaitly, Z. Yang, Y. Xiao, Z. Chen, S. Bengio, Q.V. Le, Y. Agiomyrgiannakis, R. Clark, R.A. Saurous, Tacotron: a fully end-to-end text-to-speech synthesis model. CoRR (2017). arXiv preprint arXiv:1703.10135"},{"key":"2126_CR34","unstructured":"Y. Wang, D. Stanton, Y. Zhang, R.J. Skerry-Ryan, E. Battenberg, J. Shor, Y. Xiao, F. Ren, Y. Jia, R.A. Saurous, Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. CoRR (2018). arXiv preprint arXiv:1803.09017"},{"key":"2126_CR35","doi-asserted-by":"publisher","unstructured":"O. Watts, C. Valentini-Botinhao, S. King, Speech waveform reconstruction using convolutional neural networks with noise and periodic inputs, in:ICASSP 2019 - IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7045\u20137049 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683398","DOI":"10.1109\/ICASSP.2019.8683398"},{"key":"2126_CR36","doi-asserted-by":"crossref","unstructured":"D. Weber, C. G\u00fchmann, Non-autoregressive vs autoregressive neural networks for system identification. CoRR (2021). arXiv preprint arXiv:2105.02027","DOI":"10.1016\/j.ifacol.2021.11.252"},{"key":"2126_CR37","doi-asserted-by":"crossref","unstructured":"R. Yamamoto, E. Song, J.M. Kim, Parallel wavegan: a fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram, in ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6199\u20136203 (IEEE, 2020)","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"2126_CR38","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1007\/s13244-018-0639-9","volume":"9","author":"R Yamashita","year":"2018","unstructured":"R. Yamashita, M. Nishio, R. Do, K. Togashi, Convolutional neural networks: an overview and application in radiology. Insights Imaging 9, 611\u2013629 (2018). https:\/\/doi.org\/10.1007\/s13244-018-0639-9","journal-title":"Insights Imaging"},{"key":"2126_CR39","doi-asserted-by":"crossref","unstructured":"Y. Yasuda, X. Wang, J. Yamagishi, Investigation of learning abilities on linguistic features in sequence-to-sequence text-to-speech synthesis (2020)","DOI":"10.1016\/j.csl.2020.101183"},{"key":"2126_CR40","unstructured":"S. Young, G. Evermann, M. Gales, T. Hain, D. Kershaw, X. Liu, G. Moore, J. Odell, D. Ollason, D. Povey, V. Valtchev, P. Woodland, The HTK book. Cambridge University Engineering Department (2002)"},{"key":"2126_CR41","unstructured":"H. Zen, T. Nose, J. Yamagishi, S. Sako, T. Masuko, A.W. Black, K. Tokuda, The hmm-based speech synthesis system (hts) version 2.0. In: SSW, pp. 294\u2013299. Citeseer (2007)"},{"key":"2126_CR42","unstructured":"S.K. Zhizheng\u00a0Wu Oliver\u00a0Watts, Merlin: an open source neural network speech synthesis system, in 9th ISCA Speech Synthesis Workshop (SSW9) (2016)"},{"key":"2126_CR43","doi-asserted-by":"crossref","unstructured":"K. Zhou, B. Sisman, M. Zhang, H. Li, Converting anyone\u2019s emotion: towards speaker-independent emotional voice conversion. CoRR (2020). arXiv preprint arXiv:2005.07025","DOI":"10.21437\/Interspeech.2020-2014"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-02126-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-022-02126-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-02126-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T22:52:12Z","timestamp":1727736732000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-022-02126-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,8]]},"references-count":43,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,1]]}},"alternative-id":["2126"],"URL":"https:\/\/doi.org\/10.1007\/s00034-022-02126-z","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,8]]},"assertion":[{"value":"10 September 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 July 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 August 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}