{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T06:24:12Z","timestamp":1750832652957},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2020,3,9]],"date-time":"2020-03-09T00:00:00Z","timestamp":1583712000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,3,9]],"date-time":"2020-03-09T00:00:00Z","timestamp":1583712000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2020,9]]},"DOI":"10.1007\/s10772-020-09691-1","type":"journal-article","created":{"date-parts":[[2020,3,9]],"date-time":"2020-03-09T16:03:53Z","timestamp":1583769833000},"page":"597-613","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["Speaker-independent expressive voice synthesis using learning-based hybrid network model"],"prefix":"10.1007","volume":"23","author":[{"given":"Susmitha","family":"Vekkot","sequence":"first","affiliation":[]},{"given":"Deepa","family":"Gupta","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,3,9]]},"reference":[{"issue":"5","key":"9691_CR2","doi-asserted-by":"publisher","first-page":"134","DOI":"10.5923\/j.ajsp.20120205.06","volume":"2","author":"R Aihara","year":"2012","unstructured":"Aihara, R., Takashima, R., Takiguchi, T., & Ariki, Y. (2012). GMM-based emotional voice conversion using spectrum and prosody features. American Journal of Signal Processing, 2(5), 134\u2013138.","journal-title":"American Journal of Signal Processing"},{"key":"9691_CR1","doi-asserted-by":"crossref","unstructured":"Aihara, R., Ueda, R., Takiguchi, T., & Ariki, Y. (2014). Exemplar-based emotional voice conversion using non-negative matrix factorization. In Proceedings of APSIPA, 2014 (pp. 1\u20137). IEEE.","DOI":"10.1109\/APSIPA.2014.7041640"},{"key":"9691_CR3","doi-asserted-by":"crossref","unstructured":"Akagi, M., Han, X., Elbarougy, R., Hamada, Y., & Li, J. (2014). Toward affective speech-to-speech translation: Strategy for emotional speech recognition and synthesis in multiple languages. In Proceedings of APSIPA, 2014 (pp. 1\u201310). IEEE.","DOI":"10.1109\/APSIPA.2014.7041623"},{"key":"9691_CR4","unstructured":"Anon. (2013). Technology development for Indian languages programme. In Diety. Retrieved January 22, 2018 from http:\/\/tdil.mit.gov.in\/AboutUs.aspx."},{"key":"9691_CR5","doi-asserted-by":"crossref","unstructured":"Benisty, H., & Malah, D. (2011). Voice conversion using GMM with enhanced global variance. In Proceedings of INTERSPEECH.","DOI":"10.21437\/Interspeech.2011-272"},{"key":"9691_CR7","unstructured":"Burkhardt, F., Paeschke, A., Rolfes, M., Sendlemeier, W., & Weiss, B. (2011). A database of German emotional speech. In Proceedings of INTERSPEECH (pp. 1517\u20131520)."},{"key":"9691_CR6","unstructured":"Burkhardt, F., & Sendlmeier, W. F. (2000). Verification of acoustical correlates of emotional speech using formant-synthesis. In ISCA tutorial and research workshop (ITRW) on speech and emotion."},{"key":"9691_CR8","doi-asserted-by":"crossref","unstructured":"Cabral, J. P., & Oliveira, L. C. (2006). Emovoice: A system to generate emotions in speech. In Proceedings of the ninth international conference on spoken language processing.","DOI":"10.21437\/Interspeech.2006-497"},{"key":"9691_CR9","first-page":"1","volume":"8","author":"J Cahn","year":"1990","unstructured":"Cahn, J. (1990). The generation of affect in synthesized speech. Journal of the American Voice I\/O Society, 8, 1\u201319.","journal-title":"Journal of the American Voice I\/O Society"},{"issue":"5","key":"9691_CR10","doi-asserted-by":"publisher","first-page":"954","DOI":"10.1109\/TASL.2010.2047683","volume":"18","author":"S Desai","year":"2010","unstructured":"Desai, S., Black, A. W., Yegnanarayana, B., & Prahallad, K. (2010). Spectral mapping using artificial neural networks for voice conversion. IEEE Transactions on Audio, Speech, and Language Processing, 18(5), 954\u2013964.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"1","key":"9691_CR11","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/s10772-012-9155-3","volume":"16","author":"D Govind","year":"2013","unstructured":"Govind, D., & Prasanna, S. R. M. (2013). Dynamic prosody modification using zero frequency filtered signal. International Journal of Speech Technology, 16(1), 41\u201354.","journal-title":"International Journal of Speech Technology"},{"key":"9691_CR12","unstructured":"Haq, S., & Jackson, P. (2009). Speaker-dependent audio-visual emotion recognition. In Proceedings of international conference on audio visual speech processing (pp. 53\u201358)."},{"key":"9691_CR13","volume-title":"Machine audition: Principles, algorithms and systems","author":"S Haq","year":"2010","unstructured":"Haq, S., & Jackson, P. (2010). Multimodal emotion recognition. In W. Wang (Ed.), Machine audition: Principles, algorithms and systems. Hershey: IGI Global Press."},{"issue":"1","key":"9691_CR14","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/s10772-016-9386-9","volume":"20","author":"A Haque","year":"2017","unstructured":"Haque, A., & Rao, K. S. (2017). Modification of energy spectra, epoch parameters and prosody for emotion conversion in speech. International Journal of Speech Technology, 20(1), 15\u201325.","journal-title":"International Journal of Speech Technology"},{"issue":"5","key":"9691_CR15","doi-asserted-by":"publisher","first-page":"912","DOI":"10.1109\/TASL.2010.2041699","volume":"18","author":"E Helander","year":"2010","unstructured":"Helander, E., Virtanen, T., Nurminen, J., & Gabbouj, M. (2010). Voice conversion using partial least squares regression. IEEE Transactions on Audio, Speech, and Language Processing, 18(5), 912\u2013921.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"7","key":"9691_CR16","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton, G. E., Osindero, S., & Teh, Y. (2006). A fast learning algorithm for deep belief nets. Neural Computation, 18(7), 1527\u20131554.","journal-title":"Neural Computation"},{"key":"9691_CR17","doi-asserted-by":"crossref","unstructured":"Hunt, A. J., & Black, A. W. (1996). Unit selection in a concatenative speech synthesis system using a large speech database. In Proceedings of ICASSP (Vol.\u00a01, pp. 373\u2013376). IEEE.","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"9691_CR18","doi-asserted-by":"crossref","unstructured":"Inanoglu, Z., & Young, S. (2007). A system for transforming the emotion in speech: Combining data-driven conversion techniques for prosody and voice quality. In Eighth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2007-226"},{"issue":"5","key":"9691_CR19","doi-asserted-by":"publisher","first-page":"713","DOI":"10.1007\/s12046-011-0043-3","volume":"36","author":"H Kawahara","year":"2011","unstructured":"Kawahara, H., & Morise, M. (2011). Technical foundations of TANDEM-STRAIGHT, a speech analysis, modification and synthesis framework. Sadhana, 36(5), 713\u2013727.","journal-title":"Sadhana"},{"key":"9691_CR20","doi-asserted-by":"crossref","unstructured":"Kawanami, H., Iwami, Y., Toda, T., Saruwatari, H., & Shikano, K. (2003). GMM-based voice conversion applied to emotional speech synthesis. In Eighth European conference on speech communication and technology.","DOI":"10.21437\/Eurospeech.2003-661"},{"key":"9691_CR21","doi-asserted-by":"crossref","unstructured":"Koolagudi, S. G., Maity, S., Kumar, V. A., Chakrabarti, S., & Rao, K. S. (2009). IITKGP-SESC: Speech database for emotion analysis. In Proceedings of IC3 (pp. 485\u2013492). Springer.","DOI":"10.1007\/978-3-642-03547-0_46"},{"issue":"3","key":"9691_CR22","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10772-012-9148-2","volume":"15","author":"SR Krothapalli","year":"2012","unstructured":"Krothapalli, S. R., Yadav, J., Sarkar, S., Koolagudi, S. G., & Vuppala, A. (2012). Neural network based feature transformation for emotion independent speaker identification. International Journal of Speech Technology, 15(3), 335\u2013349.","journal-title":"International Journal of Speech Technology"},{"key":"9691_CR23","doi-asserted-by":"crossref","unstructured":"Liu, K., Zhang, J., & Yan, Y. (2007). High quality voice conversion through phoneme-based linear mapping functions with STRAIGHT for mandarin. In Proceedings of the international conference on fuzzy systems and knowledge discovery (Vol.\u00a04, pp. 410\u2013414). IEEE.","DOI":"10.1109\/FSKD.2007.347"},{"key":"9691_CR24","doi-asserted-by":"crossref","unstructured":"Luo, Z., Chen, J., Takiguchi, T., & Ariki, Y. (2017a). Emotional voice conversion with adaptive scales F0 based on wavelet transform using limited amount of emotional data. In Proceedings of Interspeech (pp. 3399\u20133403).","DOI":"10.21437\/Interspeech.2017-984"},{"issue":"1","key":"9691_CR26","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1186\/s13636-017-0116-2","volume":"2017","author":"Z Luo","year":"2017","unstructured":"Luo, Z., Chen, J., Takiguchi, T., & Ariki, Y. (2017b). Emotional voice conversion using neural networks with arbitrary scales F0 based on wavelet transform. EURASIP Journal on Audio, Speech, and Music Processing, 2017(1), 18.","journal-title":"EURASIP Journal on Audio, Speech, and Music Processing"},{"key":"9691_CR25","doi-asserted-by":"crossref","unstructured":"Luo, Z., Takiguchi, T., & Ariki, Y. (2016). Emotional voice conversion using deep neural networks with MCC and F0 features. In Proceedings of ICIS (pp. 1\u20135). IEEE.","DOI":"10.1109\/ICIS.2016.7550889"},{"key":"9691_CR27","doi-asserted-by":"crossref","unstructured":"Ming, H., Huang, D., Dong, M., Li, H., Xie., L., & Zhang, S. (2015). Fundamental frequency modeling using wavelets for emotional voice conversion. In Proceedings of (ACII) (pp. 804\u2013809). IEEE.","DOI":"10.1109\/ACII.2015.7344665"},{"key":"9691_CR28","doi-asserted-by":"crossref","unstructured":"Ming, H., Huang, D., Xie, L., Zhang, S., Dong, M., & Li, H. (2016). Exemplar-based sparse representation of timbre and prosody for voice conversion. In Proceedings of ICASSP (pp. 5175\u20135179). IEEE.","DOI":"10.1109\/ICASSP.2016.7472664"},{"key":"9691_CR29","unstructured":"Mohamed, A., Dahl, G., & Hinton, G. (2009). Deep belief networks for phone recognition. In NIPS workshop on deep learning for speech recognition and related applications (Vol.\u00a01, p.\u00a039). Vancouver, Canada."},{"issue":"2","key":"9691_CR30","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1016\/0167-6393(94)00054-E","volume":"16","author":"E Moulines","year":"1995","unstructured":"Moulines, E., & Laroche, J. (1995). Non-parametric techniques for pitch-scale and time-scale modification of speech. Speech Communication, 16(2), 175\u2013205.","journal-title":"Speech Communication"},{"issue":"1","key":"9691_CR31","doi-asserted-by":"publisher","first-page":"134","DOI":"10.1016\/j.specom.2011.07.007","volume":"54","author":"K Nakamura","year":"2012","unstructured":"Nakamura, K., Toda, T., Saruwatari, H., & Shikano, K. (2012). Speaking-aid systems using GMM-based voice conversion for electrolaryngeal speech. Speech Communication, 54(1), 134\u2013146.","journal-title":"Speech Communication"},{"key":"9691_CR32","doi-asserted-by":"crossref","unstructured":"Nakashika, T., Takashima, R., Takiguchi, T., & Ariki, Y. (2013). Voice conversion in high-order eigen space using deep belief nets. In Proceedings of Interspeech (pp. 369\u2013372).","DOI":"10.21437\/Interspeech.2013-102"},{"key":"9691_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.asoc.2014.06.040","volume":"24","author":"J Nirmal","year":"2014","unstructured":"Nirmal, J., Zaveri, M., Patnaik, S., & Kachare, P. (2014). Voice conversion using general regression neural network. Applied Soft Computing, 24, 1\u201312.","journal-title":"Applied Soft Computing"},{"issue":"4","key":"9691_CR34","doi-asserted-by":"publisher","first-page":"928","DOI":"10.1007\/s12559-014-9283-y","volume":"6","author":"J P\u0159ibil","year":"2014","unstructured":"P\u0159ibil, J., & P\u0159ibilov\u00e1, A. (2014). GMM-based evaluation of emotional style transformation in czech and slovak. Cognitive Computation, 6(4), 928\u2013939.","journal-title":"Cognitive Computation"},{"key":"9691_CR35","doi-asserted-by":"crossref","unstructured":"Raitio, T., Suni, A., Juvela, L., Vainio, M., & Alku, P. (2014). Deep neural network based trainable voice source model for synthesis of speech with varying vocal effort. In 15th annual conference of international speech communication association","DOI":"10.21437\/Interspeech.2014-444"},{"issue":"3\u20134","key":"9691_CR36","doi-asserted-by":"publisher","first-page":"271","DOI":"10.1016\/j.specom.2003.10.002","volume":"42","author":"J Ram\u0131rez","year":"2004","unstructured":"Ram\u0131rez, J., Segura, J. C., Ben\u0131tez, C., De La Torre, A., & Rubio, A. (2004). Efficient voice activity detection algorithms using long-term speech information. Speech Communication, 42(3\u20134), 271\u2013287.","journal-title":"Speech Communication"},{"issue":"6","key":"9691_CR37","doi-asserted-by":"publisher","first-page":"745","DOI":"10.1016\/j.specom.2013.03.002","volume":"55","author":"KS Rao","year":"2013","unstructured":"Rao, K. S., & Vuppala, A. K. (2013). Non-uniform time scale modification using instants of significant excitation and vowel onset points. Speech Communication, 55(6), 745\u2013756.","journal-title":"Speech Communication"},{"key":"9691_CR38","doi-asserted-by":"crossref","unstructured":"Ribeiro, M. S., & Clark, R. A. (2015). A multi-level representation of F0 using the continuous wavelet transform and the discrete cosine transform. In Proceedings of ICASSP (pp. 4909\u20134913). IEEE.","DOI":"10.1109\/ICASSP.2015.7178904"},{"key":"9691_CR39","doi-asserted-by":"crossref","unstructured":"Sarkar, P., Haque, A., Dutta, A. K., Reddy, G., Harikrishna, D., Dhara, P., et\u00a0al. (2014). Designing prosody rule-set for converting neutral TTS speech to storytelling style speech for Indian languages: Bengali, Hindi and Telugu. In 2014 seventh international conference on contemporary computing (IC3) (pp. 473\u2013477). IEEE.","DOI":"10.1109\/IC3.2014.6897219"},{"issue":"6","key":"9691_CR40","doi-asserted-by":"publisher","first-page":"568","DOI":"10.1109\/72.97934","volume":"2","author":"DF Specht","year":"1991","unstructured":"Specht, D. F. (1991). A general regression neural network. IEEE Transactions on Neural Networks, 2(6), 568\u2013576.","journal-title":"IEEE Transactions on Neural Networks"},{"issue":"2","key":"9691_CR41","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1109\/89.661472","volume":"6","author":"Y Stylianou","year":"1998","unstructured":"Stylianou, Y., Olivier, C., & Moulines, E. (1998). Continuous probabilistic transform for voice conversion. IEEE Transactions on Speech and Audio Processing, 6(2), 131\u2013142.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"9691_CR42","unstructured":"Suni, A., Aalto, D., Raitio, T., Alku, P., Vainio, M., et\u00a0al. (2013). Wavelets for intonation modeling in HMM speech synthesis. In Proceedings of the 8th ISCA workshop on speech synthesis, Barcelona, August 31\u2013September 2."},{"key":"9691_CR43","doi-asserted-by":"crossref","unstructured":"Tanaka, M., & Okutomi, M. (2014). A novel inference of a restricted Boltzmann machine. In 22nd international conference on pattern recognition (ICPR) (pp. 1526\u20131531). IEEE.","DOI":"10.1109\/ICPR.2014.271"},{"issue":"4","key":"9691_CR44","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1109\/TASL.2006.876113","volume":"14","author":"J Tao","year":"2006","unstructured":"Tao, J., Kang, Y., & Li, A. (2006). Prosody conversion from neutral speech to emotional speech. IEEE Transactions on Audio, Speech, and Language Processing, 14(4), 1145\u20131154.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"4","key":"9691_CR45","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TASL.2006.876129","volume":"14","author":"M Theune","year":"2006","unstructured":"Theune, M., Meijis, K., & Heylen, D. (2006). Generating expressive speech for storytelling applications. IEEE Transactions on Audio, Speech, and Language Processing, 14(4), 1137\u20131144.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9691_CR46","unstructured":"Vainio, M., Suni, A., & Aalto, D. (2013). Continuous wavelet transform for analysis of speech prosody. In Proceedings of the TRASP 2013-tools and resources for the analysys of speech prosody."},{"key":"9691_CR47","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1007\/s10772-018-9503-z","volume":"21","author":"VVR Vegesna","year":"2018","unstructured":"Vegesna, V. V. R., Gurugubelli, K., & Vuppala, A. (2018). Prosody modification for speech recognition in emotionally mismatched conditions. International Journal of Speech Technology, 21, 521\u2013532.","journal-title":"International Journal of Speech Technology"},{"key":"9691_CR48","doi-asserted-by":"crossref","unstructured":"Vekkot, S. (2017). Building a generalized model for multi-lingual vocal emotion conversion. In Seventh international conference on affective computing and intelligent interaction, ACII 2017 (pp. 576\u2013580), San Antonio, TX, USA, 23\u201326 October, 2017. https:\/\/doi.org\/10.1109\/ACII.2017.8273658.","DOI":"10.1109\/ACII.2017.8273658"},{"key":"9691_CR52","doi-asserted-by":"crossref","unstructured":"Vekkot, S., & Gupta, D. (2019a). Emotion conversion in Telugu using constrained variance GMM and continuous wavelet transform-$$F_0$$. In TENCON 2019-2019 IEEE Region 10 Conference (TENCON) (pp. 991\u2013996). IEEE.","DOI":"10.1109\/TENCON.2019.8929432"},{"issue":"3","key":"9691_CR53","doi-asserted-by":"publisher","first-page":"533","DOI":"10.1007\/s10772-019-09626-5","volume":"22","author":"S Vekkot","year":"2019","unstructured":"Vekkot, S., & Gupta, D. (2019b). Prosodic transformation in vocal emotion conversion for multi-lingual scenarios: A pilot study. International Journal of Speech Technology, 22(3), 533\u2013549.","journal-title":"International Journal of Speech Technology"},{"key":"9691_CR54","doi-asserted-by":"publisher","first-page":"81883","DOI":"10.1109\/ACCESS.2019.2923003","volume":"7","author":"S Vekkot","year":"2019","unstructured":"Vekkot, S., Gupta, D., Zakariah, M., & Alotaibi, Y. A. (2019). Hybrid framework for speaker-independent emotion conversion using i-vector PLDA and neural network. IEEE Access, 7, 81883\u201381902.","journal-title":"IEEE Access"},{"key":"9691_CR49","unstructured":"Vekkot, S., & Tripathi, S. (2016). Inter-emotion conversion using dynamic time warping and prosody imposition. In The international symposium on intelligent systems technologies and applications (pp. 913\u2013924). Springer."},{"key":"9691_CR50","unstructured":"Vekkot, S., & Tripathi, S. (2016). Significance of glottal closure instants detection algorithms in vocal emotion conversion. International workshop soft computing applications (pp. 462\u2013473). Springer."},{"key":"9691_CR51","doi-asserted-by":"crossref","unstructured":"Vekkot, S., & Tripathi, S. (2017). Vocal emotion conversion using WSOLA and linear prediction. In Proceedings of the speech and computer\u201419th international conference, SPECOM 2017 (pp. 777\u2013787), Hatfield, UK, 12\u201316 September, 2017.","DOI":"10.1007\/978-3-319-66429-3_78"},{"key":"9691_CR55","doi-asserted-by":"crossref","unstructured":"Verhelst, W., & Roelands, M. (1993). An overlap-add technique based on waveform similarity (WSOLA) for high quality time-scale modification of speech. In 1993 IEEE international conference on acoustics, speech, and signal processing. ICASSP-93 (Vol.\u00a02, pp. 554\u2013557). IEEE.","DOI":"10.1109\/ICASSP.1993.319366"},{"key":"9691_CR56","doi-asserted-by":"crossref","unstructured":"Verma, R., Sarkar, P., & Rao, K. S. (2015). Conversion of neutral speech to storytelling style speech. In Proceedings of IEEE ICAPR.","DOI":"10.1109\/ICAPR.2015.7050705"},{"key":"9691_CR57","doi-asserted-by":"crossref","unstructured":"Vuppala, A., & Kadiri, S. (2014). Neutral to anger speech conversion using non-uniform duration modification. In 9th international conference on industrial and information systems (ICIIS) (pp. 1\u20134).","DOI":"10.1109\/ICIINFS.2014.7036614"},{"issue":"5","key":"9691_CR59","doi-asserted-by":"publisher","first-page":"1643","DOI":"10.1007\/s00034-015-0134-1","volume":"35","author":"H Vydana","year":"2016","unstructured":"Vydana, H., Kadiri, S., & Vuppala, A. (2016). Vowel-based non-uniform prosody modification for emotion conversion. Circuits, Systems, and Signal Processing, 35(5), 1643\u20131663.","journal-title":"Circuits, Systems, and Signal Processing"},{"key":"9691_CR58","doi-asserted-by":"crossref","unstructured":"Vydana, H., Raju, V. V., Gangashetty, S. V., & Vuppala, A. (2015). Significance of emotionally significant regions of speech for emotive to neutral conversion. In International conference on mining intelligence and knowledge exploration (pp. 287\u2013296). Springer.","DOI":"10.1007\/978-3-319-26832-3_28"},{"key":"9691_CR60","doi-asserted-by":"crossref","unstructured":"Wu, Z., Chng, E. S., & Li, H. (2013). Conditional restricted Boltzmann machine for voice conversion. In Proceedings of the international conference on signal and information processing (ChinaSIP) (pp. 104\u2013108). IEEE.","DOI":"10.1109\/ChinaSIP.2013.6625307"},{"issue":"6","key":"9691_CR61","doi-asserted-by":"publisher","first-page":"1394","DOI":"10.1109\/TASL.2009.2034771","volume":"18","author":"C Wu","year":"2010","unstructured":"Wu, C., Hsia, C., Lee, C., & Lin, M. (2010). Hierarchical prosody conversion using regression-based clustering for emotional speech synthesis. IEEE Transactions on Audio, Speech, and Lang Processing, 18(6), 1394\u20131405.","journal-title":"IEEE Transactions on Audio, Speech, and Lang Processing"},{"issue":"10","key":"9691_CR62","doi-asserted-by":"publisher","first-page":"1506","DOI":"10.1109\/TASLP.2014.2333242","volume":"22","author":"Z Wu","year":"2014","unstructured":"Wu, Z., Virtanen, T., Chng, E., & Li, H. (2014). Exemplar-based sparse representation with residual compensation for voice conversion. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 22(10), 1506\u20131521.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"1","key":"9691_CR63","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/s00034-015-0051-3","volume":"35","author":"J Yadav","year":"2016","unstructured":"Yadav, J., & Rao, K. S. (2016). Prosodic mapping using neural networks for emotion conversion in Hindi language. Circuits, Systems, and Signal Processing, 35(1), 139\u2013162.","journal-title":"Circuits, Systems, and Signal Processing"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-020-09691-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10772-020-09691-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-020-09691-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,18]],"date-time":"2022-10-18T01:47:09Z","timestamp":1666057629000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10772-020-09691-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3,9]]},"references-count":63,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2020,9]]}},"alternative-id":["9691"],"URL":"https:\/\/doi.org\/10.1007\/s10772-020-09691-1","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3,9]]},"assertion":[{"value":"1 November 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 January 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 March 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}