{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T07:55:28Z","timestamp":1758268528569},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2018,3,28]],"date-time":"2018-03-28T00:00:00Z","timestamp":1522195200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2018,9]]},"DOI":"10.1007\/s10772-018-9503-z","type":"journal-article","created":{"date-parts":[[2018,3,28]],"date-time":"2018-03-28T08:41:19Z","timestamp":1522226479000},"page":"521-532","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Prosody modification for speech recognition in emotionally mismatched conditions"],"prefix":"10.1007","volume":"21","author":[{"given":"Vishnu Vidyadhara Raju","family":"Vegesna","sequence":"first","affiliation":[]},{"given":"Krishna","family":"Gurugubelli","sequence":"additional","affiliation":[]},{"given":"Anil kumar","family":"Vuppala","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,3,28]]},"reference":[{"key":"9503_CR1","doi-asserted-by":"crossref","unstructured":"Adiga, N., Govind, D., Prasanna, S. M. (2014). Significance of epoch identification accuracy for prosody modification. In proceedings of SPCOM, Bangalore, India. IEEE, (pp. 1\u20136).","DOI":"10.1109\/SPCOM.2014.6984007"},{"key":"9503_CR2","first-page":"3","volume":"1","author":"A Batliner","year":"2010","unstructured":"Batliner, A., Steidl, S., Seppi, D., & Schuller, B. (2010). Segmenting into adequate units for automatic recognition of emotion-related episodes: A speech-based approach. Advances in Human-Computer Interaction, 1, 3.","journal-title":"Advances in Human-Computer Interaction"},{"key":"9503_CR3","doi-asserted-by":"crossref","unstructured":"Bulut, M., Narayanan, S. S., & Syrdal, A. K. (2002). Expressive speech synthesis using a concatenative synthesizer. In proceedings of INTERSPEECH, Denver, Colorado, USA.","DOI":"10.21437\/ICSLP.2002-389"},{"key":"9503_CR4","doi-asserted-by":"crossref","unstructured":"Cabral, J. P., & Oliveira, L. (2005). Pitch-synchronous time-scaling for prosodic and voice quality transformations. In proceedings of Ninth European Conference on Speech Communication and Technology.","DOI":"10.21437\/Interspeech.2005-209"},{"key":"9503_CR5","doi-asserted-by":"crossref","unstructured":"Cabral, J. P., & Oliveira, L. C. (2006). Emovoice: A system to generate emotions in speech. In proceedings of INTERSPEECH, Pittsburgh, Pennsylvania.","DOI":"10.21437\/Interspeech.2006-497"},{"key":"9503_CR6","doi-asserted-by":"crossref","unstructured":"Crumpton, J., & Bethel, C. L. (2015). Validation of vocal prosody modifications to communicate emotion in robot speech. In International Conference on Collaboration Technologies and Systems (CTS). IEEE, pp. 39\u201346.","DOI":"10.1109\/CTS.2015.7210396"},{"issue":"3","key":"9503_CR7","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1109\/LSP.2009.2038507","volume":"17","author":"N Dhananjaya","year":"2010","unstructured":"Dhananjaya, N., & Yegnanarayana, B. (2010). Voiced\/nonvoiced detection based on robustness of voiced epochs. IEEE Signal Processing Letters, 17(3), 273\u2013276.","journal-title":"IEEE Signal Processing Letters"},{"key":"9503_CR8","unstructured":"Eide, E., Aaron, A., Bakis, R., Hamza, W., Picheny, M., & Pitrelli, J. (2004). A corpus-based approach to expressive speech synthesis. In Fifth ISCA Workshop on Speech Synthesis. Pittsburgh, PA, USA."},{"key":"9503_CR9","doi-asserted-by":"crossref","unstructured":"Ellis, D. P., & Weiss, R. J. (2006). Model-based monaural source separation using a vector-quantized phase-vocoder representation. In proceedings of International conference on Acoustics, Speech and Signal Processing, Toulouse, France, Vol. 5. IEEE.","DOI":"10.1109\/ICASSP.2006.1661436"},{"key":"9503_CR10","doi-asserted-by":"crossref","unstructured":"Gangamohan, P., Mittal, V. K., & Yegnanarayana, B. (2012). A flexible analysis synthesis tool (fast) for studying the characteristic features of emotion in speech. In Proc. of Consumer Communications and Networking Conference (CCNC), Lasvegas, USA. IEEE, pp. 250\u2013254.","DOI":"10.1109\/CCNC.2012.6181096"},{"key":"9503_CR11","doi-asserted-by":"crossref","unstructured":"Gangamohan, P., Mittal, V., & Yegnanarayana, B. (2012). Relative importance of different components of speech contributing to perception of emotion. In proceedings of Sixth International Conference on Speech Prosody, China.","DOI":"10.21437\/SpeechProsody.2012-164"},{"key":"9503_CR12","unstructured":"Govind, D., & Prasanna, S. (2009). Expressive speech synthesis using prosodic modification and dynamic time warping. In proceedings of NCC, Guwahati, India."},{"key":"9503_CR13","doi-asserted-by":"crossref","unstructured":"Govind, D., Prasanna, S. M., & Yegnanarayana, B. (2011). Neutral to target emotion conversion using source and suprasegmental information. In proceedings of INTERSPEECH, Florence, Italy, pp. 2969\u20132972.","DOI":"10.21437\/Interspeech.2011-743"},{"issue":"6","key":"9503_CR14","doi-asserted-by":"publisher","first-page":"349","DOI":"10.1250\/ast.27.349","volume":"27","author":"H Kawahara","year":"2006","unstructured":"Kawahara, H. (2006). Straight, exploitation of the other aspect of vocoder: Perceptually isomorphic decomposition of speech sounds. Acoustical Science and Technology, 27(6), 349\u2013353.","journal-title":"Acoustical Science and Technology"},{"issue":"4","key":"9503_CR15","doi-asserted-by":"publisher","first-page":"2202","DOI":"10.1121\/1.418204","volume":"101","author":"RW Kortekaas","year":"1997","unstructured":"Kortekaas, R. W., & Kohlrausch, A. (1997). Psychoacoustical evaluation of the pitch-synchronous overlap-and-add speech-waveform manipulation technique using single-formant stimuli. The Journal of the Acoustical Society of America, 101(4), 2202\u20132213.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"3","key":"9503_CR16","doi-asserted-by":"publisher","first-page":"323","DOI":"10.1109\/89.759041","volume":"7","author":"J Laroche","year":"1999","unstructured":"Laroche, J., & Dolson, M. (1999). Improved phase vocoder time-scale modification of audio. IEEE Transactions on Speech and Audio processing, 7(3), 323\u2013332.","journal-title":"IEEE Transactions on Speech and Audio processing"},{"key":"9503_CR17","doi-asserted-by":"crossref","unstructured":"Lotfian, R., & Busso, C. (2015). Emotion recognition using synthetic speech as neutral reference. In proceedings of International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, (pp. 4759\u20134763).","DOI":"10.1109\/ICASSP.2015.7178874"},{"issue":"5\u20136","key":"9503_CR18","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1016\/0167-6393(90)90021-Z","volume":"9","author":"E Moulines","year":"1990","unstructured":"Moulines, E., & Charpentier, F. (1990). Pitch-synchronous waveform processing techniques for text-to-speech synthesis using diphones. Speech Communication, 9(5\u20136), 453\u2013467.","journal-title":"Speech Communication"},{"issue":"8","key":"9503_CR19","doi-asserted-by":"publisher","first-page":"1602","DOI":"10.1109\/TASL.2008.2004526","volume":"16","author":"KSR Murty","year":"2008","unstructured":"Murty, K. S. R., & Yegnanarayana, B. (2008). Epoch extraction from speech signals. IEEE Transactions on Audio, Speech, and Language Processing, 16(8), 1602\u20131613.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"6","key":"9503_CR20","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1109\/LSP.2009.2016829","volume":"16","author":"KSR Murty","year":"2009","unstructured":"Murty, K. S. R., Yegnanarayana, B., & Joseph, M. A. (2009). Characterization of glottal activity from speech signals. IEEE Signal Processing Letters, 16(6), 469\u2013472.","journal-title":"IEEE Signal Processing Letters"},{"key":"9503_CR21","doi-asserted-by":"crossref","unstructured":"Nakayama, K., Oshima, C., Higashihara, R., Machishima, K. (2015). Mood induction through emotional prosody modification experiments of students reading a folk story scenario. In Annual Conference of the Society of Instrument and Control Engineers of Japan (SICE). IEEE, (pp. 391\u2013396).","DOI":"10.1109\/SICE.2015.7285406"},{"issue":"4","key":"9503_CR22","doi-asserted-by":"publisher","first-page":"1099","DOI":"10.1109\/TASL.2006.876123","volume":"14","author":"JF Pitrelli","year":"2006","unstructured":"Pitrelli, J. F., Bakis, R., Eide, E. M., Fernandez, R., Hamza, W., & Picheny, M. A. (2006). The ibm expressive text-to-speech synthesis system for american english. IEEE Transactions on Audio, Speech, and Language Processing, 14(4), 1099\u20131108.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9503_CR23","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P., Qian, Y., Schwarz, P. et al. (2011). The kaldi speech recognition toolkit. In IEEE 2011 Workshop on Automatic Speech Recognition and Understanding. IEEE Signal Processing Society."},{"key":"9503_CR24","doi-asserted-by":"crossref","unstructured":"Prasanna, S. M., & Govind, D. (2010). Analysis of excitation source information in emotional speech. In proceedings of INTERSPEECH, Japan.","DOI":"10.21437\/Interspeech.2010-284"},{"key":"9503_CR25","doi-asserted-by":"crossref","unstructured":"Prasanna, S., Govind, D., Rao, K. S., & Yenanarayana, B. (2010). Fast prosody modification using instants of significant excitation. In proceedings of Speech Prosody, Chicago, USA.","DOI":"10.21437\/SpeechProsody.2010-126"},{"issue":"3","key":"9503_CR26","doi-asserted-by":"publisher","first-page":"972","DOI":"10.1109\/TSA.2005.858051","volume":"14","author":"KS Rao","year":"2006","unstructured":"Rao, K. S., & Yegnanarayana, B. (2006). Prosody modification using instants of significant excitation. IEEE Transactions on Audio, Speech, and Language Processing, 14(3), 972\u2013980.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9503_CR27","unstructured":"Reddy, G., & Rao, K. S. (2015). Neutral to happy emotion conversion by blending prosody and laughter. In proceedings of Eighth International Conference on Contemporary Computing (IC3), Noida, India. IEEE, (pp. 342\u2013347)."},{"key":"9503_CR28","doi-asserted-by":"crossref","unstructured":"Sagha, H., Deng, J., & Schuller, B. (2017). The effect of personality trait, age, and gender on the performance of automatic speech valence recognition. In proceedings of Seventh International Conference on Affective Computing and Intelligent Interaction (ACII). IEEE, (pp. 86\u201391).","DOI":"10.1109\/ACII.2017.8273583"},{"issue":"11","key":"9503_CR29","doi-asserted-by":"publisher","first-page":"1749","DOI":"10.1109\/LSP.2017.2756347","volume":"24","author":"S Shahnawazuddin","year":"2017","unstructured":"Shahnawazuddin, S., Adiga, N., & Kathania, H. K. (2017). Effect of prosody modification on children\u2019s asr. IEEE Signal Processing Letters, 24(11), 1749\u20131753.","journal-title":"IEEE Signal Processing Letters"},{"key":"9503_CR30","doi-asserted-by":"crossref","unstructured":"Sharma, B., & Prasanna, S. M. (2015). Improvement of syllable based tts system in assamese using prosody modification. In proceedings of Annual India Conference (INDICON). IEEE, (pp. 1\u20136).","DOI":"10.1109\/INDICON.2015.7443698"},{"key":"9503_CR31","doi-asserted-by":"crossref","unstructured":"Sorin, A., Shechtman, S., & Pollet, V. (2015). Coherent modification of pitch and energy for expressive prosody implantation. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, (pp. 4914\u20134918).","DOI":"10.1109\/ICASSP.2015.7178905"},{"issue":"4","key":"9503_CR32","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1109\/TASL.2006.876113","volume":"14","author":"J Tao","year":"2006","unstructured":"Tao, J., Kang, Y., & Li, A. (2006). Prosody conversion from neutral speech to emotional speech. IEEE Transactions on Audio, Speech, and Language Processing, 14(4), 1145\u20131154.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9503_CR33","unstructured":"Thomas, M. R., Gudnason, J., & Naylor, P. A. (2008). Application of dypsa algorithm to segmented time scale modification of speech. In proceedings of EUSIPCO, Switzerland. IEEE."},{"issue":"4","key":"9503_CR34","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1016\/S0167-6393(99)00051-5","volume":"30","author":"W Verhelst","year":"2000","unstructured":"Verhelst, W. (2000). Overlap-add methods for time-scaling of speech. Speech Communication, 30(4), 207\u2013221.","journal-title":"Speech Communication"},{"key":"9503_CR35","doi-asserted-by":"crossref","unstructured":"Vidyadhara Raju, V., Vydana, V, H. K., Gangashetty, S. V., & Vuppala, A. K. (2017). Importance of non-uniform prosody modification for speech recognition in emotion conditions. In proceedings of Asia-Pacific Signal and information processing association annual summit and conference (APSIPA), Kuala Lumpur. IEEE.","DOI":"10.1109\/APSIPA.2017.8282109"},{"key":"9503_CR36","unstructured":"VidyadharaRaju, V., Gangamohan, P., Gangashetty, S. V., & Vuppala, A. K. (2016). Application of prosody modification for speech recognition in different emotion conditions. In proceedings of Region 10 Conference (TENCON), Singapore. IEEE, (pp. 951\u2013954)."},{"key":"9503_CR37","doi-asserted-by":"crossref","unstructured":"Vydana, H. K., Vidyadhara Raju, V., Gangashetty, V, S. V., & Vuppala, A. K. (2015). Significance of emotionally significant regions of speech for emotive to neutral conversion. In proceedings of International Conference on Mining Intelligence and Knowledge Exploration, Hyderabad, India. Springer, New York, (pp. 287\u2013296).","DOI":"10.1007\/978-3-319-26832-3_28"},{"issue":"5","key":"9503_CR38","doi-asserted-by":"publisher","first-page":"1643","DOI":"10.1007\/s00034-015-0134-1","volume":"35","author":"HK Vydana","year":"2016","unstructured":"Vydana, H. K., Kadiri, S. R., & Vuppala, A. K. (2016). Vowel-based non-uniform prosody modification for emotion conversion. Circuits, Systems, and Signal Processing, 35(5), 1643\u20131663.","journal-title":"Circuits, Systems, and Signal Processing"},{"issue":"5","key":"9503_CR39","doi-asserted-by":"publisher","first-page":"2527","DOI":"10.1121\/1.1616923","volume":"114","author":"U Z\u00f6lzer","year":"2003","unstructured":"Z\u00f6lzer, U., & Smith Iii, J. O. (2003). Dafxdigital audio effects. The Journal of the Acoustical Society of America, 114(5), 2527\u20132528.","journal-title":"The Journal of the Acoustical Society of America"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10772-018-9503-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-018-9503-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-018-9503-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T11:37:10Z","timestamp":1719920230000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10772-018-9503-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,3,28]]},"references-count":39,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2018,9]]}},"alternative-id":["9503"],"URL":"https:\/\/doi.org\/10.1007\/s10772-018-9503-z","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,3,28]]},"assertion":[{"value":"11 November 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 February 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 March 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}