{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:27:00Z","timestamp":1740122820344,"version":"3.37.3"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2018,12,3]],"date-time":"2018-12-03T00:00:00Z","timestamp":1543795200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2019,3]]},"DOI":"10.1007\/s10772-018-09583-5","type":"journal-article","created":{"date-parts":[[2018,12,3]],"date-time":"2018-12-03T07:22:00Z","timestamp":1543821720000},"page":"79-91","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Speech synthesis for glottal activity region processing"],"prefix":"10.1007","volume":"22","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3438-567X","authenticated-orcid":false,"given":"Nagaraj","family":"Adiga","sequence":"first","affiliation":[]},{"given":"S. R. M","family":"Prasanna","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,12,3]]},"reference":[{"key":"9583_CR1","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1016\/j.dsp.2017.09.007","volume":"71","author":"N Adiga","year":"2017","unstructured":"Adiga, N., Khonglah, B. K., & Prasanna, S. M. (2017). Improved voicing decision using glottal activity features for statistical parametric speech synthesis. Digital Signal Processing, 71, 131\u2013143.","journal-title":"Digital Signal Processing"},{"issue":"11","key":"9583_CR2","doi-asserted-by":"publisher","first-page":"2107","DOI":"10.1109\/LSP.2015.2461008","volume":"22","author":"N Adiga","year":"2015","unstructured":"Adiga, N., & Prasanna, S. R. M. (2015). Detection of glottal activity using different attributes of source information. The IEEE Signal Processing Letters, 22(11), 2107\u20132111.","journal-title":"The IEEE Signal Processing Letters"},{"key":"9583_CR3","doi-asserted-by":"publisher","unstructured":"Adiga, N. & Prasanna, S. R. M. (2018). Acoustic features modelling for statistical parametric speech synthesis: A review. IETE Technical Review. https:\/\/doi.org\/10.1080\/02564602.2018.1432422","DOI":"10.1080\/02564602.2018.1432422"},{"key":"9583_CR4","unstructured":"Airaksinen, M., Bollepalli, B., Juvela, L., Wu, Z., King, S. & Alku, P. (2016). Glottdnna full-band glottal vocoder for statistical parametric speech synthesis. In Proc. Interspeech."},{"issue":"2","key":"9583_CR5","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/0167-6393(92)90005-R","volume":"1","author":"P Alku","year":"1992","unstructured":"Alku, P. (1992). Glottal wave analysis with pitch synchronous iterative adaptive inverse filtering. Speech Communication, 1(2), 109\u2013118.","journal-title":"Speech Communication"},{"key":"9583_CR6","unstructured":"Ananthapadmanabha, T. V. (1984). Acoustic analysis of voice source dynamics. STL-QPSR 23. Speech, Music and Hearing, Royal Institute of Technology, Stockholm: Tech. Rep."},{"key":"9583_CR8","doi-asserted-by":"crossref","unstructured":"Aragonda, H. & Seelamantula, C. (2013) Riesz-transform-based demodulation of narrowband spectrograms of voiced speech. In Proc. IEEE Int. Conf. Acoust. Speech Signal Process., May (pp. 8203\u20138207).","DOI":"10.1109\/ICASSP.2013.6639264"},{"issue":"11","key":"9583_CR7","doi-asserted-by":"publisher","first-page":"1824","DOI":"10.1109\/TASLP.2015.2449088","volume":"23","author":"H Aragonda","year":"2015","unstructured":"Aragonda, H., & Seelamantula, C. (2015). Demodulation of narrowband speech spectrograms using the Riesz transform. The IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 23(11), 1824\u20131834.","journal-title":"The IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"9583_CR9","unstructured":"Arik, S. O., Chrzanowski, M., Coates, A., Diamos, G., Gibiansky, A., Kang, Y., Li, X., Miller, J., Raiman, J. & Sengupta, S. et\u00a0al. (2017). Deep Voice: Real-time neural text-to-speech. arXiv:1702.07825 ."},{"issue":"2","key":"9583_CR11","doi-asserted-by":"publisher","first-page":"239","DOI":"10.1016\/0165-1684(93)E0019-H","volume":"41","author":"C-Y Chi","year":"1995","unstructured":"Chi, C.-Y., & Kung, J.-Y. (1995). A new identification algorithm for allpass systems by higher-order statistics. Signal Processing, 41(2), 239\u2013256.","journal-title":"Signal Processing"},{"issue":"4","key":"9583_CR10","doi-asserted-by":"publisher","first-page":"1917","DOI":"10.1121\/1.1458024","volume":"111","author":"A Cheveign\u00e9 De","year":"2002","unstructured":"De Cheveign\u00e9, A., & Kawahara, H. (2002). YIN, a fundamental frequency estimator for speech and music. The Journal of the Acoustical Society of America, 111(4), 1917\u20131930.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9583_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13636-014-0038-1","volume":"1","author":"G Degottex","year":"2014","unstructured":"Degottex, G., & Erro, D. (2014). A uniform phase representation for the harmonic model in speech synthesis applications. EURASIP Journal on Audio Speech Music Process, 1, 1\u201316. https:\/\/doi.org\/10.1186\/s13636-014-0038-1 .","journal-title":"EURASIP Journal on Audio Speech Music Process"},{"key":"9583_CR13","unstructured":"Eleftherios, B., Daniel, E., Antonio, B., & Asuncion, M. (2008). Flexible harmonic\/stochastic modeling for HMM-based speech synthesis. V Jornadas en Tecnologa del Habla."},{"issue":"2","key":"9583_CR14","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1109\/JSTSP.2013.2283471","volume":"8","author":"D Erro","year":"2014","unstructured":"Erro, D., Sainz, I., Navas, E., & Hernaez, I. (2014). Harmonics plus noise model based vocoder for statistical parametric speech synthesis. IEEE Journal of Selected Topics in Signal Process, 8(2), 184\u2013194.","journal-title":"IEEE Journal of Selected Topics in Signal Process"},{"key":"9583_CR15","unstructured":"Fisher, W. M., Doddington, G. R. & Goudie-Marshall, K. M. (1986). The DARPA speech recognition research database: Specifications and status. In Proc. DARPA workshop on speech recognition (pp. 93\u201399)."},{"key":"9583_CR16","volume-title":"Speech analysis, synthesis and perception","author":"J\u00a0L Flanagan","year":"2013","unstructured":"Flanagan, J\u00a0. L. (2013). Speech analysis, synthesis and perception (Vol. 3). New York: Springer."},{"key":"9583_CR17","first-page":"137","volume":"1","author":"T Fukada","year":"1992","unstructured":"Fukada, T., Tokuda, K., Kobayashi, T., & Imai, S. (1992). An adaptive algorithm for mel-cepstral analysis of speech. Proceedings IEEE International Conference on Acoustics, Speech, and Signal Processing, 1, 137\u2013140.","journal-title":"Proceedings IEEE International Conference on Acoustics, Speech, and Signal Processing"},{"key":"9583_CR19","unstructured":"Hemptinne, C. (2006). Integration of the harmonic plus noise model (HNM) into the Hidden Markov Model-Based speech synthesis system (HTS). Master\u2019s thesis, Idiap Research Institute."},{"issue":"1","key":"9583_CR20","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1121\/1.396427","volume":"83","author":"DJ Hermes","year":"1988","unstructured":"Hermes, D. J. (1988). Measurement of pitch by subharmonic summation. The Journal of the Acoustical Society of America, 83(1), 257\u2013264.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9583_CR18","unstructured":"HTS. http:\/\/hts.sp.nitech.ac.jp\/"},{"key":"9583_CR21","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1109\/ICIP.1996.560836","volume":"1","author":"AJ Hunt","year":"1996","unstructured":"Hunt, A. J., & Black, A. W. (1996). Unit selection in a concatenative speech synthesis system using a large speech database. Proceedings IEEE International Conference on Acoustics, Speech, and Signal Processing, 1, 373\u2013376.","journal-title":"Proceedings IEEE International Conference on Acoustics, Speech, and Signal Processing"},{"key":"9583_CR23","unstructured":"Kawahara, H., Estill, J. & Osamu, F. (2001). Aperiodicity extraction and control using mixed mode excitation and group delay manipulation for a high quality speech analysis, modification and synthesis system straight. In Proc. MAVEBA (pp. 59\u201364)."},{"key":"9583_CR22","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/S0167-6393(98)00085-5","volume":"27","author":"H Kawahara","year":"1999","unstructured":"Kawahara, H., Masuda-Katsuse, I., & de Cheveign, A. (1999). Restructuring speech representations using a pitch-adaptive time frequency smoothing and an instantaneous-frequency-based F0 extraction. Speech Communication, 27, 187\u2013207.","journal-title":"Speech Communication"},{"issue":"5","key":"9583_CR24","doi-asserted-by":"publisher","first-page":"837","DOI":"10.1007\/s12046-011-0048-y","volume":"36","author":"S King","year":"2011","unstructured":"King, S. (2011). An introduction to statistical parametric speech synthesis. Sadhana, 36(5), 837\u2013852.","journal-title":"Sadhana"},{"issue":"4","key":"9583_CR25","doi-asserted-by":"publisher","first-page":"730","DOI":"10.1109\/TASSP.1986.1164909","volume":"34","author":"A Krishnamurthy","year":"1986","unstructured":"Krishnamurthy, A., & Childers, D. (1986). Two-channel speech analysis. IEEE Transactions on Acoustics, Speech, and Signal Processing, 34(4), 730\u2013743.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"issue":"8","key":"9583_CR26","doi-asserted-by":"publisher","first-page":"1862","DOI":"10.1364\/JOSAA.18.001862","volume":"18","author":"KG Larkin","year":"2001","unstructured":"Larkin, K. G., Bone, D. J., & Oldfield, M. A. (2001). Natural demodulation of two-dimensional fringe patterns. I. General background of the spiral phase quadrature transform. The Journal of the Optical Society of America A, 18(8), 1862\u20131870.","journal-title":"The Journal of the Optical Society of America A"},{"issue":"4","key":"9583_CR27","doi-asserted-by":"publisher","first-page":"561","DOI":"10.1109\/PROC.1975.9792","volume":"63","author":"J Makhoul","year":"1975","unstructured":"Makhoul, J. (1975). Linear prediction: A tutorial review. Proceedings of the IEEE, 63(4), 561\u2013580.","journal-title":"Proceedings of the IEEE"},{"issue":"4","key":"9583_CR28","doi-asserted-by":"publisher","first-page":"744","DOI":"10.1109\/TASSP.1986.1164910","volume":"34","author":"RJ McAulay","year":"1986","unstructured":"McAulay, R. J., & Quatieri, T. F. (1986). Speech analysis\/synthesis based on a sinusoidal representation. IEEE Transactions on Acoustics, Speech, and Signal Processing, 34(4), 744\u2013754.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"key":"9583_CR29","unstructured":"Mehri, S., Kumar, K., Gulrajani, I., Kumar, R., Jain, S., Sotelo, J., Courville, A. & Bengio, Y. (2016). SampleRNN: An unconditional end-to-end neural audio generation model. arXiv:1612.07837 ."},{"key":"9583_CR30","doi-asserted-by":"publisher","first-page":"1602","DOI":"10.1109\/TASL.2008.2004526","volume":"16","author":"KSR Murthy","year":"2008","unstructured":"Murthy, K. S. R., & Yegnanarayana, B. (2008). Epoch extraction from speech signals. IEEE Transactions on Audio Speech and Language Processing, 16, 1602\u20131613.","journal-title":"IEEE Transactions on Audio Speech and Language Processing"},{"issue":"6","key":"9583_CR31","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1109\/LSP.2009.2016829","volume":"16","author":"KSR Murthy","year":"2009","unstructured":"Murthy, K. S. R., Yegnanarayana, B., & Joseph, M. A. (2009). Characterization of glottal activity from speech signals. The IEEE Signal Processing Letters, 16(6), 469\u2013472.","journal-title":"The IEEE Signal Processing Letters"},{"issue":"3","key":"9583_CR32","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1109\/89.905996","volume":"9","author":"E Nemer","year":"2001","unstructured":"Nemer, E., Goubran, R., & Mahmoud, S. (2001). Robust voice activity detection using higher-order statistics in the LPC residual domain. IEEE Transactions on Speech and Audio Processing, 9(3), 217\u2013231.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"issue":"2","key":"9583_CR33","doi-asserted-by":"publisher","first-page":"458","DOI":"10.1121\/1.1911395","volume":"45","author":"AV Oppenheim","year":"1969","unstructured":"Oppenheim, A. V. (1969). Speech analysis-synthesis system based on homomorphic filtering. The Journal of the Acoustical Society of America, 45(2), 458\u2013465.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"9583_CR34","doi-asserted-by":"crossref","unstructured":"Pantazis, Y. & Stylianou, Y. (2008). Improving the modeling of the noise part in the harmonic plus noise model of speech. In Proc. IEEE Int. Conf. Acoust. Speech Signal Process, March (pp. 4609\u20134612).","DOI":"10.1109\/ICASSP.2008.4518683"},{"key":"9583_CR35","doi-asserted-by":"crossref","unstructured":"Patil, H. A., Patel, T. B., Shah, N. J., Sailor, H. B., Krishnan, R., Kasthuri, G., Nagarajan, T., Christina, L., Kumar, N. & Raghavendra V. et\u00a0al. (2013). A syllable-based framework for unit selection synthesis in 13 Indian languages. In Proc. Oriental COCOSDA (pp. 1\u20138). IEEE.","DOI":"10.1109\/ICSDA.2013.6709851"},{"issue":"12","key":"9583_CR36","first-page":"30","volume":"8","author":"F Plante","year":"1995","unstructured":"Plante, F., Meyer, G., & Ainsworth, W. (1995). A pitch extraction reference database. Children, 8(12), 30\u201350.","journal-title":"Children"},{"issue":"12","key":"9583_CR37","doi-asserted-by":"publisher","first-page":"2471","DOI":"10.1109\/TASL.2013.2273717","volume":"21","author":"A Prathosh","year":"2013","unstructured":"Prathosh, A., Ananthapadmanabha, T., & Ramakrishnan, A. (2013). Epoch extraction based on integrated linear prediction residual using plosion index. IEEE Transactions on Audio Speech and Language Processing, 21(12), 2471\u20132480.","journal-title":"IEEE Transactions on Audio Speech and Language Processing"},{"key":"9583_CR38","doi-asserted-by":"crossref","unstructured":"Quatieri, T. F. (2002). 2-D processing of speech with application to pitch estimation. In Proc. Interspeech.","DOI":"10.21437\/ICSLP.2002-518"},{"key":"9583_CR40","doi-asserted-by":"crossref","unstructured":"Raitio, T., Suni, A., Pulakka, H., Vainio, M. & Alku, P. (2011). Utilizing glottal source pulse library for generating improved excitation signal for HMM-based speech synthesis. In Proc. IEEE Int. Conf. Acoust. Speech Signal Process. (pp. 4564\u20134567).","DOI":"10.1109\/ICASSP.2011.5947370"},{"key":"9583_CR39","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1109\/TASL.2010.2045239","volume":"19\u20131","author":"T Raitio","year":"2011","unstructured":"Raitio, T., Suni, A., Yamagishi, J., Pulakka, H., Nurminen, J., Vainio, M., et al. (2011). HMM-based speech synthesis utilizing glottal inverse filtering. IEEE Transactions on Audio Speech and Language Processing, 19\u20131, 153\u2013165.","journal-title":"IEEE Transactions on Audio Speech and Language Processing"},{"issue":"10","key":"9583_CR41","doi-asserted-by":"publisher","first-page":"2118","DOI":"10.1364\/JOSAA.29.002118","volume":"29","author":"CS Seelamantula","year":"2012","unstructured":"Seelamantula, C. S., Pavillon, N., Depeursinge, C., & Unser, M. (2012). Local demodulation of holograms using the Riesz transform with application to microscopy. The Journal of the Optical Society of America A, 29(10), 2118\u20132129.","journal-title":"The Journal of the Optical Society of America A"},{"issue":"8","key":"9583_CR42","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1016\/S1364-6613(00)01704-6","volume":"5","author":"S Shamma","year":"2001","unstructured":"Shamma, S. (2001). On the role of space and time in auditory processing. Trends in Cognitive Sciences, 5(8), 340\u2013348.","journal-title":"Trends in Cognitive Sciences"},{"key":"9583_CR43","doi-asserted-by":"crossref","unstructured":"Sharma, B., Adiga, N. & Prasanna, S. M. (2015). Development of Assamese text-to-speech synthesis system. In Proc. TENCON (pp. 1\u20136). IEEE.","DOI":"10.1109\/TENCON.2015.7372786"},{"key":"9583_CR44","doi-asserted-by":"crossref","unstructured":"Sj\u00f6lander, K. & Beskow, J. (2000). Wavesurfer\u2014An open source speech tool. In Proc. Interspeech (pp. 464\u2013467).","DOI":"10.21437\/ICSLP.2000-849"},{"issue":"1","key":"9583_CR45","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1109\/89.890068","volume":"9","author":"Y Stylianou","year":"2001","unstructured":"Stylianou, Y. (2001). Applying the harmonic plus noise model in concatenative speech synthesis. IEEE Transactions on Speech and Audio Processing, 9(1), 21\u201329.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"9583_CR46","unstructured":"Stylianou, I. (1996). Harmonic plus noise models for speech, combined with statistical methods, for speech and speaker modification. Ph.D. dissertation, Ecole Nationale Sup\u00e9rieure des T\u00e9l\u00e9communications"},{"key":"9583_CR48","doi-asserted-by":"crossref","unstructured":"Tokuda, K., Kobayashi, T., Masuko, T. & Imai, S. (1994). Mel-generalized cepstral analysis-a unified approach to speech spectral estimation. In Proceedings of ICSLP.","DOI":"10.21437\/ICSLP.1994-275"},{"key":"9583_CR47","doi-asserted-by":"publisher","first-page":"1234","DOI":"10.1109\/JPROC.2013.2251852","volume":"101\u20135","author":"K Tokuda","year":"2013","unstructured":"Tokuda, K., Nankaku, Y., Toda, T., Zen, H., Yamagishi, J., & Oura, K. (2013). Speech synthesis based on hidden Markov models. Proceedings of the IEEE, 101\u20135, 1234\u20131252.","journal-title":"Proceedings of the IEEE"},{"key":"9583_CR54","unstructured":"van den oord, A., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., Kalchbrenner, N., Senior, A. & Kavukcuoglu, K. (2016). WaveNet: A generative model for raw audio. arXiv:1609.03499 ."},{"issue":"6","key":"9583_CR49","doi-asserted-by":"publisher","first-page":"1843","DOI":"10.1109\/TASL.2012.2188795","volume":"20","author":"T Wang","year":"2012","unstructured":"Wang, T., & Quatieri, T. (2012). Two-dimensional speech-signal modeling. IEEE Transactions on Audio Speech and Language Processing, 20(6), 1843\u20131856.","journal-title":"IEEE Transactions on Audio Speech and Language Processing"},{"key":"9583_CR50","unstructured":"Wang, Y., Skerry-Ryan, R., Stanton, D., Wu, Y., Weiss, R.\u00a0J., Jaitly, N., Yang, Z., Xiao, Y., Chen, Z., Bengio, S., Le, Q., Agiomyrgiannakis, Y., Clark, R. & Saurous, R. A. (2017). Tacotron: A fully end-to-end text-to-speech synthesis model. arXiv:1703.10135 ."},{"key":"9583_CR51","unstructured":"Wu, Z., Watts, O., & King, S. (2016). Merlin: An open source neural network speech synthesis system. In Proceedings of the speech synthesis workshop (SSW). Sunnyvale, USA: SSW."},{"key":"9583_CR52","doi-asserted-by":"crossref","unstructured":"Yoshimura, T., Tokuda, K., Masuko, T., Kobayashi, T. & Kitamura, T. (1999). Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis. In Proceedings of Eurospeech.","DOI":"10.21437\/Eurospeech.1999-513"},{"key":"9583_CR53","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","volume":"51\u201311","author":"H Zen","year":"2009","unstructured":"Zen, H., Tokuda, K., & Black, A. W. (2009). Statistical parametric speech synthesis. Speech Communication, 51\u201311, 1039\u20131064.","journal-title":"Speech Communication"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10772-018-09583-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-018-09583-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-018-09583-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,7]],"date-time":"2022-09-07T09:57:59Z","timestamp":1662544679000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10772-018-09583-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12,3]]},"references-count":54,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2019,3]]}},"alternative-id":["9583"],"URL":"https:\/\/doi.org\/10.1007\/s10772-018-09583-5","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2018,12,3]]},"assertion":[{"value":"12 April 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 November 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 December 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}