{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:27:37Z","timestamp":1775230057192,"version":"3.50.1"},"reference-count":38,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"7","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2016]]},"DOI":"10.1587\/transinf.2015edp7457","type":"journal-article","created":{"date-parts":[[2016,6,30]],"date-time":"2016-06-30T23:09:13Z","timestamp":1467328153000},"page":"1877-1884","source":"Crossref","is-referenced-by-count":711,"title":["WORLD: A Vocoder-Based High-Quality Speech Synthesis System for Real-Time Applications"],"prefix":"10.1587","volume":"E99.D","author":[{"given":"Masanori","family":"MORISE","sequence":"first","affiliation":[{"name":"Interdisciplinary Graduate School, University of Yamanashi"}]},{"given":"Fumiya","family":"YOKOMORI","sequence":"additional","affiliation":[{"name":"Graduate School of Medicine and Engineering Science Department of Education, University of Yamanashi"}]},{"given":"Kenji","family":"OZAWA","sequence":"additional","affiliation":[{"name":"Interdisciplinary Graduate School, University of Yamanashi"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] H. Kenmochi, \u201cSinging synthesis as a new musical instrument,\u201d in Proc. ICASSP2012, pp.5385-5388, 2012.","DOI":"10.1109\/ICASSP.2012.6289138"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] Y. Ohtani, T. Toda, H. Saruwatari, and K. Shikano, \u201cImprovements of the one-to-many eigenvoice conversion system,\u201d IEICE Trans. on Information and Systems, vol.E93-D, no.9, pp.2491-2499, 2010.","DOI":"10.1587\/transinf.E93.D.2491"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] H. Dudley, \u201cRemaking speech,\u201d J. Acoust. Soc. Am., vol.11, no.2, pp.169-177, 1939.","DOI":"10.1121\/1.1916020"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] A.W. Black and N. Campbell, \u201cOptimising selection of units from speech databases for concatenative synthesis,\u201d in Proc. EUROSPEECH95, vol.1, pp.581-584, 1995.","DOI":"10.21437\/Eurospeech.1995-148"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] H. Kawahara, I. Masuda-Katsuse, and A. Cheveign\u00e9, \u201cRestructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based f0 extraction,\u201d Speech Communication, vol.27, no.3-4, pp.187-207, 1999.","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] J.L. Flanagan and R.M. Golden, \u201cPhase vocoder,\u201d Bell System Technical Journal, vol.45, no.9, pp.1493-1509, 1966.","DOI":"10.1002\/j.1538-7305.1966.tb01706.x"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] E. Moulines and F. Charpentier, \u201cPitch-synchronous waveform processing techniques for text-to-speech synthesis using diphones,\u201d Speech Communication, vol.9, no.5-6, pp.453-467, 1990.","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] R. McAulay and T.F. Quatieri, \u201cSpeech analysis\/synthesis based on a sinusoidal representation,\u201d IEEE Trans. on Acoustics, Speech and Signal Processing, vol.34, no.4, pp.744-754, 1986.","DOI":"10.1109\/TASSP.1986.1164910"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] K. Nakano, M. Morise, and T. Nishiura, \u201cVocal manipulation based on pitch transcription and its application to interactive entertainment for karaoke,\u201d Lecture Notes in Computer Science, vol.LNCS 6851, pp.52-60, 2011.","DOI":"10.1007\/978-3-642-22950-3_6"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] H. Banno, H. Hata, M. Morise, T. Takahashi, T. Irino, and H. Kawahara, \u201cImplementation of realtime straight speech manipulation system,\u201d Acoust. Sci. &amp; Tech., vol.28, no.3, pp.140-146, 2007.","DOI":"10.1250\/ast.28.140"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] M. Morise, M. Onishi, H. Kawahara, and H. Katayose, \u201cv.morish&apos;09: A morphing-based singing design interface for vocal melodies,\u201d Lecture Notes in Computer Science, vol.LNCS 5709, pp.185-190, 2009.","DOI":"10.1007\/978-3-642-04052-8_18"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] H. Kawahara, M. Morise, T. Takahashi, R. Nisimura, T. Irino, and H. Banno, \u201cTandem-straight: A temporally stable power spectral representation for periodic signals and applications to interference-free spectrum, f0, and aperiodicity estimation,\u201d in Proc. ICASSP 2008, pp.3933-3936, 2008.","DOI":"10.1109\/ICASSP.2008.4518514"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] H. Kawahara and M. Morise, \u201cTechnical foundations of tandem-straight, a speech analysis, modification and synthesis framework,\u201d SADHANA \u2014 Academy Proceedings in Engineering Sciences, vol.36, no.5, pp.713-727, 2011.","DOI":"10.1007\/s12046-011-0043-3"},{"key":"14","unstructured":"[14] M. Morise, H. Kawahara, and H. Katayose, \u201cFast and reliable f0 estimation method based on the period extraction of vocal fold vibration of singing voice and speech,\u201d in Proc. AES 35th International Conference, CD-ROM Proceedings, 2009."},{"key":"15","unstructured":"[15] M. Morise, H. Kawahara, and T. Nishiura, \u201cRapid f0 estimation for high-snr speech based on fundamental component extraction,\u201d IEICE Trans. Inf. &amp; Syst. (Japanese Edition), vol.J93-D, no.2, pp.109-117, 2010."},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] M. Morise, \u201cCheaptrick, a spectral envelope estimator for high-quality speech synthesis,\u201d Speech Communication, vol.67, pp.1-7, 2015.","DOI":"10.1016\/j.specom.2014.09.003"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] M. Morise, \u201cPlatinum: A method to extract excitation signals for voice synthesis system,\u201d Acoust. Sci. &amp; Tech., vol.33, no.2, pp.123-125, 2012.","DOI":"10.1250\/ast.33.123"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] M. Morise, \u201cError evaluation of an f0-adaptive spectral envelope estimator in robustness against the additive noise and f0 error,\u201d IEICE Trans. on Information Systems, vol.E98-D, no.7, pp.1405-1408, 2015.","DOI":"10.1587\/transinf.2015EDL8015"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] W. Hess, Pitch determination of speech signals, Springer-Verlag, 1983.","DOI":"10.1007\/978-3-642-81926-1"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] A.M. Noll, \u201cShort-time spectrum and \u201ccepstrum\u201d techniques for vocal pitch detection,\u201d J. Acoust. Soc. Am., vol.36, no.2, pp.269-302, 1964.","DOI":"10.1121\/1.1918949"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] A. Cheveign\u00e9 and H. Kawahara, \u201cYin, a fundamental frequency estimator for speech and music,\u201d J. Acoust. Soc. Am., vol.111, no.4, pp.1917-1930, 2002.","DOI":"10.1121\/1.1458024"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] A. Camacho and J.G. Harris, \u201cA sawtooth waveform inspired pitch estimator for speech and music,\u201d J. Acoust. Soc. Am., vol.124, no.3, pp.1638-1652, 2008.","DOI":"10.1121\/1.2951592"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] B.S. Atal and S.L. Hanauer, \u201cSpeech analysis and synthesis by linear prediction of the speech wave,\u201d J. Acoust. Soc. Am., vol.50, no.2B, pp.637-655, 1971.","DOI":"10.1121\/1.1912679"},{"key":"24","unstructured":"[24] T. Nakano and M. Goto, \u201cA spectral envelope estimation method based on f0-adaptive multi-frame integration analysis,\u201d in Proc. SAPA-SCALE 2012, pp.11-16, 2012."},{"key":"25","unstructured":"[25] M. Morise, T. Matsubara, K. Nakano, and T. Nishiura, \u201cA rapid spectrum envelope estimation technique of vowel for high-quality speech synthesis,\u201d IEICE Trans. Inf. &amp; Syst. (Japanese Edition), vol.J94-D, no.7, pp.1079-1987, 2011."},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] M.V. Mathews, J.E. Miller, and E.E. David, \u201cPitch synchronous analysis of voiced sounds,\u201d J. Acoust. Soc. Am., vol.33, no.2, pp.179-185, 1961.","DOI":"10.1121\/1.1908614"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] A.V. McCree and T.P. Barnwell, \u201cA mixed excitation lpc vocoder model for low bit rate speech coding,\u201d IEEE Transactions on Speech Audio Processing, vol.3, no.4, pp.242-250, 1995.","DOI":"10.1109\/89.397089"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] D.W. Griffin and J.S. Lim, \u201cA new model-based speech analysis\/synthesis,\u201d in Proc. ICASSP1985, vol.10, pp.513-516, 1985.","DOI":"10.1109\/ICASSP.1985.1168385"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] H. Kawahara and M. Morise, \u201cSimplified aperiodicity representation for high-quality speech manipulation systems,\u201d in Proc. ICSP2012, pp.579-584, 2012.","DOI":"10.1109\/ICoSP.2012.6491555"},{"key":"30","unstructured":"[30] H. Kawahara, M. Morise, T. Toda, H. Banno, R. Nisimura, and T. Irino, \u201cExcitation source analysis for high-quality speech manipulation systems based on an interference-free representation of group delay with minimum phase response compensation,\u201d in Proc. Interspeech2014, pp.2243-2247, 2014."},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] H. Kawahara, J. Estill, and O. Fujimura, \u201cSpeech representation and transformation using adaptive interpolation of weighted spectrum: vocoder revisited,\u201d in Proc. ICASSP1997, pp.1303-1306, 1997.","DOI":"10.1109\/ICASSP.1997.596185"},{"key":"32","unstructured":"[32] ITU-R Recommendation BS.1534-1, \u201cMethod for the subjective assessment of intermediate quality level of coding systems,\u201d 2003."},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] H. Kawahara, A. Cheveign\u00e9, H. Banno, T. Takahashi, and T. Irino, \u201cNearly defect-free f0 trajectory extraction for expressive speech modifications based on straight,\u201d in Proc. Interspeech2005, pp.537-540, 2005.","DOI":"10.21437\/Interspeech.2005-335"},{"key":"34","doi-asserted-by":"crossref","unstructured":"[34] R. Plomp and H.J. Steeneken, \u201cEffect of phase on the timbre of complex tones,\u201d J. Acoust. Soc. Am., vol.46, no.2, pp.409-421, 1969.","DOI":"10.1121\/1.1911705"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] R. Maia, M. Akamine, and M.J.F. Gales, \u201cComplex cepstrum as phase information in statistical parametric speech synthesis,\u201d in Proc. ICASSP2012, pp.4581-4584, 2012.","DOI":"10.1109\/ICASSP.2012.6288938"},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] Y. Agiomyrgiannakis, \u201cVocaine the vocoder and applications in speech synthesis,\u201d in Proc. ICASSP2015, pp.4230-4234, 2015.","DOI":"10.1109\/ICASSP.2015.7178768"},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] H. Kawahara, M. Morise, H. Banno, and V.G. Skuk, \u201cTemporally variable multi-aspect n-way morphing based on interference-free speech representations,\u201d in Proc. APSIPA ASC 2013, pp.1-10, 2013.","DOI":"10.1109\/APSIPA.2013.6694355"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] H. Zen, K. Tokuda, and A.W. Black, \u201cStatistical parametric speech synthesis,\u201d Speech Communication, vol.51, no.11, pp.1039-1064, 2009.","DOI":"10.1016\/j.specom.2009.04.004"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E99.D\/7\/E99.D_2015EDP7457\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T02:43:58Z","timestamp":1692413038000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E99.D\/7\/E99.D_2015EDP7457\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"references-count":38,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2016]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2015edp7457","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]}}}