{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,8,7]],"date-time":"2024-08-07T12:31:20Z","timestamp":1723033880144},"reference-count":25,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"12","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2020,12,1]]},"DOI":"10.1587\/transinf.2020edp7075","type":"journal-article","created":{"date-parts":[[2020,11,30]],"date-time":"2020-11-30T22:19:44Z","timestamp":1606774784000},"page":"2673-2681","source":"Crossref","is-referenced-by-count":2,"title":["DNN-Based Full-Band Speech Synthesis Using GMM Approximation of Spectral Envelope"],"prefix":"10.1587","volume":"E103.D","author":[{"given":"Junya","family":"KOGUCHI","sequence":"first","affiliation":[{"name":"Meiji University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinnosuke","family":"TAKAMICHI","sequence":"additional","affiliation":[{"name":"The University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Masanori","family":"MORISE","sequence":"additional","affiliation":[{"name":"Meiji University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hiroshi","family":"SARUWATARI","sequence":"additional","affiliation":[{"name":"The University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shigeki","family":"SAGAYAMA","sequence":"additional","affiliation":[{"name":"The University of Electro-Communications"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","unstructured":"[1] Y. Sagisaka, K. Takeda, M. Abe, S. Katagiri, T. Umeda, and H. Kuawhara, \u201cA large-scale Japanese speech database,\u201d ICSLP90, Kobe, Japan, pp.1089-1092, Nov. 1990."},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] Y. Wang, R.J.S.-Ryan, D. Stanton, Y. Wu, R.J. Weiss, N. Jaitly, Z. Yang, Y. Xiao, Z. Chen, S. Bengio, Q. Le, Y. Agiomyrgiannakis, R. Clark, and R.A. Saurous, \u201cTacotron: Towards end-to-end speech synthesis,\u201d Proc. INTERSPEECH, Stockholm, Sweden, pp.4006-4010, Aug. 2017. 10.21437\/interspeech.2017-1452","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"3","doi-asserted-by":"publisher","unstructured":"[3] H. Zen, K. Tokuda, and A. Black, \u201cStatistical parametric speech synthesis,\u201d Speech Communication, vol.51, no.11, pp.1039-1064, 2009. 10.1016\/j.specom.2009.04.004","DOI":"10.1016\/j.specom.2009.04.004"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] K. Tokuda, T. Kobayashi, T. Masuko, and S. Imai, \u201cMel-generalized cepstral analysis-a unified approach to speech spectral estimation,\u201d Proc. ICSLP, Yokohama, Japan, pp.410-415, Sept. 1994.","DOI":"10.21437\/ICSLP.1994-275"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] P. Zolfaghari and T. Robinson, \u201cFormant analysis using mixtures of gaussians,\u201d Proc. ICSLP, vol.2, pp.1229-1232, 1996. 10.1109\/icslp.1996.607830","DOI":"10.21437\/ICSLP.1996-317"},{"key":"6","unstructured":"[6] P. Boersma and D. Weenink, \u201cPraat: doing phonetics by computer (version 6.1.16) [computer program]. retrieved 10 july 2020,\u201d 2020."},{"key":"7","doi-asserted-by":"publisher","unstructured":"[7] F. Itakura, \u201cLine spectrum representation of linear predictor coefficients of speech signals,\u201d The Journal of the Acoustical Society of America, vol.57, no.S1, p.S35, 1975. 10.1121\/1.1995189","DOI":"10.1121\/1.1995189"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] Y. Ohtani, M. Tamura, M. Morita, T. Kagoshima, and M. Akamine, \u201cHistogram-based spectral equalization for HMM-based speech synthesis using mel-LSP,\u201d Proc. INTERSPEECH, Portland, U.S.A., Sept. 2012.","DOI":"10.21437\/Interspeech.2012-362"},{"key":"9","doi-asserted-by":"publisher","unstructured":"[9] B.P. Nguyen and M. Akagi, \u201cA flexible spectral modification method based on temporal decomposition and gaussian mixture model,\u201d Acoustical Science and Technology, vol.30, no.3, pp.170-179, 2009. 10.1250\/ast.30.170","DOI":"10.1250\/ast.30.170"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] H. Zen, A. Senior, and M. Schuster, \u201cStatistical parametric speech synthesis using deep neural networks,\u201d Proc. ICASSP, Vancouver, Canada, May 2013. 10.1109\/icassp.2013.6639215","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] M. Morise, F. Yokomori, and K. Ozawa, \u201cWORLD: a vocoder-based high-quality speech synthesis system for real-time applications,\u201d IEICE Trans. Inf. &amp; Syst., vol.E99-D, no.7, pp.1877-1884, July 2016. 10.1587\/transinf.2015edp7457","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"12","doi-asserted-by":"publisher","unstructured":"[12] M. Morise, \u201cD4C, a band-aperiodicity estimator for high-quality speech synthesis,\u201d Speech Communication, vol.84, pp.57-65, 2016. 10.1016\/j.specom.2016.09.001","DOI":"10.1016\/j.specom.2016.09.001"},{"key":"13","unstructured":"[13] N. Hojo, K. Minami, D. Saito, H. Kameoka, and S. Sagayama, \u201cHMM speech synthesis using speech analysis based on composite wavelet model,\u201d Proc. ASJ Autumn Meeting, vol.2012, pp.2-2-7, 2012, in Japanese."},{"key":"14","unstructured":"[14] F. Itakura and S. Saito, \u201cAnalysis synthesis telephony based on the maximum likelihood method,\u201d Journal of the Royal Statistical Society B, vol.39, pp.185-197, 1968."},{"key":"15","unstructured":"[15] S. Sagayama and F. Itakura, \u201cTheorical study in CSM, LSP and their properties,\u201d Transaction of the Comittee on Speech Research, The Acoustical Society of Japan, vol.S82-14, pp.105-112, 1982, in Japanese."},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] H. Zen and A. Senior, \u201cDeep mixture density networks for acoustic modeling in statistical parametric speech synthesis,\u201d Proc. ICASSP, Florence, Italy, pp.3872-3876, May 2014. 10.1109\/icassp.2014.6854321","DOI":"10.1109\/ICASSP.2014.6854321"},{"key":"17","doi-asserted-by":"publisher","unstructured":"[17] T. Yoshimura, K. Tokuda, T. Masuko, T. Kobayashi, and T.Kitamura, \u201cIncorporating a mixed excitation model and postfilter into hmm-based text-to-speech synthesis,\u201d Systems and Computers in Japan, vol.36, no.12, pp.43-50, 11 2005. 10.1002\/scj.20354","DOI":"10.1002\/scj.20354"},{"key":"18","unstructured":"[18] \u201cLancers,\u201d https:\/\/www.lancers.jp\/."},{"key":"19","unstructured":"[19] S. Takamichi, K. Mitsui, Y. Saito, T. Koriyama, N. Tanji, and H. Saruwatari, \u201cJVS corpus: free Japanese multi-speaker voice corpus,\u201d arXiv preprint 1908.06248, Aug. 2019."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] J. Koguchi and S. Sagayama, \u201cComposite wavelet model for stability-oriented speech synthesis from cepstral features,\u201d Proc.APSIPA ASC, pp.1697-1701, Nov. 2018. 10.23919\/apsipa.2018.8659717","DOI":"10.23919\/APSIPA.2018.8659717"},{"key":"21","unstructured":"[21] MathWorks, \u201cMATLAB findpeaks,\u201d https:\/\/jp.mathworks.com\/help\/signal\/ref\/findpeaks.html."},{"key":"22","unstructured":"[22] Scipy, \u201cScipy find_peaks,\u201d https:\/\/docs.scipy.org\/doc\/scipy\/reference\/generated\/scipy.signal.find_peaks.html."},{"key":"23","unstructured":"[23] R. Sonobe, S. Takamichi, and H. Saruwatari, \u201cJSUT corpus: free large-scale Japanese speech corpus for end-to-end speech synthesis,\u201d abs\/1711.00354, Nov. 2017."},{"key":"24","unstructured":"[24] X. Glorot, A. Bordes, and Y. Bengio, \u201cDeep sparse rectifier neural networks,\u201d Proc. AISTATS, Lauderdale, U.S.A., pp.315-323, April 2011."},{"key":"25","unstructured":"[25] Y. Ohtani, T. Toda, H. Saruwatari, and K. Shikano, \u201cMaximum likelihood voice conversion based on GMM with STRAIGHT mixed excitation,\u201d Proc. INTERSPEECH, Pittsburgh, U.S.A., pp.2266-2269, Sept. 2006."}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E103.D\/12\/E103.D_2020EDP7075\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T12:57:51Z","timestamp":1669899471000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E103.D\/12\/E103.D_2020EDP7075\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,12,1]]},"references-count":25,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2020]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2020edp7075","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,12,1]]}}}