{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,10,23]],"date-time":"2023-10-23T23:19:35Z","timestamp":1698103175826},"reference-count":41,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"7","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2021,7,1]]},"DOI":"10.1587\/transinf.2020edp7252","type":"journal-article","created":{"date-parts":[[2021,6,30]],"date-time":"2021-06-30T22:21:02Z","timestamp":1625091662000},"page":"1002-1016","source":"Crossref","is-referenced-by-count":2,"title":["Real-Time Full-Band Voice Conversion with Sub-Band Modeling and Data-Driven Phase Estimation of Spectral Differentials"],"prefix":"10.1587","volume":"E104.D","author":[{"given":"Takaaki","family":"SAEKI","sequence":"first","affiliation":[{"name":"University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuki","family":"SAITO","sequence":"additional","affiliation":[{"name":"University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinnosuke","family":"TAKAMICHI","sequence":"additional","affiliation":[{"name":"University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hiroshi","family":"SARUWATARI","sequence":"additional","affiliation":[{"name":"University of Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","unstructured":"[1] M. Abe, S. Nakamura, K. Shikano, and H. Kuwabara, \u201cVoice conversion through vector quantization,\u201d Proc. ICASSP, New York, U.S.A., pp.655-658, April 1988. 10.1109\/icassp.1988.196671"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] T. Toda, \u201cAugmented speech production based on real-time statistical voice conversion,\u201d Proc. GlobalSIP, Atlanta, U.S.A., pp.592-596, Dec. 2014. 10.1109\/globalsip.2014.7032186","DOI":"10.1109\/GlobalSIP.2014.7032186"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] Y. Stylianou, O. Cappe, and E. Moulines, \u201cContinuous probabilistic transform for voice conversion,\u201d IEEE Transactions on Speech and Audio Processing, vol.6, no.2, pp.131-142, 1998. 10.1109\/89.661472","DOI":"10.1109\/89.661472"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] T. Toda, A.W. Black, and K. Tokuda, \u201cVoice conversion based on maximum-likelihood estimation of spectral parameter trajectory,\u201d IEEE Transactions on Audio, Speech, and Language Processing, vol.15, no.8, pp.2222-2235, 2007. 10.1109\/tasl.2007.907344","DOI":"10.1109\/TASL.2007.907344"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] S. Desai, E.V. Raghavendra, B. Yegnanarayana, A.W. Black, and K. Prahallad, \u201cVoice conversion using artificial neural networks,\u201d Proc. ICASSP, Taipei, Taiwan, pp.3893-3896, April 2009. 10.1109\/icassp.2009.4960478","DOI":"10.1109\/ICASSP.2009.4960478"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] L. Sun, S. Kang, K. Li, and H. Meng, \u201cVoice conversion using deep bidirectional long short-term memory based recurrent neural networks,\u201d Proc. ICASSP, Brisbane, Australia, pp.4869-4873, April 2015. 10.1109\/icassp.2015.7178896","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] T. Toda, T. Muramatsu, and H. Banno, \u201cImplementation of computationally efficient real-time voice conversion,\u201d Proc. INTERSPEECH, Portland, U.S.A., pp.94-97, Sept. 2012.","DOI":"10.21437\/Interspeech.2012-34"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] R. Arakawa, S. Takamichi, and H. Saruwatari, \u201cImplementation of DNN-based real-time voice conversion and its improvements by audio data augmentation and mask-shaped device,\u201d Proc. SSW10, Vienna, Austria, pp.93-98, Sept. 2019. 10.21437\/ssw.2019-17","DOI":"10.21437\/SSW.2019-17"},{"key":"9","doi-asserted-by":"publisher","unstructured":"[9] K. Kobayashi, T. Toda, and S. Nakamura, \u201cIntra-gender statistical singing voice conversion with direct waveform modification using log-spectral differential,\u201d Speech Communication, vol.99, pp.211-220, 2018. 10.1016\/j.specom.2018.03.011","DOI":"10.1016\/j.specom.2018.03.011"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] A. Tamamori, T. Hayashi, K. Kobayashi, K. Takeda, and T. Toda, \u201cSpeaker-dependent WaveNet vocoder,\u201d Proc. INTERSPEECH, Stockholm, Sweden, pp.1118-1122, Aug. 2017. 10.21437\/interspeech.2017-314","DOI":"10.21437\/Interspeech.2017-314"},{"key":"11","unstructured":"[11] N. Kalchbrenner, E. Elsen, K. Simonyan, S. Noury, N. Casagrande, E. Lockhart, F. Stimberg, A.v.d. Oord, S. Dieleman, and K. Kavukcuoglu, \u201cEfficient neural audio synthesis,\u201d arXiv, vol.abs\/ 1609.03499, 2018."},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] X. Wang, S. Takaki, and J. Yamagishi, \u201cNeural source-filter-based waveform model for statistical parametric speech synthesis,\u201d Proc. ICASSP, Calgary, Canada, pp.5916-5920, April 2018. 10.1109\/icassp.2019.8682298","DOI":"10.1109\/ICASSP.2019.8682298"},{"key":"13","doi-asserted-by":"publisher","unstructured":"[13] S. Imai, K. Sumita, and C. Furuichi, \u201cMel log spectrum approximation (MLSA) filter for speech synthesis,\u201d Electronics and Communications in Japan, vol.66, no.2, pp.10-18, 1983. 10.1002\/ecja.4400660203","DOI":"10.1002\/ecja.4400660203"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] H. Suda, G. Kotani, S. Takamichi, and D. Saito, \u201cA revisit to feature handling for high-quality voice conversion,\u201d Proc. APSIPA ASC, Hawaii, U.S.A., pp.816-822, Nov. 2018. 10.23919\/apsipa.2018.8659658","DOI":"10.23919\/APSIPA.2018.8659658"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] S. Takamichi, Y. Saito, N. Takamune, D. Kitamura, and H. Saruwatari, \u201cPhase reconstruction from amplitude spectrograms based on directional-statistics deep neural networks,\u201d Signal Processing, vol.169, p.107368, 2020. 10.1016\/j.sigpro.2019.107368","DOI":"10.1016\/j.sigpro.2019.107368"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] R. Crochiere and L. Rabiner, Multirate digital signal processing, Englewood Cliffs, N.J.: Prentice-Hall, 1983.","DOI":"10.1016\/0165-1684(83)90013-0"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] T. Okamoto, K. Tachibana, T. Toda, Y. Shiga, and H. Kawai, \u201cSubband WaveNet with overlapped single-sideband filterbanks,\u201d Proc. ASRU, Okinawa, Japan, pp.698-704, Dec. 2017. 10.1109\/asru.2017.8269005","DOI":"10.1109\/ASRU.2017.8269005"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] T. Saeki, Y. Saito, S. Takamichi, and H. Saruwatari, \u201cLifter training and sub-band modeling for computationally efficient and high-quality voice conversion using spectral differentials,\u201d Proc. ICASSP, Barcelona, Spain, pp.7784-7788, May 2020. 10.1109\/icassp40776.2020.9054490","DOI":"10.1109\/ICASSP40776.2020.9054490"},{"key":"19","unstructured":"[19] T. Saeki, Y. Saito, S. Takamichi, and H. Saruwatari, \u201cReal-time, full-band, online DNN-based voice conversion system using a single CPU,\u201d Proc. INTERSPEECH, Shanghai, China, pp.1021-1022, Oct. 2020."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] T. Fukada, K. Tokuda, T. Kobayashi, and S. Imai, \u201cAn adaptive algorithm for mel-cepstral analysis of speech,\u201d Proc. ICASSP, San Francisco, U.S.A., pp.137-140, March 1992. 10.1109\/icassp.1992.225953","DOI":"10.1109\/ICASSP.1992.225953"},{"key":"21","doi-asserted-by":"publisher","unstructured":"[21] S.-C. Pei and H.-S. Lin, \u201cMinimum-phase FIR filter design using real cepstrum,\u201d IEEE Transactions on Circuits and Systems II: Express Briefs, vol.53, no.10, pp.1113-1117, 2006. 10.1109\/tcsii.2006.882193","DOI":"10.1109\/TCSII.2006.882193"},{"key":"22","doi-asserted-by":"publisher","unstructured":"[22] D.E. Rumelhart, G.E. Hinton, and R.J. Williams, \u201cLearning representations by back-propagating errors,\u201d Nature, vol.323, pp.533-536, 1986. 10.1038\/323533a0","DOI":"10.1038\/323533a0"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] P.S. Nidadavolu, C. Lai, J. Villalba, and N. Dehak, \u201cInvestigation on bandwidth extension for speaker recognition,\u201d Proc. INTERSPEECH, Hyderabad, India, pp.1111-1115, Sept. 2018. 10.21437\/interspeech.2018-2394","DOI":"10.21437\/Interspeech.2018-2394"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] H. Miyoshi, Y. Saito, S. Takamichi, and H. Saruwatari, \u201cVoice conversion using sequence-to-sequence learning of context posterior probabilities,\u201d Proc. INTERSPEECH, Stockholm, Sweden, pp.1268-1272, Aug. 2017. 10.21437\/interspeech.2017-247","DOI":"10.21437\/Interspeech.2017-247"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] T. Kaneko, H. Kameoka, K. Hiramatsu, and K. Kashino, \u201cSequence-to-sequence voice conversion with similarity metric learned using generative adversarial networks,\u201d Proc. INTERSPEECH, Stockholm, Sweden, pp.1283-1287, Aug. 2017. 10.21437\/interspeech.2017-970","DOI":"10.21437\/Interspeech.2017-970"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] H. Kameoka, K. Tanaka, D. Kwasny, T. Kaneko, and N. Hojo, \u201cConvs2s-vc: Fully convolutional sequence-to-sequence voice conversion,\u201d IEEE Transactions on Audio, Speech, and Language Processing, vol.28, pp.1849-1863, June 2020. 10.1109\/taslp.2020.3001456","DOI":"10.1109\/TASLP.2020.3001456"},{"key":"27","unstructured":"[27] N. Morita and F. Itakura, \u201cTime-scale modification algorithm for speech by use of autocorrelation method and its evaluation,\u201d IEICE Technical Report, vol.86, pp.9-16, May 1986."},{"key":"28","doi-asserted-by":"publisher","unstructured":"[28] M. Morise, F. Yokomori, and K. Ozawa, \u201cWORLD: a vocoder-based high-quality speech synthesis system for real-time applications,\u201d IEICE transactions on information and systems, vol.E99-D, no.7, pp.1877-1884, July 2016. 10.1587\/transinf.2015edp7457","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"29","doi-asserted-by":"publisher","unstructured":"[29] H. Kawahara, I. Masuda-Katsuse, and A.D. Cheveign\u00e9, \u201cRestructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds,\u201d Speech Communication, vol.27, no.3-4, pp.187-207, 1999. 10.1016\/s0167-6393(98)00085-5","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"30","doi-asserted-by":"publisher","unstructured":"[30] Y. Saito, S. Takamichi, and H. Saruwatari, \u201cStatistical parametric speech synthesis incorporating generative adversarial networks,\u201d IEEE\/ACM Transactions on Audio, Speech, and Language Processing, vol.26, no.1, pp.84-96, Jan. 2018. 10.1109\/taslp.2017.2761547","DOI":"10.1109\/TASLP.2017.2761547"},{"key":"31","doi-asserted-by":"publisher","unstructured":"[31] S. Takamichi, R. Sonobe, K. Mitsui, Y. Saito, T. Koriyama, N. Tanji, and H. Saruwatari, \u201cJsut and jvs: free japanese voice corpora for accelerating speech synthesis research,\u201d Acoustical Science and Technology, vol.41, pp.761-768, 2020. 10.1250\/ast.41.761","DOI":"10.1250\/ast.41.761"},{"key":"32","unstructured":"[32] y_benjo and MagnesiumRibbon, \u201cVoice-actress corpus.\u201d http:\/\/voice-statistics.github.io\/"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] T. Akiba, S. Sano, T. Yanase, T. Ohta, and M. Koyama, \u201cOptuna: A next-generation hyperparameter optimization framework,\u201d Proc. KDD, Anchorage, U.S.A., pp.2623-2631, Aug. 2019. 10.1145\/3292500.3330701","DOI":"10.1145\/3292500.3330701"},{"key":"34","unstructured":"[34] Y.N. Dauphin, A. Fan, M. Auli, and D. Grangier, \u201cLanguage modeling with gated convolutional networks,\u201d Proc. ICML, Sydney Australia, pp.933-941, Aug. 2017."},{"key":"35","unstructured":"[35] S. Ioffe and C. Szegedy, \u201cBatch normalization: Accelerating deep network training by reducing internal covariate shift,\u201d Proc. ICML, Lille, France, pp.448-456, July 2015."},{"key":"36","unstructured":"[36] D. Kingma and B. Jimmy, \u201cAdam: A method for stochastic optimization,\u201d arXiv, vol.abs\/1412.6980, 2014."},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] J.-M. Valin and J. Skoglund, \u201cLpcnet: Improving neural speech synthesis through linear prediction,\u201d Proc. ICASSP, Brighton, U.K., pp.5891-5895, May 2019. 10.1109\/icassp.2019.8682804","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] Y. He, T.N. Sainath, R. Prabhavalkar, I. McGraw, R. Alvarez, D. Zhao, D. Rybach, A. Kannan, Y. Wu, R. Pang, Q. Liang, D. Bhatia, Y. Shangguan, B. Li, G. Pundak, K.C. Sim, T. Bagby, S. Chang, K. Rao, and A. Gruenstein, \u201cStreaming end-to-end speech recognition for mobile devices,\u201d Proc. ICASSP, Brighton, United Kingdom, pp.6381-6385, May 2019. 10.1109\/icassp.2019.8682336","DOI":"10.1109\/ICASSP.2019.8682336"},{"key":"39","doi-asserted-by":"crossref","unstructured":"[39] T. Kaneko and H. Kameoka, \u201cCycleGAN-VC: Non-parallel voice conversion using cycle-consistent adversarial networks,\u201d Proc. EUSIPCO, Rome, Italy, pp.2100-2104, Sept. 2018. 10.23919\/eusipco.2018.8553236","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] T. Kaneko, H. Kameoka, K. Tanaka, and N. Hojo, \u201cCycleGAN-VC2: Improved CycleGAN-based non-parallel voice conversion,\u201d Proc. ICASSP, Brighton, U.K., pp.6820-6824, May 2019. 10.1109\/icassp.2019.8682897","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"41","unstructured":"[41] D.-Y. Wu and H.-Y. Lee, \u201cOne-shot voice conversion by vector quantization,\u201d Proc. ICASSP, Barcelona, Spain, pp.7734-7738, May 2020. 10.1109\/icassp40776.2020.9053854"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/7\/E104.D_2020EDP7252\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,2]],"date-time":"2023-01-02T06:11:35Z","timestamp":1672639895000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/7\/E104.D_2020EDP7252\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,1]]},"references-count":41,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2021]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2020edp7252","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,7,1]]},"article-number":"2020EDP7252"}}