{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T04:06:56Z","timestamp":1751342816193,"version":"3.41.0"},"reference-count":21,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2018]]},"DOI":"10.1587\/transinf.2017edp7165","type":"journal-article","created":{"date-parts":[[2018,1,31]],"date-time":"2018-01-31T22:33:29Z","timestamp":1517438009000},"page":"462-472","source":"Crossref","is-referenced-by-count":26,"title":["DNN-Based Speech Synthesis Using Speaker Codes"],"prefix":"10.1587","volume":"E101.D","author":[{"given":"Nobukatsu","family":"HOJO","sequence":"first","affiliation":[{"name":"NTT Media Intelligence Laboratories, NTT Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yusuke","family":"IJIMA","sequence":"additional","affiliation":[{"name":"NTT Media Intelligence Laboratories, NTT Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hideyuki","family":"MIZUNO","sequence":"additional","affiliation":[{"name":"Tokyo University of Science"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] N. Hojo, Y. Ijima, and H. Mizuno, \u201cAn investigation of DNN-based speech synthesis using speaker codes,\u201d Proc. Intersepech 2016, pp.2278-2282, 2016. 10.21437\/interspeech.2016-589","DOI":"10.21437\/Interspeech.2016-589"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] H. Zen, A. Senior, and M. Schuster, \u201cStatistical parametric speech synthesis using deep neural networks,\u201d Proc. ICASSP 2013, pp.7962-7966, 2013. 10.1109\/icassp.2013.6639215","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] H. Zen and A. Senior, \u201cDeep mixture density networks for acoustic modeling in statistical parametric speech synthesis,\u201d Proc. ICASSP 2014, pp.3844-3848, 2014. 10.1109\/icassp.2014.6854321","DOI":"10.1109\/ICASSP.2014.6854321"},{"key":"4","unstructured":"[4] Y. Fan, Y. Qian, F. Xie, and F.K. Soong, \u201cTTS synthesis with bidirectional LSTM based recurrent neural networks,\u201d Proc. Interspeech, pp.1964-1968, 2014."},{"key":"5","doi-asserted-by":"publisher","unstructured":"[5] J. Yamagishi and T. Kobayashi, \u201cAverage-voice-based speech synthesis using HSMM-based speaker adaptation and adaptive training,\u201d IEICE Transactions on Information and Systems, vol.E90-D, no.2, pp.533-543, 2007. 10.1093\/ietisy\/e90-d.2.533","DOI":"10.1093\/ietisy\/e90-d.2.533"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] J. Yamagishi, T. Kobayashi, Y. Nakano, K. Ogata, and J. Isogai, \u201cAnalysis of speaker adaptation algorithms for HMM-based speech synthesis and a constrained SMAPLR adaptation algorithm,\u201d IEEE Transactions Audio, Speech, and Language Process., vol.17, no.1, pp.66-83, 2009. 10.1109\/tasl.2008.2006647","DOI":"10.1109\/TASL.2008.2006647"},{"key":"7","unstructured":"[7] V. Wan, J. Latorre, K. Chin, L. Chen, M. Gales, H. Zen, K. Knill, and M. Akamine, \u201cCombining multiple high quality corpora for improving HMM-TTS,\u201d Proc. Interspeech 2012, pp.1134-1137, 2012."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] Y. Fan, Y. Qian, F.K. Soong, and L. He, \u201cMulti-speaker modeling and speaker adaptation for DNN-based tts synthesis,\u201d Proc. ICASSP 2015, pp.4475-4479, IEEE, 2015. 10.1109\/icassp.2015.7178817","DOI":"10.1109\/ICASSP.2015.7178817"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] G. Saon, H. Soltau, D. Nahamoo, and M. Picheny, \u201cSpeaker adaptation of neural network acoustic models using i-vectors,\u201d Proc. ASRU 2013, pp.55-59, 2013. 10.1109\/asru.2013.6707705","DOI":"10.1109\/ASRU.2013.6707705"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] O. Abdel-Hamid and H. Jiang, \u201cFast speaker adaptation of hybrid NN\/HMM model for speech recognition based on discriminative learning of speaker code,\u201d Proc. ICASSP 2013, pp.7942-7946, 2013. 10.1109\/icassp.2013.6639211","DOI":"10.1109\/ICASSP.2013.6639211"},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] S. Xue, O. Abdel-Hamid, H. Jiang, L. Dai, and Q. Liu, \u201cFast adaptation of deep neural network based on discriminant codes for speech recognition,\u201d IEEE\/ACM Trans. Audio, Speech, Lang. Process., vol.22, no.12, pp.1713-1725, 2014. 10.1109\/taslp.2014.2346313","DOI":"10.1109\/TASLP.2014.2346313"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] Z. Wu, P. Swietojanski, C. Veaux, S. Renals, and S. King, \u201cA study of speaker adaptation for DNN-based speech synthesis,\u201d Proc. Interspeech 2015, pp.879-883, 2015.","DOI":"10.21437\/Interspeech.2015-270"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] H.-T. Luong, S. Takaki, G.E. Henter, and J. Yamagishi, \u201cAdapting and controlling DNN-based speech synthesis using input codes,\u201d Proc. ICASSP 2017, pp.4905-4909, IEEE, 2017. 10.1109\/icassp.2017.7953089","DOI":"10.1109\/ICASSP.2017.7953089"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] B. Potard, P. Motlicek, and D. Imseng, \u201cPreliminary work on speaker adaptation for DNN-based speech synthesis,\u201d tech. rep., Idiap, Rep. Idiap-RR-02-2015, 2015.","DOI":"10.1186\/s13636-015-0058-5"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] Y. Fan, Y. Qian, F.K. Soong, and L. He, \u201cUnsupervised speaker adaptation for DNN-based TTS synthesis,\u201d Proc. ICASSP 2016, pp.5135-5139, IEEE, 2016. 10.1109\/icassp.2016.7472656","DOI":"10.1109\/ICASSP.2016.7472656"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] Y. Fan, Y. Qian, F.K. Soong, and L. He, \u201cSpeaker and language factorization in DNN-based TTS synthesis,\u201d Proc. ICASSP 2016, pp.5540-5544, IEEE, 2016. 10.1109\/icassp.2016.7472737","DOI":"10.1109\/ICASSP.2016.7472737"},{"key":"17","doi-asserted-by":"publisher","unstructured":"[17] H. Kawahara, I. Masuda-Katsuse, and A. De Cheveign\u00e9, \u201cRestructuring speech representations using a pitch-adaptive time-frequency smoothing and an instantaneous-frequency-based f0 extraction: Possible role of a repetitive structure in sounds,\u201d Speech communication, vol.27, no.3-4, pp.187-207, 1999. 10.1016\/s0167-6393(98)00085-5","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"18","unstructured":"[18] D. Kingma and J. Ba, \u201cAdam: A method for stochastic optimization,\u201d arXiv preprint arXiv:1412.6980, 2014."},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] K. Shinoda and T. Watanabe, \u201cAcoustic modeling based on the MDL criterion for speech recognition,\u201d Proc. EUROSPEECH 1997, pp.99-102, 1997.","DOI":"10.21437\/Eurospeech.1997-52"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] K. Tokuda, T. Yoshimura, T. Masuko, T. Kobayashi, and T. Kitamura, \u201cSpeech parameter generation algorithms for HMM-based speech synthesis,\u201d Proc. ICASSP 2000, pp.1315-1318, 2000. 10.1109\/icassp.2000.861820","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"21","doi-asserted-by":"publisher","unstructured":"[21] T. Toda and K. Tokuda, \u201cA speech parameter generation algorithm considering global variance for HMM-based speech synthesis,\u201d IEICE Transactions on Information and Systems, vol.E90-D, no.5, pp.816-824, 2007. 10.1093\/ietisy\/e90-d.5.816","DOI":"10.1093\/ietisy\/e90-d.5.816"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E101.D\/2\/E101.D_2017EDP7165\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T10:53:22Z","timestamp":1751280802000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E101.D\/2\/E101.D_2017EDP7165\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"references-count":21,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2018]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2017edp7165","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"type":"print","value":"0916-8532"},{"type":"electronic","value":"1745-1361"}],"subject":[],"published":{"date-parts":[[2018]]}}}