{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,12,2]],"date-time":"2022-12-02T05:22:20Z","timestamp":1669958540716},"reference-count":28,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"12","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2020,12,1]]},"DOI":"10.1587\/transinf.2020edp7078","type":"journal-article","created":{"date-parts":[[2020,11,30]],"date-time":"2020-11-30T22:19:46Z","timestamp":1606774786000},"page":"2659-2672","source":"Crossref","is-referenced-by-count":0,"title":["Loss Function Considering Multiple Attributes of a Temporal Sequence for Feed-Forward Neural Networks"],"prefix":"10.1587","volume":"E103.D","author":[{"given":"Noriyuki","family":"MATSUNAGA","sequence":"first","affiliation":[{"name":"R&D group, AI, Inc."},{"name":"Graduate School of Engineering, Toyama Prefectural University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yamato","family":"OHTANI","sequence":"additional","affiliation":[{"name":"R&D group, AI, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tatsuya","family":"HIRAHARA","sequence":"additional","affiliation":[{"name":"Graduate School of Engineering, Toyama Prefectural University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] H. Zen, A. Senior, and M. Schuster, \u201cStatistical Parametric Speech Synthesis Using Deep Neural Networks,\u201d Proc. ICASSP, pp.7962-7966, 2013. 10.1109\/icassp.2013.6639215","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"2","unstructured":"[2] K. Nakamura, S. Takaki, K. Hashimoto, K. Oura, Y. Nankaku, and K. Tokuda, \u201cComputational complexity reduction method for CNN-based singing voice synthesis,\u201d Proc. ASJ2019A, pp.939-940, Sept. 2019."},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] Z. Kons, S. Shechtman, A. Sorin, C. Rabinovitz, and R. Hoory, \u201cHigh quality, lightweight and adaptable TTS using LPCNet,\u201d Proc. INTERSPEECH 2019, pp.176-180, Sept. 2019. 10.21437\/interspeech.2019-1705","DOI":"10.21437\/Interspeech.2019-1705"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] H. Banno, H. Hata, M. Morise, T. Takahashi, T. Irino, and H.Kawahara, \u201cImplementation of realtime straight speech manipulation system,\u201d Acoust. Sci. &amp; Tech., vol.28, no.3, pp.140-146, 2007. 10.1250\/ast.28.140","DOI":"10.1250\/ast.28.140"},{"key":"5","doi-asserted-by":"publisher","unstructured":"[5] M. Morise, F. Yokomori, and K. Ozawa, \u201cWORLD: a vocoder-based high-quality speech synthesis system for real-time applications,\u201d IEICE Trans. Inf. &amp; Syst., vol.E99-D, no.7, pp.1877-1884, 2016. 10.1587\/transinf.2015edp7457","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"6","unstructured":"[6] A. van den Oord, S. Dieleman, H. Zen, K. Simonyan, O. Vinyals, A. Graves, N. Kalchbrenner, A. Senior, and K. Kavukcuoglu, \u201cWaveNet: A Generative Model for Raw Audio,\u201d Proc. ISCA Speech Synthesis Workshop, Sunnyvale, CA, USA, p.125, Sept. 2016."},{"key":"7","unstructured":"[7] N. Kalchbrenner, E. Elsen, K. Simonyan, S. Noury, N. Casagrande, E. Lockhart, F. Stimberg, A. van den Oord, S. Dieleman, and K. Kavukcuoglu, \u201cEfficient Neural Audio Synthesis,\u201d Proc. International Conference on Machine Learning (ICML), Stockholm,Sweden, pp.2415-2424, 2018."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] R. Prenger, R. Valle, and B. Catanzaro, \u201cWaveGlow: A Flow-based Generative Network for Speech Synthesis,\u201d Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, UK, pp.3617-3621, May 2019. 10.1109\/icassp.2019.8683143","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"9","doi-asserted-by":"publisher","unstructured":"[9] K. Tokuda, Y. Nankaku, T. Toda, H. Zen, J. Yamagishi, and K. Oura, \u201cSpeech synthesis based on hidden Markov models,\u201d Proc. IEEE, vol.101, no.5, pp.1234-1252, May 2013. 10.1109\/jproc.2013.2251852","DOI":"10.1109\/JPROC.2013.2251852"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] H. Zen and H. Sak, \u201cUnidirectional Long Short-Term Memory Recurrent Neural Network with Recurrent Output Layer for Low-Latency Speech Synthesis,\u201d Proc. ICASSP, pp.4470-4474, 2015. 10.1109\/icassp.2015.7178816","DOI":"10.1109\/ICASSP.2015.7178816"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] N. Matsunaga, Y. Ohtani, and T. Hirahara, \u201cLoss Function considering Temporal Sequence for Feed-Forward Neural Network-Fundamental Frequency Case,\u201d Proc. 10th ISCA Speech Synthesis Workshop, pp.143-148, Vienna, Austria, 2019. 10.21437\/ssw.2019-26","DOI":"10.21437\/SSW.2019-26"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] T. Nose, V. Chunwijitra, and T. Kobayashi, \u201cA parameter Generation Algorithm Using Local Variance for HMM-Based Speech Synthesis,\u201d Proc. IEEE, vol.8, no.2, pp.221-228, 2014.","DOI":"10.1109\/JSTSP.2013.2283459"},{"key":"13","unstructured":"[13] T. Toda and K. Tokuda, \u201cSpeech Parameter Generation Algorithm Considering Global Variance for HMM-Based Speech Synthesis,\u201d Proc. INTERSPEECH 2005, pp.2801-2804, Lisbon, Portugal, Sept. 2005."},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] Z. Wu and S. King, \u201cMinimum trajectory error training for deep neural networks combined with stacked bottleneck features,\u201d Proc. Interspeech 2015, pp.309-313, 2015.","DOI":"10.21437\/Interspeech.2015-123"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] K. Hashimoto, K. Oura, Y. Nankaku, and K. Tokuda, \u201cTrajectory training considering global variance for speech synthesis based on neural networks,\u201d Proc. Int. Conf. Acoust., Speech, Signal Process., pp.5600-5604, Shanghai, China, March 2016. 10.1109\/icassp.2016.7472749","DOI":"10.1109\/ICASSP.2016.7472749"},{"key":"16","unstructured":"[16] M. Morise, Y. Toyoda, and K. Ozawa, \u201cInfluence of exaggerated temporal fluctuation on singing voice of perception of humanity,\u201d [Translated from Japanese.], IPSJ Special Interest Group Technical Report, vol.2017-MUS-115, no.55, pp.1-6, June 2017."},{"key":"17","unstructured":"[17] T.C. Ishi, N. Minematsu, and K. Hirose, \u201cIdentification of Japanese accent in continuous speech considering pitch perception,\u201d IEICE Technical Report SP., 101(270), pp.23-30, 2001."},{"key":"18","unstructured":"[18] HTS, http:\/\/hts.sp.nitech.ac.jp\/"},{"key":"19","unstructured":"[19] N. Matsunaga, Y. Ohtani, and T. Hirahara, \u201cNormalized Method of Linguistic Feature Suitable for Fundamental Frequency in Japanese Text to Speech Using Deep Learning,\u201d Trans. IEICE, vol.J102-D, no.10, pp.721-729, Oct. 2019."},{"key":"20","unstructured":"[20] V. Nair and G.E. Hinton, \u201cRectified Linear Units Improve Restricted Boltzmann Machines,\u201d Proc. 27th International Conference on Machine Learning, pp.807-814, Haifa, Israel, 2010."},{"key":"21","unstructured":"[21] D.P. Kingma and J. Ba, \u201cAdam: A Method for Stochastic Optimization,\u201d arXiv:1412.6980, 2014."},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] Y. Saito, S. Takamichi, and H. Saruwatari, \u201cStatistical Parametric Speech Synthesis Incorporating Generative Adversarial Networks,\u201d IEEE\/ACM Trans., vol.26, no.1, pp.84-96, 2018.","DOI":"10.1109\/TASLP.2017.2761547"},{"key":"23","unstructured":"[23] Speech Signal Processing Toolkit, http:\/\/sp-tk.sourceforge.net\/"},{"key":"24","doi-asserted-by":"publisher","unstructured":"[24] T. Yoshimura, K. Tokuda, T. Masuko, T. Kobayashi, and T.Kitamura, \u201cIncorporating a mixed excitation model and postfilter into HMM-based text-to-speech synthesis,\u201d Systems and Computers in Japan, vol.36, no.12, 2005. 10.1002\/scj.20354","DOI":"10.1002\/scj.20354"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] Z. Wu, O. Watts, and S. King, \u201cMerlin: An Open Source Neural Network Speech Synthesis System,\u201d Proc. 9th ISCA Speech Synthesis Workshop, pp.202-207, Sunny vale, USA, 2016. https:\/\/github.com\/CSTR-Edinburgh\/merlin","DOI":"10.21437\/SSW.2016-33"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] S. Takamichi, T. Toda, A.W. Black, G. Neubig, S. Sakti, and S. Nakamura, \u201cPost-Filters to Modify the Modulation Spectrum for Statistical Parametric Speech Synthesis,\u201d IEEE\/ACM Trans. Audio, Speech, Language Process., vol.24, no.4, pp.755-767, April 2016. 10.1109\/taslp.2016.2522655","DOI":"10.1109\/TASLP.2016.2522655"},{"key":"27","unstructured":"[27] Recommendation ITU-R BS.1534-1: Methods for subjective assessment of intermediate quality level of coding systems, https:\/\/www.itu.int\/dms_pubrec\/itu-r\/rec\/bs\/R-REC-BS.1534-3-2015-I!!PDF-E.pdf"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] T. Toda, T. Muramatsu, and H. Banno, \u201cImplementation of computationally efficient real-time voice conversion,\u201d Proc. INTERSPEECH 2012, pp.94-97, Portland, OR, USA, 2012.","DOI":"10.21437\/Interspeech.2012-34"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E103.D\/12\/E103.D_2020EDP7078\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,1]],"date-time":"2022-12-01T12:57:00Z","timestamp":1669899420000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E103.D\/12\/E103.D_2020EDP7078\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,12,1]]},"references-count":28,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2020]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2020edp7078","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,12,1]]}}}