{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T10:56:16Z","timestamp":1764413776150},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9054148","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T16:21:13Z","timestamp":1586449273000},"page":"6734-6738","source":"Crossref","is-referenced-by-count":13,"title":["Improving End-to-End Speech Synthesis with Local Recurrent Neural Network Enhanced Transformer"],"prefix":"10.1109","author":[{"given":"Yibin","family":"Zheng","sequence":"first","affiliation":[]},{"given":"Xinhui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Fenglong","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Li","family":"Lu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"article-title":"Semi-supervised training for improving data efficiency in end-to-end speech synthesis","year":"2018","author":"chung","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2325"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"article-title":"WaveNet: A generative model for raw audio","year":"2016","author":"den oord","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","year":"2014","author":"chung","key":"ref16"},{"article-title":"Deep Voice 3: Scaling text-to-speech with convo-lutional sequence learning","year":"2017","author":"ping","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref19","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"7962","DOI":"10.1109\/ICASSP.2013.6639215","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"zen","year":"2013","journal-title":"Acoustics Speech and Signal Processing (ICASSP) 2013 IEEE International Conference on"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"kingma","key":"ref27"},{"key":"ref3","first-page":"4475","article-title":"Multi-speaker modeling and speaker adaptation for DNN-based TTS synthesis","author":"fan","year":"2015","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref5","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017"},{"article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron","year":"2018","author":"skerry-ryan","key":"ref8"},{"article-title":"Uncovering latent style factors for expressive speech synthesis","year":"2017","author":"wang","key":"ref7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511816338"},{"article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","year":"2018","author":"wang","key":"ref9"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1006\/csla.1999.0123"},{"article-title":"Close to human quality TTS with Transformer","year":"2018","author":"li","key":"ref20"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"article-title":"R-Transformer: Recurrent neural network enhanced Transformer","year":"2019","author":"wang","key":"ref24"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682861"},{"article-title":"Layer normalization","year":"2016","author":"ba","key":"ref26"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2020,5,4]]},"location":"Barcelona, Spain","end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09054148.pdf?arnumber=9054148","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,29]],"date-time":"2023-09-29T15:27:58Z","timestamp":1696001278000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9054148\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9054148","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}