{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:43:21Z","timestamp":1776883401260,"version":"3.51.2"},"reference-count":24,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,12]]},"DOI":"10.1109\/slt.2018.8639682","type":"proceedings-article","created":{"date-parts":[[2019,2,14]],"date-time":"2019-02-14T23:36:34Z","timestamp":1550187394000},"page":"595-602","source":"Crossref","is-referenced-by-count":75,"title":["Predicting Expressive Speaking Style from Text in End-To-End Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Daisy","family":"Stanton","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxuan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"RJ","family":"Skerry-Ryan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"crossref","first-page":"146","DOI":"10.21437\/Interspeech.2010-71","article-title":"AuToBI-a tool for automatic ToBI annotation","author":"rosenberg","year":"2010","journal-title":"InterSpeech"},{"key":"ref11","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2015-336","article-title":"High-level feature representation using recurrent neural network for speech emotion recognition","author":"lee","year":"2015","journal-title":"Proceedings of Interspeech 2015"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953138"},{"key":"ref13","article-title":"Capturing long-term temporal dependencies with convolutional networks for continuous emotion recognition","author":"khorram","year":"2017","journal-title":"CoRR"},{"key":"ref14","article-title":"Variational autoencoders for learning latent representations of speech emotion","author":"latif","year":"2017","journal-title":"CoRR"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2018.03.002"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2759338"},{"key":"ref17","article-title":"Unsupervised learning for expressive speech synthesis","author":"jauk","year":"2017","journal-title":"Ph D Dissertation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref19","article-title":"Voice synthesis for in-the-wild speakers via a phonological loop","author":"taigman","year":"2017","journal-title":"CoRR"},{"key":"ref4","article-title":"La repr&#x00E9;sentation linguistique des syst&#x00E8;mes prosodiques: une approche cognitive","author":"hirst","year":"1987","journal-title":"Ph D Dissertation"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"ref6","first-page":"71","article-title":"Automatic modelling of fundamental frequency using a quadratic spline function","volume":"15","author":"hirst","year":"1993","journal-title":"Travaux de l&#x2019;Institut de Phon&#x00E9;tique d&#x2019;Aix"},{"key":"ref5","article-title":"ToBI: A standard for labeling english prosody","author":"silverman","year":"1992","journal-title":"Second International Conference on Spoken Language Processing"},{"key":"ref8","article-title":"The tilt intonation model","author":"taylor","year":"1998","journal-title":"ICSLP"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1121\/1.416983"},{"key":"ref2","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"2018","journal-title":"International Conference on Machine Learning"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"4006","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"246","DOI":"10.21437\/SpeechProsody.2014-37","article-title":"Slam: Automatic stylization and labelling of speech melody","author":"obin","year":"2014","journal-title":"Speech Prosody"},{"key":"ref20","article-title":"Highway networks","author":"srivastava","year":"2015","journal-title":"International Conference on Machine Learning Workshop on Deep Learning"},{"key":"ref22","article-title":"A note on the evaluation of generative models","author":"theis","year":"2015"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref24","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron","author":"skerry-ryan","year":"0"},{"key":"ref23","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014"}],"event":{"name":"2018 IEEE Spoken Language Technology Workshop (SLT)","location":"Athens, Greece","start":{"date-parts":[[2018,12,18]]},"end":{"date-parts":[[2018,12,21]]}},"container-title":["2018 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8632666\/8639030\/08639682.pdf?arnumber=8639682","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T06:48:21Z","timestamp":1643266101000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8639682\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,12]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/slt.2018.8639682","relation":{},"subject":[],"published":{"date-parts":[[2018,12]]}}}