{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T11:16:52Z","timestamp":1763810212975},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/icassp40776.2020.9053520","type":"proceedings-article","created":{"date-parts":[[2020,4,9]],"date-time":"2020-04-09T20:21:13Z","timestamp":1586463673000},"source":"Crossref","is-referenced-by-count":58,"title":["Fully-Hierarchical Fine-Grained Prosody Modeling For Interpretable Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Guangzhi","family":"Sun","sequence":"first","affiliation":[]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ron J.","family":"Weiss","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Heiga","family":"Zen","sequence":"additional","affiliation":[]},{"given":"Yonghui","family":"Wu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","year":"0","journal-title":"Audio samples from &#x201D;Fully-hierarchical Fine-grained Prosody Modelling for Interpretable Speech Synthesis\""},{"key":"ref38","article-title":"Reducing f0 frame error of f0 tracking algorithms under noisy conditions with an unvoiced\/voiced classification frontend","author":"chu","year":"2009","journal-title":"Proc ICASSP"},{"key":"ref33","article-title":"RNADE: The real-valued neural autoregressive density-estimator","author":"uria","year":"2013","journal-title":"Advances in neural information processing systems"},{"key":"ref32","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref31","first-page":"2525","article-title":"Structured disentangled representations","author":"esmaeili","year":"2019","journal-title":"Proc Int Conf on Artificial Intelligence and Statistics"},{"key":"ref30","article-title":"Disentangling factors of variation in deep representations using adversarial training","author":"mathieu","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1121\/1.1458024"},{"key":"ref36","article-title":"The blizzard challenge 2013","author":"king","year":"2013","journal-title":"Blizzard Challenge Workshop"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref34","first-page":"2338","article-title":"Masked autoregressive flow for density estimation","author":"papamakarios","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683561"},{"key":"ref11","author":"battenberg","year":"2019","journal-title":"Effective use of variational embedding capacity in expressive end-to-end speech synthesis"},{"key":"ref12","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with Tacotron","author":"skerry-ryan","year":"2018","journal-title":"Proc Int Conf on Machine Learning (ICML)"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref16","author":"henter","year":"2018","journal-title":"Deep encoder-decoder models for unsupervised learning of controllable speech synthesis"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"ref18","author":"razavi","year":"2019","journal-title":"Generating diverse high-fidelity images with vq-vae-2"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683862"},{"key":"ref28","author":"habib","year":"2019","journal-title":"Semi-supervised generative modeling for controllable speech synthesis"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref27","author":"gyawali","year":"2019","journal-title":"Semi-supervised learning by disentangling and self-ensembling over stochastic latent space"},{"key":"ref3","article-title":"Deep Voice 3: 2000-speaker neural text-to-speech","author":"ping","year":"2018","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref6","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"2019","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref29","first-page":"2172","article-title":"Infogan: Interpretable representation learning by information maximizing generative adversarial nets","author":"chen","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref5","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref8","article-title":"A generative adversarial network for style modeling in a text-to-speech system","author":"ma","year":"2019","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref7","first-page":"5167","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"2018","journal-title":"Proc Int Conf on Machine Learning (ICML)"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"4006","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1080\/01690961003589492"},{"key":"ref1","article-title":"Char2wav: End-to-end speech synthesis","author":"sotelo","year":"2017","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1097"},{"key":"ref22","article-title":"Beta-VAE: Learning basic visual concepts with a constrained variational framework","author":"higgins","year":"2017","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref21","article-title":"Unsupervised learning of disentangled and interpretable representations from sequential data","author":"hsu","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref24","article-title":"Variational inference of disentangled latent concepts from unlabeled observations","author":"kumar","year":"2017","journal-title":"Proc of the Int Conf on Learning Representations (ICLR)"},{"key":"ref23","first-page":"2649","article-title":"Disentangling by factorising","author":"kim","year":"2018","journal-title":"Proc Int Conf on Machine Learning (ICML)"},{"key":"ref26","first-page":"5925","article-title":"Learning disentangled representations with semi-supervised deep generative models","author":"narayanaswamy","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref25","article-title":"Challenging common assumptions in the unsupervised learning of disentangled representations","author":"locatello","year":"2019","journal-title":"Proc Int Conf on Machine Learning (ICML)"}],"event":{"name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Barcelona, Spain","start":{"date-parts":[[2020,5,4]]},"end":{"date-parts":[[2020,5,8]]}},"container-title":["ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9040208\/9052899\/09053520.pdf?arnumber=9053520","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,28]],"date-time":"2022-06-28T00:26:11Z","timestamp":1656375971000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9053520\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icassp40776.2020.9053520","relation":{},"subject":[],"published":{"date-parts":[[2020,5]]}}}