{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T23:47:30Z","timestamp":1780444050812,"version":"3.54.1"},"reference-count":30,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,1,19]]},"DOI":"10.1109\/slt48900.2021.9383629","type":"proceedings-article","created":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T20:46:54Z","timestamp":1616705214000},"page":"446-453","source":"Crossref","is-referenced-by-count":22,"title":["Hierarchical Prosody Modeling for Non-Autoregressive Speech Synthesis"],"prefix":"10.1109","author":[{"given":"Chung-Ming","family":"Chien","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hung-yi","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref30","first-page":"623","author":"loizou","year":"2011","journal-title":"Speech Quality Assessment"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639682"},{"key":"ref11","first-page":"281","article-title":"Predicting prosodic prominence from text with pre-trained contextualized word representations","author":"talman","year":"2019","journal-title":"Proceedings of the 22nd Nordic Conference on Computational Linguistics"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2016.11.001"},{"key":"ref13","first-page":"3331","article-title":"Chive: Varying prosody in speech synthesis with a linguistically driven dynamic hierarchical conditional variational network","author":"kenter","year":"2019","journal-title":"Proceedings of the 36th International Conference on Machine Learning ICML 2019"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/SpeechProsody.2020-193"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053520"},{"key":"ref16","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref19","article-title":"Fast and reliable f0 estimation method based on the period extraction of vocal fold vibration of singing voice and speech","author":"morise","year":"2009","journal-title":"Audio Engineering Society Conference 35th International Conference Audio for Games"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960497"},{"key":"ref4","article-title":"Hierarchical generative modeling for controllable speech synthesis","author":"hsu","year":"2019","journal-title":"International Conference on Learning Representations"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1006"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref6","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"2020"},{"key":"ref29","article-title":"Synthesizer voice quality of new languages calibrated with mean mel cepstral distortion","author":"kominek","year":"2008","journal-title":"SLTU"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"ref8","first-page":"3171","article-title":"Fastspeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref7","article-title":"Fastpitch: Parallel text-to-speech with pitch prediction","author":"la?cucki","year":"2020"},{"key":"ref2","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume":"80","author":"wang","year":"2018","journal-title":"Proceedings of the 35th International Conference on Machine Learning"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053436"},{"key":"ref1","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume":"80","author":"skerry-ryan","year":"2018","journal-title":"Proceedings of the 35th International Conference on Machine Learning"},{"key":"ref20","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref22","article-title":"Huggingface&#x2019;s transformers: State-of-the-art natural language processing","volume":"abs 1910 3771","author":"wolf","year":"2019","journal-title":"ArXiv"},{"key":"ref21","article-title":"Fasttext.zip: Compressing text classification models","author":"joulin","year":"2016","journal-title":"arXiv preprint arXiv 1612 03651"},{"key":"ref24","first-page":"14910","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis","author":"kumar","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref23","article-title":"The lj speech dataset","author":"ito","year":"2017"},{"key":"ref26","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies Volume 1 (Long and Short Papers)"},{"key":"ref25","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"3rd International Conference on Learning Representations ICLR 2015"}],"event":{"name":"2021 IEEE Spoken Language Technology Workshop (SLT)","location":"Shenzhen, China","start":{"date-parts":[[2021,1,19]]},"end":{"date-parts":[[2021,1,22]]}},"container-title":["2021 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9383468\/9383452\/09383629.pdf?arnumber=9383629","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,31]],"date-time":"2021-05-31T21:30:53Z","timestamp":1622496653000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9383629\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,19]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383629","relation":{},"subject":[],"published":{"date-parts":[[2021,1,19]]}}}