{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T15:19:49Z","timestamp":1767626389717},"reference-count":28,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10096530","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T13:28:30Z","timestamp":1683293310000},"page":"1-5","source":"Crossref","is-referenced-by-count":1,"title":["Prosody-Aware Speecht5 for Expressive Neural TTS"],"prefix":"10.1109","author":[{"given":"Yan","family":"Deng","sequence":"first","affiliation":[{"name":"Microsoft,China"}]},{"given":"Long","family":"Zhou","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Yuanhao","family":"Yi","sequence":"additional","affiliation":[{"name":"Microsoft,China"}]},{"given":"Shujie","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Lei","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2016"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref5","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume-title":"Proc. NIPS","author":"Ren"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414718"},{"article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NIPS","author":"Kong","key":"ref7"},{"key":"ref8","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end textto-speech","volume-title":"Proc. ICML","author":"Kim"},{"key":"ref9","article-title":"NaturalSpeech: End-to-end text to speech synthesis with human-level quality","author":"Tan","year":"2022","journal-title":"preprint arXiv:2205.04421v2"},{"key":"ref10","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. ICML","author":"Wang"},{"article-title":"Hierarchical generative modeling for controllable speech synthesis","volume-title":"Proc. ICLR","author":"Hsu","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"article-title":"FastSpeech 2: Fast and high-quality end-to-end text-to-speech","volume-title":"Proc. ICLR","author":"Ren","key":"ref13"},{"key":"ref14","article-title":"Feature reinforcement with word embedding and parsing information in neural TTS","author":"Ming","year":"2019","journal-title":"preprint arXiv:1901.00707"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3177"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054337"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414102"},{"article-title":"Semisupervised training for improving data efficiency in endto-end speech synthesis","volume-title":"Proc. ICASSP","author":"Chung","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-621"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413864"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746858"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746883"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.393"},{"key":"ref25","article-title":"DelightfulTTS: The Microsoft speech synthesis system for Blizzard Challenge 2021","author":"Liu","year":"2021","journal-title":"preprint arXiv:2110.12612"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2023,6,4]]},"location":"Rhodes Island, Greece","end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10096530.pdf?arnumber=10096530","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T21:06:11Z","timestamp":1705007171000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10096530\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10096530","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}