{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T01:58:15Z","timestamp":1781315895973,"version":"3.54.1"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"British Broadcasting Corporation Research and Development"},{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"publisher","award":["EP\/T019751\/1"],"award-info":[{"award-number":["EP\/T019751\/1"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Centre for Vision, Speech and Signal Processing"},{"name":"Faculty of Engineering and Physical Science"},{"DOI":"10.13039\/501100003513","name":"University of Surrey","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003513","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1109\/tpami.2024.3356232","type":"journal-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:43:03Z","timestamp":1705689783000},"page":"4234-4245","source":"Crossref","is-referenced-by-count":160,"title":["<i>NaturalSpeech<\/i>: End-to-End Text-to-Speech Synthesis With Human-Level Quality"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5631-0639","authenticated-orcid":false,"given":"Xu","family":"Tan","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2667-0654","authenticated-orcid":false,"given":"Jiawei","family":"Chen","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1036-7888","authenticated-orcid":false,"given":"Haohe","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0223-3492","authenticated-orcid":false,"given":"Jian","family":"Cong","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3596-4683","authenticated-orcid":false,"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3520-1887","authenticated-orcid":false,"given":"Yanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0434-7939","authenticated-orcid":false,"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3440-074X","authenticated-orcid":false,"given":"Yichong","family":"Leng","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0135-9838","authenticated-orcid":false,"given":"Yuanhao","family":"Yi","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2106-740X","authenticated-orcid":false,"given":"Lei","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9624-5381","authenticated-orcid":false,"given":"Sheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft Azure Speech, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9095-0776","authenticated-orcid":false,"given":"Tao","family":"Qin","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9088-3577","authenticated-orcid":false,"given":"Frank","family":"Soong","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0476-8020","authenticated-orcid":false,"given":"Tie-Yan","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A survey on neural speech synthesis","author":"Tan","year":"2021"},{"key":"ref2","article-title":"WaveNet: A generative model for raw audio","author":"van den Oord","year":"2016"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref5","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"\u00d6 Ar\u0131k"},{"key":"ref6","first-page":"2966","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Gibiansky"},{"key":"ref7","first-page":"214","article-title":"Deep voice 3: 2000-speaker neural text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ping"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461829"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref10","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume-title":"Proc. 33rd Int. Conf. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Blizzard.2021-14"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref13","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kim"},{"key":"ref14","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov"},{"key":"ref15","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref16","first-page":"2410","article-title":"Efficient neural audio synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kalchbrenner"},{"key":"ref17","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Kong"},{"key":"ref18","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren"},{"key":"ref19","article-title":"End-to-end adversarial text-to-speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Donahue"},{"key":"ref20","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"van den Oord"},{"key":"ref21","article-title":"Zero-shot text-to-image generation","author":"Ramesh","year":"2021"},{"key":"ref22","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2013"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-621"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1461"},{"key":"ref25","article-title":"NICE: Non-linear independent components estimation","author":"Dinh","year":"2014"},{"key":"ref26","first-page":"4743","article-title":"Improved variational inference with inverse autoregressive flow","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Durk"},{"key":"ref27","first-page":"10236","article-title":"Glow: Generative flow with invertible 1\u00d7 1 convolutions","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Kingma"},{"key":"ref28","article-title":"The LJ speech dataset","author":"Ito","year":"2017"},{"key":"ref29","article-title":"Superseded-CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"Veaux","year":"2016"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413877"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_16"},{"key":"ref36","article-title":"CogView: Mastering text-to-image generation via transformers","author":"Ding","year":"2021"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"ref38","article-title":"VideoGPT: Video generation using VQ-VAE and transformers","author":"Yan","year":"2021"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.5220\/0010241801010112"},{"key":"ref40","first-page":"14866","article-title":"Generating diverse high-fidelity images with VQ-VAE-2","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Razavi"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i20.30271"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054337"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref46","article-title":"BERT: Pre-training of deep bidirectional Transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref47","article-title":"Searching for activation functions","author":"Ramachandran","year":"2017"},{"key":"ref48","first-page":"1530","article-title":"Variational inference with normalizing flows","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rezende"},{"key":"ref49","first-page":"894","article-title":"Soft-DTW: A differentiable loss function for time-series","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Cuturi"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.304"},{"key":"ref51","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref52","article-title":"AdaSpeech: Adaptive text to speech for custom voice","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-901"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413851"},{"key":"ref55","article-title":"The news-crawl dataset","year":"2022"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1208"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21105\/joss.03958"},{"key":"ref58","article-title":"Density estimation using real NVP","author":"Dinh","year":"2016"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p16-1162"},{"key":"ref60","article-title":"Decoupled weight decay regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Loshchilov"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10522060\/10409539.pdf?arnumber=10409539","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,8]],"date-time":"2024-05-08T04:42:51Z","timestamp":1715143371000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10409539\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6]]},"references-count":60,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3356232","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,6]]}}}