{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T19:30:55Z","timestamp":1771961455863,"version":"3.50.1"},"reference-count":52,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3453598","type":"journal-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T17:29:48Z","timestamp":1727717388000},"page":"4263-4276","source":"Crossref","is-referenced-by-count":5,"title":["Cross-Utterance Conditioned VAE for Speech Generation"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7073-9698","authenticated-orcid":false,"given":"Yang","family":"Li","sequence":"first","affiliation":[{"name":"Department of Computer Science, The University of Manchester, Manchester, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0102-9441","authenticated-orcid":false,"given":"Cheng","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Creativity and Art, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5886-056X","authenticated-orcid":false,"given":"Guangzhi","family":"Sun","sequence":"additional","affiliation":[{"name":"Machine Intelligence Lab, University of Cambridge, Cambridge, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9745-4457","authenticated-orcid":false,"given":"Weiqin","family":"Zu","sequence":"additional","affiliation":[{"name":"School of Creativity and Art, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0622-8512","authenticated-orcid":false,"given":"Zheng","family":"Tian","sequence":"additional","affiliation":[{"name":"School of Creativity and Art, ShanghaiTech University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1247-2382","authenticated-orcid":false,"given":"Ying","family":"Wen","sequence":"additional","affiliation":[{"name":"School of Electronic, Information and Electrical Engineering (SEIEE), Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1121-9879","authenticated-orcid":false,"given":"Wei","family":"Pan","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Manchester, Manchester, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7730-5131","authenticated-orcid":false,"given":"Chao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Electronic Engineering, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4021-4228","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University College London, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0608-9408","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[{"name":"Thrust of Internet of Things, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4302-2512","authenticated-orcid":false,"given":"Fanglei","family":"Sun","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Shanghai for Science and Technology, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Auto-encoding variational Bayes","volume-title":"Proc. 2nd Int. Conf. Learn. Representations","author":"Kingma","year":"2014"},{"key":"ref2","article-title":"Hierarchical generative modeling for controllable speech synthesis","volume-title":"Proc. 7th Int. Conf. Learn. Representations","author":"Hsu","year":"2019"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053436"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053520"},{"key":"ref6","first-page":"3483","article-title":"Learning structured output representation using deep conditional generative models","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Sohn","year":"2015"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073702"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/asru51503.2021.9688051"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3190717"},{"key":"ref10","first-page":"1399","article-title":"A $^{3}$ T: Alignment-aware acoustic and text pretraining for speech synthesis and editing","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"162","author":"Bai","year":"2022"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3177"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1430"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-252"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-412"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2848"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-528"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.30"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2558"},{"key":"ref20","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ren","year":"2019"},{"key":"ref21","article-title":"Parallel neural text-to-speech","author":"Peng","year":"2019"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413889"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054119"},{"key":"ref24","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. 9th Int. Conf. Learn. Representations","author":"Ren","year":"2021"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1461"},{"key":"ref26","first-page":"7700","article-title":"EfficientTTS: An efficient and high-quality text-to-speech architecture","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Miao","year":"2021"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054484"},{"key":"ref28","first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Kim","year":"2020"},{"key":"ref29","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov","year":"2021"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746883"},{"key":"ref32","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-349"},{"key":"ref34","first-page":"1876","article-title":"Unsupervised learning of disentangled and interpretable representations from sequential data","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Hsu","year":"2017"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683561"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414499"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414102"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref40","article-title":"Towards transfer learning for end-to-end speech synthesis from deep pre-trained language models","author":"Fang","year":"2019"},{"key":"ref41","article-title":"Unified mandarin TTS front-end based on distilled BERT model","author":"Zhang","year":"2020"},{"key":"ref42","article-title":"Dependency parsing based semantic representation learning with graph neural network for enhancing expressiveness of text-to-speech","author":"Zhou","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref44","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst. 33: Annu. Conf. Neural Inf. Process. Syst. 20","author":"Kong","year":"2020"},{"key":"ref45","first-page":"498","article-title":"Montreal forced aligner: Trainable text-speech alignment using Kaldi","volume-title":"Proc. Interspeech 18th Annu. Conf. Int. Speech Commun. Assoc.","author":"McAuliffe","year":"2017"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2441"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2009.4960497"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref49","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov","year":"2021"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref51","article-title":"Efficient estimation of word representations in vector space","volume-title":"Proc. 1st Int. Conf. Learn. Representations","author":"Mikolov","year":"2013"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10699460.pdf?arnumber=10699460","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T04:05:10Z","timestamp":1728187510000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10699460\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":52,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3453598","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}