{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:05:45Z","timestamp":1775199945302,"version":"3.50.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434698","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Speech Synthesis From Continuous Features Using Per-Token Latent Diffusion"],"prefix":"10.1109","author":[{"given":"Arnon","family":"Turetzky","sequence":"first","affiliation":[{"name":"The Hebrew University of Jerusalem"}]},{"given":"Avihu","family":"Dekel","sequence":"additional","affiliation":[{"name":"The Hebrew University of Jerusalem"}]},{"given":"Nimrod","family":"Shabtay","sequence":"additional","affiliation":[{"name":"Tel Aviv University"}]},{"given":"Slava","family":"Shechtman","sequence":"additional","affiliation":[{"name":"IBM Research"}]},{"given":"David","family":"Haws","sequence":"additional","affiliation":[{"name":"IBM Research"}]},{"given":"Hagai","family":"Aronowitz","sequence":"additional","affiliation":[{"name":"IBM Research"}]},{"given":"Ron","family":"Hoory","sequence":"additional","affiliation":[{"name":"IBM Research"}]},{"given":"Yossi","family":"Adi","sequence":"additional","affiliation":[{"name":"The Hebrew University of Jerusalem"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Neural discrete representation learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref3","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"International conference on machine learning","author":"Ramesh"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref6","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2022","journal-title":"arXiv preprint arXiv:2210.13438"},{"key":"ref7","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv:2301.02111"},{"key":"ref8","article-title":"Simple and controllable music generation","volume":"36","author":"Copet","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2406.05370"},{"key":"ref10","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023","journal-title":"arXiv preprint arXiv:2303.03926"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446998"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_17"},{"key":"ref13","article-title":"Autoregressive image generation without vector quantization","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv:2406.11838"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.65"},{"key":"ref15","article-title":"Continuous autoregressive modeling with stochastic monotonic alignment for speech synthesis","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Lin"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref17","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref21","article-title":"Voicebox: Textguided multilingual universal speech generation at scale","volume":"36","author":"Le","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref22","article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","year":"2023","journal-title":"arXiv preprint arXiv:2304.09116"},{"key":"ref23","article-title":"Base tts: Lessons from building a billion-parameter text-to-speech model on 100k hours of data","author":"\u0141ajszczak","year":"2024","journal-title":"arXiv preprint arXiv:2402.08093"},{"key":"ref24","article-title":"Speak, read and prompt: High-fidelity text-to-speech with minimal supervision","author":"Kharitonov","year":"2023","journal-title":"arXiv: 2302.03540 [cs. SD]"},{"key":"ref25","article-title":"Make-a-voice: Unified voice synthesis with discrete representation","volume-title":"arXiv preprint arXiv:2305.19269","author":"Huang","year":"2023"},{"key":"ref26","article-title":"Soundstorm: Efficient parallel audio generation","author":"Borsos","year":"2023","journal-title":"arXiv preprint arXiv:2305.09636"},{"key":"ref27","article-title":"AudioLM: A language modeling approach to audio generation","author":"Borsos","year":"2023","journal-title":"arXiv: 2209.03143 [cs. SD]"},{"key":"ref28","article-title":"Audiopalm: A large language model that can speak and listen","author":"Rubenstein","year":"2023","journal-title":"arXiv preprint arXiv:2306.12925"},{"key":"ref29","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref30","article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"arXiv preprint arXiv:2009.09761","author":"Kong","year":"2020"},{"key":"ref31","article-title":"Wavegrad: Estimating gradients for waveform generation","author":"Chen","year":"2020","journal-title":"arXiv preprint arXiv:2009.00713"},{"key":"ref32","first-page":"8599","article-title":"Grad-tts: A diffusion probabilistic model for text-to-speech","volume-title":"International Conference on Machine Learning","author":"Popov"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2024-2366"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1214"},{"key":"ref37","article-title":"Seamless: Multilingual expressive and streaming speech translation","author":"Barrault","year":"2023","journal-title":"arXiv preprint arXiv:2312.05187"},{"key":"ref38","article-title":"Classifier-free diffusion guidance","author":"Ho","year":"2022","journal-title":"arXiv preprint arXiv:2207.12598"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"},{"key":"ref40","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/jstsp.2022.3188113"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/s10606-017-9283-z"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5946971"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2016-1331"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639550"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.2307\/3001968"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434698.pdf?arnumber=11434698","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:26Z","timestamp":1775192306000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434698\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434698","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}