{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,4]],"date-time":"2026-07-04T16:38:18Z","timestamp":1783183098250,"version":"3.54.6"},"reference-count":35,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2022ZD0116307"],"award-info":[{"award-number":["2022ZD0116307"]}]},{"name":"NSF China","award":["62271270"],"award-info":[{"award-number":["62271270"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/lsp.2025.3600376","type":"journal-article","created":{"date-parts":[[2025,8,19]],"date-time":"2025-08-19T18:18:44Z","timestamp":1755627524000},"page":"3530-3534","source":"Crossref","is-referenced-by-count":3,"title":["StreamMel: Real-Time Zero-Shot Text-to-Speech Via Interleaved Continuous Autoregressive Modeling"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8057-4644","authenticated-orcid":false,"given":"Hui","family":"Wang","sequence":"first","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0588-1812","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2599-6752","authenticated-orcid":false,"given":"Shujie","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1089-9748","authenticated-orcid":false,"given":"Jinyu","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lingwei","family":"Meng","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0476-8020","authenticated-orcid":false,"given":"Yanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4819-4572","authenticated-orcid":false,"given":"Jiaming","family":"Zhou","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haoqin","family":"Sun","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5383-6424","authenticated-orcid":false,"given":"Yan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2748-3020","authenticated-orcid":false,"given":"Yong","family":"Qin","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref3","article-title":"Flow matching for generative modeling","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Lipman","year":"2023"},{"key":"ref4","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Ho","year":"2020"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3530270"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"key":"ref7","article-title":"CosyVoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","author":"Du","year":"2024"},{"key":"ref8","article-title":"Seed-TTS: A family of high-quality versatile speech generation models","author":"Anastassiou","year":"2024"},{"key":"ref9","article-title":"CosyVoice 3: Towards in-the-wild speech generation via scaling-up and post-training","author":"Du","year":"2025"},{"key":"ref10","article-title":"Pseudo-autoregressive neural codec language models for efficient zero-shot text-to-speech synthesis","volume-title":"Proc. ACMMM","author":"Yang","year":"2025"},{"key":"ref11","article-title":"Efficient speech language modeling via energy distance in continuous latent space","author":"Ma","year":"2025"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.65"},{"key":"ref13","article-title":"CosyVoice 2: Scalable streaming speech synthesis with large language models","author":"Du","year":"2024"},{"key":"ref14","article-title":"Interleaved speech-text language models are simple streaming text to speech synthesizers","author":"Yang","year":"2024"},{"key":"ref15","article-title":"Zero-shot streaming text to speech synthesis with transducer and auto-regressive modeling","author":"Sun","year":"2025"},{"key":"ref16","article-title":"SyncSpeech: Low-latency and efficient dual-stream text-to-speech based on temporal masked transformer","author":"Sheng","year":"2025"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446998"},{"key":"ref18","article-title":"Autoregressive diffusion transformer for text-to-speech synthesis","author":"Liu","year":"2024"},{"key":"ref19","article-title":"Autoregressive image generation without vector quantization","author":"Li","year":"2024"},{"key":"ref20","article-title":"FELLE: Autoregressive speech synthesis with token-wise coarse-to-fine flow matching","volume-title":"Proc. ACMMM","author":"Wang","year":"2025"},{"key":"ref21","article-title":"DiTAR: Diffusion transformer autoregressive modeling for speech generation","author":"Jia","year":"2025"},{"key":"ref22","article-title":"Auto-encoding variational Bayes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2014"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref25","article-title":"VALL-E R: Robust and efficient zero-shot text-to-speech synthesis via monotonic alignment","author":"Han","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.313"},{"key":"ref27","article-title":"MaskGCT: Zero-shot text-to-speech with masked generative codec transformer","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wang","year":"2024"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-851"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3552957"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref32","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447120"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832365"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/97\/10802935\/11129630.pdf?arnumber=11129630","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T05:42:43Z","timestamp":1758087763000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11129630\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/lsp.2025.3600376","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"value":"1070-9908","type":"print"},{"value":"1558-2361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}