{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T18:10:07Z","timestamp":1748455807042,"version":"3.41.0"},"reference-count":36,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icasspw65056.2025.11011026","type":"proceedings-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T17:05:14Z","timestamp":1748365514000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["StableTTS: Towards Efficient Denoising Acoustic Decoder for Text to Speech Synthesis with Consistency Flow Matching"],"prefix":"10.1109","author":[{"given":"Zhiyong","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai University,Shanghai,China"}]},{"given":"Xinnuo","family":"Li","sequence":"additional","affiliation":[{"name":"New York University,New York,USA"}]},{"given":"Shuhang","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai University,Shanghai,China"}]},{"given":"Zhi","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai University,Shanghai,China"}]},{"given":"Zhiqi","family":"Ai","sequence":"additional","affiliation":[{"name":"Shanghai University,Shanghai,China"}]},{"given":"Shugong","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai University,Shanghai,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"article-title":"Consistency Flow Matching: Defining Straight Flows with Velocity Consistency","year":"2024","author":"Yang","key":"ref2"},{"article-title":"Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone","volume-title":"International Conference on Machine Learning","author":"Casanova","key":"ref3"},{"article-title":"Hierspeech++: Bridging the gap between semantic and acoustic representation of speech by hierarchical variational inference for zero-shot speech synthesis","year":"2023","author":"Lee","key":"ref4"},{"volume-title":"MARS5-TTS","year":"2023","key":"ref5"},{"article-title":"NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers","volume-title":"The Twelfth International Conference on Learning Representations","author":"Shen","key":"ref6"},{"article-title":"Neural codec language models are zero-shot text to speech synthesizers","year":"2023","author":"Wang","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"article-title":"Grad-tts: A diffusion probabilistic model for text-to-speech","volume-title":"International Conference on Machine Learning","author":"Popov","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"author":"Chen","key":"ref12","article-title":"Bridge-TTS: Text-to-Speech Synthesis with Schrodinger Bridge"},{"article-title":"BASE TTS: Lessons from building a billion-parameter text-to-speech model on 100K hours of data","year":"2024","author":"\u0141ajszczak","key":"ref13"},{"article-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","year":"2024","author":"Du","key":"ref14"},{"article-title":"Autoregressive Speech Synthesis without Vector Quantization","year":"2024","author":"Meng","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10889"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612061"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096710"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445948"},{"key":"ref21","article-title":"P-flow: a fast and data-efficient zero-shot TTS through speech prompting","volume":"36","author":"Kim","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Neural codec language models are zero-shot text to speech synthesizers[J]","year":"2023","author":"Wang","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"article-title":"Soundstorm: Efficient parallel audio generation[J]","year":"2023","author":"Borsos","key":"ref24"},{"article-title":"Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow[C]","volume-title":"NeurIPS 2022 Workshop on Score-Based Methods","author":"Liu","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref27","first-page":"7748","article-title":"Meta-stylespeech: Multi-speaker adaptive text-to-speech generation[C]","volume-title":"International Conference on Machine Learning","author":"Min"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414878"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445948"},{"volume-title":"GitHub - Plachtaa\/Seed-vc: State-of-the-Art Zero-shot Voice Conversion and Singing Voice Conversion With in Context Learning","key":"ref31"},{"article-title":"Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis","year":"2023","author":"Siuzdak","key":"ref32"},{"volume-title":"GitHub - Resemble-ai\/Resemblyzer: A Python Package to Analyze and Compare Voices With Deep Learning","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"article-title":"Flow straight and fast: Learning to generate and transfer data with rectified flow","volume-title":"The Eleventh International Conference on Learning Representations","author":"Liu","key":"ref35"},{"key":"ref36","first-page":"32211","article-title":"Consistency models","volume-title":"International Conferenceon Machine Learning","author":"Song"}],"event":{"name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","start":{"date-parts":[[2025,4,6]]},"location":"Hyderabad, India","end":{"date-parts":[[2025,4,11]]}},"container-title":["2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11010992\/11010997\/11011026.pdf?arnumber=11011026","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T17:50:15Z","timestamp":1748454615000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11011026\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/icasspw65056.2025.11011026","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}