{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:14:54Z","timestamp":1776885294234,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"name":"the National Key R&D Program of China","award":["2022ZD0116307"],"award-info":[{"award-number":["2022ZD0116307"]}]},{"name":"NSF China","award":["62271270"],"award-info":[{"award-number":["62271270"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755494","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"10229-10238","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["FELLE: Autoregressive Speech Synthesis with Token-Wise Coarse-to-Fine Flow Matching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8057-4644","authenticated-orcid":false,"given":"Hui","family":"Wang","sequence":"first","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2599-6752","authenticated-orcid":false,"given":"Shujie","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1028-6017","authenticated-orcid":false,"given":"Lingwei","family":"Meng","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1089-9748","authenticated-orcid":false,"given":"Jinyu","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0588-1812","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5068-025X","authenticated-orcid":false,"given":"Shiwan","family":"Zhao","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tanjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3485-3869","authenticated-orcid":false,"given":"Haiyang","family":"Sun","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4150-0680","authenticated-orcid":false,"given":"Yanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8554-8969","authenticated-orcid":false,"given":"Haoqin","family":"Sun","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4819-4572","authenticated-orcid":false,"given":"Jiaming","family":"Zhou","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5383-6424","authenticated-orcid":false,"given":"Yan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2748-3020","authenticated-orcid":false,"given":"Yong","family":"Qin","sequence":"additional","affiliation":[{"name":"College of Computer Science, Nankai University, Tian Jin, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent Eugene Kharitonov Olivier Pietquin Matt Sharifi Dominik Roblek Olivier Teboul David Grangier Marco Tagliasacchi et al. 2023. Audiolm: a language modeling approach to audio generation. IEEE\/ACM transactions on audio speech and language processing Vol. 31 (2023).","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_1_3_1","volume-title":"Explicitly Minimizing the Blur Error of Variational Autoencoders. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=9krnQ-ue9M","author":"Bredell Gustav","year":"2023","unstructured":"Gustav Bredell, Kyriakos Flouris, Krishna Chaitanya, Ertunc Erdil, and Ender Konukoglu. 2023. Explicitly Minimizing the Blur Error of Variational Autoencoders. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=9krnQ-ue9M"},{"key":"e_1_3_2_1_4_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020)."},{"key":"e_1_3_2_1_5_1","volume-title":"VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:2406.05370","author":"Chen Sanyuan","year":"2024","unstructured":"Sanyuan Chen, Shujie Liu, Long Zhou, Yanqing Liu, Xu Tan, Jinyu Li, Sheng Zhao, Yao Qian, and Furu Wei. 2024a. VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:2406.05370 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3530270"},{"key":"e_1_3_2_1_8_1","volume-title":"F5-tts: A fairytaler that fakes fluent and faithful speech with flow matching. arXiv preprint arXiv:2410.06885","author":"Chen Yushen","year":"2024","unstructured":"Yushen Chen, Zhikang Niu, Ziyang Ma, Keqi Deng, Chunhui Wang, Jian Zhao, Kai Yu, and Xie Chen. 2024b. F5-tts: A fairytaler that fakes fluent and faithful speech with flow matching. arXiv preprint arXiv:2410.06885 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746395"},{"key":"e_1_3_2_1_10_1","volume-title":"Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037","author":"D\u00e9fossez Alexandre","year":"2024","unstructured":"Alexandre D\u00e9fossez, Laurent Mazar\u00e9, Manu Orsini, Am\u00e9lie Royer, Patrick P\u00e9rez, Herv\u00e9 J\u00e9gou, Edouard Grave, and Neil Zeghidour. 2024. Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407","author":"Du Zhihao","year":"2024","unstructured":"Zhihao Du, Qian Chen, Shiliang Zhang, Kai Hu, Heng Lu, Yexin Yang, Hangrui Hu, Siqi Zheng, Yue Gu, Ziyang Ma, et al., 2024a. Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Zhihao Du Yuxuan Wang Qian Chen Xian Shi Xiang Lv Tianyu Zhao Zhifu Gao Yexin Yang Changfeng Gao Hui Wang et al. 2024b. Cosyvoice 2: Scalable streaming speech synthesis with large language models. arXiv preprint arXiv:2412.10117 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"e_1_3_2_1_14_1","volume-title":"VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment. arXiv preprint arXiv:2406.07855","author":"Han Bing","year":"2024","unstructured":"Bing Han, Long Zhou, Shujie Liu, Sanyuan Chen, Lingwei Meng, Yanming Qian, Yanqing Liu, Sheng Zhao, Jinyu Li, and Furu Wei. 2024. VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment. arXiv preprint arXiv:2406.07855 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_16_1","volume-title":"Yao Hung Hubert Tsai, et al","author":"Hsu Wei Ning","year":"2021","unstructured":"Wei Ning Hsu, Benjamin Bolte, Yao Hung Hubert Tsai, et al., 2021. HuBERT: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, Vol. 29 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Ju Zeqian","year":"2024","unstructured":"Zeqian Ju, Yuancheng Wang, Kai Shen, Xu Tan, Detai Xin, Dongchao Yang, Eric Liu, Yichong Leng, Kaitao Song, Siliang Tang, et al., 2024. NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"e_1_3_2_1_19_1","volume-title":"CLaM-TTS: Improving Neural Codec Language Model for Zero-Shot Text-to-Speech. In The Twelfth International Conference on Learning Representations.","author":"Kim Jaehyeon","year":"2024","unstructured":"Jaehyeon Kim, Keon Lee, Seungjun Chung, and Jaewoong Cho. 2024. CLaM-TTS: Improving Neural Codec Language Model for Zero-Shot Text-to-Speech. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. In Advances in Neural Information Processing Systems, Vol. 33."},{"key":"e_1_3_2_1_22_1","volume-title":"BASE TTS: Lessons from building a billion-parameter text-to-speech model on 100K hours of data. arXiv preprint arXiv:2402.08093","author":"\u0141ajszczak Mateusz","year":"2024","unstructured":"Mateusz \u0141ajszczak, Guillermo C\u00e1mbara, Yang Li, Fatih Beyhan, Arent van Korlaar, Fan Yang, Arnaud Joly, \u00c1lvaro Mart\u00edn-Cortinas, Ammar Abbas, Adam Michalski, et al., 2024. BASE TTS: Lessons from building a billion-parameter text-to-speech model on 100K hours of data. arXiv preprint arXiv:2402.08093 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Voicebox: Text-guided multilingual universal speech generation at scale. Advances in neural information processing systems","author":"Le Matthew","year":"2024","unstructured":"Matthew Le, Apoorv Vyas, Bowen Shi, Brian Karrer, Leda Sari, Rashel Moritz, Mary Williamson, Vimal Manohar, Yossi Adi, Jay Mahadeokar, et al., 2024. Voicebox: Text-guided multilingual universal speech generation at scale. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Heli Ben-Hamu, Maximilian Nickel, and Matt Le.","author":"Lipman Yaron","year":"2022","unstructured":"Yaron Lipman, Ricky TQ Chen, Heli Ben-Hamu, Maximilian Nickel, and Matt Le. 2022. Flow matching for generative modeling. arXiv preprint arXiv:2210.02747 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"MusicEval: A Generative Music Corpus with Expert Ratings for Automatic Text-to-Music Evaluation. arXiv preprint arXiv:2501.10811","author":"Liu Cheng","year":"2025","unstructured":"Cheng Liu, Hui Wang, Jinghua Zhao, Shiwan Zhao, Hui Bu, Xin Xu, Jiaming Zhou, Haoqin Sun, and Yong Qin. 2025. MusicEval: A Generative Music Corpus with Expert Ratings for Automatic Text-to-Music Evaluation. arXiv preprint arXiv:2501.10811 (2025)."},{"key":"e_1_3_2_1_26_1","unstructured":"Zhengrui Ma Yang Feng Chenze Shao et al. 2025. Efficient Speech Language Modeling via Energy Distance in Continuous Latent Space. arXiv:2505.13181 [cs.CL] https:\/\/arxiv.org\/abs\/2505.13181"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"e_1_3_2_1_28_1","unstructured":"Lingwei Meng Long Zhou Shujie Liu et al. 2024. Autoregressive speech synthesis without vector quantization. arXiv preprint arXiv:2407.08551 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR.","author":"Nichol Alexander Quinn","year":"2021","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In International conference on machine learning. PMLR."},{"key":"e_1_3_2_1_30_1","volume-title":"Librispeech: An ASR corpus based on public domain audio books. In ICASSP.","author":"Panayotov Vassil","year":"2015","unstructured":"Vassil Panayotov, Guoguo Chen, Daniel Povey, et al., 2015. Librispeech: An ASR corpus based on public domain audio books. In ICASSP."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446998"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.564"},{"key":"e_1_3_2_1_33_1","volume-title":"ELLA-V: Stable Neural Codec Language Modeling with Alignment-guided Sequence Reordering. arXiv preprint arXiv:2401.07333","author":"Song Yakun","year":"2024","unstructured":"Yakun Song, Zhuo Chen, Xiaofei Wang, Ziyang Ma, and Xie Chen. 2024. ELLA-V: Stable Neural Codec Language Modeling with Alignment-guided Sequence Reordering. arXiv preprint arXiv:2401.07333 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Artificial Intelligence and Statistics (AISTATS). PMLR.","author":"Jakub","unstructured":"Jakub M. Tomczak and Max Welling. 2018. VAE with a VampPrior. In International Conference on Artificial Intelligence and Statistics (AISTATS). PMLR."},{"key":"e_1_3_2_1_36_1","volume-title":"Continuous Speech Synthesis using per-token Latent Diffusion. arXiv preprint arXiv:2410.16048","author":"Turetzky Arnon","year":"2024","unstructured":"Arnon Turetzky, Nimrod Shabtay, Slava Shechtman, Hagai Aronowitz, David Haws, Ron Hoory, and Avihu Dekel. 2024. Continuous Speech Synthesis using per-token Latent Diffusion. arXiv preprint arXiv:2410.16048 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Melnet: A generative model for audio in the frequency domain. arXiv preprint arXiv:1906.01083","author":"Vasquez Sean","year":"2019","unstructured":"Sean Vasquez and Mike Lewis. 2019. Melnet: A generative model for audio in the frequency domain. arXiv preprint arXiv:1906.01083 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-851"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3552957"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-937"},{"key":"e_1_3_2_1_41_1","volume-title":"Intermediate-Task Learning with Pretrained Model for Synthesized Speech MOS Prediction. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE.","author":"Wang Hui","year":"2023","unstructured":"Hui Wang, Xiguang Zheng, and Yong Qin. 2023b. Intermediate-Task Learning with Pretrained Model for Synthesized Speech MOS Prediction. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE."},{"key":"e_1_3_2_1_42_1","volume-title":"MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ExuBFYtCQU","author":"Wang Yuancheng","year":"2025","unstructured":"Yuancheng Wang, Haoyue Zhan, Liwei Liu, Ruihong Zeng, Haotian Guo, Jiachen Zheng, Qiang Zhang, Xueyao Zhang, Shunsi Zhang, and Zhizheng Wu. 2025a. MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ExuBFYtCQU"},{"key":"e_1_3_2_1_43_1","unstructured":"Detai Xin Xu Tan Kai Shen Zeqian Ju Dongchao Yang Yuancheng Wang Shinnosuke Takamichi Hiroshi Saruwatari Shujie Liu Jinyu Li et al. 2024. RALL-E: Robust Codec Language Modeling with Chain-of-Thought Prompting for Text-to-Speech Synthesis. arXiv preprint arXiv:2404.03204 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Yifan Yang Ziyang Ma Shujie Liu Jinyu Li Hui Wang Lingwei Meng Haiyang Sun Yuzhe Liang Ruiyang Xu Yuxuan Hu et al. 2024. Interleaved Speech-Text Language Models are Simple Streaming Text to Speech Synthesizers. arXiv preprint arXiv:2412.16102 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation. arXiv preprint arXiv:2401.13527","author":"Zhang Dong","year":"2024","unstructured":"Dong Zhang, Xin Zhang, Jun Zhan, Shimin Li, Yaqian Zhou, and Xipeng Qiu. 2024. SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation. arXiv preprint arXiv:2401.13527 (2024)."},{"key":"e_1_3_2_1_46_1","unstructured":"Ziqiang Zhang Long Zhou Chengyi Wang Sanyuan Chen Yu Wu Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Speak foreign languages with your own voice: Cross-lingual neural codec language modeling. arXiv preprint arXiv:2303.03926 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Autoregressive Speech Synthesis with Next-Distribution Prediction. arXiv preprint arXiv:2412.16846","author":"Zhu Xinfa","year":"2024","unstructured":"Xinfa Zhu, Wenjie Tian, and Lei Xie. 2024. Autoregressive Speech Synthesis with Next-Distribution Prediction. arXiv preprint arXiv:2412.16846 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755494","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:19:15Z","timestamp":1765307955000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755494"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":47,"alternative-id":["10.1145\/3746027.3755494","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755494","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}