{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:14:46Z","timestamp":1776885286676,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62206171 and No. U23B2018"],"award-info":[{"award-number":["No. 62206171 and No. U23B2018"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"Yangtze River Delta Science and Technology Innovation Community Joint Research Project","award":["2024CSJGG01100"],"award-info":[{"award-number":["2024CSJGG01100"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754745","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"9316-9325","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Pseudo-Autoregressive Neural Codec Language Models for Efficient Zero-Shot Text-to-Speech Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0588-1812","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2599-6752","authenticated-orcid":false,"given":"Shujie","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1089-9748","authenticated-orcid":false,"given":"Jinyu","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5688-7488","authenticated-orcid":false,"given":"Yuxuan","family":"Hu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7166-5534","authenticated-orcid":false,"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8057-4644","authenticated-orcid":false,"given":"Hui","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2449-1436","authenticated-orcid":false,"given":"Jianwei","family":"Yu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Vancouver, BC, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1028-6017","authenticated-orcid":false,"given":"Lingwei","family":"Meng","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3485-3869","authenticated-orcid":false,"given":"Haiyang","family":"Sun","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4150-0680","authenticated-orcid":false,"given":"Yanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5383-6424","authenticated-orcid":false,"given":"Yan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7423-617X","authenticated-orcid":false,"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai Innovation Institute, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Keyu An Qian Chen Chong Deng et al. 2024. FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs. arXiv:2407.04051 [cs.SD] https:\/\/arxiv.org\/abs\/2407.04051"},{"key":"e_1_3_2_1_2_1","unstructured":"Philip Anastassiou Jiawei Chen Jitong Chen et al. 2024. Seed-TTS: A Family of High-Quality Versatile Speech Generation Models. arXiv:2406.02430 [eess.AS] https:\/\/arxiv.org\/abs\/2406.02430"},{"key":"e_1_3_2_1_3_1","volume-title":"Proc. ICLR. Virtual.","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Steffen Schneider, and Michael Auli. 2020. vq-wav2vec: Self-supervised learning of discrete speech representations. In Proc. ICLR. Virtual."},{"key":"e_1_3_2_1_4_1","volume-title":"Proc. NeurIPS. Virtual.","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Proc. NeurIPS. Virtual."},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. ICML","author":"Casanova Edresson","year":"2022","unstructured":"Edresson Casanova, Julian Weber, Christopher Dane Shulby, et al. 2022. YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone. In Proc. ICML. Baltimore."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"e_1_3_2_1_8_1","unstructured":"Sanyuan Chen Shujie Liu Long Zhou et al. 2024. VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers. arXiv:2406.05370 [cs.CL] https:\/\/arxiv.org\/abs\/2406.05370"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Yu Wu et al. 2025. Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. IEEE Transactions on Audio Speech and Language Processing 33 (2025).","DOI":"10.1109\/TASLPRO.2025.3530270"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.313"},{"key":"e_1_3_2_1_12_1","volume-title":"High Fidelity Neural Audio Compression. Transactions on Machine Learning Research","author":"D\u00e9fossez Alexandre","year":"2023","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2023. High Fidelity Neural Audio Compression. Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_13_1","unstructured":"Wei Deng Siyi Zhou Jingchen Shu et al. 2025. IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System. arXiv:2502.05512 [cs.SD] https:\/\/arxiv.org\/abs\/2502.05512"},{"key":"e_1_3_2_1_14_1","volume-title":"Proc. NAACL-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proc. NAACL-HLT. Minneapolis."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890943"},{"key":"e_1_3_2_1_16_1","unstructured":"Zhihao Du Qian Chen Shiliang Zhang et al. 2024. CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens. arXiv:2407.05407 [cs.CL] https:\/\/arxiv.org\/abs\/2407.05407"},{"key":"e_1_3_2_1_17_1","unstructured":"Zhihao Du Changfeng Gao Yuxuan Wang et al. 2025. CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training. arXiv:2505.17589 [cs.SD] https:\/\/arxiv.org\/abs\/2505.17589"},{"key":"e_1_3_2_1_18_1","unstructured":"Zhihao Du Yuxuan Wang Qian Chen et al. 2024. CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models. arXiv:2412.10117 [cs.CL] https:\/\/arxiv.org\/abs\/2412.10117"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447523"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832320"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389766"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. ICASSP","author":"Graves Alex","unstructured":"Alex Graves, Abdel-rahman Mohamed, and Geoffrey E. Hinton. 2013. Speech recognition with deep recurrent neural networks. In Proc. ICASSP. Vancouver."},{"key":"e_1_3_2_1_24_1","unstructured":"Hao-Han Guo Kun Liu Fei-Yu Shen et al. 2024. FireRedTTS: A Foundation Text-To-Speech Framework for Industry-Level Generative Speech Applications. arXiv:2409.03283 [cs.SD] https:\/\/arxiv.org\/abs\/2409.03283"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414423"},{"key":"e_1_3_2_1_26_1","volume-title":"Audio Imagination: NeurIPS 2024 Workshop on AI-Driven Speech, Music, and Sound Generation","author":"Han Bing","year":"2024","unstructured":"Bing Han, Long Zhou, Shujie Liu, et al. 2024. VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment. In Audio Imagination: NeurIPS 2024 Workshop on AI-Driven Speech, Music, and Sound Generation. Vancouver. https:\/\/openreview.net\/forum?id=xvORqaYDgL"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832365"},{"key":"e_1_3_2_1_28_1","volume-title":"Yao Hung Hubert Tsai, et al","author":"Hsu Wei Ning","year":"2021","unstructured":"Wei Ning Hsu, Benjamin Bolte, Yao Hung Hubert Tsai, et al. 2021. HuBERT: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 29 (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"e_1_3_2_1_30_1","volume-title":"Proc. ICLR","author":"Ji Shengpeng","year":"2025","unstructured":"Shengpeng Ji, Ziyue Jiang, Xize Cheng, et al. 2025. WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling. In Proc. ICLR. Singapore."},{"key":"e_1_3_2_1_31_1","volume-title":"Proc. ICML","author":"Jia Dongya","year":"2025","unstructured":"Dongya Jia, Zhuo Chen, Jiawei Chen, et al. 2025. DiTAR: Diffusion Transformer Autoregressive Modeling for Speech Generation. In Proc. ICML. Vancouver."},{"key":"e_1_3_2_1_32_1","volume-title":"Proc. ICLR","author":"Jiang Ziyue","year":"2024","unstructured":"Ziyue Jiang, Jinglin Liu, Yi Ren, et al. 2024. Mega-TTS 2: Boosting Prompting Mechanisms for Zero-Shot Speech Synthesis. In Proc. ICLR. Vienna."},{"key":"e_1_3_2_1_33_1","unstructured":"Ziyue Jiang Yi Ren Ruiqi Li et al. 2025. MegaTTS 3: Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis. arXiv:2502.18924 [eess.AS] https:\/\/arxiv.org\/abs\/2502.18924"},{"key":"e_1_3_2_1_34_1","unstructured":"Ziyue Jiang Yi Ren Zhenhui Ye et al. 2023. Mega-TTS: Zero-Shot Text-to-Speech at Scale with Intrinsic Inductive Bias. arXiv:2306.03509 [eess.AS] https:\/\/arxiv.org\/abs\/2306.03509"},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. ICML","author":"Ju Zeqian","year":"2024","unstructured":"Zeqian Ju, Yuancheng Wang, Kai Shen, et al. 2024. NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models. In Proc. ICML. Vienna."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447120"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Eugene Kharitonov Damien Vincent Zal\u00e1n Borsos et al. 2023. Speak Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision. Trans. Assoc. Comput. Linguistics 11 (2023).","DOI":"10.1162\/tacl_a_00618"},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. ICML. Virtual.","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In Proc. ICML. Virtual."},{"key":"e_1_3_2_1_40_1","volume-title":"Proc. ICLR","author":"Kim Jaehyeon","year":"2024","unstructured":"Jaehyeon Kim, Keon Lee, Seungjun Chung, and Jaewoong Cho. 2024. CLaM-TTS: Improving Neural Codec Language Model for Zero-Shot Text-to-Speech. In Proc. ICLR. Vienna."},{"key":"e_1_3_2_1_41_1","volume-title":"Proc. NeurIPS. Virtual.","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. In Proc. NeurIPS. Virtual."},{"key":"e_1_3_2_1_42_1","volume-title":"Proc. NeurIPS. New Orleans.","author":"Kumar Rithesh","year":"2023","unstructured":"Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, et al. 2023. High-Fidelity Audio Compression with Improved RVQGAN. In Proc. NeurIPS. New Orleans."},{"key":"e_1_3_2_1_43_1","volume-title":"BASE TTS: Lessons from building a billion-parameter Text-to-Speech model on 100K hours of data. arXiv:2402.08093 [cs.LG] https:\/\/arxiv.org\/abs\/2402.08093","author":"Lajszczak Mateusz","year":"2024","unstructured":"Mateusz Lajszczak, Guillermo C\u00e1mbara, Yang Li, et al. 2024. BASE TTS: Lessons from building a billion-parameter Text-to-Speech model on 100K hours of data. arXiv:2402.08093 [cs.LG] https:\/\/arxiv.org\/abs\/2402.08093"},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. NeurIPS. New Orleans.","author":"Le Matthew","year":"2023","unstructured":"Matthew Le, Apoorv Vyas, Bowen Shi, et al. 2023. Voicebox: Text-Guided Multilingual Universal Speech Generation at Scale. In Proc. NeurIPS. New Orleans."},{"key":"e_1_3_2_1_45_1","volume-title":"Proc. ICLR","author":"Lee Keon","year":"2025","unstructured":"Keon Lee, Dong Won Kim, Jaehyeon Kim, et al. 2025. DiTTo-TTS: Diffusion Transformers for Scalable Text-to-Speech without Domain-Specific Factors. In Proc. ICLR. Singapore."},{"key":"e_1_3_2_1_46_1","volume-title":"Proc. ICASSP. Hyderabad.","author":"Li Bohan","year":"2025","unstructured":"Bohan Li, Hankun Wang, Situo Zhang, et al. 2025. Fast and High-Quality Autoregressive Speech Synthesis via Speculative Decoding. In Proc. ICASSP. Hyderabad."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"e_1_3_2_1_48_1","unstructured":"Zhijun Liu Shuai Wang Sho Inoue et al. 2024. Autoregressive Diffusion Transformer for Text-to-Speech Synthesis. arXiv:2406.05551 [eess.AS] https:\/\/arxiv.org\/abs\/2406.05551"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890230"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2343"},{"key":"e_1_3_2_1_51_1","unstructured":"Zhengrui Ma Yang Feng Chenze Shao et al. 2025. Efficient Speech Language Modeling via Energy Distance in Continuous Latent Space. arXiv:2505.13181 [cs.CL] https:\/\/arxiv.org\/abs\/2505.13181"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.65"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_54_1","unstructured":"Julian D. Parker Anton Smirnov Jordi Pons et al. 2024. Scaling Transformers for Low-Bitrate High-Quality Speech Coding. arXiv:2411.19842 [eess.AS] https:\/\/arxiv.org\/abs\/2411.19842"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"e_1_3_2_1_56_1","volume-title":"Proc. ICML. Honolulu.","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, et al. 2023. Robust speech recognition via large-scale weak supervision. In Proc. ICML. Honolulu."},{"key":"e_1_3_2_1_57_1","volume-title":"Proc. ICLR. Virtual.","author":"Ren Yi","year":"2021","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, et al. 2021. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In Proc. ICLR. Virtual."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_59_1","volume-title":"Proc. ICLR","author":"Shen Kai","year":"2024","unstructured":"Kai Shen, Zeqian Ju, Xu Tan, et al. 2024. NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers. In Proc. ICLR. Vienna."},{"key":"e_1_3_2_1_60_1","unstructured":"Zhengyan Sheng Zhihao Du Shiliang Zhang et al. 2025. SyncSpeech: Low Latency and Efficient Dual-Stream Text-to-Speech based on Temporal Masked Transformer. arXiv:2502.11094 [cs.SD] https:\/\/arxiv.org\/abs\/2502.11094"},{"key":"e_1_3_2_1_61_1","unstructured":"Xingchen Song Mengtao Xing Changwei Ma et al. 2024. TouchTTS: An Embarrassingly Simple TTS Framework that Everyone Can Touch. arXiv:2412.08237 [cs.SD] https:\/\/arxiv.org\/abs\/2412.08237"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34703"},{"key":"e_1_3_2_1_63_1","unstructured":"Haiyang Sun Shujie Hu Shujie Liu et al. 2025. Zero-Shot Streaming Text to Speech Synthesis with Transducer and Auto-Regressive Modeling. arXiv:2505.19669 [cs.LG] https:\/\/arxiv.org\/abs\/2505.19669"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3356232"},{"key":"e_1_3_2_1_65_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu et al. 2023. Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. arXiv:2301.02111 [cs.CL] https:\/\/arxiv.org\/abs\/2301.02111"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755494"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Hui Wang Yifan Yang Shujie Liu et al. 2025. StreamMel: Real-Time Zero-Shot Text-to-Speech via Interleaved Continuous Autoregressive Modeling. arXiv:2506.12570 [cs.SD] https:\/\/arxiv.org\/abs\/2506.12570","DOI":"10.1109\/LSP.2025.3600376"},{"key":"e_1_3_2_1_68_1","volume-title":"Proc. ICLR","author":"Wang Yuancheng","year":"2024","unstructured":"Yuancheng Wang, Haoyue Zhan, Liwei Liu, et al. 2024. MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer. In Proc. ICLR. Singapore."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"e_1_3_2_1_70_1","volume-title":"Sefik Emre Eskimez, and Jinyu Li","author":"Wu Haibin","year":"2024","unstructured":"Haibin Wu, Naoyuki Kanda, Sefik Emre Eskimez, and Jinyu Li. 2024. TS3-Codec: Transformer-Based Simple Streaming Single Codec. arXiv:2411.18803 [eess.AS] https:\/\/arxiv.org\/abs\/2411.18803"},{"key":"e_1_3_2_1_71_1","unstructured":"Yifan Yang Ziyang Ma Shujie Liu Jinyu Li et al. 2024. Interleaved Speech Text Language Models are Simple Streaming Text to Speech Synthesizers. arXiv:2412.16102 [eess.AS] https:\/\/arxiv.org\/abs\/2412.16102"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.135"},{"key":"e_1_3_2_1_74_1","volume-title":"Proc. ICLR","author":"Yao Zengwei","year":"2024","unstructured":"Zengwei Yao, Liyong Guo, Xiaoyu Yang, et al. 2024. Zipformer: A Faster and Better Encoder for Automatic Speech Recognition. In Proc. ICLR. Vienna."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"crossref","unstructured":"Neil Zeghidour Alejandro Luebs Ahmed Omran et al. 2022. SoundStream: An End-to-End Neural Audio Codec. IEEE ACM Trans. Audio Speech Lang. Process. 30 (2022).","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"e_1_3_2_1_77_1","unstructured":"Bowen Zhang Congchao Guo Geng Yang et al. 2025. MiniMax-Speech: Intrinsic Zero-Shot Text-to-Speech with a Learnable Speaker Encoder. arXiv:2505.07916 [eess.AS] https:\/\/arxiv.org\/abs\/2505.07916"},{"key":"e_1_3_2_1_78_1","unstructured":"Siyi Zhou Yiquan Zhou Yi He et al. 2025. IndexTTS2: A Breakthrough in Emotionally Expressive and Duration-Controlled Auto-Regressive Zero-Shot Text-to-Speech. arXiv:2506.21619 [cs.CL] https:\/\/arxiv.org\/abs\/2506.21619"},{"key":"e_1_3_2_1_79_1","unstructured":"Han Zhu Wei Kang Zengwei Yao et al. 2025. ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching. arXiv:2506.13053 [eess.AS] https:\/\/arxiv.org\/abs\/2506.13053"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754745","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:03:49Z","timestamp":1765343029000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754745"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":79,"alternative-id":["10.1145\/3746027.3754745","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754745","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}