{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:54Z","timestamp":1765339794793,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755816","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"10632-10641","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Speech Token Prediction via Compressed-to-fine Language Modeling for Speech Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-5940-5369","authenticated-orcid":false,"given":"Wenrui","family":"Liu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6939-7438","authenticated-orcid":false,"given":"Qian","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0356-1968","authenticated-orcid":false,"given":"Wen","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Sunnyvale, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3614-1346","authenticated-orcid":false,"given":"Guanrou","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0317-727X","authenticated-orcid":false,"given":"Weiqin","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6488-9695","authenticated-orcid":false,"given":"Minghui","family":"Fang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6876-9943","authenticated-orcid":false,"given":"Jialong","family":"Zuo","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7297-4536","authenticated-orcid":false,"given":"Xiaoda","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3564-1628","authenticated-orcid":false,"given":"Tao","family":"Jin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1409-6731","authenticated-orcid":false,"given":"Jin","family":"Xu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6262-9435","authenticated-orcid":false,"given":"Zemin","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4446-1391","authenticated-orcid":false,"given":"Yafeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7106-513X","authenticated-orcid":false,"given":"Jionghao","family":"Bai","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6037-2299","authenticated-orcid":false,"given":"Zhifang","family":"Guo","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Funaudiollm: Voice understanding and generation foundation models for natural interaction between humans and llms. arXiv preprint arXiv:2407.04051","author":"An Keyu","year":"2024","unstructured":"Keyu An, Qian Chen, Chong Deng, Zhihao Du, Changfeng Gao, Zhifu Gao, Yue Gu, Ting He, Hangrui Hu, Kai Hu, et al., 2024. Funaudiollm: Voice understanding and generation foundation models for natural interaction between humans and llms. arXiv preprint arXiv:2407.04051 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1121\/1.2188331"},{"key":"e_1_3_2_1_3_1","volume-title":"Syllablelm: Learning coarse semantic units for speech language models. arXiv preprint arXiv:2410.04029","author":"Baade Alan","year":"2024","unstructured":"Alan Baade, Puyuan Peng, and David Harwath. 2024. Syllablelm: Learning coarse semantic units for speech language models. arXiv preprint arXiv:2410.04029 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"MARS6: A Small and Robust Hierarchical-Codec Text-to-Speech Model. arXiv preprint arXiv:2501.05787","author":"Baas Matthew","year":"2025","unstructured":"Matthew Baas, Pieter Scholtz, Arnav Mehta, Elliott Dyson, Akshat Prakash, and Herman Kamper. 2025. MARS6: A Small and Robust Hierarchical-Codec Text-to-Speech Model. arXiv preprint arXiv:2501.05787 (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"Soundstorm: Efficient parallel audio generation. arXiv preprint arXiv:2305.09636","author":"Borsos Zal\u00e1n","year":"2023","unstructured":"Zal\u00e1n Borsos, Matt Sharifi, Damien Vincent, Eugene Kharitonov, Neil Zeghidour, and Marco Tagliasacchi. 2023. Soundstorm: Efficient parallel audio generation. arXiv preprint arXiv:2305.09636 (2023)."},{"doi-asserted-by":"crossref","unstructured":"Edresson Casanova Kelly Davis Eren G\u00f6lge G\u00f6rkem G\u00f6knar Iulian Gulea Logan Hart Aya Aljafari Joshua Meyer Reuben Morais Samuel Olayemi et al. 2024. Xtts: a massively multilingual zero-shot text-to-speech model. arXiv preprint arXiv:2406.04904 (2024).","key":"e_1_3_2_1_6_1","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"e_1_3_2_1_7_1","volume-title":"Minmo: A multimodal large language model for seamless voice interaction. arXiv preprint arXiv:2501.06282","author":"Chen Qian","year":"2025","unstructured":"Qian Chen, Yafeng Chen, Yanni Chen, Mengzhe Chen, Yingda Chen, Chong Deng, Zhihao Du, Ruize Gao, Changfeng Gao, Zhifu Gao, et al., 2025a. Minmo: A multimodal large language model for seamless voice interaction. arXiv preprint arXiv:2501.06282 (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers. arXiv preprint arXiv:2406.05370","author":"Chen Sanyuan","year":"2024","unstructured":"Sanyuan Chen, Shujie Liu, Long Zhou, Yanqing Liu, Xu Tan, Jinyu Li, Sheng Zhao, Yao Qian, and Furu Wei. 2024a. Vall-e 2: Neural codec language models are human parity zero-shot text to speech synthesizers. arXiv preprint arXiv:2406.05370 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/TASLPRO.2025.3530270"},{"unstructured":"Yafeng Chen Siqi Zheng Hui Wang Luyao Cheng et al. 2024b. 3D-Speaker-Toolkit: An Open Source Toolkit for Multi-modal Speaker Verification and Diarization. (2024). https:\/\/arxiv.org\/pdf\/2403.19971","key":"e_1_3_2_1_10_1"},{"key":"e_1_3_2_1_11_1","volume-title":"Sylber: Syllabic Embedding Representation of Speech from Raw Audio. arXiv preprint arXiv:2410.07168","author":"Cho Cheol Jun","year":"2024","unstructured":"Cheol Jun Cho, Nicholas Lee, Akshat Gupta, Dhruv Agarwal, Ethan Chen, Alan W Black, and Gopala K Anumanchipalli. 2024. Sylber: Syllabic Embedding Representation of Speech from Raw Audio. arXiv preprint arXiv:2410.07168 (2024)."},{"key":"e_1_3_2_1_12_1","first-page":"024","article-title":"Objective measure for estimating mean opinion score of synthesized speech","volume":"7","author":"Chu Min","year":"2006","unstructured":"Min Chu and Hu Peng. 2006. Objective measure for estimating mean opinion score of synthesized speech. US Patent 7,024,362.","journal-title":"US Patent"},{"key":"e_1_3_2_1_13_1","volume-title":"High fidelity neural audio compression. arXiv preprint arXiv:2210.13438","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High fidelity neural audio compression. arXiv preprint arXiv:2210.13438 (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037","author":"D\u00e9fossez Alexandre","year":"2024","unstructured":"Alexandre D\u00e9fossez, Laurent Mazar\u00e9, Manu Orsini, Am\u00e9lie Royer, Patrick P\u00e9rez, Herv\u00e9 J\u00e9gou, Edouard Grave, and Neil Zeghidour. 2024. Moshi: a speech-text foundation model for real-time dialogue. arXiv preprint arXiv:2410.00037 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Adam: A method for stochastic optimization. (No Title)","author":"Diederik P Kingma","year":"2014","unstructured":"P Kingma Diederik. 2014. Adam: A method for stochastic optimization. (No Title) (2014)."},{"key":"e_1_3_2_1_16_1","volume-title":"Variable-rate discrete representation learning. arXiv preprint arXiv:2103.06089","author":"Dieleman Sander","year":"2021","unstructured":"Sander Dieleman, Charlie Nash, Jesse Engel, and Karen Simonyan. 2021. Variable-rate discrete representation learning. arXiv preprint arXiv:2103.06089 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407","author":"Du Zhihao","year":"2024","unstructured":"Zhihao Du, Qian Chen, Shiliang Zhang, Kai Hu, Heng Lu, Yexin Yang, Hangrui Hu, Siqi Zheng, Yue Gu, Ziyang Ma, et al., 2024a. Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens. arXiv preprint arXiv:2407.05407 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1109\/ICASSP48485.2024.10447523"},{"unstructured":"Chaoyou Fu Haojia Lin Xiong Wang Yi-Fan Zhang Yunhang Shen Xiaoyu Liu Yangze Li Zuwei Long Heting Gao Ke Li et al. 2025. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction. arXiv preprint arXiv:2501.01957 (2025).","key":"e_1_3_2_1_19_1"},{"key":"e_1_3_2_1_20_1","volume-title":"Recent Advances in Discrete Speech Tokens: A Review. arXiv preprint arXiv:2502.06490","author":"Guo Yiwei","year":"2025","unstructured":"Yiwei Guo, Zhihan Li, Hankun Wang, Bohan Li, Chongtian Shao, Hanglei Zhang, Chenpeng Du, Xie Chen, Shujie Liu, and Kai Yu. 2025. Recent Advances in Discrete Speech Tokens: A Review. arXiv preprint arXiv:2502.06490 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"VALL-E R: Robust and efficient zero-shot text-to-speech synthesis via monotonic alignment. arXiv preprint arXiv:2406.07855","author":"Han Bing","year":"2024","unstructured":"Bing Han, Long Zhou, Shujie Liu, Sanyuan Chen, Lingwei Meng, Yanming Qian, Yanqing Liu, Sheng Zhao, Jinyu Li, and Furu Wei. 2024. VALL-E R: Robust and efficient zero-shot text-to-speech synthesis via monotonic alignment. arXiv preprint arXiv:2406.07855 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"unstructured":"Shengpeng Ji Ziyue Jiang Wen Wang Yifu Chen Minghui Fang Jialong Zuo Qian Yang Xize Cheng Zehan Wang Ruiqi Li et al. 2024. Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling. arXiv preprint arXiv:2408.16532 (2024).","key":"e_1_3_2_1_23_1"},{"key":"e_1_3_2_1_24_1","volume-title":"UniCodec: Unified Audio Codec with Single Domain-Adaptive Codebook. arXiv preprint arXiv:2502.20067","author":"Jiang Yidi","year":"2025","unstructured":"Yidi Jiang, Qian Chen, Shengpeng Ji, Yu Xi, Wen Wang, Chong Zhang, Xianghu Yue, ShiLiang Zhang, and Haizhou Li. 2025. UniCodec: Unified Audio Codec with Single Domain-Adaptive Codebook. arXiv preprint arXiv:2502.20067 (2025)."},{"volume-title":"Frequency and the emergence of linguistic structure","author":"Jurafsky Daniel","unstructured":"Daniel Jurafsky, Alan Bell, Michelle Gregory, and William D Raymond. 2008. Probabilistic relations between words: Evidence from reduction in lexical production. In Frequency and the emergence of linguistic structure. John Benjamins Publishing Company, 229-254.","key":"e_1_3_2_1_25_1"},{"key":"e_1_3_2_1_26_1","volume-title":"Audio Sparse-Transformer for Speech Classification. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","author":"Kavaki Hassan Salami","year":"2025","unstructured":"Hassan Salami Kavaki and Michael I Mandel. 2025. Audio Sparse-Transformer for Speech Classification. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1109\/ICASSP40776.2020.9053591"},{"key":"e_1_3_2_1_28_1","first-page":"8067","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"Kim Jaehyeon","year":"2020","unstructured":"Jaehyeon Kim, Sungwon Kim, Jungil Kong, and Sungroh Yoon. 2020b. Glow-tts: A generative flow for text-to-speech via monotonic alignment search. Advances in Neural Information Processing Systems, Vol. 33 (2020), 8067-8077.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems, Vol. 33 (2020), 17022-17033."},{"unstructured":"Yadong Li Jun Liu Tao Zhang Song Chen Tianpeng Li Zehuan Li Lijun Liu Lingfeng Ming Guosheng Dong Da Pan et al. 2025. Baichuan-Omni-1.5 Technical Report. arXiv preprint arXiv:2501.15368 (2025).","key":"e_1_3_2_1_30_1"},{"key":"e_1_3_2_1_31_1","volume-title":"Vit-tts: visual text-to-speech with scalable diffusion transformer. arXiv preprint arXiv:2305.12708","author":"Liu Huadai","year":"2023","unstructured":"Huadai Liu, Rongjie Huang, Xuan Lin, Wenqiang Xu, Maozong Zheng, Hong Chen, Jinzheng He, and Zhou Zhao. 2023. Vit-tts: visual text-to-speech with scalable diffusion transformer. arXiv preprint arXiv:2305.12708 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Medic: Zero-shot music editing with disentangled inversion control. arXiv preprint arXiv:2407.13220","author":"Liu Huadai","year":"2024","unstructured":"Huadai Liu, Jialei Wang, Xiangtai Li, Rongjie Huang, Yang Liu, Jiayang Xu, and Zhou Zhao. 2024. Medic: Zero-shot music editing with disentangled inversion control. arXiv preprint arXiv:2407.13220 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.18653\/v1\/2025.acl-long.1498"},{"unstructured":"Enzhe Lu Zhejun Jiang Jingyuan Liu Yulun Du Tao Jiang Chao Hong Shaowei Liu Weiran He Enming Yuan Yuzhi Wang et al. 2025. MoBA: Mixture of Block Attention for Long-Context LLMs. arXiv preprint arXiv:2502.13189 (2025).","key":"e_1_3_2_1_34_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.3389\/fcomm.2018.00025"},{"key":"e_1_3_2_1_36_1","volume-title":"HALL-E: hierarchical neural codec language model for minute-long zero-shot text-to-speech synthesis. arXiv preprint arXiv:2410.04380","author":"Nishimura Yuto","year":"2024","unstructured":"Yuto Nishimura, Takumi Hirose, Masanari Ohi, Hideki Nakayama, and Nakamasa Inoue. 2024. HALL-E: hierarchical neural codec language model for minute-long zero-shot text-to-speech synthesis. arXiv preprint arXiv:2410.04380 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_38_1","volume-title":"Voicecraft: Zero-shot speech editing and text-to-speech in the wild. arXiv preprint arXiv:2403.16973","author":"Peng Puyuan","year":"2024","unstructured":"Puyuan Peng, Po-Yao Huang, Shang-Wen Li, Abdelrahman Mohamed, and David Harwath. 2024. Voicecraft: Zero-shot speech editing and text-to-speech in the wild. arXiv preprint arXiv:2403.16973 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Mls: A large-scale multilingual dataset for speech research. arXiv preprint arXiv:2012.03411","author":"Pratap Vineel","year":"2020","unstructured":"Vineel Pratap, Qiantong Xu, Anuroop Sriram, Gabriel Synnaeve, and Ronan Collobert. 2020. Mls: A large-scale multilingual dataset for speech research. arXiv preprint arXiv:2012.03411 (2020)."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1109\/ICASSP48485.2024.10448454"},{"key":"e_1_3_2_1_42_1","volume-title":"Utmos: Utokyo-sarulab system for voicemos challenge","author":"Saeki Takaaki","year":"2022","unstructured":"Takaaki Saeki, Detai Xin, Wataru Nakata, Tomoki Koriyama, Shinnosuke Takamichi, and Hiroshi Saruwatari. 2022. Utmos: Utokyo-sarulab system for voicemos challenge 2022. arXiv preprint arXiv:2204.02152 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"International Conference on Machine Learning. PMLR, 31210-31227","author":"Shi Freda","year":"2023","unstructured":"Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael Sch\u00e4rli, and Denny Zhou. 2023. Large language models can be easily distracted by irrelevant context. In International Conference on Machine Learning. PMLR, 31210-31227."},{"key":"e_1_3_2_1_44_1","volume-title":"ICML Workshop on Invertible Neural Networks, Normalizing Flows, and Explicit Likelihood Models.","author":"Shih Kevin J","year":"2021","unstructured":"Kevin J Shih, Rafael Valle, Rohan Badlani, Adrian Lancucki, Wei Ping, and Bryan Catanzaro. 2021. RAD-TTS: Parallel flow-based TTS with robust alignment learning and diverse synthesis. In ICML Workshop on Invertible Neural Networks, Normalizing Flows, and Explicit Likelihood Models."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1109\/ICASSP49357.2023.10097097"},{"key":"e_1_3_2_1_46_1","volume-title":"Ella-v: Stable neural codec language modeling with alignment-guided sequence reordering. arXiv preprint arXiv:2401.07333","author":"Song Yakun","year":"2024","unstructured":"Yakun Song, Zhuo Chen, Xiaofei Wang, Ziyang Ma, and Xie Chen. 2024. Ella-v: Stable neural codec language modeling with alignment-guided sequence reordering. arXiv preprint arXiv:2401.07333 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_47_1","DOI":"10.1038\/s41467-025-56162-9"},{"key":"e_1_3_2_1_48_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"doi-asserted-by":"crossref","unstructured":"Yuhao Wang Heyang Liu Ziyang Cheng Ronghua Wu Qunshan Gu Yanfeng Wang and Yu Wang. 2025. VocalNet: Speech LLM with Multi-Token Prediction for Faster and High-Quality Generation. arXiv:2504.04060 [cs.CL] https:\/\/arxiv.org\/abs\/2504.04060","key":"e_1_3_2_1_49_1","DOI":"10.18653\/v1\/2025.emnlp-main.989"},{"key":"e_1_3_2_1_50_1","volume-title":"Maskgct: Zero-shot text-to-speech with masked generative codec transformer. arXiv preprint arXiv:2409.00750","author":"Wang Yuancheng","year":"2024","unstructured":"Yuancheng Wang, Haoyue Zhan, Liwei Liu, Ruihong Zeng, Haotian Guo, Jiachen Zheng, Qiang Zhang, Xueyao Zhang, Shunsi Zhang, and Zhizheng Wu. 2024. Maskgct: Zero-shot text-to-speech with masked generative codec transformer. arXiv preprint arXiv:2409.00750 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377","author":"Xin Detai","year":"2024","unstructured":"Detai Xin, Xu Tan, Shinnosuke Takamichi, and Hiroshi Saruwatari. 2024. Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"Hifi-codec: Group-residual vector quantization for high fidelity audio codec. arXiv preprint arXiv:2305.02765","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Songxiang Liu, Rongjie Huang, Jinchuan Tian, Chao Weng, and Yuexian Zou. 2023. Hifi-codec: Group-residual vector quantization for high fidelity audio codec. arXiv preprint arXiv:2305.02765 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Yang Dongchao","year":"2024","unstructured":"Dongchao Yang, Jinchuan Tian, Xu Tan, Rongjie Huang, Songxiang Liu, Haohan Guo, Xuankai Chang, Jiatong Shi, Jiang Bian, Zhou Zhao, et al., 2024. Uniaudio: Towards universal audio generation with large language models. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_54_1","volume-title":"Emovoice: Llm-based emotional text-to-speech model with freestyle text prompting. arXiv preprint arXiv:2504.12867","author":"Yang Guanrou","year":"2025","unstructured":"Guanrou Yang, Chen Yang, Qian Chen, Ziyang Ma, Wenxi Chen, Wen Wang, Tianrui Wang, Yifan Yang, Zhikang Niu, Wenrui Liu, et al., 2025. Emovoice: Llm-based emotional text-to-speech model with freestyle text prompting. arXiv preprint arXiv:2504.12867 (2025)."},{"unstructured":"Zhen Ye Peiwen Sun Jiahe Lei Hongzhan Lin Xu Tan Zheqi Dai Qiuqiang Kong Jianyi Chen Jiahao Pan Qifeng Liu et al. 2024. Codec does matter: Exploring the semantic shortcoming of codec for audio language model. arXiv preprint arXiv:2408.17175 (2024).","key":"e_1_3_2_1_55_1"},{"key":"e_1_3_2_1_56_1","first-page":"78808","article-title":"Megabyte: Predicting million-byte sequences with multiscale transformers","volume":"36","author":"Yu Lili","year":"2023","unstructured":"Lili Yu, D\u00e1niel Simig, Colin Flaherty, Armen Aghajanyan, Luke Zettlemoyer, and Mike Lewis. 2023. Megabyte: Predicting million-byte sequences with multiscale transformers. Advances in Neural Information Processing Systems, Vol. 36 (2023), 78808-78823.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_57_1","DOI":"10.1109\/TASLP.2021.3129994"},{"doi-asserted-by":"crossref","unstructured":"Heiga Zen Viet Dang Rob Clark Yu Zhang Ron J. Weiss Ye Jia Zhifeng Chen and Yonghui Wu. 2019. LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech. arXiv:1904.02882 [cs.SD] https:\/\/arxiv.org\/abs\/1904.02882","key":"e_1_3_2_1_58_1","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"e_1_3_2_1_59_1","volume-title":"Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000","author":"Zhang Dong","year":"2023","unstructured":"Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, and Xipeng Qiu. 2023a. Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692","author":"Zhang Xin","year":"2023","unstructured":"Xin Zhang, Dong Zhang, Shimin Li, Yaqian Zhou, and Xipeng Qiu. 2023b. Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692 (2023)."}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '25","name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755816","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:00Z","timestamp":1765339620000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755816"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":60,"alternative-id":["10.1145\/3746027.3755816","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755816","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}