{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:22Z","timestamp":1765309522779,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755710","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"689-698","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MuCodec: Ultra Low-Bitrate Music Codec for Music Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7063-7317","authenticated-orcid":false,"given":"Yaoxun","family":"Xu","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4085-4364","authenticated-orcid":false,"given":"Hangting","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2449-1436","authenticated-orcid":false,"given":"Jianwei","family":"Yu","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4967-7361","authenticated-orcid":false,"given":"Wei","family":"Tan","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3597-3913","authenticated-orcid":false,"given":"Shun","family":"Lei","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6576-9814","authenticated-orcid":false,"given":"Zhiwei","family":"Lin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1861-9170","authenticated-orcid":false,"given":"Rongzhi","family":"Gu","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli Andrea","year":"2023","unstructured":"Andrea Agostinelli, Timo I Denk, Zal\u00e1n Borsos, Jesse Engel, Mauro Verzetti, Antoine Caillon, Qingqing Huang, Aren Jansen, Adam Roberts, Marco Tagliasacchi, et al., 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Yang Ai Xiao-Hang Jiang et al. 2024. APCodec: A Neural Audio Codec with Parallel Amplitude and Phase Spectrum Encoding and Decoding. arXiv preprint arXiv:2402.10533 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Philip Anastassiou Jiawei Chen et al. 2024. Seed-TTS: A Family of High-Quality Versatile Speech Generation Models. arXiv preprint arXiv:2406.02430 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13054-023-04393-x"},{"key":"e_1_3_2_1_5_1","first-page":"10","article-title":"The million song dataset","volume":"2","author":"Bertin-Mahieux Thierry","year":"2011","unstructured":"Thierry Bertin-Mahieux, Daniel PW Ellis, Brian Whitman, and Paul Lamere. 2011. The million song dataset.. In Ismir, Vol. 2. 10.","journal-title":"Ismir"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053113"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Edresson Casanova Kelly Davis Eren G\u00f6lge G\u00f6rkem G\u00f6knar Iulian Gulea Logan Hart Aya Aljafari Joshua Meyer Reuben Morais Samuel Olayemi et al. 2024. Xtts: a massively multilingual zero-shot text-to-speech model. arXiv preprint arXiv:2406.04904 (2024).","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"e_1_3_2_1_8_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Copet Jade","year":"2024","unstructured":"Jade Copet, Felix Kreuk, et al., 2024. Simple and controllable music generation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Alexandre D\u00e9fossez Jade Copet et al. 2022. High fidelity neural audio compression. arXiv preprint arXiv:2210.13438 (2022)."},{"key":"e_1_3_2_1_10_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_11_1","volume-title":"Songcomposer: A large language model for lyric and melody composition in song generation. arXiv preprint arXiv:2402.17645","author":"Ding Shuangrui","year":"2024","unstructured":"Shuangrui Ding, Zihan Liu, Xiaoyi Dong, Pan Zhang, Rui Qian, Conghui He, Dahua Lin, and Jiaqi Wang. 2024. Songcomposer: A large language model for lyric and melody composition in song generation. arXiv preprint arXiv:2402.17645 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447523"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447523"},{"key":"e_1_3_2_1_15_1","unstructured":"Shuochen Gao Shun Lei et al. 2024. An End-to-End Approach for Chord-Conditioned Song Generation. arXiv:2409.06307 [cs.SD] https:\/\/arxiv.org\/abs\/2409.06307"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_17_1","volume-title":"Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, et al., 2020. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"Socodec: A semantic-ordered multi-stream speech codec for efficient language model based text-to-speech synthesis. arXiv preprint arXiv:2409.00933","author":"Guo Haohan","year":"2024","unstructured":"Haohan Guo, Fenglong Xie, et al., 2024. Socodec: A semantic-ordered multi-stream speech codec for efficient language model based text-to-speech synthesis. arXiv preprint arXiv:2409.00933 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-015-0054-9"},{"key":"e_1_3_2_1_20_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_21_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, et al., 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451-3460."},{"key":"e_1_3_2_1_23_1","unstructured":"Qingqing Huang Daniel S Park Tao Wang Timo I Denk Andy Ly Nanxin Chen Zhengdong Zhang Zhishuai Zhang Jiahui Yu Christian Frank et al. 2023. Noise2music: Text-conditioned music generation with diffusion models. arXiv preprint arXiv:2302.03917 (2023)."},{"key":"e_1_3_2_1_24_1","unstructured":"Shengpeng Ji Ziyue Jiang et al. 2024. Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling. arXiv preprint arXiv:2408.16532 (2024)."},{"key":"e_1_3_2_1_25_1","unstructured":"Ziyue Jiang Jinglin Liu Yi Ren Jinzheng He Zhenhui Ye Shengpeng Ji Qian Yang Chen Zhang Pengfei Wei Chunfeng Wang et al. 2023. Mega-tts 2: Boosting prompting mechanisms for zero-shot speech synthesis. arXiv preprint arXiv:2307.07218 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems, Vol. 33 (2020), 17022-17033."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747733"},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Kumar Rithesh","year":"2024","unstructured":"Rithesh Kumar, Prem Seetharaman, et al., 2024a. High-fidelity audio compression with improved rvqgan. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Kumar Rithesh","year":"2024","unstructured":"Rithesh Kumar, Prem Seetharaman, et al., 2024b. High-fidelity audio compression with improved rvqgan. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_30_1","first-page":"17450","article-title":"Efficient neural music generation","volume":"36","author":"Lam Max WY","year":"2023","unstructured":"Max WY Lam, Qiao Tian, Tang Li, Zongyu Yin, Siyuan Feng, Ming Tu, Yuliang Ji, Rui Xia, Mingbo Ma, Xuchen Song, et al., 2023. Efficient neural music generation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 17450-17463.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","unstructured":"Max WY Lam Yijin Xing Weiya You Jingcheng Wu Zongyu Yin Fuqiang Jiang Hangyu Liu Feng Liu Xingda Li Wei-Tsung Lu et al. 2025. Analyzable Chain-of-Musical-Thought Prompting for High-Fidelity Music Generation. arXiv preprint arXiv:2503.19611 (2025)."},{"key":"e_1_3_2_1_32_1","unstructured":"Shun Lei Yixuan Zhou et al. 2024. SongCreator: Lyrics-based Universal Song Generation. arXiv:2409.06029 [cs.SD] https:\/\/arxiv.org\/abs\/2409.06029"},{"key":"e_1_3_2_1_33_1","volume-title":"Single-Codec: Single-Codebook Speech Codec towards High-Performance Speech Generation. arXiv preprint arXiv:2406.07422","author":"Li Hanzhao","year":"2024","unstructured":"Hanzhao Li, Liumeng Xue, Haohan Guo, Xinfa Zhu, Yuanjun Lv, Lei Xie, Yunlin Chen, Hao Yin, and Zhifei Li. 2024. Single-Codec: Single-Codebook Speech Codec towards High-Performance Speech Generation. arXiv preprint arXiv:2406.07422 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Mert: Acoustic music understanding model with large-scale self-supervised training. arXiv preprint arXiv:2306.00107","author":"Li Yizhi","year":"2023","unstructured":"Yizhi Li, Ruibin Yuan, et al., 2023. Mert: Acoustic music understanding model with large-scale self-supervised training. arXiv preprint arXiv:2306.00107 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Ricky TQ Chen, et al","author":"Lipman Yaron","year":"2022","unstructured":"Yaron Lipman, Ricky TQ Chen, et al., 2022. Flow matching for generative modeling. arXiv preprint arXiv:2210.02747 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Haohe Liu Xuenan Xu et al. 2024a. SemantiCodec: An Ultra Low Bitrate Semantic Audio Codec for General Sound. arXiv preprint arXiv:2405.00233 (2024).","DOI":"10.1109\/JSTSP.2024.3506286"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Haohe Liu Yi Yuan et al. 2024b. Audioldm 2: Learning holistic audio generation with self-supervised pretraining. IEEE\/ACM Transactions on Audio Speech and Language Processing (2024).","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"e_1_3_2_1_38_1","volume-title":"SongGen: A Single Stage Auto-regressive Transformer for Text-to-Song Generation. arXiv preprint arXiv:2502.13128","author":"Liu Zihan","year":"2025","unstructured":"Zihan Liu, Shuangrui Ding, Zhixiong Zhang, Xiaoyi Dong, Pan Zhang, Yuhang Zang, Yuhang Cao, Dahua Lin, and Jiaqi Wang. 2025. SongGen: A Single Stage Auto-regressive Transformer for Text-to-Song Generation. arXiv preprint arXiv:2502.13128 (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"DiffRhythm: Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation with Latent Diffusion. arXiv preprint arXiv:2503.01183","author":"Ning Ziqian","year":"2025","unstructured":"Ziqian Ning, Huakang Chen, Yuepeng Jiang, Chunbo Hao, Guobin Ma, Shuai Wang, Jixun Yao, and Lei Xie. 2025. DiffRhythm: Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation with Latent Diffusion. arXiv preprint arXiv:2503.01183 (2025)."},{"key":"e_1_3_2_1_40_1","volume-title":"arXiv preprint arXiv:2208.08706","author":"Pasini Marco","year":"2022","unstructured":"Marco Pasini and Jan Schl\u00fcter. 2022. Musika! fast infinite waveform music generation. arXiv preprint arXiv:2208.08706 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, et al., 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_1_42_1","volume-title":"Fastspeech: Fast, robust and controllable text to speech. Advances in neural information processing systems","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448454"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Simon Rouard Francisco Massa and Alexandre D\u00e9fossez. 2023. Hybrid Transformers for Music Source Separation. In ICASSP 23.","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"e_1_3_2_1_45_1","volume-title":"Mo usai: Text-to-music generation with long-context latent diffusion. arXiv preprint arXiv:2301.11757","author":"Schneider Flavio","year":"2023","unstructured":"Flavio Schneider, Ojasv Kamal, Zhijing Jin, and Bernhard Sch\u00f6lkopf. 2023. Mo usai: Text-to-music generation with long-context latent diffusion. arXiv preprint arXiv:2301.11757 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"1st Web Audio Conference. 1-6.","author":"Schoeffler Michael","year":"2015","unstructured":"Michael Schoeffler, Fabian-Robert St\u00f6ter, Bernd Edler, and J\u00fcrgen Herre. 2015. Towards the next generation of web-based experiments: A case study assessing basic audio quality following the ITU-R recommendation BS. 1534 (MUSHRA). In 1st Web Audio Conference. 1-6."},{"key":"e_1_3_2_1_47_1","first-page":"3","article-title":"Constant-Q transform toolbox for music processing. In 7th sound and music computing conference, Barcelona","author":"Sch\u00f6rkhuber Christian","year":"2010","unstructured":"Christian Sch\u00f6rkhuber and Anssi Klapuri. 2010. Constant-Q transform toolbox for music processing. In 7th sound and music computing conference, Barcelona, Spain. SMC, 3-64.","journal-title":"Spain. SMC"},{"key":"e_1_3_2_1_48_1","unstructured":"Andros Tjandra Yi-Chiao Wu Baishan Guo John Hoffman Brian Ellis Apoorv Vyas Bowen Shi Sanyuan Chen Matt Le Nick Zacharov et al. 2025. Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech Music and Sound. arXiv preprint arXiv:2502.05139 (2025)."},{"key":"e_1_3_2_1_49_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"Recent advances in autoencoder-based representation learning. arXiv preprint arXiv:1812.05069","author":"Tschannen Michael","year":"2018","unstructured":"Michael Tschannen, Olivier Bachem, and Mario Lucic. 2018. Recent advances in autoencoder-based representation learning. arXiv preprint arXiv:1812.05069 (2018)."},{"key":"e_1_3_2_1_51_1","unstructured":"Chengyi Wang Sanyuan Chen et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Speechx: Neural codec language model as a versatile speech transformer","author":"Wang Xiaofei","year":"2024","unstructured":"Xiaofei Wang, Manthan Thakker, et al., 2024. Speechx: Neural codec language model as a versatile speech transformer. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms. arXiv preprint arXiv:2502.18008","author":"Wang Yashan","year":"2025","unstructured":"Yashan Wang, Shangda Wu, Jianhuai Hu, Xingjian Du, Yueqi Peng, Yongxin Huang, Shuai Fan, Xiaobing Li, Feng Yu, and Maosong Sun. 2025. NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms. arXiv preprint arXiv:2502.18008 (2025)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096509"},{"key":"e_1_3_2_1_55_1","volume-title":"Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377","author":"Xin Detai","year":"2024","unstructured":"Detai Xin, Xu Tan, et al., 2024. Bigcodec: Pushing the limits of low-bitrate neural speech codec. arXiv preprint arXiv:2409.05377 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"SongEditor","author":"Yang Chenyu","year":"2024","unstructured":"Chenyu Yang, Shuai Wang, Hangting Chen, Jianwei Yu, Wei Tan, Rongzhi Gu, Yaoxun Xu, Yizhi Zhou, Haina Zhu, and Haizhou Li. 2024. SongEditor: Adapting Zero-Shot Song Generation Language Model as a Multi-Task Editor. arXiv preprint arXiv:2412.13786 (2024)."},{"key":"e_1_3_2_1_57_1","unstructured":"Zhen Ye Peiwen Sun Jiahe Lei Hongzhan Lin Xu Tan Zheqi Dai Qiuqiang Kong Jianyi Chen Jiahao Pan Qifeng Liu et al. 2024. Codec does matter: Exploring the semantic shortcoming of codec for audio language model. arXiv preprint arXiv:2408.17175 (2024)."},{"key":"e_1_3_2_1_58_1","volume-title":"Representation alignment for generation: Training diffusion transformers is easier than you think. arXiv preprint arXiv:2410.06940","author":"Yu Sihyun","year":"2024","unstructured":"Sihyun Yu, Sangkyung Kwak, Huiwon Jang, Jongheon Jeong, Jonathan Huang, Jinwoo Shin, and Saining Xie. 2024. Representation alignment for generation: Training diffusion transformers is easier than you think. arXiv preprint arXiv:2410.06940 (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"Ruibin Yuan Hanfeng Lin Shuyue Guo Ge Zhang Jiahao Pan Yongyi Zang Haohe Liu Yiming Liang Wenye Ma Xingjian Du et al. 2025. YuE: Scaling Open Foundation Models for Long-Form Music Generation. arXiv preprint arXiv:2503.08638 (2025)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_1_61_1","volume-title":"Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692","author":"Zhang Xin","year":"2023","unstructured":"Xin Zhang, Dong Zhang, et al., 2023. Speechtokenizer: Unified speech tokenizer for speech large language models. arXiv preprint arXiv:2308.16692 (2023)."},{"key":"e_1_3_2_1_62_1","unstructured":"Siqi Zheng Luyao Cheng et al. 2023. 3d-speaker: A large-scale multi-device multi-distance and multi-dialect corpus for speech representation disentanglement. arXiv preprint arXiv:2306.15354 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"Muq: Self-supervised music representation learning with mel residual vector quantization. arXiv preprint arXiv:2501.01108","author":"Zhu Haina","year":"2025","unstructured":"Haina Zhu, Yizhi Zhou, Hangting Chen, Jianwei Yu, Ziyang Ma, Rongzhi Gu, Yi Luo, Wei Tan, and Xie Chen. 2025. Muq: Self-supervised music representation learning with mel residual vector quantization. arXiv preprint arXiv:2501.01108 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755710","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:40:43Z","timestamp":1765309243000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755710"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":63,"alternative-id":["10.1145\/3746027.3755710","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755710","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}