{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:40Z","timestamp":1765357720754,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No.62076144"],"award-info":[{"award-number":["Grant No.62076144"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["Grant No.2024QY1400"],"award-info":[{"award-number":["Grant No.2024QY1400"]}]},{"name":"Shenzhen Science and Technology Program","award":["Grant No.WDZC20220816140515001"],"award-info":[{"award-number":["Grant No.WDZC20220816140515001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681680","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"554-563","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6363-891X","authenticated-orcid":false,"given":"Yixuan","family":"Zhou","sequence":"first","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-3220","authenticated-orcid":false,"given":"Xiaoyu","family":"Qin","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8465-8878","authenticated-orcid":false,"given":"Zeyu","family":"Jin","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6824-4824","authenticated-orcid":false,"given":"Shuoyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3597-3913","authenticated-orcid":false,"given":"Shun","family":"Lei","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5972-3955","authenticated-orcid":false,"given":"Songtao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"SoundStorm: Efficient Parallel Audio Generation. arXiv preprint arXiv:2305.09636","author":"Borsos Zal\u00e1n","year":"2023","unstructured":"Zal\u00e1n Borsos, Matt Sharifi, Damien Vincent, Eugene Kharitonov, Neil Zeghidour, and Marco Tagliasacchi. 2023. SoundStorm: Efficient Parallel Audio Generation. arXiv preprint arXiv:2305.09636 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"A comprehensive survey of ai-generated content (aigc): A history of generative ai from gan to chatgpt. arXiv preprint arXiv:2303.04226","author":"Cao Yihan","year":"2023","unstructured":"Yihan Cao, Siyu Li, Yixin Liu, Zhiling Yan, Yutong Dai, Philip S Yu, and Lichao Sun. 2023. A comprehensive survey of ai-generated content (aigc): A history of generative ai from gan to chatgpt. arXiv preprint arXiv:2303.04226 (2023)."},{"key":"e_1_3_2_1_4_1","first-page":"3645","article-title":"SC-GlowTTS","volume":"2021","author":"Casanova Edresson","year":"2021","unstructured":"Edresson Casanova, Christopher Shulby, Eren G\u00f6lge, Nicolas Michael M\u00fcller, Frederico Santos de Oliveira, Arnaldo Candido Jr, Anderson da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti. 2021. SC-GlowTTS: An Efficient Zero-Shot Multi-Speaker Text-To-Speech Model. In Interspeech 2021. 3645--3659.","journal-title":"An Efficient Zero-Shot Multi-Speaker Text-To-Speech Model. In Interspeech"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"e_1_3_2_1_7_1","volume-title":"ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Cheng Xuxin","year":"2023","unstructured":"Xuxin Cheng, Qianqian Dong, Fengpeng Yue, Tom Ko, Mingxuan Wang, and Yuexian Zou. 2023. M 3 st: Mix at three levels for speech translation. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_8_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"High Fidelity Neural Audio Compression. Transactions on Machine Learning Research","author":"D\u00e9fossez Alexandre","year":"2023","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2023. High Fidelity Neural Audio Compression. Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"VALL-T: Decoder-Only Generative Transducer for Robust and Decoding-Controllable Text-to-Speech. arXiv preprint arXiv:2401.14321","author":"Du Chenpeng","year":"2024","unstructured":"Chenpeng Du, Yiwei Guo, Hankun Wang, Yifan Yang, Zhikang Niu, Shuai Wang, Hui Zhang, Xie Chen, and Kai Yu. 2024. VALL-T: Decoder-Only Generative Transducer for Robust and Decoding-Controllable Text-to-Speech. arXiv preprint arXiv:2401.14321 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Prompttts: Controllable Text-To-Speech With Text Descriptions. In ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Guo Zhifang","year":"2023","unstructured":"Zhifang Guo, Yichong Leng, Yihan Wu, Sheng Zhao, and Xu Tan. 2023. Prompttts: Controllable Text-To-Speech With Text Descriptions. In ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_12_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445879"},{"key":"e_1_3_2_1_15_1","volume-title":"Boosting Prompting Mechanisms for Zero-Shot Speech Synthesis. In The Twelfth International Conference on Learning Representations.","author":"Jiang Ziyue","year":"2023","unstructured":"Ziyue Jiang, Jinglin Liu, Yi Ren, Jinzheng He, Zhenhui Ye, Shengpeng Ji, Qian Yang, Chen Zhang, Pengfei Wei, Chunfeng Wang, et al. 2023. Boosting Prompting Mechanisms for Zero-Shot Speech Synthesis. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681674"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"e_1_3_2_1_18_1","volume-title":"Unified Speech-Text Pretraining for Spoken Dialog Modeling. arXiv preprint arXiv:2402.05706","author":"Kim Heeseung","year":"2024","unstructured":"Heeseung Kim, Soonshin Seo, Kyeongseok Jeong, Ohsung Kwon, Jungwhan Kim, Jaehong Lee, Eunwoo Song, Myungwoo Oh, Sungroh Yoon, and Kang Min Yoo. 2024. Unified Speech-Text Pretraining for Spoken Dialog Modeling. arXiv preprint arXiv:2402.05706 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Leng Yichong","year":"2023","unstructured":"Yichong Leng, ZHifang Guo, Kai Shen, Zeqian Ju, Xu Tan, Eric Liu, Yufei Liu, Dongchao Yang, Kaitao Song, Lei He, et al. 2023. PromptTTS 2: Describing and Generating Voices with Text Prompt. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","volume-title":"Promptstyle: Controllable style transfer for text-to-speech with natural language descriptions. arXiv preprint arXiv:2305.19522","author":"Liu Guanghou","year":"2023","unstructured":"Guanghou Liu, Yongmao Zhang, Yi Lei, Yunlin Chen, Rui Wang, Zhifei Li, and Lei Xie. 2023. Promptstyle: Controllable style transfer for text-to-speech with natural language descriptions. arXiv preprint arXiv:2305.19522 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. 21450--21474","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. AudioLDM: text-to-audio generation with latent diffusion models. In Proceedings of the 40th International Conference on Machine Learning. 21450--21474."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612485"},{"key":"e_1_3_2_1_23_1","volume-title":"Natural language guidance of high-fidelity text-to-speech with synthetic annotations. arXiv preprint arXiv:2402.01912","author":"Lyth Dan","year":"2024","unstructured":"Dan Lyth and Simon King. 2024. Natural language guidance of high-fidelity text-to-speech with synthetic annotations. arXiv preprint arXiv:2402.01912 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3618408.3619590"},{"key":"e_1_3_2_1_25_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv e-prints","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv e-prints (2022), arXiv--2204."},{"key":"e_1_3_2_1_26_1","volume-title":"Pawan Sasanka Ammanamanchi, and Stella Biderman","author":"Sanchez Guillaume","year":"2023","unstructured":"Guillaume Sanchez, Honglu Fan, Alexander Spangher, Elad Levi, Pawan Sasanka Ammanamanchi, and Stella Biderman. 2023. Stay on topic with classifier-free guidance. arXiv preprint arXiv:2306.17806 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Shen Kai","year":"2023","unstructured":"Kai Shen, Zeqian Ju, Xu Tan, Eric Liu, Yichong Leng, Lei He, Tao Qin, Jiang Bian, et al. 2023. NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448173"},{"key":"e_1_3_2_1_29_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_30_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Siuzdak Hubert","year":"2023","unstructured":"Hubert Siuzdak. 2023. Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_1_32_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Audiobox: Unified audio generation with natural language prompts. arXiv preprint arXiv:2312.15821","author":"Vyas Apoorv","year":"2023","unstructured":"Apoorv Vyas, Bowen Shi, Matthew Le, Andros Tjandra, Yi-Chiao Wu, Baishan Guo, Jiemin Zhang, Xinyue Zhang, Robert Adkins, William Ngan, et al. 2023. Audiobox: Unified audio generation with natural language prompts. arXiv preprint arXiv:2312.15821 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:2301.02111","author":"Wang Chengyi","year":"2023","unstructured":"Chengyi Wang, Sanyuan Chen, Yu Wu, Ziqiang Zhang, Long Zhou, Shujie Liu, Zhuo Chen, Yanqing Liu, Huaming Wang, Jinyu Li, Lei He, Sheng Zhao, and Furu Wei. 2023. Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_35_1","unstructured":"Detai Xin Xu Tan Kai Shen Zeqian Ju Dongchao Yang Yuancheng Wang Shinnosuke Takamichi Hiroshi Saruwatari Shujie Liu Jinyu Li et al. 2024. RALL-E: Robust Codec Language Modeling with Chain-of-Thought Prompting for Text-to-Speech Synthesis. arXiv preprint arXiv:2404.03204 (2024)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"e_1_3_2_1_37_1","volume-title":"Instructtts: Modelling expressive tts in discrete latent space with natural language style prompt","author":"Yang Dongchao","year":"2024","unstructured":"Dongchao Yang, Songxiang Liu, Rongjie Huang, Chao Weng, and Helen Meng. 2024. Instructtts: Modelling expressive tts in discrete latent space with natural language style prompt. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746682"},{"key":"e_1_3_2_1_39_1","unstructured":"Yu Zhang Wei Han James Qin Yongqiang Wang Ankur Bapna Zhehuai Chen Nanxin Chen Bo Li Vera Axelrod Gary Wang Zhong Meng Ke Hu Andrew Rosenberg Rohit Prabhavalkar Daniel S. Park Parisa Haghani Jason Riesa Ginger Perng Hagen Soltau Trevor Strohman Bhuvana Ramabhadran Tara Sainath Pedro Moreno Chung-Cheng Chiu Johan Schalkwyk Fran\u00e7oise Beaufays and Yonghui Wu. 2023. Google USM: Scaling Automatic Speech Recognition Beyond 100 Languages."},{"key":"e_1_3_2_1_40_1","volume-title":"Promptspeaker: Speaker Generation Based on Text Descriptions. In 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). IEEE, 1--7.","author":"Zhang Yongmao","year":"2023","unstructured":"Yongmao Zhang, Guanghou Liu, Yi Lei, Yunlin Chen, Hao Yin, Lei Xie, and Zhifei Li. 2023. Promptspeaker: Speaker Generation Based on Text Descriptions. In 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). IEEE, 1--7."},{"key":"e_1_3_2_1_41_1","unstructured":"Ziqiang Zhang Long Zhou Chengyi Wang Sanyuan Chen Yu Wu Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Speak foreign languages with your own voice: Cross-lingual neural codec language modeling. arXiv preprint arXiv:2303.03926 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681680","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681680","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681680"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3681680","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681680","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}