{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:47:37Z","timestamp":1767340057391,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024QY1400"],"award-info":[{"award-number":["2024QY1400"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681674","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"1255-1264","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["SpeechCraft: A Fine-Grained Expressive Speech Dataset with Natural Language Description"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8465-8878","authenticated-orcid":false,"given":"Zeyu","family":"Jin","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5832-8192","authenticated-orcid":false,"given":"Qixin","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2487-9107","authenticated-orcid":false,"given":"Kehan","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6824-4824","authenticated-orcid":false,"given":"Shuoyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5972-3955","authenticated-orcid":false,"given":"Songtao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-3220","authenticated-orcid":false,"given":"Xiaoyu","family":"Qin","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Common voice: A massively-multilingual speech corpus. arXiv preprint arXiv:1912.06670","author":"Ardila Rosana","year":"2019","unstructured":"Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M Tyers, and Gregor Weber. 2019. Common voice: A massively-multilingual speech corpus. arXiv preprint arXiv:1912.06670 (2019)."},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 12449--12460. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/92d1e1eb1cd6f9fba3227870bb6d7f07-Paper.pdf"},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen technical report. arXiv preprint arXiv:2309.16609","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, and others. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 1877--1901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008--9076--6"},{"key":"e_1_3_2_1_6_1","volume-title":"Frederico Santos De Oliveira, Arnaldo Candido Junior, Anderson da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti.","author":"Casanova Edresson","year":"2021","unstructured":"Edresson Casanova, Christopher Shulby, Eren G\u00f6lge, Nicolas Michael M\u00fcller, Frederico Santos De Oliveira, Arnaldo Candido Junior, Anderson da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti. 2021. SC-GlowTTS: An efficient zero-shot multi-speaker text-to-speech model. arXiv preprint arXiv:2104.05557 (2021)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2999388"},{"key":"e_1_3_2_1_8_1","volume-title":"Gigaspeech: An evolving, multi-domain asr corpus with 10,000 hours of transcribed audio. arXiv preprint arXiv:2106.06909","author":"Chen Guoguo","year":"2021","unstructured":"Guoguo Chen, Shuzhou Chai, Guanbo Wang, Jiayu Du, Wei-Qiang Zhang, Chao Weng, Dan Su, Daniel Povey, Jan Trmal, Junbo Zhang, and others. 2021. Gigaspeech: An evolving, multi-domain asr corpus with 10,000 hours of transcribed audio. arXiv preprint arXiv:2106.06909 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_10_1","volume-title":"Toronto emotional speech set (tess)-younger talker_happy. (2010)","author":"Dupuis Kate","year":"2010","unstructured":"Kate Dupuis and M Kathleen Pichora-Fuller. 2010. Toronto emotional speech set (tess)-younger talker_happy. (2010). Publisher: Toronto: University of Toronto, Psychology Department, 2010.."},{"key":"e_1_3_2_1_11_1","volume-title":"High Fidelity Neural Audio Compression. arXiv preprint arXiv:2210.13438","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High Fidelity Neural Audio Compression. arXiv preprint arXiv:2210.13438 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29769"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096285"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Shengpeng Ji Jialong Zuo Minghui Fang Ziyue Jiang Feiyang Chen Xinyu Duan Baoxing Huai and Zhou Zhao. 2024. TextrolSpeech: A Text Style Control Speech Corpus with Codec Language Text-to-Speech Models. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 10301--10305. https:\/\/doi.org\/10.1109\/ICASSP48485.2024.10445879","DOI":"10.1109\/ICASSP48485.2024.10445879"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1108\/RR-08--2013-0197"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_18_1","volume-title":"Byoung Jin Choi, Jong Jin Kim, and Nam Soo Kim.","author":"Kim Minchan","year":"2021","unstructured":"Minchan Kim, Sung Jun Cheon, Byoung Jin Choi, Jong Jin Kim, and Nam Soo Kim. 2021. Expressive text-to-speech using style tag. arXiv preprint arXiv:2104.00436 (2021)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3149712"},{"key":"e_1_3_2_1_20_1","volume-title":"Libritts-r: A restored multi-speaker text-to-speech corpus. arXiv preprint arXiv:2305.18802","author":"Koizumi Yuma","year":"2023","unstructured":"Yuma Koizumi, Heiga Zen, Shigeki Karita, Yifan Ding, Kohei Yatabe, Nobuyuki Morioka, Michiel Bacchiani, Yu Zhang, Wei Han, and Ankur Bapna. 2023. Libritts-r: A restored multi-speaker text-to-speech corpus. arXiv preprint arXiv:2305.18802 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"VITS2: Improving Quality and Efficiency of Single-Stage Text-to-Speech with Adversarial Learning and Architecture Design. arXiv preprint arXiv:2307.16430","author":"Kong Jungil","year":"2023","unstructured":"Jungil Kong, Jihoon Park, Beomjeong Kim, Jeongmin Kim, Dohee Kong, and Sangjin Kim. 2023. VITS2: Improving Quality and Efficiency of Single-Stage Text-to-Speech with Adversarial Learning and Architecture Design. arXiv preprint arXiv:2307.16430 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964310"},{"key":"e_1_3_2_1_23_1","volume-title":"PromptTTS 2: Describing and Generating Voices with Text Prompt. arXiv preprint arXiv:2309.02285","author":"Leng Yichong","year":"2023","unstructured":"Yichong Leng, Zhifang Guo, Kai Shen, Xu Tan, Zeqian Ju, Yanqing Liu, Yufei Liu, Dongchao Yang, Leying Zhang, Kaitao Song, Lei He, Xiang-Yang Li, Sheng Zhao, Tao Qin, and Jiang Bian. 2023. PromptTTS 2: Describing and Generating Voices with Text Prompt. arXiv preprint arXiv:2309.02285 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Natural language guidance of high-fidelity text-to-speech with synthetic annotations. arXiv preprint arXiv:2402.01912","author":"Lyth Dan","year":"2024","unstructured":"Dan Lyth and Simon King. 2024. Natural language guidance of high-fidelity text-to-speech with synthetic annotations. arXiv preprint arXiv:2402.01912 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. arXiv preprint arXiv:2312.15185","author":"Ma Ziyang","year":"2023","unstructured":"Ziyang Ma, Zhisheng Zheng, Jiaxin Ye, Jinchao Li, Zhifu Gao, Shiliang Zhang, and Xie Chen. 2023. emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. arXiv preprint arXiv:2312.15185 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395","author":"Mei Xinhao","year":"2023","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2023. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139). PMLR, 8748--8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2212.04356"},{"key":"e_1_3_2_1_30_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_32_1","volume-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers. arXiv preprint arXiv:2304.09116","author":"Shen Kai","year":"2023","unstructured":"Kai Shen, Zeqian Ju, Xu Tan, Yanqing Liu, Yichong Leng, Lei He, Tao Qin, Sheng Zhao, and Jiang Bian. 2023. Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers. arXiv preprint arXiv:2304.09116 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Aishell-3: A multi-speaker mandarin tts corpus and the baselines. arXiv preprint arXiv:2010.11567","author":"Shi Yao","year":"2020","unstructured":"Yao Shi, Hui Bu, Xin Xu, Shaoji Zhang, and Ming Li. 2020. Aishell-3: A multi-speaker mandarin tts corpus and the baselines. arXiv preprint arXiv:2010.11567 (2020)."},{"key":"e_1_3_2_1_35_1","volume-title":"Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289","author":"Tang Changli","year":"2023","unstructured":"Changli Tang, Wenyi Yu, Guangzhi Sun, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, Zejun Ma, and Chao Zhang. 2023. Salmonn: Towards generic hearing abilities for large language models. arXiv preprint arXiv:2310.13289 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, and others. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2007--611"},{"key":"e_1_3_2_1_38_1","volume-title":"Audiobox: Unified audio generation with natural language prompts. arXiv preprint arXiv:2312.15821","author":"Vyas Apoorv","year":"2023","unstructured":"Apoorv Vyas, Bowen Shi, Matthew Le, Andros Tjandra, Yi-Chiao Wu, Baishan Guo, Jiemin Zhang, Xinyue Zhang, Robert Adkins, William Ngan, and others. 2023. Audiobox: Unified audio generation with natural language prompts. arXiv preprint arXiv:2312.15821 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Neural codec language models are zero-shot text to speech synthesizers.(2023). arXiv preprint arXiv:2301.02111","author":"Wang Chengyi","year":"2023","unstructured":"Chengyi Wang, Sanyuan Chen, Yu Wu, Ziqiang Zhang, Long Zhou, Shujie Liu, Zhuo Chen, Yanqing Liu, Huaming Wang, Jinyu Li, and others. 2023. Neural codec language models are zero-shot text to speech synthesizers.(2023). arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"MEAD: A Large-Scale Audio-Visual Dataset for Emotional Talking-Face Generation. In Computer Vision -- ECCV","author":"Wang Kaisiyuan","year":"2020","unstructured":"Kaisiyuan Wang, Qianyi Wu, Linsen Song, Zhuoqian Yang, Wayne Wu, Chen Qian, Ran He, Yu Qiao, and Chen Change Loy. 2020. MEAD: A Large-Scale Audio-Visual Dataset for Emotional Talking-Face Generation. In Computer Vision -- ECCV 2020. Springer International Publishing, Cham, 700--717."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","unstructured":"Yusong Wu Ke Chen Tianyu Zhang Yuchen Hui Taylor Berg-Kirkpatrick and Shlomo Dubnov. 2023. Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation. In ICASSP 2023 - 2023 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 1--5. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10095969","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29902"},{"key":"e_1_3_2_1_44_1","unstructured":"Aiyuan Yang and et al. 2023. Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Instructtts: Modelling expressive TTS in discrete latent space with natural language style prompt. arXiv preprint arXiv:2301.13662","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Songxiang Liu, Rongjie Huang, Chao Weng, and Helen Meng. 2023. Instructtts: Modelling expressive TTS in discrete latent space with natural language style prompt. arXiv preprint arXiv:2301.13662 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2017.01.026"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681674","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681674","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681674"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":44,"alternative-id":["10.1145\/3664647.3681674","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681674","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}