{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:09:43Z","timestamp":1765544983523,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681465","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"7513-7522","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["UniStyle: Unified Style Modeling for Speaking Style Captioning and Stylistic Speech Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9275-523X","authenticated-orcid":false,"given":"Xinfa","family":"Zhu","sequence":"first","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3859-0927","authenticated-orcid":false,"given":"Wenjie","family":"Tian","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1826-7419","authenticated-orcid":false,"given":"Xinsheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9870-8739","authenticated-orcid":false,"given":"Lei","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5624-1704","authenticated-orcid":false,"given":"Yujia","family":"Xiao","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0434-7939","authenticated-orcid":false,"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5631-0639","authenticated-orcid":false,"given":"Xu","family":"Tan","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9624-5381","authenticated-orcid":false,"given":"Sheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8234-0823","authenticated-orcid":false,"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"doi-asserted-by":"publisher","key":"e_1_3_2_1_1_1","DOI":"10.1109\/APSIPA.2017.8282282"},{"key":"e_1_3_2_1_2_1","volume-title":"Designing and Evaluating Speech Emotion Recognition Systems: A Reality Check Case Study with IEMOCAP. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Antoniou Nikolaos","year":"2023","unstructured":"Nikolaos Antoniou, Athanasios Katsamanis, Theodoros Giannakopoulos, and Shrikanth Narayanan. 2023. Designing and Evaluating Speech Emotion Recognition Systems: A Reality Check Case Study with IEMOCAP. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4--10, 2023. IEEE, 1--5."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization@ACL 2005","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization@ACL 2005, Ann Arbor, Michigan, USA, June 29, 2005, Jade Goldstein, Alon Lavie, Chin-Yew Lin, and Clare R. Voss (Eds.). Association for Computational Linguistics, 65--72."},{"volume-title":"Pearson Correlation Coefficient","author":"Benesty Jacob","unstructured":"Jacob Benesty, Jingdong Chen, Yiteng Huang, and Israel Cohen. 2009. Pearson Correlation Coefficient. Springer Berlin Heidelberg, Berlin, Heidelberg, 1--4.","key":"e_1_3_2_1_4_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_7_1","volume-title":"High Fidelity Neural Audio Compression. CoRR","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High Fidelity Neural Audio Compression. CoRR, Vol. abs\/2210.13438 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2--7, 2019, Volume 1 (Long and Short Papers), Jill Burstein, Christy Doran, and Thamar Solorio (Eds.). Association for Computational Linguistics, 4171--4186."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/TASLP.2022.3171965"},{"key":"e_1_3_2_1_10_1","volume-title":"Prompttts: Controllable Text-To-Speech With Text Descriptions. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Guo Zhifang","year":"2023","unstructured":"Zhifang Guo, Yichong Leng, Yihan Wu, Sheng Zhao, and Xu Tan. 2023. Prompttts: Controllable Text-To-Speech With Text Descriptions. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4--10, 2023. IEEE, 1--5."},{"key":"e_1_3_2_1_11_1","volume-title":"Multiple Acoustic Features Speech Emotion Recognition Using Cross-Attention Transformer. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"He Yurun","year":"2023","unstructured":"Yurun He, Nobuaki Minematsu, and Daisuke Saito. 2023. Multiple Acoustic Features Speech Emotion Recognition Using Cross-Attention Transformer. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4--10, 2023. IEEE, 1--5."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_13_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Hu Edward J.","year":"2022","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25--29, 2022. OpenReview.net."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.1016\/j.bspc.2020.101894"},{"key":"e_1_3_2_1_15_1","volume-title":"TextrolSpeech: A Text Style Control Speech Corpus With Codec Language Text-to-Speech Models. CoRR","author":"Ji Shengpeng","year":"2023","unstructured":"Shengpeng Ji, Jialong Zuo, Minghui Fang, Ziyue Jiang, Feiyang Chen, Xinyu Duan, Baoxing Huai, and Zhou Zhao. 2023. TextrolSpeech: A Text Style Control Speech Corpus With Codec Language Text-to-Speech Models. CoRR, Vol. abs\/2308.14430 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"e_1_3_2_1_17_1","volume-title":"Libriheavy: a 50, 000 hours ASR corpus with punctuation casing and context. CoRR","author":"Kang Wei","year":"2023","unstructured":"Wei Kang, Xiaoyu Yang, Zengwei Yao, Fangjun Kuang, Yifan Yang, Liyong Guo, Long Lin, and Daniel Povey. 2023. Libriheavy: a 50, 000 hours ASR corpus with punctuation casing and context. CoRR, Vol. abs\/2309.08105 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer. CoRR","author":"Kim Daegyeom","year":"2023","unstructured":"Daegyeom Kim, Seongho Hong, and Yong-Hoon Choi. 2023. SC VALL-E: Style-Controllable Zero-Shot Text to Speech Synthesizer. CoRR, Vol. abs\/2307.10550 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"5540","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 5530--5540."},{"key":"e_1_3_2_1_20_1","volume-title":"22nd Annual Conference of the International Speech Communication Association, Brno","author":"Kim Minchan","year":"2021","unstructured":"Minchan Kim, Sung Jun Cheon, Byoung Jin Choi, Jong Jin Kim, and Nam Soo Kim. 2021. Expressive Text-to-Speech Using Style Tag. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association, Brno, Czechia, 30 August - 3 September 2021, Hynek Hermansky, Honza Cernock\u00fd, Luk\u00e1s Burget, Lori Lamel, Odette Scharenborg, and Petr Motl\u00edcek (Eds.). ISCA, 4663--4667."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.1109\/TASLP.2022.3145293"},{"key":"e_1_3_2_1_22_1","volume-title":"PromptTTS 2: Describing and Generating Voices with Text Prompt. CoRR","author":"Leng Yichong","year":"2023","unstructured":"Yichong Leng, Zhifang Guo, Kai Shen, Xu Tan, Zeqian Ju, Yanqing Liu, Yufei Liu, Dongchao Yang, Leying Zhang, Kaitao Song, Lei He, Xiang-Yang Li, Sheng Zhao, Tao Qin, and Jiang Bian. 2023. PromptTTS 2: Describing and Generating Voices with Text Prompt. CoRR, Vol. abs\/2309.02285 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1109\/TASLP.2022.3164181"},{"key":"e_1_3_2_1_25_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. ACL, 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. ACL, 74--81."},{"key":"e_1_3_2_1_26_1","volume-title":"David Cheng-Han Chiang, and Hung-yi Lee","author":"Lin Guan-Ting","year":"2024","unstructured":"Guan-Ting Lin, David Cheng-Han Chiang, and Hung-yi Lee. 2024. Advancing Large Language Models to Capture Varied Speaking Styles and Respond Properly in Spoken Conversations. CoRR, Vol. abs\/2402.12786 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.21437\/Interspeech.2023-1779"},{"key":"e_1_3_2_1_28_1","volume-title":"Referee: Towards Reference-Free Cross-Speaker Style Transfer with Low-Quality Data for Expressive Speech Synthesis","author":"Liu Songxiang","year":"2022","unstructured":"Songxiang Liu, Shan Yang, Dan Su, and Dong Yu. 2022. Referee: Towards Reference-Free Cross-Speaker Style Transfer with Low-Quality Data for Expressive Speech Synthesis. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2022, Virtual and Singapore, 23--27 May 2022. IEEE, 6307--6311."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1109\/TASLP.2023.3282092"},{"key":"e_1_3_2_1_30_1","volume-title":"Natural language guidance of high-fidelity text-to-speech with synthetic annotations. CoRR","author":"Lyth Daniel","year":"1912","unstructured":"Daniel Lyth and Simon King. 2024. Natural language guidance of high-fidelity text-to-speech with synthetic annotations. CoRR, Vol. abs\/2402.01912 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Neural TTS Stylization with Adversarial and Collaborative Games. In 7th International Conference on Learning Representations, ICLR 2019","author":"Ma Shuang","year":"2019","unstructured":"Shuang Ma, Daniel McDuff, and Yale Song. 2019. Neural TTS Stylization with Adversarial and Collaborative Games. In 7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6--9, 2019. OpenReview.net."},{"key":"e_1_3_2_1_32_1","volume-title":"emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. CoRR","author":"Ma Ziyang","year":"2023","unstructured":"Ziyang Ma, Zhisheng Zheng, Jiaxin Ye, Jinchao Li, Zhifu Gao, Shiliang Zhang, and Xie Chen. 2023. emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation. CoRR, Vol. abs\/2312.15185 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, July 6--12","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, July 6--12, 2002, Philadelphia, PA, USA. ACL, 311--318."},{"key":"e_1_3_2_1_34_1","volume-title":"Compact Graph Architecture for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2021","author":"Shirian Amir","year":"2021","unstructured":"Amir Shirian and Tanaya Guha. 2021. Compact Graph Architecture for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2021, Toronto, ON, Canada, June 6--11, 2021. IEEE, 6284--6288."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning, ICML 2018, Stockholmsm\u00e4ssan","author":"Skerry-Ryan R. J.","year":"2018","unstructured":"R. J. Skerry-Ryan, Eric Battenberg, Ying Xiao, Yuxuan Wang, Daisy Stanton, Joel Shor, Ron J. Weiss, Rob Clark, and Rif A. Saurous. 2018. Towards End-to-End Prosody Transfer for Expressive Speech Synthesis with Tacotron. In Proceedings of the 35th International Conference on Machine Learning, ICML 2018, Stockholmsm\u00e4ssan, Stockholm, Sweden, July 10--15, 2018 (Proceedings of Machine Learning Research, Vol. 80), Jennifer G. Dy and Andreas Krause (Eds.). PMLR, 4700--4709."},{"key":"e_1_3_2_1_36_1","volume-title":"21st Annual Conference of the International Speech Communication Association, Virtual Event","author":"Sorin Alexander","year":"2020","unstructured":"Alexander Sorin, Slava Shechtman, and Ron Hoory. 2020. Principal Style Components: Expressive Style Control and Cross-Speaker Transfer in Neural TTS. In Interspeech 2020, 21st Annual Conference of the International Speech Communication Association, Virtual Event, Shanghai, China, 25--29 October 2020, Helen Meng, Bo Xu, and Thomas Fang Zheng (Eds.). ISCA, 3411--3415."},{"key":"e_1_3_2_1_37_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR, Vol. abs\/2302.13971 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_39_1","volume-title":"Audiobox: Unified Audio Generation with Natural Language Prompts. CoRR","author":"Vyas Apoorv","year":"2023","unstructured":"Apoorv Vyas, Bowen Shi, Matthew Le, Andros Tjandra, Yi-Chiao Wu, Baishan Guo, Jiemin Zhang, Xinyue Zhang, Robert Adkins, William Ngan, Jeff Wang, Ivan Cruz, Bapi Akula, Akinniyi Akinyemi, Brian Ellis, Rashel Moritz, Yael Yungster, Alice Rakotoarison, Liang Tan, Chris Summers, Carleigh Wood, Joshua Lane, Mary Williamson, and Wei-Ning Hsu. 2023. Audiobox: Unified Audio Generation with Natural Language Prompts. CoRR, Vol. abs\/2312.15821 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. CoRR","author":"Wang Chengyi","year":"2023","unstructured":"Chengyi Wang, Sanyuan Chen, Yu Wu, Ziqiang Zhang, Long Zhou, Shujie Liu, Zhuo Chen, Yanqing Liu, Huaming Wang, Jinyu Li, Lei He, Sheng Zhao, and Furu Wei. 2023. Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. CoRR, Vol. abs\/2301.02111 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Audiodec: An Open-Source Streaming High-Fidelity Neural Audio Codec. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Wu Yi-Chiao","year":"2023","unstructured":"Yi-Chiao Wu, Israel D. Gebru, Dejan Markovic, and Alexander Richard. 2023. Audiodec: An Open-Source Streaming High-Fidelity Neural Audio Codec. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4--10, 2023. IEEE, 1--5."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1609\/aaai.v38i17.29902"},{"key":"e_1_3_2_1_43_1","volume-title":"StyleCap: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-supervised Learning Models. CoRR","author":"Yamauchi Kazuki","year":"2023","unstructured":"Kazuki Yamauchi, Yusuke Ijima, and Yuki Saito. 2023. StyleCap: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-supervised Learning Models. CoRR, Vol. abs\/2311.16509 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"InstructTTS: Modelling Expressive TTS in Discrete Latent Space with Natural Language Style Prompt. CoRR","author":"Yang Dongchao","year":"2023","unstructured":"Dongchao Yang, Songxiang Liu, Rongjie Huang, Guangzhi Lei, Chao Weng, Helen Meng, and Dong Yu. 2023. InstructTTS: Modelling Expressive TTS in Discrete Latent Space with Natural Language Style Prompt. CoRR, Vol. abs\/2301.13662 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Prosodyspeech: Towards Advanced Prosody Model for Neural Text-to-Speech. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2022","author":"Yi Yuanhao","year":"2022","unstructured":"Yuanhao Yi, Lei He, Shifeng Pan, Xi Wang, and Yujia Xiao. 2022. Prosodyspeech: Towards Advanced Prosody Model for Neural Text-to-Speech. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2022, Virtual and Singapore, 23--27 May 2022. IEEE, 7582--7586."},{"key":"e_1_3_2_1_46_1","volume-title":"Learning Latent Representations for Style Control and Transfer in End-to-end Speech Synthesis. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2019","author":"Zhang Yajie","year":"2019","unstructured":"Yajie Zhang, Shifeng Pan, Lei He, and Zhen-Hua Ling. 2019. Learning Latent Representations for Style Control and Transfer in End-to-end Speech Synthesis. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2019, Brighton, United Kingdom, May 12--17, 2019. IEEE, 6945--6949."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_47_1","DOI":"10.1109\/TASLP.2024.3363444"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_48_1","DOI":"10.1109\/ICASSP49357.2023.10095776"},{"key":"e_1_3_2_1_49_1","volume-title":"Vec-Tok Speech: speech vectorization and tokenization for neural speech generation. CoRR","author":"Zhu Xinfa","year":"2023","unstructured":"Xinfa Zhu, Yuanjun Lv, Yi Lei, Tao Li, Wendi He, Hongbin Zhou, Heng Lu, and Lei Xie. 2023. Vec-Tok Speech: speech vectorization and tokenization for neural speech generation. CoRR, Vol. abs\/2310.07246 (2023)."}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '24","name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681465","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681465","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681465"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681465","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681465","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}