{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T18:47:02Z","timestamp":1770144422523,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681697","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4187-4196","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Generative Expressive Conversational Speech Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4524-7413","authenticated-orcid":false,"given":"Rui","family":"Liu","sequence":"first","affiliation":[{"name":"Inner Mongolia University, Hohhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2276-1456","authenticated-orcid":false,"given":"Yifan","family":"Hu","sequence":"additional","affiliation":[{"name":"Inner Mongolia University, Hohhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9160-3848","authenticated-orcid":false,"given":"Yi","family":"Ren","sequence":"additional","affiliation":[{"name":"ByteDance, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0472-2783","authenticated-orcid":false,"given":"Xiang","family":"Yin","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"SRIBD, School of Data Science, The Chinese University of Hong Kong &amp; National University of Singapore, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635856"},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6--12, 2020, virtual, Hugo Larochelle, Marc'Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"e_1_3_2_1_3_1","volume-title":"Pheme: Efficient and Conversational Speech Generation. arXiv preprint arXiv:2401.02839","author":"Budzianowski Pawe\u0142","year":"2024","unstructured":"Pawe\u0142 Budzianowski, Taras Sereda, Tomasz Cichy, and Ivan Vuli\u0107. 2024. Pheme: Efficient and Conversational Speech Generation. arXiv preprint arXiv:2401.02839 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/S10579-008--9076--6"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1455"},{"key":"e_1_3_2_1_6_1","volume-title":"CodeT: Code Generation with Generated Tests. In The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Chen Bei","year":"2023","unstructured":"Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. 2023. CodeT: Code Generation with Generated Tests. In The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1--5, 2023. OpenReview.net. https:\/\/openreview.net\/pdf?id=ktrw68Cmu9c"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26488"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2205.14727"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2312.10358"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612565"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612565"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446132"},{"key":"e_1_3_2_1_13_1","volume-title":"Virtual reality and augmented reality. Rheumatology teaching: the art and science of medical education","author":"Miedany Yasser El","year":"2019","unstructured":"Yasser El Miedany and Yasser El Miedany. 2019. Virtual reality and augmented reality. Rheumatology teaching: the art and science of medical education (2019), 403--427."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09548-1"},{"key":"e_1_3_2_1_15_1","volume-title":"PAARTH PRASAD, DHEERAJ KUMAR, SAMYAK JAIN, and JATIN CHOPRA.","author":"ARCHISMITA","year":"2023","unstructured":"ARCHISMITA GHOSH, GADDAM PRATHIK KUMAR, PAARTH PRASAD, DHEERAJ KUMAR, SAMYAK JAIN, and JATIN CHOPRA. 2023. Synergizing Generative Intelligence: Advancements in Artificial Intelligence for Intelligent Vehicle Systems and Vehicular Networks. (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"GPT-SoVITS. 2024. https:\/\/github.com\/RVC-Boss\/GPT-SoVITS."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383460"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2210.15360"},{"key":"e_1_3_2_1_20_1","volume-title":"The lj speech dataset","author":"Ito Keith","year":"2017","unstructured":"Keith Ito and Linda Johnson. 2017. The lj speech dataset. 2017. URL https:\/\/keithito. com\/LJ-Speech-Dataset (2017)."},{"key":"e_1_3_2_1_21_1","volume-title":"Muhammad Azizi Aswad Hisham, Wan Najmi Wan Mohd Noh, and Zul Adib Izzuddin Razali.","author":"Jenal Mahyuzie","year":"2022","unstructured":"Mahyuzie Jenal, Athira Nabilla Omar, Muhammad Azizi Aswad Hisham, Wan Najmi Wan Mohd Noh, and Zul Adib Izzuddin Razali. 2022. Smart home controlling system. Journal of Electronic Voltage and Application, Vol. 3, 1 (2022), 92--104."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2307.07218"},{"key":"e_1_3_2_1_23_1","volume-title":"Unified Speech-Text Pretraining for Spoken Dialog Modeling. arXiv preprint arXiv:2402.05706","author":"Kim Heeseung","year":"2024","unstructured":"Heeseung Kim, Soonshin Seo, Kyeongseok Jeong, Ohsung Kwon, Jungwhan Kim, Jaehong Lee, Eunwoo Song, Myungwoo Oh, Sungroh Yoon, and Kang Min Yoo. 2024. Unified Speech-Text Pretraining for Spoken Dialog Modeling. arXiv preprint arXiv:2402.05706 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2402.05706"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 5530--5540","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In International Conference on Machine Learning. PMLR, 5530--5540."},{"key":"e_1_3_2_1_26_1","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia Kushal","year":"2021","unstructured":"Kushal Lakhotia, Eugene Kharitonov, Wei-Ning Hsu, Yossi Adi, Adam Polyak, Benjamin Bolte, Tu-Anh Nguyen, Jade Copet, Alexei Baevski, Abdelrahman Mohamed, et al. 2021. On generative spoken language modeling from raw audio. Transactions of the Association for Computational Linguistics, Vol. 9 (2021), 1336--1354.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095751"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747837"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547831"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing, IJCNLP 2017","volume":"995","author":"Li Yanran","year":"2017","unstructured":"Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. 2017. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. In Proceedings of the Eighth International Joint Conference on Natural Language Processing, IJCNLP 2017, Taipei, Taiwan, November 27 - December 1, 2017 - Volume 1: Long Papers, Greg Kondrak and Taro Watanabe (Eds.). Asian Federation of Natural Language Processing, 986--995. https:\/\/aclanthology.org\/I17--1099\/"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2312.15316"},{"key":"e_1_3_2_1_32_1","volume-title":"Advancing Large Language Models to Capture Varied Speaking Styles and Respond Properly in Spoken Conversations. arXiv preprint arXiv:2402.12786","author":"Lin Guan-Ting","year":"2024","unstructured":"Guan-Ting Lin, Cheng-Han Chiang, and Hung-yi Lee. 2024. Advancing Large Language Models to Capture Varied Speaking Styles and Respond Properly in Spoken Conversations. arXiv preprint arXiv:2402.12786 (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446933"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29833"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613823"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 24th Annual Conference of the European Association for Machine Translation, EAMT 2023","author":"Moslem Yasmin","year":"2023","unstructured":"Yasmin Moslem, Rejwanul Haque, John D. Kelleher, and Andy Way. 2023. Adaptive Machine Translation with Large Language Models. In Proceedings of the 24th Annual Conference of the European Association for Machine Translation, EAMT 2023, Tampere, Finland, 12--15 June 2023, Mary Nurminen, Judith Brenner, Maarit Koponen, Sirkku Latomaa, Mikhail Mikhailov, Frederike Schierl, Tharindu Ranasinghe, Eva Vanmassenhove, Sergi Alvarez Vidal, Nora Aranberri, Mara Nunziatini, Carla Parra Escart\u00edn, Mikel L. Forcada, Maja Popovic, Carolina Scarton, and Helena Moniz (Eds.). European Association for Machine Translation, 227--237. https:\/\/aclanthology.org\/2023.eamt-1.22"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.5555\/1324818"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2203.16502"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-403"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2021-36"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Learning Representations.","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.13713"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2022--300"},{"key":"e_1_3_2_1_45_1","volume-title":"AISHELL-3: A Multi-speaker Mandarin TTS Corpus and the Baselines. CoRR","author":"Shi Yao","year":"2020","unstructured":"Yao Shi, Hui Bu, Xin Xu, Shaoji Zhang, and Ming Li. 2020. AISHELL-3: A Multi-speaker Mandarin TTS Corpus and the Baselines. CoRR, Vol. abs\/2010.11567 (2020). showeprint[arXiv]2010.11567 https:\/\/arxiv.org\/abs\/2010.11567"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-demos.29"},{"key":"e_1_3_2_1_47_1","volume-title":"GPTVoiceTasker: LLM-Powered Virtual Assistant for Smartphone. arXiv preprint arXiv:2401.14268","author":"Vu Minh Duc","year":"2024","unstructured":"Minh Duc Vu, Han Wang, Zhuang Li, Jieshan Chen, Shengdong Zhao, Zhenchang Xing, and Chunyang Chen. 2024. GPTVoiceTasker: LLM-Powered Virtual Assistant for Smartphone. arXiv preprint arXiv:2401.14268 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2301.02111"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2310.04673"},{"key":"e_1_3_2_1_50_1","volume-title":"Fine-grained Emotion and Intent Learning in Movie Dialogues. CoRR","author":"Welivita Anuradha","year":"2020","unstructured":"Anuradha Welivita, Yubo Xie, and Pearl Pu. 2020. Fine-grained Emotion and Intent Learning in Movie Dialogues. CoRR, Vol. abs\/2012.13624 (2020). showeprint[arXiv]2012.13624 https:\/\/arxiv.org\/abs\/2012.13624"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389705"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2401.00475"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096905"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2021--341"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2019--2441"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"e_1_3_2_1_57_1","volume-title":"SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities. In Findings of the Association for Computational Linguistics: EMNLP 2023","author":"Zhang Dong","year":"2023","unstructured":"Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, and Xipeng Qiu. 2023. SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities. In Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6--10, 2023, Houda Bouamor, Juan Pino, and Kalika Bali (Eds.). Association for Computational Linguistics, 15757--15773. https:\/\/aclanthology.org\/2023.findings-emnlp.1055"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.391"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681697","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681697","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681697"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":59,"alternative-id":["10.1145\/3664647.3681697","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681697","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}