{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T10:52:35Z","timestamp":1769597555410,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613823","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"5966-5974","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Emotionally Situated Text-to-Speech Synthesis in User-Agent Conversation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6873-7530","authenticated-orcid":false,"given":"Yuchen","family":"Liu","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9646-9802","authenticated-orcid":false,"given":"Haoyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7218-3877","authenticated-orcid":false,"given":"Shichao","family":"Liu","sequence":"additional","affiliation":[{"name":"Bytedance, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0472-2783","authenticated-orcid":false,"given":"Xiang","family":"Yin","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5508-1328","authenticated-orcid":false,"given":"Zejun","family":"Ma","sequence":"additional","affiliation":[{"name":"Bytedance, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Controllable context-aware conversational speech synthesis. arXiv preprint arXiv:2106.10828","author":"Cong Jian","year":"2021","unstructured":"Jian Cong, Shan Yang, Na Hu, Guangzhi Li, Lei Xie, and Dan Su. 2021. Controllable context-aware conversational speech synthesis. arXiv preprint arXiv:2106.10828 (2021)."},{"key":"e_1_3_2_1_2_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_3_1","unstructured":"Deepanway Ghosal Navonil Majumder Soujanya Poria Niyati Chhaya and Alexander Gelbukh. [n. d.]. DialogueGCN: A Graph Convolutional Neural Network for Emotion Recognition in Conversation. ([n. d.])."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383460"},{"key":"e_1_3_2_1_5_1","volume-title":"an introduction to voice assistants. Medical reference services quarterly 37, 1","author":"Hoy Matthew B","year":"2018","unstructured":"Matthew B Hoy. 2018. Alexa, Siri, Cortana, and more: an introduction to voice assistants. Medical reference services quarterly 37, 1 (2018), 81--88."},{"key":"e_1_3_2_1_6_1","volume-title":"Mmgcn: Multimodal fusion via deep graph convolution network for emotion recognition in conversation. arXiv preprint arXiv:2107.06779","author":"Hu Jingwen","year":"2021","unstructured":"Jingwen Hu, Yuchen Liu, Jinming Zhao, and Qin Jin. 2021. Mmgcn: Multimodal fusion via deep graph convolution network for emotion recognition in conversation. arXiv preprint arXiv:2107.06779 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"FCTalker: Fine and Coarse Grained Context Modeling for Expressive Conversational Speech Synthesis. arXiv e-prints","author":"Hu Yifan","year":"2022","unstructured":"Yifan Hu, Rui Liu, Guanglai Gao, and Haizhou Li. 2022. FCTalker: Fine and Coarse Grained Context Modeling for Expressive Conversational Speech Synthesis. arXiv e-prints (2022), arXiv-2210."},{"key":"e_1_3_2_1_8_1","volume-title":"International conference on machine learning. PMLR, 4651--4664","author":"Jaegle Andrew","year":"2021","unstructured":"Andrew Jaegle, Felix Gimeno, Andy Brock, Oriol Vinyals, Andrew Zisserman, and Joao Carreira. 2021. Perceiver: General perception with iterative attention. In International conference on machine learning. PMLR, 4651--4664."},{"key":"e_1_3_2_1_9_1","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems 33 (2020), 17022--17033.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_10_1","volume-title":"DailyTalk: Spoken Dialogue Dataset for Conversational Text-to-Speech. arXiv e-prints","author":"Lee Keon","year":"2022","unstructured":"Keon Lee, Kyumin Park, and Daeyoung Kim. 2022. DailyTalk: Spoken Dialogue Dataset for Conversational Text-to-Speech. arXiv e-prints (2022), arXiv-2207."},{"key":"e_1_3_2_1_11_1","volume-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Li Jingbei","unstructured":"Jingbei Li, Yi Meng, Chenyi Li, Zhiyong Wu, Helen Meng, Chao Weng, and Dan Su. 2022. Enhancing Speaking Styles in Conversational Text-to-Speech Synthesis with Graph-Based Multi-Modal Context Modeling. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 7917--7921."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547831"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 986--995","author":"Li Yanran","year":"2017","unstructured":"Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. 2017. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. In Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 986--995."},{"key":"e_1_3_2_1_14_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 684--693","author":"Liu Yuchen","year":"2022","unstructured":"Yuchen Liu, Jinming Zhao, Jingwen Hu, Ruichen Li, and Qin Jin. 2022. DialogueEIN: Emotion Interaction Network for Dialogue Affective Analysis. In Proceedings of the 29th International Conference on Computational Linguistics. 684--693."},{"key":"e_1_3_2_1_16_1","unstructured":"Ilya Loshchilov and Frank Hutter. [n. d.]. DECOUPLED WEIGHT DECAY REGULARIZATION. ([n. d.])."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"e_1_3_2_1_18_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van den Oord Aaron","year":"2016","unstructured":"Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)."},{"key":"e_1_3_2_1_19_1","volume-title":"International Conference on Learning Representations.","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","volume-title":"Fastspeech: Fast, robust and controllable text to speech. Advances in Neural Information Processing Systems 32","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"International journal of computers and communications 8","author":"Rojc Matej","year":"2014","unstructured":"Matej Rojc, Marko Presker, Zdravko Ka\u010di\u0107, and Izidor Mlakar. 2014. TTS-driven expressive embodied conversation agent EVA for UMB-SmartTV. International journal of computers and communications 8 (2014), 57--66."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_23_1","volume-title":"DialogXL: All-in-one XLNet for multi-party conversation emotion recognition. arXiv preprint arXiv:2012.08695","author":"Shen Weizhou","year":"2020","unstructured":"Weizhou Shen, Junqing Chen, Xiaojun Quan, and Zhixian Xie. 2020. DialogXL: All-in-one XLNet for multi-party conversation emotion recognition. arXiv preprint arXiv:2012.08695 (2020)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.1700826"},{"key":"e_1_3_2_1_25_1","unstructured":"Xu Tan Tao Qin Frank Soong and Tie-Yan Liu. [n. d.]. A Survey on Neural Speech Synthesis. ([n. d.])."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the conference. Association for Computational Linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for Computational Linguistics. Meeting, Vol. 2019. NIH Public Access, 6558."},{"key":"e_1_3_2_1_27_1","volume-title":"Ying Xiao, Zhifeng Chen, Samy Bengio, Quoc Le, et al.","author":"Wang Yuxuan","year":"2017","unstructured":"Yuxuan Wang, RJ Skerry-Ryan, Daisy Stanton, Yonghui Wu, Ron J Weiss, Zongheng Yang Jaitly, Ying Xiao, Zhifeng Chen, Samy Bengio, Quoc Le, et al. 2017. Tacotron: Towards End-to-End Speech Synthesis. (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 5180--5189","author":"Wang Yuxuan","year":"2018","unstructured":"Yuxuan Wang, Daisy Stanton, Yu Zhang, RJ-Skerry Ryan, Eric Battenberg, Joel Shor, Ying Xiao, Ye Jia, Fei Ren, and Rif A Saurous. 2018. Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis. In International Conference on Machine Learning. PMLR, 5180--5189."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023186"},{"key":"e_1_3_2_1_30_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli_a_00368"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613823","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613823","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:31Z","timestamp":1755820831000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613823"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":31,"alternative-id":["10.1145\/3581783.3613823","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613823","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}