{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T11:34:57Z","timestamp":1774006497241,"version":"3.50.1"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100010097","name":"China Association for Science and Technology","doi-asserted-by":"publisher","award":["2024QNRC001"],"award-info":[{"award-number":["2024QNRC001"]}],"id":[{"id":"10.13039\/100010097","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004763","name":"Natural Science Foundation of Inner Mongolia Autonomous Region","doi-asserted-by":"publisher","award":["2025JQ011"],"award-info":[{"award-number":["2025JQ011"]}],"id":[{"id":"10.13039\/501100004763","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012272","name":"Shenzhen Technology Development Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012272","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100012541","name":"Guangdong Innovative and Entrepreneurial Research Team Program","doi-asserted-by":"publisher","award":["2023ZT10X044"],"award-info":[{"award-number":["2023ZT10X044"]}],"id":[{"id":"10.13039\/100012541","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.specom.2026.103353","type":"journal-article","created":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T17:01:31Z","timestamp":1769533291000},"page":"103353","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Emphasis rendering for conversational text-to-speech with multi-modal multi-scale context modeling"],"prefix":"10.1016","volume":"178","author":[{"given":"Rui","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0494-3669","authenticated-orcid":false,"given":"Jia","family":"Zhenqi","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yifan","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2026.103353_b1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.specom.2026.103353_b2","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","article-title":"IEMOCAP: Interactive emotional dyadic motion capture database","volume":"42","author":"Busso","year":"2008","journal-title":"Lang. Resour. Eval."},{"key":"10.1016\/j.specom.2026.103353_b3","series-title":"11th International Conference on Language Resources and Evaluation","first-page":"1597","article-title":"Emotionlines: An emotion corpus of multi-party conversations","author":"Chen","year":"2019"},{"key":"10.1016\/j.specom.2026.103353_b4","series-title":"2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference","first-page":"2409","article-title":"Multi-granularity semantic and acoustic stress prediction for expressive TTS","author":"Chi","year":"2023"},{"key":"10.1016\/j.specom.2026.103353_b5","series-title":"Listening for sound, listening for meaning: Task effects on prosodic transcription","author":"Cole","year":"2014"},{"key":"10.1016\/j.specom.2026.103353_b6","doi-asserted-by":"crossref","first-page":"300","DOI":"10.1016\/j.csl.2017.02.008","article-title":"Crowd-sourcing prosodic annotation","volume":"45","author":"Cole","year":"2017","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.specom.2026.103353_b7","series-title":"Statistical Methods for Rates and Proportions","author":"Fleiss","year":"2013"},{"key":"10.1016\/j.specom.2026.103353_b8","doi-asserted-by":"crossref","unstructured":"Ghosal, Deepanway, Majumder, Navonil, Poria, Soujanya, Chhaya, Niyati, Gelbukh, Alexander, 2019. DialogueGCN: A Graph Convolutional Neural Network for Emotion Recognition in Conversation. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing. EMNLP-IJCNLP, pp. 154\u2013164.","DOI":"10.18653\/v1\/D19-1015"},{"key":"10.1016\/j.specom.2026.103353_b9","series-title":"2021 IEEE Spoken Language Technology Workshop","first-page":"403","article-title":"Conversational end-to-end tts for voice agents","author":"Guo","year":"2021"},{"key":"10.1016\/j.specom.2026.103353_b10","series-title":"National Conference on Man-Machine Speech Communication","first-page":"306","article-title":"Automatic stress annotation and prediction for expressive mandarin TTS","author":"He","year":"2022"},{"key":"10.1016\/j.specom.2026.103353_b11","series-title":"FCTalker: Fine and coarse grained context modeling for expressive conversational speech synthesis","author":"Hu","year":"2022"},{"key":"10.1016\/j.specom.2026.103353_b12","doi-asserted-by":"crossref","unstructured":"Hu, Yifan, Liu, Rui, Ren, Yi, Yin, Xiang, Li, Haizhou, 2025. UniTalker: Conversational Speech-Visual Synthesis. In: Proceedings of the 33rd ACM International Conference on Multimedia. pp. 10248\u201310257.","DOI":"10.1145\/3746027.3755502"},{"key":"10.1016\/j.specom.2026.103353_b13","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Intra-and inter-modal context interaction modeling for conversational speech synthesis","author":"Jia","year":"2025"},{"key":"10.1016\/j.specom.2026.103353_b14","doi-asserted-by":"crossref","unstructured":"Jia, Zhenqi, Liu, Rui, Sisman, Berrak, Li, Haizhou, 2025. Multimodal Fine-grained Context Interaction Graph Modeling for Conversational Speech Synthesis. In: Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing. pp. 8863\u20138869.","DOI":"10.18653\/v1\/2025.emnlp-main.448"},{"key":"10.1016\/j.specom.2026.103353_b15","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.specom.2026.103353_b16","series-title":"BASE TTS: Lessons from building a billion-parameter text-to-speech model on 100 K hours of data","author":"\u0141ajszczak","year":"2024"},{"key":"10.1016\/j.specom.2026.103353_b17","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Dailytalk: Spoken dialogue dataset for conversational text-to-speech","author":"Lee","year":"2023"},{"key":"10.1016\/j.specom.2026.103353_b18","doi-asserted-by":"crossref","DOI":"10.1109\/TASLP.2023.3301217","article-title":"MSStyleTTS: Multi-scale style modeling with hierarchical context information for expressive speech synthesis","author":"Lei","year":"2023","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2026.103353_b19","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7917","article-title":"Enhancing speaking styles in conversational text-to-speech synthesis with graph-based multi-modal context modeling","author":"Li","year":"2022"},{"key":"10.1016\/j.specom.2026.103353_b20","doi-asserted-by":"crossref","unstructured":"Li, Jingbei, Meng, Yi, Wu, Xixin, Wu, Zhiyong, Jia, Jia, Meng, Helen, Tian, Qiao, Wang, Yuping, Wang, Yuxuan, 2022b. Inferring speaking styles from multi-modal conversational context by multi-scale relational graph convolutional networks. In: Proceedings of the 30th ACM International Conference on Multimedia. pp. 5811\u20135820.","DOI":"10.1145\/3503161.3547831"},{"key":"10.1016\/j.specom.2026.103353_b21","doi-asserted-by":"crossref","unstructured":"Liu, Rui, He, Shuwei, Hu, Yifan, Li, Haizhou, 2025a. Multi-modal and multi-scale spatial environment understanding for immersive visual text-to-speech. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol. 39, pp. 24632\u201324640, 23.","DOI":"10.1609\/aaai.v39i23.34643"},{"key":"10.1016\/j.specom.2026.103353_b22","doi-asserted-by":"crossref","unstructured":"Liu, Rui, Hu, Yifan, Ren, Yi, Yin, Xiang, Li, Haizhou, 2024a. Emotion Rendering for Conversational Speech Synthesis with Heterogeneous Graph-Based Context Modeling. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 1\u20139.","DOI":"10.1609\/aaai.v38i17.29833"},{"key":"10.1016\/j.specom.2026.103353_b23","series-title":"2021 IEEE Spoken Language Technology Workshop","first-page":"410","article-title":"Controllable emphatic speech synthesis based on forward attention for expressive speech synthesis","author":"Liu","year":"2021"},{"key":"10.1016\/j.specom.2026.103353_b24","doi-asserted-by":"crossref","first-page":"1075","DOI":"10.1109\/TASLP.2023.3348762","article-title":"Text-to-speech for low-resource agglutinative language with morphology-aware language model pre-training","volume":"32","author":"Liu","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2026.103353_b25","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.102948","article-title":"Retrieval-augmented dialogue knowledge aggregation for expressive conversational speech synthesis","volume":"118","author":"Liu","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.specom.2026.103353_b26","doi-asserted-by":"crossref","first-page":"2188","DOI":"10.1109\/TASLP.2024.3378110","article-title":"Controllable accented text-to-speech synthesis with fine and coarse-grained intensity rendering","volume":"32","author":"Liu","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2026.103353_b27","doi-asserted-by":"crossref","DOI":"10.1109\/TASLPRO.2025.3624913","article-title":"FluentEditor2: Text-based speech editing by modeling multi-scale acoustic and prosody consistency","author":"Liu","year":"2025","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2026.103353_b28","doi-asserted-by":"crossref","unstructured":"Liu, Yuchen, Zhang, Haoyu, Liu, Shichao, Yin, Xiang, Ma, Zejun, Jin, Qin, 2023. Emotionally Situated Text-to-Speech Synthesis in User-Agent Conversation. In: Proceedings of the 31st ACM International Conference on Multimedia. pp. 5966\u20135974.","DOI":"10.1145\/3581783.3613823"},{"issue":"4","key":"10.1016\/j.specom.2026.103353_b29","doi-asserted-by":"crossref","first-page":"1856","DOI":"10.1109\/TAFFC.2024.3378570","article-title":"Contrastive learning based modality-invariant feature acquisition for robust multimodal emotion recognition with missing modalities","volume":"15","author":"Liu","year":"2024","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.specom.2026.103353_b30","doi-asserted-by":"crossref","unstructured":"Liu, Rui, Zuo, Haolin, Lian, Zheng, Yuan, Hongyu, Fan, Qi, 2025d. Hardness-Aware Dynamic Curriculum Learning for Robust Multimodal Emotion Recognition with Missing Modalities. In: Proceedings of the 33rd ACM International Conference on Multimedia. pp. 5755\u20135764.","DOI":"10.1145\/3746027.3755605"},{"key":"10.1016\/j.specom.2026.103353_b31","doi-asserted-by":"crossref","unstructured":"Mass, Yosi, Shechtman, Slava, Mordechay, Moran, Hoory, Ron, Sar Shalom, Oren, Lev, Guy, Konopnicki, David, 2018. Word Emphasis Prediction for Expressive Text to Speech. In: Proc. Interspeech 2018. pp. 2868\u20132872.","DOI":"10.21437\/Interspeech.2018-1159"},{"key":"10.1016\/j.specom.2026.103353_b32","doi-asserted-by":"crossref","unstructured":"McAuliffe, Michael, Socolof, Michaela, Mihuc, Sarah, Wagner, Michael, Sonderegger, Morgan, 2017. Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi. In: Proc. Interspeech 2017. pp. 498\u2013502.","DOI":"10.21437\/Interspeech.2017-1386"},{"issue":"3","key":"10.1016\/j.specom.2026.103353_b33","first-page":"1","article-title":"Conversational AI: Dialogue systems, conversational agents, and chatbots","volume":"13","author":"McTear","year":"2020","journal-title":"Synth. Lect. Hum. Lang. Technol."},{"key":"10.1016\/j.specom.2026.103353_b34","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"12281","article-title":"Crowdsourced and automatic speech prominence estimation","author":"Morrison","year":"2024"},{"key":"10.1016\/j.specom.2026.103353_b35","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","article-title":"MELD: A multimodal multi-party dataset for emotion recognition in conversations","author":"Poria","year":"2019"},{"key":"10.1016\/j.specom.2026.103353_b36","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7587","article-title":"Hierarchical prosody modeling and control in non-autoregressive parallel neural TTS","author":"Raitio","year":"2022"},{"key":"10.1016\/j.specom.2026.103353_b37","doi-asserted-by":"crossref","unstructured":"Reimers, Nils, Gurevych, Iryna, 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing. EMNLP-IJCNLP, pp. 3982\u20133992.","DOI":"10.18653\/v1\/D19-1410"},{"key":"10.1016\/j.specom.2026.103353_b38","series-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","author":"Ren","year":"2020"},{"key":"10.1016\/j.specom.2026.103353_b39","doi-asserted-by":"crossref","unstructured":"Sang, Erik Tjong Kim, De Meulder, Fien, 2003. Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition. In: Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003. pp. 142\u2013147.","DOI":"10.3115\/1119176.1119195"},{"issue":"4","key":"10.1016\/j.specom.2026.103353_b40","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3386867","article-title":"Voice in human\u2013agent interaction: A survey","volume":"54","author":"Seaborn","year":"2021","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.specom.2026.103353_b41","series-title":"Emphasis control for parallel neural TTS","author":"Seshadri","year":"2021"},{"issue":"2","key":"10.1016\/j.specom.2026.103353_b42","doi-asserted-by":"crossref","first-page":"868","DOI":"10.3390\/app13020868","article-title":"HierTTS: Expressive end-to-end text-to-waveform using a multi-scale hierarchical variational auto-encoder","volume":"13","author":"Shang","year":"2023","journal-title":"Appl. Sci."},{"key":"10.1016\/j.specom.2026.103353_b43","doi-asserted-by":"crossref","unstructured":"Shirani, Amirreza, Dernoncourt, Franck, Asente, Paul, Lipka, Nedim, Kim, Seokhwan, Echevarria, Jose, Solorio, Thamar, 2019. Learning emphasis selection for written text in visual media from crowd-sourced label distributions. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics. pp. 1167\u20131172.","DOI":"10.18653\/v1\/P19-1112"},{"issue":"2","key":"10.1016\/j.specom.2026.103353_b44","doi-asserted-by":"crossref","first-page":"213","DOI":"10.1007\/s00530-014-0446-1","article-title":"Mean opinion score (MOS) revisited: methods and applications, limitations and alternatives","volume":"22","author":"Streijl","year":"2016","journal-title":"Multimedia Syst."},{"key":"10.1016\/j.specom.2026.103353_b45","article-title":"Connecting cross-modal representations for compact and robust multimodal sentiment analysis with sentiment word substitution error","author":"Sun","year":"2024","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.specom.2026.103353_b46","series-title":"Advances in Signal Processing and Intelligent Recognition Systems: 4th International Symposium SIRS 2018, Bangalore, India, September 19\u201322, 2018, Revised Selected Papers 4","first-page":"190","article-title":"Survey on virtual assistant: Google assistant, siri, cortana, alexa","author":"Tulshan","year":"2019"},{"key":"10.1016\/j.specom.2026.103353_b47","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing","first-page":"917","article-title":"TOD-BERT: Pre-trained natural language understanding for task-oriented dialogue","author":"Wu","year":"2020"},{"key":"10.1016\/j.specom.2026.103353_b48","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"M 2-CTTS: End-to-end multi-scale multi-modal conversational text-to-speech synthesis","author":"Xue","year":"2023"},{"key":"10.1016\/j.specom.2026.103353_b49","series-title":"Open source magicdata-ramc: A rich annotated mandarin conversational (ramc) speech dataset","author":"Yang","year":"2022"},{"key":"10.1016\/j.specom.2026.103353_b50","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume":"32","author":"Yang","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.specom.2026.103353_b51","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Towards expressive video dubbing with multiscale multimodal context interaction","author":"Zhao","year":"2025"},{"key":"10.1016\/j.specom.2026.103353_b52","doi-asserted-by":"crossref","unstructured":"Zhao, Jinming, Zhang, Tenggan, Hu, Jingwen, Liu, Yuchen, Jin, Qin, Wang, Xinchao, Li, Haizhou, 2022. M3ED: Multi-modal Multi-scene Multi-label Emotional Dialogue Database. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 5699\u20135710.","DOI":"10.18653\/v1\/2022.acl-long.391"},{"key":"10.1016\/j.specom.2026.103353_b53","series-title":"EE-TTS: Emphatic expressive TTS with linguistic information","author":"Zhong","year":"2023"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000014?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000014?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T09:21:09Z","timestamp":1773998469000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639326000014"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":53,"alternative-id":["S0167639326000014"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2026.103353","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Emphasis rendering for conversational text-to-speech with multi-modal multi-scale context modeling","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2026.103353","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103353"}}