{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T18:23:24Z","timestamp":1776277404094,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800495","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"611-615","source":"Crossref","is-referenced-by-count":1,"title":["Expressive Text-to-Speech with Contextual Background for ICAGC 2024"],"prefix":"10.1109","author":[{"given":"Yu","family":"Jiang","sequence":"first","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianrui","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Gong","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiuyu","family":"Liu","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zikang","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Longbiao","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"crossref","DOI":"10.1109\/ISCSLP63861.2024.10800374","article-title":"Inspirational and Convincing Audio Generation Challenge 2024 ICAGC 2024","volume-title":"The 14th International Symposium on Chinese Spoken Lan-guage Processing (ISCSLP 2024)","author":"Fu"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3451951"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446203"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3485485"},{"key":"ref5","article-title":"Viola: Unified codec language models for speech recognition, synthesis, and translation","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3250266"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681688"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"ref9","first-page":"1877","article-title":"Lan-guage models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref10","first-page":"5180","article-title":"Style tokens: Un-supervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"International conference on machine learning","author":"Wang"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446467"},{"key":"ref12","first-page":"6840","article-title":"Denoising diffusion probabilis-tic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2860246"},{"key":"ref14","first-page":"1","article-title":"Speech emotion di-arization: Which emotion appears when?","volume-title":"2023 IEEE Auto-matic Speech Recognition and Understanding Workshop (ASRU)","author":"Wang"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1405"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3268730"},{"key":"ref17","article-title":"Audiogen: Textu-ally guided audio generation","author":"Kreuk","year":"2022","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2013","journal-title":"arXiv preprint"},{"key":"ref19","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref20","first-page":"916","article-title":"Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models","volume-title":"International Conference on Machine Learning","volume":"13","author":"Huang","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"issue":"8","key":"ref23","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref24","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref26","article-title":"Neural discrete representation learning","volume":"30","author":"Van","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref27","first-page":"arXiv-2207","article-title":"Masked autoencoders that listen","author":"Xu","year":"2022","journal-title":"arXiv e-prints"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-014-0446-1"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","location":"Beijing, China","start":{"date-parts":[[2024,11,7]]},"end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800495.pdf?arnumber=10800495","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,24]],"date-time":"2024-12-24T06:26:31Z","timestamp":1735021591000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800495\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800495","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}