{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T19:06:50Z","timestamp":1764270410035,"version":"3.46.0"},"reference-count":82,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271432"],"award-info":[{"award-number":["62271432"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Science and Technology Program","award":["ZDSYS20230626091302006"],"award-info":[{"award-number":["ZDSYS20230626091302006"]}]},{"name":"Program for Guangdong Introducing Innovative and Entrepreneurial Teams","award":["2023ZT10X044"],"award-info":[{"award-number":["2023ZT10X044"]}]},{"name":"Open Project of the Key Laboratory of Artificial Intelligence, Ministry of Education","award":["AI202405"],"award-info":[{"award-number":["AI202405"]}]},{"name":"CCF-NetEase ThunderFire Innovation Research Funding","award":["CCF-Netease 202302"],"award-info":[{"award-number":["CCF-Netease 202302"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Affective Comput."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/taffc.2025.3582715","type":"journal-article","created":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T13:34:38Z","timestamp":1750772078000},"page":"3316-3328","source":"Crossref","is-referenced-by-count":2,"title":["Hierarchical Control of Emotion Rendering in Speech Synthesis"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7498-8611","authenticated-orcid":false,"given":"Sho","family":"Inoue","sequence":"first","affiliation":[{"name":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7869-4474","authenticated-orcid":false,"given":"Kun","family":"Zhou","sequence":"additional","affiliation":[{"name":"Tongyi Speech Lab, Alibaba Group, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1523-9631","authenticated-orcid":false,"given":"Shuai","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Intelligence Science and Technology, Nanjing University, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3250266"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2018.3620963"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-90-481-3129-7"},{"article-title":"Emotion modelling for speech generation","year":"2023","author":"Zhou","key":"ref4"},{"issue":"10","key":"ref5","first-page":"1355","article-title":"An overview of affective speech synthesis and conversion in the deep learning ERA","volume-title":"Proc. IEEE","volume":"111","author":"Triantafyllopoulos","year":"2022"},{"article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-359"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3348762"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.2001-150"},{"article-title":"A survey on neural speech synthesis","year":"2021","author":"Tan","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1002\/9780470756959.ch23"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1093\/scan\/nst124"},{"key":"ref13","first-page":"1","volume-title":"Vocal Communication of Emotion","author":"Laukka","year":"2017"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383526"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2014"},{"key":"ref16","first-page":"1459","article-title":"Opensmile\u2013the munich versatile and fast open-source audio feature extractor","volume-title":"Proc. 18th ACM Int. Conf. Multimedia","author":"Eyben","year":"2010"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"15747","DOI":"10.18653\/v1\/2024.findings-acl.931","article-title":"emotion2vec: Self-supervised pre-training for speech emotion representation","volume-title":"Proc. Findings Assoc. Comput. Linguistics: ACL 2024","author":"Ma","year":"2024"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023186"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1405"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3268571"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3164181"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126281"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3175578"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003829"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3233324"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3145293"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445996"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC63619.2025.10848721"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4614-5143-3_5"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-33"},{"issue":"1","key":"ref35","doi-asserted-by":"crossref","first-page":"e25","DOI":"10.1016\/j.jvoice.2010.02.002","article-title":"Intonation and emotion: Influence of pitch levels and contour type on creating emotions","volume":"25","author":"Rodero","year":"2011","journal-title":"J. Voice"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.3758\/s13428-012-0314-x"},{"issue":"2","key":"ref37","doi-asserted-by":"crossref","first-page":"241","DOI":"10.1016\/S0095-4470(19)30625-4","article-title":"The influence of pitch range, duration, amplitude and spectral features on the interpretation of the rise-fall-rise intonation contour in english","volume":"20","author":"Hirschberg","year":"1992","journal-title":"J. Phonetics"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/S0749-596X(02)00519-3"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2008.2009578"},{"issue":"3","key":"ref40","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","article-title":"Survey on speech emotion recognition: Features, classification schemes, and databases","volume":"44","author":"Ayadi","year":"2011","journal-title":"Pattern Recognit."},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1121\/1.420109"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3378110"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/taffc.2024.3490694"},{"key":"ref44","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11336"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413398"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11133"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1129"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2477"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-947"},{"key":"ref51","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.3390\/app13042225"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"ref54","first-page":"6994","article-title":"FastDiff 2: Revisiting and incorporating GANs and diffusion models in high-fidelity speech synthesis","volume-title":"Proc. Findings Assoc. Comput. Linguistics","author":"Huang","year":"2023"},{"article-title":"BDDM: Bilateral denoising diffusion models for fast and high-quality speech synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lam","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"article-title":"Flow matching for generative modeling","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Lipman","key":"ref57"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2021.11.006"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref62","first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Kim","year":"2020"},{"article-title":"Vocos: Closing the gap between time-domain and fourier-based neural vocoders for high-quality audio synthesis","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Siuzdak","key":"ref63"},{"article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","key":"ref64"},{"key":"ref65","first-page":"1180","article-title":"Unsupervised domain adaptation by backpropagation","volume-title":"Proc. 32nd Int. Conf. Mach. Learn.","volume":"37","author":"Ganin"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3290795"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2860246"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053581"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2017.8273610"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1155\/2015\/394083"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096626"},{"key":"ref74","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref75","first-page":"465","article-title":"Best-worst scaling more reliable than rating scales: A case study on sentiment intensity annotation","volume-title":"Proc. 55th Annu. Meeting Assoc. Comput. Linguistics","author":"Kiritchenko","year":"2017"},{"key":"ref76","doi-asserted-by":"crossref","first-page":"8747","DOI":"10.1109\/TNNLS.2022.3218982","article-title":"Measuring disentanglement: A review of metrics","volume":"35","author":"Carbonneau","year":"2024","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"article-title":"Isolating sources of disentanglement in VAEs","year":"2018","author":"Chen","key":"ref77"},{"article-title":"A framework for the quantitative evaluation of disentangled representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Eastwood","key":"ref78"},{"key":"ref79","first-page":"185","article-title":"Learning deep disentangled embeddings with the f-statistic loss","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Ridgeway","year":"2018"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3129340"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2024.3378570"}],"container-title":["IEEE Transactions on Affective Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5165369\/11269911\/11049047.pdf?arnumber=11049047","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:58:23Z","timestamp":1764269903000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11049047\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":82,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/taffc.2025.3582715","relation":{},"ISSN":["1949-3045","2371-9850"],"issn-type":[{"type":"electronic","value":"1949-3045"},{"type":"electronic","value":"2371-9850"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}