{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:24:23Z","timestamp":1780673063092,"version":"3.54.1"},"reference-count":198,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001659","name":"DFG\u2019s Reinhart Koselleck Project","doi-asserted-by":"publisher","award":["442218748 (AUDI0NOMOUS)"],"award-info":[{"award-number":["442218748 (AUDI0NOMOUS)"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1109\/jproc.2023.3250266","type":"journal-article","created":{"date-parts":[[2023,3,10]],"date-time":"2023-03-10T18:24:33Z","timestamp":1678472673000},"page":"1355-1381","source":"Crossref","is-referenced-by-count":78,"title":["An Overview of Affective Speech Synthesis and Conversion in the Deep Learning Era"],"prefix":"10.1109","volume":"111","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8338-617X","authenticated-orcid":false,"given":"Andreas","family":"Triantafyllopoulos","sequence":"first","affiliation":[{"name":"Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6478-8699","authenticated-orcid":false,"given":"Bj\u00f6rn W.","family":"Schuller","sequence":"additional","affiliation":[{"name":"Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6458-395X","authenticated-orcid":false,"given":"G\u00f6k\u00e7e","family":"\u0130ymen","sequence":"additional","affiliation":[{"name":"KUIS AI Laboratory, College of Engineering, Ko&#x00E7; University, Istanbul, Turkey"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Metin","family":"Sezgin","sequence":"additional","affiliation":[{"name":"KUIS AI Laboratory, College of Engineering, Ko&#x00E7; University, Istanbul, Turkey"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiangheng","family":"He","sequence":"additional","affiliation":[{"name":"Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1610-1869","authenticated-orcid":false,"given":"Zijiang","family":"Yang","sequence":"additional","affiliation":[{"name":"Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9449-5339","authenticated-orcid":false,"given":"Panagiotis","family":"Tzirakis","sequence":"additional","affiliation":[{"name":"Group on Language, Audio, &#x0026; Music (GLAM), Imperial College London, London, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8133-8588","authenticated-orcid":false,"given":"Shuo","family":"Liu","sequence":"additional","affiliation":[{"name":"Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5230-5218","authenticated-orcid":false,"given":"Silvan","family":"Mertes","sequence":"additional","affiliation":[{"name":"Chair of Human-Centered Artificial Intelligence, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2367-162X","authenticated-orcid":false,"given":"Elisabeth","family":"Andr\u00e9","sequence":"additional","affiliation":[{"name":"Chair of Human-Centered Artificial Intelligence, University of Augsburg, Augsburg, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9598-1881","authenticated-orcid":false,"given":"Ruibo","family":"Fu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9344-6428","authenticated-orcid":false,"given":"Jianhua","family":"Tao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1121\/1.395275"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1121\/1.2017051"},{"key":"ref59","first-page":"1","article-title":"Concatenative speech synthesis: A review","volume":"136","author":"khan","year":"2016","journal-title":"Int J Comput Appl"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1037\/amp0000399"},{"key":"ref52","article-title":"The ICML 2022 expressive vocalizations workshop and competition: Recognizing, generating, and personalizing vocal bursts","author":"baird","year":"2022","journal-title":"arXiv 2205 01780"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1976.10154"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1121\/1.1906583"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2018.2828429"},{"key":"ref50","article-title":"Emotion intensity and its control for emotional voice conversion","author":"zhou","year":"2022","journal-title":"IEEE Trans Affect Comput"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1037\/0022-3514.70.3.614"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.ps.41.020190.002221"},{"key":"ref48","first-page":"2417","article-title":"Voice quality: The 4th prosodic dimension","author":"campbell","year":"2003","journal-title":"Proc Int Congr Phonetic Sci"},{"key":"ref47","first-page":"220","article-title":"Vocal communication of emotion","volume":"2 2000","author":"johnstone","year":"0","journal-title":"Handbook of Emotions"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2018.06.006"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.876118"},{"key":"ref44","first-page":"137","article-title":"Psychological models of emotion","volume":"137","author":"scherer","year":"2000","journal-title":"The Neuropsychology of Emotion"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2018.8706656"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3065460"},{"key":"ref8","article-title":"Generating expression in synthesized speech","author":"cahn","year":"1989"},{"key":"ref7","article-title":"Simulating emotion in synthetic speech","author":"murray","year":"1989"},{"key":"ref9","first-page":"1","article-title":"The generation of affect in synthesized speech","volume":"8","author":"cahn","year":"1990","journal-title":"J American Voice I\/O Society"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/1-4020-2637-4_3"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/1140.001.0001"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1002\/9781118706664"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960401"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00916"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1016\/0092-6566(77)90037-X"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11955"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5044"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1037\/\/0096-3445.113.3.464"},{"key":"ref36","article-title":"AffectON: Incorporating affect into dialog generation","author":"bucinca","year":"2020","journal-title":"IEEE Trans Affect Comput"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1002\/0470013494.ch30"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49127-9_25"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3446390"},{"key":"ref32","author":"ellsworth","year":"2003","journal-title":"Appraisal Processes in Emotion"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1080\/02699939208411068"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2905209"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413466"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.34"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref25","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"lakhotia","year":"2021","journal-title":"Trans Assoc Comput Linguistics"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383537"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2317187"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2988781"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2021.108439"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-1894-4"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3052688"},{"key":"ref129","article-title":"Textless speech emotion conversion using discrete and decomposed representations","author":"kreuk","year":"2021","journal-title":"arXiv 2111 07402"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687906"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2412"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413907"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2021-11"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659628"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1236"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3076369"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3145293"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref131","article-title":"iEmoTTS: Toward robust cross-speaker emotion transfer and control for speech synthesis based on disentanglement between prosody and timbre","author":"zhang","year":"2022","journal-title":"arXiv 2206 14866"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3164181"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3222646"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1659961"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1131"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-33"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref137","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"dauphin","year":"2017","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"ref138","first-page":"1","article-title":"Unsupervised cross-domain image generation","author":"taigman","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(89)90041-1"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859769"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.08.009"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2017.8282282"},{"key":"ref82","first-page":"1","article-title":"Adversarial audio synthesis","author":"donahue","year":"2018","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3203888"},{"key":"ref81","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"kong","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362098"},{"key":"ref84","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","author":"popov","year":"2021","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1253"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"ref140","article-title":"Emotional voice conversion with cycle-consistent adversarial network","author":"liu","year":"2020","journal-title":"arXiv 2004 03781"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.09.002"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref79","first-page":"14881","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","author":"kumar","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1053"},{"key":"ref78","first-page":"1","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref109","article-title":"Emotional end-to-end neural speech synthesizer","author":"lee","year":"2017","journal-title":"arXiv 1711 05447"},{"key":"ref106","first-page":"413","article-title":"HMM-based speech synthesis with various speaking styles using model interpolation","author":"tachibana","year":"2004","journal-title":"Proc Int Conf Speech Prosody"},{"key":"ref107","doi-asserted-by":"crossref","first-page":"1145","DOI":"10.1109\/TASL.2006.876113","article-title":"Prosody conversion from neutral speech to emotional speech","volume":"14","author":"tao","year":"2006","journal-title":"IEEE Trans Audio Speech Language Process"},{"key":"ref75","first-page":"1","article-title":"Generative adversarial nets","volume":"27","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.2003-473"},{"key":"ref74","first-page":"1","article-title":"Auto-encoding variational Bayes","author":"kingma","year":"2013","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(02)00081-X"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref102","first-page":"836","article-title":"StarGAN-based emotional voice conversion for Japanese phrases","author":"moritani","year":"2021","journal-title":"Proc Asia&#x2013;Pacific Signal Inf Process Assoc Annu Summit Conf (APSIPA ASC)"},{"key":"ref76","first-page":"3918","article-title":"Parallel WaveNet: Fast high-fidelity speech synthesis","author":"oord","year":"2018","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref103","first-page":"1","article-title":"Verification of acoustical correlates of emotional speech using formant-synthesis","author":"burkhardt","year":"2000","journal-title":"Proc ISCA Tutorial Res Workshop (ITRW) Speech Emotion"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003859"},{"key":"ref70","first-page":"7586","article-title":"Non-autoregressive neural text-to-speech","author":"peng","year":"2020","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683682"},{"key":"ref73","first-page":"1","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"2020","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref72","first-page":"1","article-title":"FastSpeech: Fast, robust and controllable text to speech","volume":"32","author":"ren","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2018.03.002"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053678"},{"key":"ref67","first-page":"1","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","volume":"30","author":"gibiansky","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683865"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2878"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/S0166-4115(97)80111-2"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2293"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1250\/ast.27.349"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2923951"},{"key":"ref66","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","volume":"70","author":"arik","year":"2017","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/ITC-CSCC.2019.8793393"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2512"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1647"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383524"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053255"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.04.004"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054579"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-014-0446-1"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-10-3734-4_3"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1111\/j.1745-459X.1998.tb00085.x"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2005-446"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746291"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1123"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref173","first-page":"366","article-title":"Perceptual objective listening quality assessment (POLQA), the third generation ITU-T standard for end-to-end speech quality measurement. Part I&#x2014;Temporal alignment","volume":"61","author":"beerends","year":"2013","journal-title":"J Audio Eng Soc"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.883177"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/49.138987"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10371"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15184-2_12"},{"key":"ref181","first-page":"1753","article-title":"Deep fakes: A looming challenge for privacy, democracy, and national security","volume":"107","author":"chesney","year":"2019","journal-title":"California Law Rev"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-86"},{"key":"ref189","first-page":"150","article-title":"Integrating models of personality and emotions into lifelike characters","author":"andr\u00e9","year":"1999","journal-title":"Proc Int Workshop Affect Interact"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1121\/1.1913571"},{"key":"ref187","article-title":"Myers&#x2013;Briggs personality classification and personality-specific language generation using pre-trained language models","author":"keh","year":"2019","journal-title":"arXiv 1907 06333"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.3021015"},{"key":"ref185","article-title":"Generating diverse vocal bursts with StyleGAN2 and MEL-spectrograms","author":"jiralerspong","year":"2022","journal-title":"arXiv 2206 12563"},{"key":"ref182","article-title":"I&#x2019;d blush if I could: Closing gender divides in digital skills through education","author":"west","year":"2019"},{"key":"ref183","article-title":"White paper on artificial intelligence: A European approach to excellence and trust","year":"2020"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2457417"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/IALP54817.2021.9675192"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1121\/1.392466"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1002\/ejsp.2420080405"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053732"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2931673"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/ICSP54964.2022.9778768"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3156093"},{"key":"ref151","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"2018","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023186"},{"key":"ref150","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","author":"skerry-ryan","year":"2018","journal-title":"Proc Int Conf Mach Learn (ICML)"},{"key":"ref159","article-title":"Dawn of the transformer era in speech emotion recognition: Closing the valence gap","author":"wagner","year":"2022","journal-title":"arXiv 2203 07378"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/MIUCC55081.2022.9781718"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639682"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054534"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746994"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-017-0116-2"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2019.3"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413398"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2017.8273608"},{"key":"ref161","first-page":"1","article-title":"Dynamic routing between capsules","volume":"30","author":"sabour","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3129340"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1121\/1.405558"},{"key":"ref15","article-title":"A survey on neural speech synthesis","author":"tan","year":"2021","journal-title":"arXiv 2106 15561"},{"key":"ref14","first-page":"125","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"Proc 9th ISCA Speech Synthesis Workshop"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(92)90040-E"},{"key":"ref10","first-page":"155","article-title":"Prosodic control to express emotions for man-machine speech interaction","volume":"75","author":"kitahara","year":"1992","journal-title":"IEICE Trans Fundam Electron Commun Comput Sci"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2021.11.006"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.2001-150"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461748"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10636"},{"key":"ref2","author":"minsky","year":"1988","journal-title":"Society of Mind"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173857"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2019.8901785"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428217"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2019.8925487"},{"key":"ref197","doi-asserted-by":"crossref","first-page":"613","DOI":"10.1145\/3462244.3479944","article-title":"Engagement rewarded actor-critic with conservative Q-learning for speech-driven laughter backchannel generation","author":"bayramo?lu","year":"2021","journal-title":"Proc Int Conf Multimodal Interact"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3190233"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1177\/1754073919898526"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/ROMAN.2017.8172330"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414698"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.3389\/frobt.2019.00116"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5\/10283866\/10065433.pdf?arnumber=10065433","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,16]],"date-time":"2024-10-16T04:38:18Z","timestamp":1729053498000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10065433\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10]]},"references-count":198,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/jproc.2023.3250266","relation":{},"ISSN":["0018-9219","1558-2256"],"issn-type":[{"value":"0018-9219","type":"print"},{"value":"1558-2256","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10]]}}}