{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T05:40:58Z","timestamp":1764222058308,"version":"3.46.0"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/sped67700.2025.11251681","type":"proceedings-article","created":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T19:00:27Z","timestamp":1764183627000},"page":"75-80","source":"Crossref","is-referenced-by-count":0,"title":["Style-Controlled VALL-E for Few-Shot Emotional German TTS"],"prefix":"10.1109","author":[{"given":"Rami","family":"Kammoun","sequence":"first","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Artificial Intelligence,Budapest,Hungary"}]},{"given":"Mohammed Salah","family":"Al-Radhi","sequence":"additional","affiliation":[{"name":"Budapest University of Technology and Economics,Department of Telecommunications and Artificial Intelligence,Budapest,Hungary"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Adaptive control for singularly perturbed systems examples","author":"Eves","year":"2023","journal-title":"Code Ocean"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49660.2025.10888737"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-68456-7_22"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.2018.8461368"},{"article-title":"FastSpeech 2: Fast and High-Quality End-to-End Text to Speech","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Ren","key":"ref5"},{"key":"ref6","article-title":"Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech","author":"Kim","year":"2021","journal-title":"arXiv preprint arXiv:2106.06103"},{"key":"ref7","article-title":"SC VALLE: Style-Controllable Zero-Shot Text to Speech Synthesizer","author":"Kim","year":"2023","journal-title":"ArXiv"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-024-00329-7"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1236"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2022.3164181"},{"key":"ref12","article-title":"E3-VITS: Emotional End-to-End TTS with Cross-speaker Style Transfer","author":"Jung","year":"2023","journal-title":"Published online"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-1452"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-2121"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-367"},{"key":"ref16","first-page":"2280","article-title":"Word-level Text Markup for Prosody Control in Speech Synthesis","volume-title":"Proceedings of Interspeech 2024","author":"Korotkova"},{"volume-title":"VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers","year":"2025","author":"Chen","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/syndata4genai.2024-4"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10094994"},{"volume-title":"Exploring Transfer Learning for Low Resource Emotional TTS","year":"2019","author":"Tits","key":"ref20"},{"key":"ref21","first-page":"26","article-title":"Neural Speech Synthesis in German","volume-title":"Proceedings of CENTRIC 2021","author":"Wirth"}],"event":{"name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","start":{"date-parts":[[2025,10,19]]},"location":"Cluj-Napoca, Romania","end":{"date-parts":[[2025,10,22]]}},"container-title":["2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11251505\/11251597\/11251681.pdf?arnumber=11251681","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T05:37:22Z","timestamp":1764221842000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11251681\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/sped67700.2025.11251681","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}