{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T16:38:56Z","timestamp":1772642336240,"version":"3.50.1"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T00:00:00Z","timestamp":1673222400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,1,9]]},"DOI":"10.1109\/slt54892.2023.10022433","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:54:03Z","timestamp":1674845643000},"page":"962-969","source":"Crossref","is-referenced-by-count":7,"title":["Exact Prosody Cloning in Zero-Shot Multispeaker Text-to-Speech"],"prefix":"10.1109","author":[{"given":"Florian","family":"Lux","sequence":"first","affiliation":[{"name":"University of Stuttgart, Institute for Natural Language Processing,Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Julia","family":"Koch","sequence":"additional","affiliation":[{"name":"University of Stuttgart, Institute for Natural Language Processing,Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ngoc Thang","family":"Vu","sequence":"additional","affiliation":[{"name":"University of Stuttgart, Institute for Natural Language Processing,Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis","volume":"31","author":"Jia","year":"2018","journal-title":"NeurIPS"},{"key":"ref2","article-title":"Neural voice cloning with a few samples","volume":"31","author":"Arik","year":"2018","journal-title":"NeurIPS"},{"key":"ref3","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"ICML","author":"Wang","year":"2018"},{"key":"ref4","first-page":"6588","article-title":"FastPitch: Parallel text-to-speech with pitch prediction","volume-title":"ICASSP.","author":"\u013fa\u0144ricucki","year":"2021"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-15"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2096"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-49"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2861"},{"key":"ref11","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"ICML","author":"Skerry-Ryan","year":"2018"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2571"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1583"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1610"},{"key":"ref15","article-title":"FastSpeech: fast, robust and controllable text to speech","author":"Ren","year":"2019","journal-title":"NeurIPS"},{"key":"ref16","article-title":"FastSpeech 2: Fast and High-Quality End to End Text to Speech","author":"Ren","year":"2020","journal-title":"ICLR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/blizzard.2023-4"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref20","article-title":"ESPnet2-TTS: Extending the Edge of TTS Research","author":"Hayashi","year":"2021","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10703"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1929"},{"key":"ref26","author":"Ravanelli","year":"2021","journal-title":"SpeechBrain: A General-Purpose Speech Toolkit"},{"key":"ref27","article-title":"HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis","volume":"33","author":"Kong","year":"2020","journal-title":"NeurIPS"},{"issue":"9","key":"ref28","first-page":"341","article-title":"Praat, a system for doing phonetics by computer","volume":"5","author":"Boersma","year":"2001","journal-title":"Glot. Int."},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref30","article-title":"VRAIN-UPV MLLPs system for the Blizzard Challenge 2021","volume-title":"Proc. Blizzard Challenge Workshop","volume":"2021","author":"Martos"},{"key":"ref31","first-page":"8067","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"Kim","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.472"},{"key":"ref33","article-title":"The Blizzard Chal-lenge 2011","volume-title":"Proc. Blizzard Challenge Workshop","volume":"2011","author":"King"},{"key":"ref34","volume-title":"The LJ Speech Dataset","author":"Ito","year":"2017"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2441"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1599"},{"key":"ref37","author":"Veaux","year":"2017","journal-title":"Superseded-CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87626-5_15"},{"key":"ref39","volume-title":"Thorsten-Open German Voice (Neutral) Dataset","author":"Muller","year":"2021"},{"key":"ref40","doi-asserted-by":"crossref","DOI":"10.21437\/Blizzard.2021-1","article-title":"The Blizzard Challenge 2021","volume-title":"Proc. Blizzard Challenge Workshop","author":"Ling"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1500"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref44","first-page":"170","article-title":"IMS-speech: A speech to text tool","author":"Denisov","year":"2019","journal-title":"Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2019"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref46","first-page":"359","article-title":"Using dynamic time warping to find patterns in time series","volume-title":"KDD workshop","author":"Berndt"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.09.003"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-58"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"issue":"11","key":"ref50","article-title":"Visual-izing data using t-SNE","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"Journal of machine learning research"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2022.101362"}],"event":{"name":"2022 IEEE Spoken Language Technology Workshop (SLT)","location":"Doha, Qatar","start":{"date-parts":[[2023,1,9]]},"end":{"date-parts":[[2023,1,12]]}},"container-title":["2022 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10022052\/10022330\/10022433.pdf?arnumber=10022433","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T05:07:43Z","timestamp":1728796063000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10022433\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,9]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/slt54892.2023.10022433","relation":{},"subject":[],"published":{"date-parts":[[2023,1,9]]}}}