{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T05:24:56Z","timestamp":1747373096626,"version":"3.37.3"},"reference-count":26,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Institute of Information and communications Technology Planning and Evaluation"},{"name":"Korea government","award":["2021-0-00456"],"award-info":[{"award-number":["2021-0-00456"]}]},{"name":"Development of Ultra-high Speech Quality Technology for Remote Multi-speaker Conference"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/lsp.2024.3377588","type":"journal-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T20:33:29Z","timestamp":1710794009000},"page":"899-903","source":"Crossref","is-referenced-by-count":1,"title":["Variable-Length Speaker Conditioning in Flow-Based Text-to-Speech"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1319-8215","authenticated-orcid":false,"given":"Byoung Jin","family":"Choi","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer Engineering, Institute of New Media and Communications, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4689-3110","authenticated-orcid":false,"given":"Myeonghun","family":"Jeong","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Institute of New Media and Communications, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8150-765X","authenticated-orcid":false,"given":"Minchan","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Institute of New Media and Communications, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0568-4902","authenticated-orcid":false,"given":"Nam Soo","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Institute of New Media and Communications, Seoul National University, Seoul, South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054301"},{"key":"ref2","article-title":"AdaSpeech: Adaptive text to speech for custom voice","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","year":"2021"},{"article-title":"Residual adapters for few-shot text-to-speech speaker adaptation","year":"2022","author":"Morioka","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3167258"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP57327.2022.10037585"},{"key":"ref6","first-page":"7748","article-title":"Meta-StyleSpeech: Multi-speaker adaptive text-to-speech generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Min","year":"2021"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9979900"},{"article-title":"Mega-TTS: Zero-shot text-to-speech at scale with intrinsic inductive bias","year":"2023","author":"Jiang","key":"ref8"},{"article-title":"Mega-TTS 2: Zero-shot text-to-speech with arbitrary length speech prompts","year":"2023","author":"Jiang","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-368"},{"key":"ref11","first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Casanova","year":"2022"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3226655"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3277786"},{"key":"ref14","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2096"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"author":"Yamagishi","key":"ref17","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref19","article-title":"Auto-encoding variational Bayes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma","year":"2014"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"ref21","article-title":"Semi-conditional normalizing flows for semi-supervised learning","volume-title":"Proc. ICML Workshop Invertible Neural Nets Normalizing Flows","author":"Atanov","year":"2019"},{"key":"ref22","first-page":"125","article-title":"WaveNet: A generative model for raw audio","volume-title":"Proc. 9th ISCA Speech Synth. Workshop","author":"Oord","year":"2016"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref24","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"article-title":"SpeechBrain: A general-purpose speech toolkit","year":"2021","author":"Ravanelli","key":"ref25"},{"key":"ref26","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2023"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/97\/10380231\/10472607.pdf?arnumber=10472607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T20:12:35Z","timestamp":1712002355000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10472607\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/lsp.2024.3377588","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"type":"print","value":"1070-9908"},{"type":"electronic","value":"1558-2361"}],"subject":[],"published":{"date-parts":[[2024]]}}}