{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T06:07:57Z","timestamp":1762841277398,"version":"build-2065373602"},"reference-count":40,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/lsp.2025.3618764","type":"journal-article","created":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T17:58:40Z","timestamp":1759859920000},"page":"4199-4203","source":"Crossref","is-referenced-by-count":0,"title":["Zero-Shot Voice Conversion via Content-Aware Timbre Ensemble and Conditional Flow Matching"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4995-0689","authenticated-orcid":false,"given":"Yu","family":"Pan","sequence":"first","affiliation":[{"name":"Department of Information Science and Electrical Engineering, Kyushu University, Fukuoka, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuguang","family":"Yang","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jixun","family":"Yao","sequence":"additional","affiliation":[{"name":"ByteDance Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8621-2420","authenticated-orcid":false,"given":"Lei","family":"Ma","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Tokyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8083-4352","authenticated-orcid":false,"given":"Jianjun","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Information Science and Electrical Engineering, Kyushu University, Fukuoka, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053868"},{"article-title":"Takin: A cohort of superior quality zero-shot speech generation models","year":"2024","author":"Chen","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414975"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747625"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096220"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref7","first-page":"9361","article-title":"SqueezeFormer: An efficient transformer for automatic speech recognition","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Kim","year":"2022"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1513"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448394"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/slt61566.2024.10832339"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094850"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446160"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747743"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1443"},{"key":"ref18","first-page":"1731","article-title":"Takin-VC: Zero-shot voice conversion via jointly hybrid content and memory-augmented context-aware timbre modeling","volume-title":"Proc. 63rd Annu. Meeting Assoc. Comput. Linguistics","author":"Yang","year":"2025"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3439996"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2023.08.012"},{"article-title":"Qwen technical report","year":"2023","author":"Bai","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3308474"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.396"},{"article-title":"High fidelity neural audio compression","year":"2022","author":"Dfossez","key":"ref24"},{"article-title":"PromptCodec: High-fidelity neural speech codec using disentangled representation learning based adaptive feature-aware prompt encoders","year":"2024","author":"Pan","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096467"},{"article-title":"Improving and generalizing flow-based generative models with minibatch optimal transport","year":"2023","author":"Tong","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2025-203"},{"article-title":"BigVGAN: A universal neural vocoder with large-scale training","year":"2022","author":"Lee","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1294"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2116"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"article-title":"Conditional flow matching: Simulation-free dynamic optimal transport","year":"2023","author":"Tong","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref35","first-page":"271","article-title":"CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","author":"Yamagishi","year":"2019"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2021.11.006"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34758"},{"article-title":"Diffusion-based voice conversion with fast maximum likelihood sampling scheme","year":"2021","author":"Popov","key":"ref38"},{"article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","year":"2023","author":"Shen","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3530270"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/97\/10802935\/11194707.pdf?arnumber=11194707","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,11]],"date-time":"2025-11-11T06:03:34Z","timestamp":1762841014000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11194707\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/lsp.2025.3618764","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"type":"print","value":"1070-9908"},{"type":"electronic","value":"1558-2361"}],"subject":[],"published":{"date-parts":[[2025]]}}}