{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:43:59Z","timestamp":1763192639275,"version":"3.45.0"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230986","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["DiTVC: One-Shot Voice Conversion via Diffusion Transformer with Environment and Speaking Rate Cloning"],"prefix":"10.1109","author":[{"given":"Yunyun","family":"Wang","sequence":"first","affiliation":[{"name":"Princeton University"}]},{"given":"Jiaqi","family":"Su","sequence":"additional","affiliation":[{"name":"Adobe Research"}]},{"given":"Adam","family":"Finkelstein","sequence":"additional","affiliation":[{"name":"Princeton University"}]},{"given":"Rithesh","family":"Kumar","sequence":"additional","affiliation":[{"name":"Adobe Research"}]},{"given":"Ke","family":"Chen","sequence":"additional","affiliation":[{"name":"Adobe Research"}]},{"given":"Zeyu","family":"Jin","sequence":"additional","affiliation":[{"name":"Adobe Research"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054734"},{"key":"ref2","first-page":"2709","article-title":"Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone","volume-title":"Proc. ICML","author":"Casanova"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448232"},{"article-title":"vec2wav 2.0: Advancing voice conversion via discrete token vocoders","year":"2024","author":"Guo","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095191"},{"key":"ref7","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NeurIPS","volume":"33","author":"Kong"},{"article-title":"Bigvgan: A universal neural vocoder with large-scale training","year":"2022","author":"Lee","key":"ref8"},{"key":"ref9","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. NeurIPS","volume":"33","author":"Ho"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"article-title":"Imagen video: High definition video generation with diffusion models","year":"2022","author":"Ho","key":"ref11"},{"article-title":"Simple-tts: End-to-end text-to-speech synthesis with latent diffusion","year":"2023","author":"Lovelace","key":"ref12"},{"article-title":"Diffusion-based voice conversion with fast maximum likelihood sampling scheme","year":"2021","author":"Popov","key":"ref13"},{"article-title":"Codiff-vc: A codec-assisted diffusion model for zero-shot voice conversion","year":"2024","author":"Li","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547855"},{"article-title":"Ditto-tts: Efficient and scalable zero-shot text-to-speech with diffusion transformer","year":"2024","author":"Lee","key":"ref16"},{"article-title":"Dmdspeech: Distilled diffusion model surpassing the teacher in zero-shot speech synthesis via direct metric optimization","year":"2024","author":"Li","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.3362\/0262-8104.2002.009"},{"article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","year":"2023","author":"Shen","key":"ref19"},{"key":"ref20","article-title":"High-fidelity audio compression with improved rvqgan","volume-title":"Proc. NeurIPS","volume":"36","author":"Kumar"},{"key":"ref21","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. NeurIPS","volume":"33","author":"Baevski"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref23","first-page":"18003","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","volume-title":"Proc. ICML","author":"Qian"},{"article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Proc. ICML","author":"Esser","key":"ref24"},{"article-title":"Progressive distillation for fast sampling of diffusion models","year":"2022","author":"Salimans","key":"ref25"},{"article-title":"Ditse: High-fidelity generative speech enhancement via latent diffusion transformers","year":"2025","author":"Guimar\u00e3es","key":"ref26"},{"article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. ICLR","author":"Radford","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1108\/RR-08-2013-0197"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"article-title":"Common voice: A massively-multilingual speech corpus","year":"2019","author":"Ardila","key":"ref30"},{"article-title":"Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit","year":"2019","author":"Yamagishi","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2014.2379648"},{"key":"ref33","first-page":"10937","article-title":"Unispeech: Unified speech representation learning with labeled and unlabeled data","volume-title":"Proc. ICML","author":"Wang"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230986.pdf?arnumber=11230986","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:41:39Z","timestamp":1763192499000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230986\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230986","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}