{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:21:30Z","timestamp":1776885690350,"version":"3.51.2"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100011038","name":"Office of the Director of National Intelligence","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100011038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100011039","name":"Intelligence Advanced Research Projects Activity","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100011039","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434735","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":2,"title":["GenVC: Self-Supervised Zero-Shot Voice Conversion"],"prefix":"10.1109","author":[{"given":"Zexin","family":"Cai","sequence":"first","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Henry Li","family":"Xinyuan","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Ashi","family":"Garg","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Leibny Paola","family":"Garc\u00eda-Perera","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Kevin","family":"Duh","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Sanjeev","family":"Khudanpur","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Matthew","family":"Wiesner","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]},{"given":"Nicholas","family":"Andrews","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Human Language Technology Center of Excellence,Baltimore,United States"}]}],"member":"263","reference":[{"key":"ref1","first-page":"5210","article-title":"AutoVC: Zero-Shot Voice Style Transfer with Only Autoencoder Loss","volume-title":"International Conference on Machine Learning","volume":"97","author":"Qian"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1710"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746048"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2006-170"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095191"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832292"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2024.3439469"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096733"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832351"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414975"},{"key":"ref12","first-page":"2709","article-title":"YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone","volume-title":"International Conference on Machine Learning","author":"Casanova"},{"key":"ref13","first-page":"16 251","article-title":"Neural Analysis and Synthesis: Reconstructing Speech from Self-Supervised Representations","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Choi","year":"2021"},{"key":"ref14","article-title":"NANSY++: Unified Voice Synthesis with Neural Analysis and Synthesis","volume-title":"International Conference on Learning Representations","author":"Choi"},{"key":"ref15","article-title":"Better Speech Synthesis through Scaling","volume-title":"arXiv preprint arXiv:2305.07243","author":"Betker","year":"2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3308474"},{"key":"ref17","article-title":"AudioGen: Textually Guided Audio Generation","volume-title":"International Conference on Learning Representations","author":"Kreuk"},{"key":"ref18","article-title":"Towards Audio Language Modeling-An Overview","author":"Wu","year":"2024","journal-title":"arXiv preprint arXiv:2402.13236"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref23","article-title":"CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models","volume-title":"arXiv preprint arXiv:2412.10117","author":"Du","year":"2024"},{"key":"ref24","article-title":"CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Posttraining","author":"Du","year":"2025","journal-title":"arXiv preprint arXiv:2505.17589"},{"key":"ref25","article-title":"Seed-TTS: A Family of High-Quality Versatile Speech Generation Models","author":"Anastassiou","year":"2024","journal-title":"arXiv preprint arXiv:2406.02430"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3530270"},{"key":"ref27","article-title":"High Fidelity Neural Audio Compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref28","article-title":"Speak Foreign Languages with Your Own Voice: Cross-Lingual Neural Codec Language Modeling","author":"Zhang","year":"2023","journal-title":"arXiv preprint arXiv:2303.03926"},{"key":"ref29","article-title":"VALL-E 2: Neural Codec Language Models are Human Parity Zero-Shot Text to Speech Synthesizers","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2406.05370"},{"key":"ref30","article-title":"CosyVoice: A Scalable Multilingual Zero-Shot Text-to-Speech Synthesizer Based on Supervised Semantic Tokens","author":"Du","year":"2024","journal-title":"arXiv preprint arXiv:2407.05407"},{"key":"ref31","article-title":"MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer","volume-title":"International Conference on Learning Representations","author":"Wang"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2066"},{"key":"ref33","article-title":"Moshi: A Speech-Text Foundation Model for Real-Time Dialogue","author":"D\u00e9fossez","year":"2024","journal-title":"arXiv preprint arXiv:2410.00037"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.396"},{"key":"ref36","article-title":"Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement","volume-title":"International Conference on Learning Representations","author":"Zhang"},{"key":"ref37","article-title":"The VoicePrivacy 2024 Challenge Evaluation Plan","author":"Tomashenko","year":"2024","journal-title":"arXiv preprint arXiv:2404.02677"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1157"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1356"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746430"},{"key":"ref41","article-title":"Neural Discrete Representation Learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref44","article-title":"NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers","volume-title":"International Conference on Learning Representations","author":"Shen"},{"key":"ref45","first-page":"17 022","article-title":"HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kong","year":"2020"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref47","first-page":"4218","article-title":"Common Voice: A Massively-Multilingual Speech Corpus","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref49","article-title":"CMU ARCTIC Databases for Speech Synthesis","author":"Kominek","year":"2003"},{"key":"ref50","article-title":"The EMIME Mandarin Bilingual Database","volume-title":"The University of Edinburgh, Tech. Rep.","author":"Wester","year":"2011"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref54","first-page":"18 003","article-title":"ContentVec: An Improved Self-Supervised Speech Representation by Disentangling Speakers","volume-title":"International Conference on Machine Learning","author":"Qian"},{"issue":"8","key":"ref55","first-page":"9","article-title":"Language Models Are Unsupervised Multitask Learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448436"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832315"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-479"},{"key":"ref60","article-title":"Good Practices for Evaluation of Synthesized Speech","author":"Cooper","year":"2025","journal-title":"arXiv preprint arXiv:2503.03250"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref62","article-title":"Scaling Laws for Neural Language Models","author":"Kaplan","year":"2020","journal-title":"arXiv preprint arXiv:2001.08361"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434735.pdf?arnumber=11434735","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:03Z","timestamp":1775192343000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434735\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434735","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}