{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T08:03:53Z","timestamp":1764403433809,"version":"3.33.0"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832319","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"758-765","source":"Crossref","is-referenced-by-count":2,"title":["Leveraging Diverse Semantic-Based Audio Pretrained Models for Singing Voice Conversion"],"prefix":"10.1109","author":[{"given":"Xueyao","family":"Zhang","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Zihao","family":"Fang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Yicheng","family":"Gu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Haopeng","family":"Chen","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Lexiao","family":"Zou","sequence":"additional","affiliation":[{"name":"Shenzhen Research Institute of Big Data"}]},{"given":"Junan","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Liumeng","family":"Xue","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Zhizheng","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389671"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832365"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1761"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR.2019.00059"},{"key":"ref6","first-page":"5210","article-title":"Autovc: Zero-shot voice style transfer with only autoencoder loss","volume":"97","author":"Qian","journal-title":"ICML. 2019"},{"key":"ref7","first-page":"70","article-title":"Zero-shot singing voice conversion","author":"Nercessian","year":"2020","journal-title":"ISMIR"},{"key":"ref8","first-page":"7073","article-title":"Ppg-based singing voice con-version with adversarial representation learning","author":"Li","year":"2021","journal-title":"ICASSP."},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3193761"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097147"},{"key":"ref11","article-title":"Robust one-shot singing voice conversion","volume":"abs\/2210.11096","author":"Takahashi","year":"2022","journal-title":"arXiv"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448184"},{"key":"ref13","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref15","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume":"202","author":"Radford","journal-title":"ICML. 2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10316"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3313424"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446066"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1983"},{"key":"ref20","first-page":"18003","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","volume":"162","author":"Qian","journal-title":"in ICML. 2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"journal-title":"in ICLR. 2024, OpenReview.net","article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2009.4960404"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-539"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-580"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054199"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389740"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688219"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-48"},{"key":"ref33","article-title":"M4singer: A multi-style, multi-singer and musical score provided mandarin singing corpus","author":"Zhang","year":"2022","journal-title":"NeurIPS"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/PACRIM.1993.407206"},{"key":"ref35","article-title":"Mega-tts: Zero-shot text-to-speech at scale with intrinsic inductive bias","volume":"abs\/2306.03509","author":"Jiang","year":"2023","journal-title":"arXiv"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref37","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-toend text-to-speech","volume":"139","author":"Kim","journal-title":"ICML. 2021"},{"key":"ref38","article-title":"Diffwave: A versatile diffusion model for audio synthesis","author":"Kong","year":"2021","journal-title":"ICLR"},{"key":"ref39","first-page":"125","article-title":"Wavenet: A generative model for raw audio","volume":"2016","author":"Oord","journal-title":"SSW."},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1016\/j.wocn.2018.07.001"},{"journal-title":"ICLR. 2021, OpenReview.net","article-title":"Fastspeech 2: Fast and high-quality end-toend text to speech","author":"Ren","key":"ref41"},{"key":"ref42","article-title":"Amphion: An open-source audio, music and speech generation toolkit","volume":"abs\/2312.09911","author":"Zhang","year":"2023","journal-title":"arXiv"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2024,12,2]]},"location":"Macao","end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832319.pdf?arnumber=10832319","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:50:36Z","timestamp":1737100236000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832319\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832319","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}