{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:48:25Z","timestamp":1776883705716,"version":"3.51.2"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832313","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"719-726","source":"Crossref","is-referenced-by-count":3,"title":["Visinger2+: End-to-End Singing Voice Synthesis Augmented by Self-Supervised Learning Representation"],"prefix":"10.1109","author":[{"given":"Yifeng","family":"Yu","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiatong","family":"Shi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuning","family":"Wu","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxun","family":"Tang","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"4009","article-title":"Vocaloid commercial singing synthesizer based on sample concatenation","volume-title":"Proc. Interspeech","author":"Kenmochi"},{"key":"ref3","article-title":"Singing voice synthesis combining excitation plus resonance and sinusoidal plus residual models","volume-title":"Proc. ICMC","author":"Bonada"},{"key":"ref4","first-page":"1","article-title":"Samplebased singing voice synthesizer by spectral concatenation","volume-title":"Proceedings of Stockholm Music Acoustics Conference","author":"Bonada"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1410"},{"key":"ref6","article-title":"Hifisinger: Towards high-fidelity neural singing voice synthesis","author":"Chen","year":"2020","journal-title":"arXiv preprint arXiv:2009.01776"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747664"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-391"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414348"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-119"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/iscslp63861.2024.10799952"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-978"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-33"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref16","article-title":"MERT: Acoustic music understanding model with large-scale self-supervised training","volume-title":"Proc. ICLR","author":"Yizhi"},{"key":"ref17","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. ICML. PMLR","author":"Baevski"},{"key":"ref18","article-title":"Multi-resolution HuBERT: Multiresolution speech self-supervised learning with masked unit prediction","volume-title":"Proc. ICLR","author":"Shi"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref20","first-page":"884","article-title":"MLSUPERB: Multilingual Speech Universal PERformance Benchmark","volume-title":"Proc. Interspeech","author":"Shi"},{"key":"ref21","article-title":"MARBLE: Music audio representation benchmark for universal evaluation","volume-title":"Proc. NeurIPS","volume":"36","author":"Yuan"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW62465.2024.10626580"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447809"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2023-26"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2574"},{"key":"ref26","article-title":"ParrotTTS: Text-to-speech synthesis exploiting disentangled self-supervised representations","volume-title":"Proc. EACL","author":"Shah"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"ref28","article-title":"ZMM-TTS: Zero-shot multilingual and multispeaker speech synthesis conditioned on self-supervised discrete speech representations","author":"Gong","year":"2023","journal-title":"arXiv preprint arXiv:2312.14398"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"key":"ref32","article-title":"HiFiGAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NeurIPS","author":"Kong"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-48"},{"key":"ref34","article-title":"DB Production: Futon P","author":"Futon","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10039"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685000"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389671"},{"key":"ref38","article-title":"Singmos: An extensive open-source singing voice dataset for mos prediction","author":"Tang","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1011"},{"key":"ref40","article-title":"Espnet-spk: full pipeline speaker embedding toolkit with reproducible recipes, self-supervised frontends, and off-the-shelf models","author":"Jung","year":"2024","journal-title":"arXiv preprint arXiv:2401.17230"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","location":"Macao","start":{"date-parts":[[2024,12,2]]},"end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832313.pdf?arnumber=10832313","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:39:22Z","timestamp":1737398362000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832313\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832313","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}