{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:22:43Z","timestamp":1776885763802,"version":"3.51.2"},"reference-count":102,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100002465","name":"Delta","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100002465","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832289","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"562-569","source":"Crossref","is-referenced-by-count":15,"title":["ESPnet-Codec: Comprehensive Training and Evaluation of Neural Codecs For Audio, Music, and Speech"],"prefix":"10.1109","author":[{"given":"Jiatong","family":"Shi","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinchuan","family":"Tian","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yihan","family":"Wu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jee-Weon","family":"Jung","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia Qi","family":"Yip","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yoshiki","family":"Masuyama","sequence":"additional","affiliation":[{"name":"Tokyo Metropolitan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"William","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuning","family":"Wu","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxun","family":"Tang","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Massa","family":"Baali","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dareen","family":"Alharthi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruifan","family":"Deng","sequence":"additional","affiliation":[{"name":"University of Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tejes","family":"Srivastava","sequence":"additional","affiliation":[{"name":"National Taiwan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"National Taiwan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bhiksha","family":"Raj","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruihua","family":"Song","sequence":"additional","affiliation":[{"name":"Renmin University of China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref3","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","volume-title":"Proc. ICML","author":"Chiu"},{"key":"ref4","article-title":"Multi-resolution HuBERT: Multi-resolution speech self-supervised learning with masked unit prediction","volume-title":"Proc. ICLR","author":"Shi"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1316"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9747490"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447929"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096827"},{"key":"ref11","article-title":"Discretalk: Text-to-speech as a machine translation problem","author":"Hayashi","year":"2020","journal-title":"arXiv:2005.05525"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095973"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-demo.38"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"ref16","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv:2301.02111"},{"key":"ref17","article-title":"Simple and controllable music generation","volume-title":"Proc. NeurIPS","author":"Copet"},{"key":"ref18","article-title":"SpeechTokenizer: Unified speech tokenizer for speech language models","volume-title":"Proc. ICLR","author":"Zhang"},{"key":"ref19","article-title":"Uniaudio: An audio foundation model toward universal audio generation","author":"Yang","year":"2024","journal-title":"ICML"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1878"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2021.3129994"},{"key":"ref23","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"TMLR"},{"key":"ref24","article-title":"High-fidelity audio compression with improved RVQGAN","volume-title":"Proc. NeurIPS","author":"Kumar"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1456"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref27","article-title":"ESPnet2-TTS: Extending the edge of TTS research","author":"Hayashi","year":"2021","journal-title":"arXiv:2110.07840"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10039"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1345"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10727"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383615"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1176"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096509"},{"key":"ref34","article-title":"HiFi-codec: Group-residual vector quantization for high fidelity audio codec","volume-title":"arXiv:2305.02765","author":"Yang"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447523"},{"key":"ref36","doi-asserted-by":"crossref","DOI":"10.1109\/SLT61566.2024.10832255","article-title":"Amphion: An open-source audio, music and speech generation toolkit","volume-title":"arXiv:2312.09911","author":"Zhang"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref39","article-title":"SDR-half-baked or well done?","volume-title":"Proc. ICASSP","author":"Le Roux"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-015-0054-9"},{"key":"ref41","article-title":"A high fidelity and low complexity neural audio coding","volume-title":"arXiv:2310.10992","author":"Liu"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413439"},{"key":"ref43","article-title":"Generating diverse high-fidelity images with VQ-VAE-2","volume-title":"Proc. NeurIPS","author":"Razavi"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413605"},{"key":"ref46","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NeurIPS","author":"Kong"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414661"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1508"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414878"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1532"},{"key":"ref52","article-title":"SingMOS: An extensive open-source singing voice dataset for mos prediction","author":"Tang","year":"2024","journal-title":"arXiv:2406.10911"},{"key":"ref53","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref54","article-title":"Viola: Unified codec language models for speech recognition, synthesis, and translation","author":"Wang","year":"2023","journal-title":"arXiv:2305.16107"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447296"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3347148"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2251"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-565"},{"key":"ref59","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. ICLR","author":"Ren"},{"key":"ref60","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. ICML","author":"Kim"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3419418"},{"key":"ref63","article-title":"Seed-TTS: A family of high-quality versatile speech generation models","author":"Anastassiou","year":"2024","journal-title":"arXiv:2406.02430"},{"key":"ref64","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","author":"Ju","year":"2024","journal-title":"ICML"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1392"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-126"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446998"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2023.3304482"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3285241"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445985"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447030"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023356"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-337"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2023.3282097"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2360"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2291"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-391"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414348"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389700"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1510"},{"key":"ref82","article-title":"Towards efficient self-supervised representation learning in speech processing","volume-title":"Proc. EACL","author":"Lugo"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389778"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-755"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.americasnlp-1.7"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.96"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475437"},{"key":"ref89","article-title":"Singstyle111: A multilingual singing dataset with style transfer","volume-title":"Proc. ISMIR","author":"Dai"},{"key":"ref90","article-title":"M4singer: A multi-style, multi-singer and musical score provided mandarin singing corpus","volume-title":"Proc. NeurIPS","author":"Zhang"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1250\/ast.42.140"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-48"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-33"},{"key":"ref94","article-title":"PJS: Phoneme-balanced japanese singing-voice corpus","volume-title":"Proc. APSIPA ASC","author":"Koguchi"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1250\/ast.41.761"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.616"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref99","volume-title":"The lj speech dataset","author":"Ito","year":"2017"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.588"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","location":"Macao","start":{"date-parts":[[2024,12,2]]},"end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832289.pdf?arnumber=10832289","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:50:28Z","timestamp":1737100228000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832289\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":102,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832289","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}