{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:25:07Z","timestamp":1775229907919,"version":"3.50.1"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832198","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"415-422","source":"Crossref","is-referenced-by-count":2,"title":["Estimating the Completeness of Discrete Speech Units"],"prefix":"10.1109","author":[{"given":"Sung-Lin","family":"Yeh","sequence":"first","affiliation":[{"name":"University of Edinburgh,School of Informatics"}]},{"given":"Hao","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Edinburgh,School of Informatics"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807:03748"},{"key":"ref2","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1228"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10884"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475"},{"key":"ref10","article-title":"On generative spoken language modeling from raw audio","author":"Lakhotia","year":"2021","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.769"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29747"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023234"},{"key":"ref15","article-title":"Informationtheoretic probing for linguistic structure","author":"Pimentel","year":"2020","journal-title":"ACL"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref19","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv:2301.02111"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref21","article-title":"Speechtokenizer: Unified speech tokenizer for speech language models","author":"Zhang","year":"2024"},{"key":"ref22","article-title":"Adaptive density estimation for generative models","author":"Lucas","year":"2019","journal-title":"NeurIPS"},{"key":"ref23","article-title":"Hallucinations in neural automatic speech recognition: Identifying errors and hallucinatory models","author":"Frieske","year":"2024","journal-title":"arXiv preprint arXiv:2401.01572"},{"key":"ref24","article-title":"Formal limitations on the measurement of mutual information","volume-title":"International Conference on Artificial Intelligence and Statistics","author":"McAllester"},{"key":"ref25","article-title":"Selfsupervised speech representations are more phonetic than semantic","author":"Choi","year":"2024","journal-title":"arXiv preprint arXiv:2406.08619"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-419"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/icassp.1982.1171604"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.606"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447758"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3203608"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023428"},{"key":"ref32","article-title":"Speech self-supervised representations benchmarking: a case for larger probing heads","author":"Zaiem","year":"2023","journal-title":"arXiv preprint arXiv:2308.14456"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.14"},{"key":"ref34","article-title":"Contentvec: An improved self-supervised speech representation by disentangling speakers","author":"Qian","year":"2022","journal-title":"ICML. PMLR"},{"key":"ref35","article-title":"Self-supervised neural factor analysis for disentangling utterance-level speech representations","author":"Lin","year":"2023","journal-title":"ICML"},{"key":"ref36","article-title":"Understanding hallucinations in diffusion models through mode interpolation","author":"Aithal","year":"2024","journal-title":"arXiv preprint arXiv:2406.09358"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"ref38","article-title":"Hifi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","author":"Kong","year":"2020","journal-title":"NeurIPS"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2005.851256"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3632410.3633297"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref42","article-title":"Vocos: Closing the gap between time-domain and fourier-based neural vocoders for high-quality audio synthesis","author":"Siuzdak","year":"2023","journal-title":"arXiv preprint arXiv:2306.00814"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853678"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref47","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2022","journal-title":"arXiv preprint arXiv:2210.13438"},{"key":"ref48","article-title":"Categorical reparameterization with Gumbel-softmax","author":"Jang","year":"2017","journal-title":"ICLR"},{"key":"ref49","article-title":"The concrete distribution: A continuous relaxation of discrete random variables","author":"Maddison","year":"2017","journal-title":"ICLR"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","location":"Macao","start":{"date-parts":[[2024,12,2]]},"end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832198.pdf?arnumber=10832198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:50:06Z","timestamp":1737100206000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832198\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832198","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}