{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T13:22:33Z","timestamp":1767705753956,"version":"3.33.0"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832321","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"247-254","source":"Crossref","is-referenced-by-count":1,"title":["Fusion Of Discrete Representations and Self-Augmented Representations for Multilingual Automatic Speech Recognition"],"prefix":"10.1109","author":[{"given":"Shih-Heng","family":"Wang","sequence":"first","affiliation":[{"name":"National Taiwan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiatong","family":"Shi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chien-Yu","family":"Huang","sequence":"additional","affiliation":[{"name":"National Taiwan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref2","first-page":"5071","article-title":"A Survey of Multilingual Models for Automatic Speech Recognition","author":"Yadav","year":"2022","journal-title":"LREC"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC47483.2019.9023195"},{"key":"ref4","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-143"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"article-title":"Multi-resolution huBERT: Multi-resolution speech self-supervised learning with masked unit prediction","volume-title":"Proc. ICLR","author":"Shi","key":"ref9"},{"key":"ref10","article-title":"Scaling speech technology to 1,000+ languages","author":"Pratap","year":"2023","journal-title":"arXiv"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1316"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/icassp48485.2024.10447929"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10610"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446063"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447112"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2069"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3352388"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01409"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2199"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10917"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10796"},{"article-title":"Many-to-many spoken language translation via unified speech and text representation learning with unit-to-unit translation","year":"2023","author":"Kim","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-194"},{"article-title":"Tmt: Tri-modal translation between speech, image, and text by processing different modalities as different languages","year":"2024","author":"Kim","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446888"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1878"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref32","first-page":"1505","article-title":"Wavlm: Large-scale self-supervised pretraining for full stack speech processing","volume-title":"IEEE Journal of Selected Topics in Signal Processing","volume":"16","author":"Chen"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2018-1456"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.25080\/majora-7b98e3ed-003"},{"key":"ref35","article-title":"Adam: A method for stochastic optimization","volume":"abs\/1412.6980","author":"Kingma","year":"2014","journal-title":"CoRR"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2024,12,2]]},"location":"Macao","end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832321.pdf?arnumber=10832321","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:39:27Z","timestamp":1737398367000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832321\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832321","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}