{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:10:12Z","timestamp":1776885012289,"version":"3.51.2"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434631","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":1,"title":["Bridging the Modality Gap: Softly Discretizing Audio Representation for LLM-based Automatic Speech Recognition"],"prefix":"10.1109","author":[{"given":"Mu","family":"Yang","sequence":"first","affiliation":[{"name":"University of Texas at Dallas,Centor for Robust Speech Systems (CRSS),USA"}]},{"given":"Szu-Jui","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas,Centor for Robust Speech Systems (CRSS),USA"}]},{"given":"Jiamin","family":"Xie","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas,Centor for Robust Speech Systems (CRSS),USA"}]},{"given":"H. L.","family":"John Hansen","sequence":"additional","affiliation":[{"name":"University of Texas at Dallas,Centor for Robust Speech Systems (CRSS),USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref3","article-title":"Qwen2. 5-coder technical report","volume-title":"arXiv preprint arXiv:2409.12186","author":"Hui","year":"2024"},{"key":"ref4","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90 % chatgpt quality","author":"Chiang","year":"2023"},{"key":"ref5","article-title":"Listen, think, and understand","volume-title":"International Conference on Learning Representations","author":"Gong"},{"key":"ref6","article-title":"Salmonn: Towards generic hearing abilities for large language models","author":"Tang","year":"2024"},{"key":"ref7","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref8","first-page":"5522","article-title":"AudioChatLlama: Towards general-purpose speech abilities for LLMs","volume-title":"Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Fathullah"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-209"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447605"},{"key":"ref11","article-title":"An embarrassingly simple approach for llm with strong asr capacity","author":"Ma","year":"2024","journal-title":"arXiv preprint arXiv:2402.08846"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2025-2245"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389705"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445874"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.298"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW65056.2025.11010998"},{"key":"ref17","article-title":"Soundwave: Less is more for speech-text alignment in 11 ms","author":"Zhang","year":"2025","journal-title":"arXiv preprint arXiv:2502.12900"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1533"},{"key":"ref19","article-title":"A comparative study of discrete speech tokens for semantic-related tasks with large language models","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2411.08742"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3347148"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389673"},{"key":"ref22","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"International Conference on Learning Representations","author":"Hu"},{"key":"ref23","article-title":"Neural discrete representation learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref24","article-title":"Estimating or propagating gradients through stochastic neurons for conditional computation","author":"Bengio","year":"2013","journal-title":"arXiv preprint arXiv:1308.3432"},{"key":"ref25","article-title":"Categorical reparameterization with gumbel-softmax","volume-title":"International Conference on Learning Representations","author":"Jang"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref27","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","volume-title":"International Conference on Learning Representations.","author":"Baevski"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref29","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2019","journal-title":"arXiv preprint arXiv:1912.06670"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref31","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning. PMLR","author":"Radford"},{"issue":"11","key":"ref32","article-title":"Visualizing data using t-sne","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"Journal of machine learning research"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434631.pdf?arnumber=11434631","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:33Z","timestamp":1775192253000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434631\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434631","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}