{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:21:16Z","timestamp":1776889276196,"version":"3.51.2"},"reference-count":45,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434774","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":4,"title":["Efficient Scaling for LLM-based ASR"],"prefix":"10.1109","author":[{"given":"Bingshen","family":"Mu","sequence":"first","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Yiwen","family":"Shao","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Kun","family":"Wei","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Dong","family":"Yu","sequence":"additional","affiliation":[{"name":"Tencent AI Lab"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref6","first-page":"6079","article-title":"CIF: Continuous Integrate-And-Fire for End-ToEnd Speech Recognition","volume-title":"Proc. ICASSP","author":"Dong"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1211.3711"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2763455"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447563"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2024.3432275"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888133"},{"key":"ref12","article-title":"GPT-4 Technical Report","volume-title":"arXiv preprint arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref13","article-title":"LLaMA: Open and Efficient Foundation Language Models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"key":"ref14","article-title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2307.09288"},{"key":"ref15","article-title":"The Llama 3 Herd of Models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref16","article-title":"Qwen Technical Report","volume-title":"arXiv preprint arXiv:2309.16609","author":"Bai","year":"2023"},{"key":"ref17","article-title":"Qwen3 Technical Report","volume-title":"arXiv preprint arXiv:2505.09388","author":"Yang","year":"2025"},{"key":"ref18","article-title":"SALMONN: Towards Generic Hearing Abilities for Large Language Models","volume-title":"Proc. ICLR","author":"Tang"},{"key":"ref19","article-title":"AudioPaLM: A Large Language Model That Can Speak and Listen","author":"Rubenstein","year":"2023","journal-title":"arXiv preprint arXiv:2306.12925"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"ref21","article-title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models","author":"Chu","year":"2023","journal-title":"arXiv preprint arXiv:2311.07919"},{"key":"ref22","article-title":"Qwen2-Audio Technical Report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref23","article-title":"An Embarrassingly Simple Approach for LLM with Strong ASR Capacity","author":"Ma","year":"2024","journal-title":"arXiv preprint arXiv:2402.08846"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP63861.2024.10800077"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3589856"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832362"},{"key":"ref27","first-page":"28492","article-title":"Robust Speech Recognition via Large-Scale Weak Supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref28","article-title":"LoRA: Low-Rank Adaptation of Large Language Models","volume-title":"Proc. ICLR","author":"Hu"},{"key":"ref29","article-title":"LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT","author":"Du","year":"2023","journal-title":"arXiv preprint arXiv:2310.04673"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref31","article-title":"FireRedASR: Open-Source Industrial-Grade Mandarin Speech Recognition Models from EncoderDecoder to LLM Integration","author":"Xu","year":"2025","journal-title":"arXiv preprint arXiv:2501.14350"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref33","article-title":"Deep Learning Scaling is Predictable, Empirically","author":"Hestness","year":"2017","journal-title":"arXiv preprint arXiv:1712.00409"},{"key":"ref34","article-title":"Scaling Laws for Neural Machine Translation","volume-title":"Proc. ICLR","author":"Ghorbani"},{"key":"ref35","first-page":"10053","article-title":"Scaling Laws for Multilingual Neural Machine Translation","volume-title":"Proc. ICML","author":"Fernandes"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2128"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1644"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.21"},{"key":"ref39","article-title":"OWLS: Scaling Laws for Multilingual Speech Recognition and Translation Models","author":"Chen","year":"2025","journal-title":"arXiv preprint arXiv:2502.10373"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746682"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref42","article-title":"AISHELL-2: Transforming Mandarin ASR Research Into Industrial Scale","author":"Du","year":"2018","journal-title":"arXiv preprint arXiv:1808.10583"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1397"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746465"},{"key":"ref45","article-title":"KeSpeech: An Open Source Speech Dataset of Mandarin and Its Eight Subdialects","volume-title":"Proc. NeurIPS","author":"Tang"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434774.pdf?arnumber=11434774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:36Z","timestamp":1775192376000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434774\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434774","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}