{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:02:18Z","timestamp":1775199738575,"version":"3.50.1"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434645","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Omni-Router: Sharing Routing Decisions in Sparse Mixture-of-Experts for Speech Recognition"],"prefix":"10.1109","author":[{"given":"Zijin","family":"Gu","sequence":"first","affiliation":[{"name":"Apple Inc."}]},{"given":"Tatiana","family":"Likhomanenko","sequence":"additional","affiliation":[{"name":"Apple Inc."}]},{"given":"Navdeep","family":"Jaitly","sequence":"additional","affiliation":[{"name":"Apple Inc."}]}],"member":"263","reference":[{"key":"ref1","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani"},{"key":"ref2","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2023","journal-title":"ICML."},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref5","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in english and mandarin","volume-title":"International conference on machine learning.","author":"Amodei"},{"key":"ref6","article-title":"End-to-end asr: from supervised to semi-supervised learning with modern architectures","volume-title":"ICML 2020 Workshop on Selfsupervision in Audio and Speech","author":"Synnaeve"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1616"},{"key":"ref8","article-title":"Fully convolutional speech recognition","author":"Zeghidour","year":"2018","journal-title":"arXiv preprint arXiv:1812.06864"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683745"},{"key":"ref10","article-title":"Denoising lm: Pushing the limits of error correction models for speech recognition","author":"Gu","year":"2024","journal-title":"arXiv preprint arXiv:2405.15216"},{"key":"ref11","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"key":"ref12","first-page":"30016","article-title":"Training compute-optimal large language models","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Hoffmann"},{"key":"ref13","article-title":"Owls: Scaling laws for multilingual speech recognition and translation models","author":"Chen","year":"2025","journal-title":"arXiv preprint arXiv:2502.10373"},{"key":"ref14","article-title":"Upcycling large language models into mixture of experts","author":"He","year":"2024","journal-title":"arXiv preprint arXiv:2410.07524"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"ref16","article-title":"Outrageously large neural networks: The sparselygated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv preprint arXiv:1701.06538"},{"key":"ref17","article-title":"Mixture of parrots: Experts improve memorization more than reasoning","author":"Jelassi","year":"2024","journal-title":"arXiv preprint arXiv:2410.19034"},{"key":"ref18","article-title":"Parameters vs flops: Scaling laws for optimal sparsity for mixture-of-experts language models","author":"Abnar","year":"2025","journal-title":"arXiv preprint arXiv:2501.12370"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0515"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.489"},{"key":"ref21","article-title":"St-moe: Designing stable and transferable sparse expert models","author":"Zoph","year":"2022","journal-title":"arXiv preprint arXiv:2202.08906"},{"key":"ref22","doi-asserted-by":"crossref","DOI":"10.1609\/aaai.v39i24.34708","article-title":"Transformer layers as painters","author":"Sun","year":"2025"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.2.181"},{"issue":"120","key":"ref25","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"ref26","first-page":"6265","article-title":"Base layers: Simplifying training of large, sparse models","volume-title":"International Conference on Machine Learning. PMLR","author":"Lewis"},{"key":"ref27","first-page":"17","article-title":"Hash layers for large sparse models","volume":"34","author":"Roller","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref28","first-page":"5547","article-title":"Glam: Efficient scaling of language models with mixture-of-experts","author":"Du","year":"2022"},{"key":"ref29","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume":"34","author":"Riquelme","year":"2021","journal-title":"NeurIPS"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-478"},{"key":"ref31","first-page":"7217","article-title":"Speechmoe2: Mixture-of-experts model with improved routing","year":"2022","journal-title":"ICASSP. IEEE"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-480"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096227"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10651105"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0897"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29622"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.571"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-78"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-740"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-007-9040-x"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1992.225858"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/CHiME.2020-1"},{"key":"ref44","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2019","journal-title":"arXiv preprint arXiv:1912.06670"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-99579-3_21"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2_7"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447120"},{"key":"ref53","first-page":"69","article-title":"The fisher corpus: A resource for the next generations of speech-to-text","volume":"4","author":"Cieri","year":"2004","journal-title":"LREC"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434645.pdf?arnumber=11434645","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:40Z","timestamp":1775192260000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434645\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434645","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}