{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T13:16:05Z","timestamp":1769260565234,"version":"3.49.0"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"NSF Institute for Foundations of Machine Learning","award":["2019844"],"award-info":[{"award-number":["2019844"]}]},{"name":"NSF AI Institute for Foundations of Machine Learning"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Inform. Theory"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1109\/tit.2025.3647061","type":"journal-article","created":{"date-parts":[[2025,12,22]],"date-time":"2025-12-22T18:40:59Z","timestamp":1766428859000},"page":"1276-1304","source":"Crossref","is-referenced-by-count":0,"title":["Convergence Rates for Softmax Gating Mixture of Experts"],"prefix":"10.1109","volume":"72","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5605-1584","authenticated-orcid":false,"given":"Huy","family":"Nguyen","sequence":"first","affiliation":[{"name":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"}]},{"given":"Nhat","family":"Ho","sequence":"additional","affiliation":[{"name":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"}]},{"given":"Alessandro","family":"Rinaldo","sequence":"additional","affiliation":[{"name":"Department of Statistics and Data Sciences, The University of Texas at Austin, Austin, TX, USA"}]}],"member":"263","reference":[{"key":"ref1","first-page":"37707","article-title":"On least square estimation in softmax gating mixture of experts","volume-title":"Proc. ICML","author":"Nguyen"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1214\/cbms\/1462106013"},{"key":"ref4","first-page":"23049","article-title":"Towards understanding the mixture-of-experts layer in deep learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Chen"},{"key":"ref5","first-page":"37617","article-title":"A general theory for softmax gating multinomial logistic mixture of experts","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Nguyen"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1080\/00949650802590261"},{"key":"ref7","first-page":"878","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Shazeer"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/NNSP.1994.366050"},{"key":"ref10","first-page":"29335","article-title":"DSelect-K: Differentiable selection in the mixture of experts with applications to multi-task learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Hazimeh"},{"key":"ref11","article-title":"Sparsely activated mixture-of-experts are robust multi-task learners","author":"Gupta","year":"2022","journal-title":"arXiv:2204.07689"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1080\/01621459.1996.10476965"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-478"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1186\/1471-2105-8-S10-S6"},{"key":"ref15","article-title":"Estimating or propagating gradients through stochastic neurons for conditional computation","author":"Bengio","year":"2013","journal-title":"arXiv:1308.3432"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref17","article-title":"Mixtral of experts","author":"Jiang","year":"2024","journal-title":"arXiv:2401.04088"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2167"},{"key":"ref19","first-page":"98782","article-title":"Flex-MoE: Modeling arbitrary modality combination via the flexible mixture-of-experts","volume-title":"Proc. 38th Annu. Conf. Neural Inf. Process. Syst.","author":"Yun"},{"key":"ref20","article-title":"DeepSeek-V3 technical report","volume-title":"arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref21","article-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref22","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv:2403.05530"},{"key":"ref23","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Riquelme"},{"key":"ref24","first-page":"28441","article-title":"M3ViT: Mixture-of-experts vision transformer for efficient multi-task learning with model-accelerator co-design","volume-title":"Proc. NeurIPS","author":"Liang"},{"key":"ref25","article-title":"Sparse mixture-of-experts are domain generalizable learners","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Li"},{"key":"ref26","article-title":"Statistical advantages of perturbing cosine router in mixture of experts","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Nguyen"},{"key":"ref27","first-page":"38520","article-title":"Mixtures of experts unlock parameter scaling for deep RL","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","author":"Obando-Ceron"},{"key":"ref28","article-title":"A mixture-of-expert approach to RL-based dialogue management","volume-title":"Proc. 11th Int. Conf. Learn. Represent.","author":"Chow"},{"key":"ref29","article-title":"EvoMoE: An evolutional mixture-of-experts training framework via dense-to-sparse gate","author":"Nie","year":"2021","journal-title":"arXiv:2112.14397"},{"key":"ref30","first-page":"34600","article-title":"On the representation collapse of sparse mixture of experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chi"},{"key":"ref31","first-page":"37570","article-title":"Is temperature sample efficient for softmax Gaussian mixture of experts?","volume-title":"Proc. ICML","author":"Nguyen"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.49"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.2.181"},{"key":"ref34","article-title":"On expert estimation in hierarchical mixture of experts: Beyond softmax gating functions","author":"Nguyen","year":"2024","journal-title":"arXiv:2410.02935"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.08.052"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1002\/wrcr.20150"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1002\/2015WR018266"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/18.669150"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1214\/aos\/1018031265"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00354"},{"issue":"323","key":"ref41","first-page":"1","article-title":"Convergence rates for Gaussian mixtures of experts","volume":"23","author":"Ho","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref42","first-page":"4624","article-title":"Demystifying softmax gating function in Gaussian mixture of experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref43","first-page":"45583","article-title":"Statistical perspective of top-K sparse softmax gating mixture of experts","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Nguyen"},{"key":"ref44","first-page":"53022","article-title":"Multilinear mixture of experts: Scalable expert specialization through factorization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Oldfield"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(99)00066-0"},{"key":"ref46","article-title":"Gaussian error linear units (GELUs)","author":"Hendrycks","year":"2016","journal-title":"arXiv:1606.08415"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1214\/12-AOS1065"},{"key":"ref48","first-page":"14979","article-title":"Refined convergence rates for maximum likelihood estimation under finite mixture models","volume-title":"Proc. 39th Int. Conf. Mach. Learn.","author":"Manole"},{"key":"ref49","volume-title":"Empirical Processes in M-Estimation","author":"van de Geer","year":"2000"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-1880-7_29"}],"container-title":["IEEE Transactions on Information Theory"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/18\/11361350\/11311504.pdf?arnumber=11311504","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T20:58:10Z","timestamp":1769201890000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11311504\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":50,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tit.2025.3647061","relation":{},"ISSN":["0018-9448","1557-9654"],"issn-type":[{"value":"0018-9448","type":"print"},{"value":"1557-9654","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]}}}