{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T10:13:28Z","timestamp":1773137608113,"version":"3.50.1"},"reference-count":37,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Institute of Information and Communications Technology Planning and Evaluation (IITP) grant funded by Korean Government (MSIT), Development of an On-Device Integrated Edge AI Server System","award":["RS-2025-25441574"],"award-info":[{"award-number":["RS-2025-25441574"]}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea (NRF) grant funded by Korean Government","doi-asserted-by":"publisher","award":["RS-2023-00213118"],"award-info":[{"award-number":["RS-2023-00213118"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/access.2026.3665697","type":"journal-article","created":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T21:09:07Z","timestamp":1771362547000},"page":"33610-33624","source":"Crossref","is-referenced-by-count":0,"title":["Two-Stage Expert Offloading for Domain-Aware MoE Inference"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-6416-2741","authenticated-orcid":false,"given":"Hangyeol","family":"Kim","sequence":"first","affiliation":[{"name":"Department of Electrical Computer Engineering, Sungkyunkwan University, Suwon, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6948-3440","authenticated-orcid":false,"given":"Honguk","family":"Woo","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Sungkyunkwan University, Suwon, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Younghwan","family":"Kim","sequence":"additional","affiliation":[{"name":"Intelligent IDC Project Office, Korea Electronics Technology Institute, Seongnam-si, Gyeonggi-do, South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"issue":"8","key":"ref2","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref3","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv:1701.06538"},{"issue":"120","key":"ref4","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"J. Mach. Learn. Res."},{"key":"ref5","first-page":"1","article-title":"BASE layers: Simplifying training of large, sparse models","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Lewis"},{"key":"ref6","article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","author":"Lepikhin","year":"2020","journal-title":"arXiv:2006.16668"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"ref8","first-page":"1","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","volume-title":"Proc. Int. Conf. High Perform. Comput., Netw., Storage Anal.","author":"Narayanan"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1525\/9780520940420-020"},{"key":"ref11","article-title":"MoE-ERAS: Expert residency aware scheduling for efficient MoE inference","volume-title":"Proc. 4th Conf. Mach. Learn. Syst. (MLSys)","author":"Luo"},{"key":"ref12","first-page":"1","article-title":"Efficiently scaling transformer inference","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Pope"},{"key":"ref13","first-page":"560","article-title":"vLLM: Easy, fast, and cheap llm serving with pagedattention","volume-title":"Proc. 29th Symp. Operating Syst. Princ.","author":"Kwon"},{"key":"ref14","volume-title":"DeepSeek-MoE: Scaling Large Language Models With Mixture-of-Experts","year":"2024"},{"key":"ref15","article-title":"ST-MoE: Designing stable and transferable sparse expert models","author":"Zoph","year":"2022","journal-title":"arXiv:2202.08906"},{"key":"ref16","article-title":"GLaM: Efficient scaling of language models with mixture-of-experts","author":"Du","year":"2021","journal-title":"arXiv:2112.06905"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00024"},{"key":"ref18","first-page":"10488","article-title":"QMoE: Practical sub-4-bit quantization for mixture-of-experts","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Frantar"},{"key":"ref19","article-title":"Sarathi: Efficient LLM inference by piggybacking preemption with pausing","volume-title":"Proc. 29th Symp. Operating Syst. Princ.","author":"Sheng"},{"key":"ref20","article-title":"On the hoop conjecture and the weak cosmic censorship conjecture for the axisymmetric einstein-vlasov system","author":"Ames","year":"2023","journal-title":"arXiv:2305.04360"},{"key":"ref21","article-title":"Mixture-of-experts with expert choice routing","author":"Zhou","year":"2022","journal-title":"arXiv:2202.09368"},{"key":"ref22","article-title":"A chat about boring problems: Studying GPT-based text normalization","author":"Zhang","year":"2023","journal-title":"arXiv:2309.13426"},{"key":"ref23","article-title":"Vision MoE: Scaling mixture of experts for efficient transfer learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Riquelme"},{"key":"ref24","article-title":"Fast inference from transformers via speculative decoding","author":"Leviathan","year":"2022","journal-title":"arXiv:2211.17192"},{"key":"ref25","article-title":"Lookahead decoding: An exploration of what is being speculated","volume-title":"Proc. 62nd Annu. Meeting Assoc. Comput. Linguistics","author":"Yang"},{"key":"ref26","article-title":"MoE-infinity: Scaling sparsely activated models to infinity","volume-title":"Proc. 37th Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Mazur"},{"key":"ref27","article-title":"Medusa: Simple framework for accelerating transformer inference","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Cai"},{"key":"ref28","article-title":"Recurrent drafting for fast autoregressive inference","volume-title":"Proc. Findings Assoc. Comput. Linguistics (ACL)","author":"Zhang"},{"key":"ref29","article-title":"FlashAttention-2: Faster attention with better parallelism and memory usage","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Dao"},{"key":"ref30","article-title":"Orca: Progressive learning from complex explanation traces of GPT-4","author":"Mukherjee","year":"2023","journal-title":"arXiv:2306.02707"},{"key":"ref31","article-title":"Tutel: Adaptive mixture-of-experts for efficient inference","volume-title":"Proc. Mach. Learn. Syst. (MLSys)","author":"He"},{"key":"ref32","volume-title":"Computer Architecture: A Quantitative Approach","author":"Hennessy","year":"2017"},{"key":"ref33","volume-title":"Operating System Concepts","author":"Silberschatz","year":"2018"},{"key":"ref34","volume-title":"Accelerate: Training and Inference at Scale Made Simple, Efficient and Adaptable","author":"Gugger","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2025.3546466"},{"key":"ref36","article-title":"Branch-train-merge: Embarrassingly parallel training of expert models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref37","article-title":"Fast, controllable, and interpretable generation of high-fidelity audio with latent diffusion models","author":"Chen","year":"2023","journal-title":"IEEE\/ACM Trans. Audio, Speech, Language Process."}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/11323511\/11397596.pdf?arnumber=11397596","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T19:59:53Z","timestamp":1773086393000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11397596\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/access.2026.3665697","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}