{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T21:17:38Z","timestamp":1773263858191,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T00:00:00Z","timestamp":1768780800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T00:00:00Z","timestamp":1768780800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,19]]},"DOI":"10.1109\/asp-dac66049.2026.11420335","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T19:51:15Z","timestamp":1773172275000},"page":"333-340","source":"Crossref","is-referenced-by-count":0,"title":["Efficient CPU-GPU Collaborative Inference for MoE-based LLMs on Memory-Limited Systems"],"prefix":"10.1109","author":[{"given":"En-Ming","family":"Huang","sequence":"first","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li-Shang","family":"Lin","sequence":"additional","affiliation":[{"name":"National Tsing Hua University,Hsinchu,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chun-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taipei,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Advances in Neural Information Processing Systems (NIPS)","volume":"33","author":"Brown"},{"key":"ref2","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2024"},{"key":"ref3","volume-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"ref4","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017"},{"key":"ref5","volume-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","author":"Fedus","year":"2022"},{"key":"ref6","volume-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"ref7","volume-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"Abdin","year":"2024"},{"key":"ref8","volume-title":"Introducing DBRX: A new state-of-the-art open llm","author":"Team","year":"2024"},{"key":"ref9","volume-title":"Qwen2 technical report","author":"Yang","year":"2024"},{"key":"ref10","volume-title":"Cheaper, better, faster, stronger","author":"AI","year":"2024"},{"key":"ref11","volume-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref14","volume-title":"Fast inference of mixture-of-experts language models with offloading","author":"Eliseev","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676741"},{"key":"ref17","volume-title":"Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference","author":"He","year":"2024"},{"key":"ref18","article-title":"MoE-Infinity: Offloading-efficient MoE model serving","author":"Xue","year":"2024"},{"key":"ref19","first-page":"31094","article-title":"FlexGen: High-throughput generative inference of large language models with a single GPU","volume-title":"Proc. Int. Conf. on Machine Learning (ICML)","author":"Sheng"},{"key":"ref20","first-page":"155","article-title":"InfiniGen: Efficient generative inference of large language models with dynamic KV cache management","volume-title":"Proc. USENIX Symp. on Operating Systems Design and Implementation (OSDI)","author":"Lee"},{"key":"ref21","first-page":"2024","volume-title":"Transformers","author":"Face","year":"2024"},{"key":"ref22","article-title":"Measuring massive multitask language understanding","volume-title":"Proc. Int. Conf. on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref23","article-title":"Aligning AI with shared human values","volume-title":"Proc. Int. Conf. on Learning Representations (ICLR)","author":"Hendrycks"},{"key":"ref24","article-title":"Pytorch: An imperative style, highperformance deep learning library","author":"Paszke","year":"2019"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/99.660313"},{"key":"ref26","article-title":"Fiddler: CPUGPU orchestration for fast inference of Mixture-of-Experts models","volume-title":"Proc. Int. Conf. on Learning Representations (ICLR)","author":"Kamahori"},{"key":"ref27","article-title":"EdgeMoE: Fast on-device inference of MoE-based large language models","author":"Yi","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.363"},{"key":"ref29","volume-title":"mistral-inference","year":"2024"}],"event":{"name":"2026 31st Asia and South Pacific Design Automation Conference (ASP-DAC)","location":"Lantau, Hong Kong","start":{"date-parts":[[2026,1,19]]},"end":{"date-parts":[[2026,1,22]]}},"container-title":["2026 31st Asia and South Pacific Design Automation Conference (ASP-DAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11420221\/11420229\/11420335.pdf?arnumber=11420335","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T05:17:37Z","timestamp":1773206257000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11420335\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,19]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/asp-dac66049.2026.11420335","relation":{},"subject":[],"published":{"date-parts":[[2026,1,19]]}}}