{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T03:57:45Z","timestamp":1776916665805,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707267","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"715-730","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["MoE-L\n            <scp>ightning<\/scp>\n            : High-Throughput MoE Inference on Memory-constrained GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6834-375X","authenticated-orcid":false,"given":"Shiyi","family":"Cao","sequence":"first","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0825-6178","authenticated-orcid":false,"given":"Shu","family":"Liu","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4655-0229","authenticated-orcid":false,"given":"Tyler","family":"Griggs","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9865-0456","authenticated-orcid":false,"given":"Peter","family":"Schafhalter","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4968-1027","authenticated-orcid":false,"given":"Xiaoxuan","family":"Liu","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1883-2126","authenticated-orcid":false,"given":"Ying","family":"Sheng","sequence":"additional","affiliation":[{"name":"Stanford University, Palo Alto, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2921-956X","authenticated-orcid":false,"given":"Joseph E.","family":"Gonzalez","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7547-7204","authenticated-orcid":false,"given":"Matei","family":"Zaharia","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5373-0088","authenticated-orcid":false,"given":"Ion","family":"Stoica","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Flashinfer: Kernel library for llm serving. https:\/\/github.com\/flashinfer-ai\/flashinfer","author":"Flashinfer","year":"2024","unstructured":"Flashinfer AI. Flashinfer: Kernel library for llm serving. https:\/\/github.com\/flashinfer-ai\/flashinfer, 2024. Accessed: 2024-05-20."},{"key":"e_1_3_2_1_2_1","volume-title":"Gqa: Training generalized multiquery transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. Gqa: Training generalized multiquery transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245, 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar. Llm in a flash: Efficient large language model inference with limited memory","author":"Alizadeh Keivan","year":"2024","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. Llm in a flash: Efficient large language model inference with limited memory, 2024."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_5_1","volume-title":"Accelerating large language model decoding with speculative sampling","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. Accelerating large language model decoding with speculative sampling, 2023."},{"key":"e_1_3_2_1_6_1","first-page":"5383","volume-title":"International Conference on Machine Learning","author":"Chen Wuyang","year":"2023","unstructured":"Wuyang Chen, Yanqi Zhou, Nan Du, Yanping Huang, James Laudon, Zhifeng Chen, and Claire Cui. Lifelong language pretraining with distribution-specialized experts. In International Conference on Machine Learning, pages 5383--5395. PMLR, 2023."},{"key":"e_1_3_2_1_7_1","first-page":"1661","volume-title":"International Conference on Machine Learning","author":"Chen Xinyun","year":"2021","unstructured":"Xinyun Chen, Petros Maniatis, Rishabh Singh, Charles Sutton, Hanjun Dai, Max Lin, and Denny Zhou. Spreadsheetcoder: Formula prediction from semi-structured context. In International Conference on Machine Learning, pages 1661--1672. PMLR, 2021."},{"key":"e_1_3_2_1_8_1","volume-title":"Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E Gonzalez, et al. Chatbot arena: An open platform for evaluating llms by human preference. arXiv preprint arXiv:2403.04132","author":"Chiang Wei-Lin","year":"2024","unstructured":"Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E Gonzalez, et al. Chatbot arena: An open platform for evaluating llms by human preference. arXiv preprint arXiv:2403.04132, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"Generating long sequences with sparse transformers","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. Generating long sequences with sparse transformers, 2019."},{"key":"e_1_3_2_1_10_1","volume-title":"Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. CoRR, abs\/2401.06066","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, R. X. Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Y. Wu, Zhenda Xie, Y. K. Li, Panpan Huang, Fuli Luo, Chong Ruan, Zhifang Sui, and Wenfeng Liang. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. CoRR, abs\/2401.06066, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. FlashAttention-2: Faster attention with better parallelism and work partitioning. In International Conference on Learning Representations (ICLR), 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. FlashAttention: Fast and memory-efficient exact attention with IO-awareness. In Advances in Neural Information Processing Systems (NeurIPS), 2022."},{"key":"e_1_3_2_1_13_1","first-page":"5547","volume-title":"International Conference on Machine Learning","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning, pages 5547--5569. PMLR, 2022."},{"key":"e_1_3_2_1_14_1","volume-title":"The llama 3 herd of models. arXiv preprint arXiv:2407.21783","author":"Dubey Abhimanyu","year":"2024","unstructured":"Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Fast inference of mixture-of-experts language models with offloading","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. Fast inference of mixture-of-experts language models with offloading, 2023."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_17_1","volume-title":"Fastdecode: High-throughput gpu-efficient llm serving using heterogeneous pipelines","author":"He Jiaao","year":"2024","unstructured":"Jiaao He and Jidong Zhai. Fastdecode: High-throughput gpu-efficient llm serving using heterogeneous pipelines, 2024."},{"key":"e_1_3_2_1_18_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_19_1","volume-title":"Hugging face accelerate. https:\/\/huggingface.co\/docs\/accelerate\/index","year":"2022","unstructured":"HuggingFace. Hugging face accelerate. https:\/\/huggingface.co\/docs\/accelerate\/index, 2022."},{"key":"e_1_3_2_1_20_1","volume-title":"https:\/\/www.intel. com\/content\/www\/us\/en\/developer\/tools\/oneapi\/onemkl.html","year":"2024","unstructured":"Intel. Intel(r) oneapi math kernel library (onemkl). https:\/\/www.intel. com\/content\/www\/us\/en\/developer\/tools\/oneapi\/onemkl.html, 2024."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_22_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de Las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier Marie-Anne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Th\u00e9ophile Gervet Thibaut Lavril ThomasWang Timoth\u00e9e Lacroix and William El Sayed. Mixtral of experts. CoRR abs\/2401.04088 2024."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1994.6.2.181"},{"key":"e_1_3_2_1_24_1","volume-title":"Hydragen: High-throughput llm inference with shared prefixes","author":"Juravsky Jordan","year":"2024","unstructured":"Jordan Juravsky, Bradley Brown, Ryan Ehrlich, Daniel Y. Fu, Christopher R\u00e9, and Azalia Mirhoseini. Hydragen: High-throughput llm inference with shared prefixes, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Fiddler: Cpugpu orchestration for fast inference of mixture-of-experts models","author":"Kamahori Keisuke","year":"2024","unstructured":"Keisuke Kamahori, Yile Gu, Kan Zhu, and Baris Kasikci. Fiddler: Cpugpu orchestration for fast inference of mixture-of-experts models, 2024."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_27_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668, 2020."},{"key":"e_1_3_2_1_28_1","volume-title":"Fast inference from transformers via speculative decoding","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"Holistic evaluation of language models. arXiv preprint arXiv:2211.09110","author":"Liang Percy","year":"2022","unstructured":"Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, et al. Holistic evaluation of language models. arXiv preprint arXiv:2211.09110, 2022."},{"key":"e_1_3_2_1_30_1","volume-title":"Qserve: W4a8kv4 quantization and system co-design for efficient llm serving","author":"Lin Yujun","year":"2024","unstructured":"Yujun Lin, Haotian Tang, Shang Yang, Zhekai Zhang, Guangxuan Xiao, Chuang Gan, and Song Han. Qserve: W4a8kv4 quantization and system co-design for efficient llm serving, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Optimizing llm queries in relational workloads","author":"Liu Shu","year":"2024","unstructured":"Shu Liu, Asim Biswal, Audrey Cheng, Xiangxi Mo, Shiyi Cao, Joseph E. Gonzalez, Ion Stoica, and Matei Zaharia. Optimizing llm queries in relational workloads, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"April","author":"AI.","year":"2024","unstructured":"MistralAI. https:\/\/mistral.ai\/news\/mixtral-8x22b\/, April 2024."},{"key":"e_1_3_2_1_33_1","volume-title":"Can foundation models wrangle your data? arXiv preprint arXiv:2205.09911","author":"Narayan Avanika","year":"2022","unstructured":"Avanika Narayan, Ines Chami, Laurel Orr, and Christopher R\u00e9. Can foundation models wrangle your data? arXiv preprint arXiv:2205.09911, 2022."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_36_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_37_1","first-page":"5","article-title":"Efficiently scaling transformer inference","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems, 5, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_38_1","volume-title":"Int4 decoding gqa cuda optimizations for llm inference. https:\/\/pytorch.org\/blog\/int4-decoding\/","author":"Pumma Sarunya","year":"2024","unstructured":"Sarunya Pumma, Jongsoo Park, Jianyu Huang, Amy Yang, Jaewon Lee, Daniel Haziza, Grigory Sizov, Jeremy Reizenstein, Jeff Johnson, and Ying Zhang. Int4 decoding gqa cuda optimizations for llm inference. https:\/\/pytorch.org\/blog\/int4-decoding\/, 2024."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.689"},{"key":"e_1_3_2_1_40_1","volume-title":"Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100","author":"Scao Teven Le","year":"2022","unstructured":"Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100, 2022."},{"key":"e_1_3_2_1_41_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538, 2017."},{"key":"e_1_3_2_1_42_1","first-page":"31094","volume-title":"International Conference on Machine Learning","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning, pages 31094--31116. PMLR, 2023."},{"key":"e_1_3_2_1_43_1","volume-title":"Powerinfer: Fast large language model serving with a consumer-grade gpu","author":"Song Yixin","year":"2023","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. Powerinfer: Fast large language model serving with a consumer-grade gpu, 2023."},{"key":"e_1_3_2_1_44_1","volume-title":"Blockwise parallel decoding for deep autoregressive models","author":"Stern Mitchell","year":"2018","unstructured":"Mitchell Stern, Noam Shazeer, and Jakob Uszkoreit. Blockwise parallel decoding for deep autoregressive models, 2018."},{"key":"e_1_3_2_1_45_1","volume-title":"Quest: Query-aware sparsity for efficient long-context llm inference. arXiv preprint arXiv:2406.10774","author":"Tang Jiaming","year":"2024","unstructured":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han. Quest: Query-aware sparsity for efficient long-context llm inference. arXiv preprint arXiv:2406.10774, 2024."},{"key":"e_1_3_2_1_46_1","volume-title":"Introducing dbrx: A new state-of-the-art open llm","author":"Team Mosaic Research","year":"2024","unstructured":"Mosaic Research Team. Introducing dbrx: A new state-of-the-art open llm, 2024. https:\/\/www.databricks.com\/blog\/introducing-dbrx-newstate-art-open-llm, March 2024. Accessed 2024-06-20."},{"key":"e_1_3_2_1_47_1","volume-title":"Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie- Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_49_1","first-page":"162","article-title":"Efficient heterogeneous parallel inference for large language models on resource-constrained devices","volume":"6","author":"Bin Jia ZHAO XUANLEI","year":"2024","unstructured":"ZHAO XUANLEI, Bin Jia, Haotian Zhou, Ziming Liu, Shenggan Cheng, and Yang You. Hetegen: Efficient heterogeneous parallel inference for large language models on resource-constrained devices. Proceedings of Machine Learning and Systems, 6:162--172, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_50_1","volume-title":"Moeinfinity: Activation-aware expert offloading for efficient moe serving","author":"Xue Leyang","year":"2024","unstructured":"Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, and Mahesh Marina. Moeinfinity: Activation-aware expert offloading for efficient moe serving, 2024."},{"key":"e_1_3_2_1_51_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521--538, 2022."},{"key":"e_1_3_2_1_52_1","volume-title":"Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer. Llm inference unveiled: Survey and roofline model insights","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Yuzhang Shang, Yang Zhou, Zhen Dong, Chenhao Xue, Bingzhe Wu, Zhikai Li, Qingyi Gu, Yong Jae Lee, Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer. Llm inference unveiled: Survey and roofline model insights, 2024."},{"key":"e_1_3_2_1_53_1","volume-title":"Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068, 2022."},{"key":"e_1_3_2_1_54_1","first-page":"36","article-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models","author":"Zhang Zhenyu","year":"2024","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_55_1","first-page":"36","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_56_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. Sglang: Efficient execution of structured language model programs","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. Sglang: Efficient execution of structured language model programs, 2024."},{"key":"e_1_3_2_1_57_1","first-page":"7103","article-title":"Mixture-ofexperts with expert choice routing","volume":"35","author":"Zhou Yanqi","year":"2022","unstructured":"Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al. Mixture-ofexperts with expert choice routing. Advances in Neural Information Processing Systems, 35:7103--7114, 2022.","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707267","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707267","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:46:58Z","timestamp":1755787618000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707267"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":57,"alternative-id":["10.1145\/3669940.3707267","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707267","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}