{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:57:57Z","timestamp":1773248277744,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676741","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T13:21:20Z","timestamp":1744204880000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["AdapMoE: Adaptive Sensitivity-based Expert Gating and Management for Efficient MoE Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5478-3604","authenticated-orcid":false,"given":"Shuzhang","family":"Zhong","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuit, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8534-6494","authenticated-orcid":false,"given":"Ling","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4951-4286","authenticated-orcid":false,"given":"Yuan","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7514-0767","authenticated-orcid":false,"given":"Runsheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8146-4821","authenticated-orcid":false,"given":"Ru","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7212-2264","authenticated-orcid":false,"given":"Meng","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_2_1","unstructured":"Hicham Badri and Appu Shaji. 2023. Half-Quadratic Quantization of Large Machine Learning Models. https:\/\/mobiusml.github.io\/hqq_blog\/"},{"key":"e_1_3_2_1_3_1","volume-title":"Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457","author":"Clark Peter","year":"2018","unstructured":"Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. 2018. Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457 (2018)."},{"key":"e_1_3_2_1_4_1","unstructured":"Marta R Costa-juss\u00e0 James Cross Onur \u00c7elebi Maha Elbayad Kenneth Heafield Kevin Heffernan Elahe Kalbassi Janice Lam Daniel Licht Jean Maillard et al. 2022. No language left behind: Scaling human-centered machine translation. arXiv preprint arXiv:2207.04672 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238 (2023)."},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23, 120 (2022), 1--39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. arXiv preprint arXiv:2308.12066","author":"Hwang Ranggi","year":"2023","unstructured":"Ranggi Hwang, Jianyu Wei, Shijie Cao, Changho Hwang, Xiaohu Tang, Ting Cao, Mao Yang, and Minsoo Rhu. 2023. Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. arXiv preprint arXiv:2308.12066 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Squeezellm: Dense-and-sparse quantization. arXiv preprint arXiv:2306.07629","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael W Mahoney, and Kurt Keutzer. 2023. Squeezellm: Dense-and-sparse quantization. arXiv preprint arXiv:2306.07629 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Adaptive gating in mixture-of-experts based language models. arXiv preprint arXiv:2310.07188","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Qiang Su, Yitao Yang, Yimin Jiang, Cong Wang, and Hong Xu. 2023. Adaptive gating in mixture-of-experts based language models. arXiv preprint arXiv:2310.07188 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-012-9338-y"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_14_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. {Zero-offload}: Democratizing {billion-scale} model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 551--564."},{"key":"e_1_3_2_1_15_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 31094--31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094--31116."},{"key":"e_1_3_2_1_17_1","volume-title":"MoE-Infinity: Activation-Aware Expert Offloading for Efficient MoE Serving. arXiv preprint arXiv:2401.14361","author":"Xue Leyang","year":"2024","unstructured":"Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, and Mahesh Marina. 2024. MoE-Infinity: Activation-Aware Expert Offloading for Efficient MoE Serving. arXiv preprint arXiv:2401.14361 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData50022.2020.9378171"},{"key":"e_1_3_2_1_19_1","volume-title":"Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. Edgemoe: Fast on-device inference of moe-based large language models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_20_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems 36 (2024)."}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"]},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676741","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676741","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:57Z","timestamp":1750290237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676741"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":20,"alternative-id":["10.1145\/3676536.3676741","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676741","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}