{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T10:09:54Z","timestamp":1769162994357,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T00:00:00Z","timestamp":1763164800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302420"],"award-info":[{"award-number":["62302420"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"U.S. Department of Energy","doi-asserted-by":"publisher","award":["Office of Science, Advanced Scientific Computing Research (ASCR), DE-AC02-06CH11357"],"award-info":[{"award-number":["Office of Science, Advanced Scientific Computing Research (ASCR), DE-AC02-06CH11357"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hong Kong Innovation and Technology Commission","award":["Innovation and Technology Support Programme (ITP), ITP\/012\/25LP"],"award-info":[{"award-number":["Innovation and Technology Support Programme (ITP), ITP\/012\/25LP"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767377","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:20:02Z","timestamp":1762532402000},"page":"339-348","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Compression Error Sensitivity Analysis for Different Experts in MoE Model Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1956-096X","authenticated-orcid":false,"given":"Songkai","family":"Ma","sequence":"first","affiliation":[{"name":"Department of Computing, Hong Kong Polytechnic University, Hong Hong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0284-1113","authenticated-orcid":false,"given":"Zhaorui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computing, The Hong Kong Polytechnic University, Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9935-5674","authenticated-orcid":false,"given":"Sheng","family":"Di","sequence":"additional","affiliation":[{"name":"Argonne National Labratory, Argonne National Laboratory (ANL), DuPage, Illinois, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8300-9740","authenticated-orcid":false,"given":"Benben","family":"Liu","sequence":"additional","affiliation":[{"name":"LSCM, The University of Hong Kong, Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2432-6171","authenticated-orcid":false,"given":"Xiaodong","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7581-8905","authenticated-orcid":false,"given":"Xiaoyi","family":"Lu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of California Merced, Merced, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0921-2726","authenticated-orcid":false,"given":"Dan","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computing, Hong Kong Polytechnic University, Hong Kong, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Marah Abdin Jyoti Aneja Harkirat Behl S\u00e9bastien Bubeck Ronen Eldan Suriya Gunasekar Michael Harrison Russell\u00a0J Hewett Mojan Javaheripi Piero Kauffmann et\u00a0al. 2024. Phi-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.08905 (2024)."},{"key":"e_1_3_3_2_3_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707267"},{"key":"e_1_3_3_2_5_2","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et\u00a0al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.14168 (2021)."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Sheng Di Jinyang Liu Kai Zhao Xin Liang Robert Underwood Zhaorui Zhang Milan Shah Yafan Huang Jiajun Huang Xiaodong Yu et\u00a0al. 2025. A survey on error-bounded lossy compression for scientific datasets. ACM computing surveys 57 11 (2025) 1\u201338.","DOI":"10.1145\/3733104"},{"key":"e_1_3_3_2_7_2","unstructured":"Haojie Duanmu Xiuhong Li Zhihang Yuan Size Zheng Jiangfei Duan Xingcheng Zhang and Dahua Lin. 2025. MxMoE: Mixed-precision Quantization for MoE with Accuracy and Performance Co-Design. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.05799 (2025)."},{"key":"e_1_3_3_2_8_2","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.17238 (2023)."},{"key":"e_1_3_3_2_9_2","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et\u00a0al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12948 (2025)."},{"key":"e_1_3_3_2_10_2","unstructured":"Xin He Shunkang Zhang Yuxin Wang Haiyan Yin Zihao Zeng Shaohuai Shi Zhenheng Tang Xiaowen Chu Ivor Tsang and Ong\u00a0Yew Soon. 2024. Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.17954 (2024)."},{"key":"e_1_3_3_2_11_2","unstructured":"Dan Hendrycks Collin Burns Saurav Kadavath Akul Arora Steven Basart Eric Tang Dawn Song and Jacob Steinhardt. 2021. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.03874 (2021)."},{"key":"e_1_3_3_2_12_2","unstructured":"Xing Hu Zhixuan Chen Dawei Yang Zukang Xu Chen Xu Zhihang Yuan Sifan Zhou and Jiangyong Yu. 2025. MoEQuant: Enhancing Quantization for Mixture-of-Experts Large Language Models via Expert-Balanced Sampling and Affinity Guidance. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.03804 (2025)."},{"key":"e_1_3_3_2_13_2","unstructured":"Beichen Huang Yueming Yuan Zelei Shao and Minjia Zhang. 2025. MiLo: Efficient Quantized MoE Inference with Mixture of Low-Rank Compensators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.02658 (2025)."},{"key":"e_1_3_3_2_14_2","unstructured":"Jiajun Huang Sheng Di Xiaodong Yu Yujia Zhai Jinyang Liu Ken Raffenetti Hui Zhou Kai Zhao Zizhong Chen Franck Cappello et\u00a0al. 2023. C-Coll: Introducing error-bounded lossy compression into MPI collectives. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.03890 (2023)."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00072"},{"key":"e_1_3_3_2_16_2","unstructured":"Jiajun Huang Sheng Di Xiaodong Yu Yujia Zhai Zhaorui Zhang Jinyang Liu Xiaoyi Lu Ken Raffenetti Hui Zhou Kai Zhao et\u00a0al. 2025. ZCCL: Significantly improving collective communication with error-bounded lossy compression. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.18554 (2025)."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00021"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607048"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"e_1_3_3_2_20_2","unstructured":"Albert\u00a0Q Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Emma\u00a0Bou Hanna Florian Bressand et\u00a0al. 2024. Mixtral of experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04088 (2024)."},{"key":"e_1_3_3_2_21_2","unstructured":"Rui Kong Yuanchun Li Qingtian Feng Weijun Wang Xiaozhou Ye Ye Ouyang Linghe Kong and Yunxin Liu. 2023. SwapMoE: Serving off-the-shelf MoE-based large language models with tunable memory budget. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.15030 (2023)."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Xin Liang Kai Zhao Sheng Di Sihuan Li Robert Underwood Ali\u00a0M Gok Jiannan Tian Junjing Deng Jon\u00a0C Calhoun Dingwen Tao et\u00a0al. 2022. Sz3: A modular framework for composing prediction-based error-bounded lossy compressors. IEEE Transactions on Big Data 9 2 (2022) 485\u2013498.","DOI":"10.1109\/TBDATA.2022.3201176"},{"key":"e_1_3_3_2_23_2","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et\u00a0al. 2024. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.04434 (2024)."},{"key":"e_1_3_3_2_24_2","unstructured":"Jingyuan Liu Jianlin Su Xingcheng Yao Zhejun Jiang Guokun Lai Yulun Du Yidao Qin Weixin Xu Enzhe Lu Junjie Yan et\u00a0al. 2025. Muon is scalable for LLM training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.16982 (2025)."},{"key":"e_1_3_3_2_25_2","unstructured":"Jiacheng Liu Peng Tang Wenfeng Wang Yuhang Ren Xiaofeng Hou Pheng-Ann Heng Minyi Guo and Chao Li. 2024. A survey on inference optimization techniques for mixture of experts models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14219 (2024)."},{"key":"e_1_3_3_2_26_2","unstructured":"Qianli Liu Zhaorui Zhang Xin Yao and Benben Liu. 2025. HLoRA: Efficient federated learning system for LLM heterogeneous fine-tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.00813 (2025)."},{"key":"e_1_3_3_2_27_2","unstructured":"Yuanjian Liu Sheng Di Jiajun Huang Zhaorui Zhang Kyle Chard and Ian Foster. 2025. Ocelot: An Interactive Efficient Distributed Compression-As-a-Service Platform With Optimized Data Compression Techniques. IEEE Transactions on Parallel and Distributed Systems (2025)."},{"key":"e_1_3_3_2_28_2","unstructured":"Xudong Lu Qi Liu Yuhui Xu Aojun Zhou Siyuan Huang Bo Zhang Junchi Yan and Hongsheng Li. 2024. Not all experts are equal: Efficient expert pruning and skipping for mixture-of-experts large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.14800 (2024)."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD57390.2023.10323651"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE64628.2025.10992997"},{"key":"e_1_3_3_2_31_2","unstructured":"Xiaoniu Song Zihang Zhong Rong Chen and Haibo Chen. 2024. Promoe: Fast moe-based llm serving using proactive caching. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.22134 (2024)."},{"key":"e_1_3_3_2_32_2","unstructured":"Peng Tang Jiacheng Liu Xiaofeng Hou Yifei Pu Jing Wang Pheng-Ann Heng Chao Li and Minyi Guo. 2024. Hobbit: A mixed precision expert offloading system for fast moe inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.01433 (2024)."},{"key":"e_1_3_3_2_33_2","unstructured":"Qwen Team. 2024. Qwen2 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.10671 (2024)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3679240.3734604"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1386"},{"key":"e_1_3_3_2_36_2","unstructured":"Haotian Xu Zhaorui Zhang Sheng Di Benben Liu Khalid\u00a0Ayed Alharthi and Jiannong Cao. 2024. Fedfa: a fully asynchronous training paradigm for federated learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.11015 (2024)."},{"key":"e_1_3_3_2_37_2","unstructured":"Tairan Xu Leyang Xue Zhan Lu Adrian Jackson and Luo Mai. 2025. MoE-Gen: High-Throughput MoE Inference on a Single GPU with Module-Based Batching. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.09716 (2025)."},{"key":"e_1_3_3_2_38_2","unstructured":"Leyang Xue Yao Fu Zhan Lu Luo Mai and Mahesh Marina. 2024. Moe-infinity: Offloading-efficient moe model serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.14361 (2024)."},{"key":"e_1_3_3_2_39_2","unstructured":"Ping Zhang Zhaorui Zhang Sheng Di Yao Xin and Benben Liu. 2025. CLLoRA: An approach to measure the effects of the context length for LLM fine-tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.18910 (2025)."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Zhaorui Zhang Sheng Di Benben Liu Zhuoran Ji Guanpeng Li Xiaoyi Lu Amelie\u00a0Chi Zhou Khalid\u00a0Ayed Alharthi and Jiannong Cao. 2025. FedEFsz: Fair Cross-Silo Federated Learning System with Error-Bounded Lossy Compression. IEEE Transactions on Parallel and Distributed Systems (2025).","DOI":"10.1109\/TPDS.2025.3593896"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Zhaorui Zhang Sheng Di Kai Zhao Sian Jin Dingwen Tao Zhuoran Ji Benben Liu Khalid\u00a0Ayed Alharthi Jiannong Cao and Franck Cappello. 2025. FedCSpc: A Cross-Silo Federated Learning System with Error-Bounded Lossy Parameter Compression. IEEE Transactions on Parallel and Distributed Systems (2025).","DOI":"10.1109\/TPDS.2025.3564736"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"crossref","unstructured":"Zhaorui Zhang Zhuoran Ji and Choli Wang. 2022. Momentum-driven adaptive synchronization model for distributed DNN training on HPC clusters. J. Parallel and Distrib. Comput. 159 (2022) 65\u201384.","DOI":"10.1016\/j.jpdc.2021.09.007"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Zhaorui Zhang and Choli Wang. 2021. SaPus: Self-adaptive parameter update strategy for DNN training on Multi-GPU clusters. IEEE Transactions on Parallel and Distributed Systems 33 7 (2021) 1569\u20131580.","DOI":"10.1109\/TPDS.2021.3118609"},{"key":"e_1_3_3_2_44_2","unstructured":"Zhaorui Zhang and Choli Wang. 2022. MIPD: An adaptive gradient sparsification framework for distributed DNNs training. IEEE Transactions on Parallel and Distributed Systems 33 11 (2022) 3053\u20133066."}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3731599.3767377","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767377","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767377","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:36:13Z","timestamp":1767987373000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767377"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":43,"alternative-id":["10.1145\/3731599.3767377","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767377","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}