{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T21:05:38Z","timestamp":1773090338817,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3666025.3699355","type":"proceedings-article","created":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T18:48:26Z","timestamp":1730746106000},"page":"521-534","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["LiteMoE: Customizing On-device LLM Serving via Proxy Submodel Tuning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0702-5093","authenticated-orcid":false,"given":"Yan","family":"Zhuang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5094-5331","authenticated-orcid":false,"given":"Zhenzhe","family":"Zheng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0965-9058","authenticated-orcid":false,"given":"Fan","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6934-1685","authenticated-orcid":false,"given":"Guihai","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,11,4]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. AICore. https:\/\/developer.android.com\/ml\/aicore."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. HuggingFace models. https:\/\/huggingface.co\/models."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. Introducing Apple's On-Device and Server Foundation Models. https:\/\/machinelearning.apple.com\/research\/introducing-apple-foundation-models."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. PyTorch. https:\/\/pytorch.org\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Snapdragon 8 gen 3 mobile platform product brief. https:\/\/docs.qualcomm.com\/bundle\/publicresource\/87-71408-1_REV_C_Snapdragon_8_gen_3_Mobile_Platform_Product_Brief.pdf."},{"key":"e_1_3_2_1_7_1","volume-title":"International Joint Conference on Artificial Intelligence (IJCAI). 35--41","author":"Ahmed Faez","unstructured":"Faez Ahmed, John P. Dickerson, and Mark D. Fuge. 2017. Diverse Weighted Bipartite b-Matching. In International Joint Conference on Artificial Intelligence (IJCAI). 35--41."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3592505"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2016.2527003"},{"key":"e_1_3_2_1_10_1","volume-title":"Task-Specific Expert Pruning for Sparse Mixture-of-Experts. arXiv preprint arXiv:2206.00277","author":"Chen Tianyu","year":"2022","unstructured":"Tianyu Chen, Shaohan Huang, Yuan Xie, Binxing Jiao, Daxin Jiang, Haoyi Zhou, Jianxin Li, and Furu Wei. 2022. Task-Specific Expert Pruning for Sparse Mixture-of-Experts. arXiv preprint arXiv:2206.00277 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Sinkhorn Distances: Lightspeed Computation of Optimal Transport. In Advances in Neural Information Processing Systems (NeurIPS). 1--9.","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi. 2013. Sinkhorn Distances: Lightspeed Computation of Optimal Transport. In Advances in Neural Information Processing Systems (NeurIPS). 1--9."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning (ICML). 5547--5569","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. 2022. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning (ICML). 5547--5569."},{"key":"e_1_3_2_1_15_1","volume-title":"Fast Inference of Mixture-of-Experts Language Models with Offloading. arXiv preprint arXiv:2312.17238","author":"Eliseev Artyom","year":"2023","unstructured":"Artyom Eliseev and Denis Mazur. 2023. Fast Inference of Mixture-of-Experts Language Models with Offloading. arXiv preprint arXiv:2312.17238 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3586589.3586709"},{"key":"e_1_3_2_1_18_1","volume-title":"QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models. arXiv preprint arXiv:2310.16795","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models. arXiv preprint arXiv:2310.16795 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 27th Annual International Conference on Mobile Computing and Networking (MobiCom). 406--419","author":"Han Rui","unstructured":"Rui Han, Qinglong Zhang, Chi Harold Liu, Guoren Wang, Jian Tang, and Lydia Y. Chen. 2021. LegoDNN: block-grained scaling of deep neural networks for mobile vision. In Proceedings of the 27th Annual International Conference on Mobile Computing and Networking (MobiCom). 406--419."},{"key":"e_1_3_2_1_20_1","volume-title":"Trained Quantization and Huffman Coding. In 4th International Conference on Learning Representations (ICLR). 1--14","author":"Han Song","unstructured":"Song Han, Huizi Mao, and William J. Dally. 2016. Deep Compression: Compressing Deep Neural Network with Pruning, Trained Quantization and Huffman Coding. In 4th International Conference on Learning Representations (ICLR). 1--14."},{"key":"e_1_3_2_1_21_1","volume-title":"Dally","author":"Han Song","year":"2015","unstructured":"Song Han, Jeff Pool, John Tran, and William J. Dally. 2015. Learning both Weights and Connections for Efficient Neural Network. In Advances in Neural Information Processing Systems (NeurIPS). 1135--1143."},{"key":"e_1_3_2_1_22_1","volume-title":"Distilling the Knowledge in a Neural Network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey E.","year":"2015","unstructured":"Geoffrey E. Hinton, Oriol Vinyals, and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning (ICML). 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning (ICML). 2790--2799."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"e_1_3_2_1_25_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations (ICLR). 1--13","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations (ICLR). 1--13."},{"key":"e_1_3_2_1_26_1","volume-title":"Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA). 1018--1031","author":"Hwang Ranggi","year":"2024","unstructured":"Ranggi Hwang, Jianyu Wei, Shijie Cao, Changho Hwang, Xiaohu Tang, Ting Cao, and Mao Yang. 2024. Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable Mixture-of-Expert Inference. In 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA). 1018--1031."},{"key":"e_1_3_2_1_27_1","volume-title":"Gemma: Introducing new state-of-the-art open models. https:\/\/blog.google\/technology\/developers\/gemma-openmodels\/.","author":"Google Inc.","year":"2024","unstructured":"Google Inc. 2024. Gemma: Introducing new state-of-the-art open models. https:\/\/blog.google\/technology\/developers\/gemma-openmodels\/."},{"key":"e_1_3_2_1_28_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_30_1","volume-title":"Serving MoE Models on Resource-constrained Edge Devices via Dynamic Expert Swapping. arXiv preprint arXiv:2308.15030","author":"Kong Rui","year":"2023","unstructured":"Rui Kong, Yuanchun Li, Qingtian Feng, Weijun Wang, L. Kong, and Yunxin Liu. 2023. Serving MoE Models on Resource-constrained Edge Devices via Dynamic Expert Swapping. arXiv preprint arXiv:2308.15030 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.363"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_33_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_34_1","volume-title":"Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459","author":"Li Yuanchun","year":"2024","unstructured":"Yuanchun Li, Hao Wen, Weijun Wang, Xiangyu Li, Yizhen Yuan, Guohong Liu, Jiacheng Liu, Wenxing Xu, Xiang Wang, Yi Sun, Rui Kong, Yile Wang, Hanfei Geng, Jian Luan, Xuefeng Jin, Zi-Liang Ye, Guanjing Xiong, Fan Zhang, Xiang Li, Mengwei Xu, Zhijun Li, Peng Li, Yang Liu, Yaqiong Zhang, and Yunxin Liu. 2024. Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security. arXiv preprint arXiv:2401.05459 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models. arXiv preprint arXiv:2401.15947","author":"Lin Bin","year":"2024","unstructured":"Bin Lin, Zhenyu Tang, Yang Ye, Jiaxi Cui, Bin Zhu, Peng Jin, Jinfa Huang, Junwu Zhang, Yatian Pang, Munan Ning, and Li Yuan. 2024. MoE-LLaVA: Mixture of Experts for Large Vision-Language Models. arXiv preprint arXiv:2401.15947 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"MobileLLM: Optimizing Sub-billion Parameter Language Models for On-Device Use Cases. arXiv preprint arXiv:2402.14905","author":"Liu Zechun","year":"2024","unstructured":"Zechun Liu, Changsheng Zhao, Forrest N. Iandola, Chen Lai, Yuandong Tian, Igor Fedorov, Yunyang Xiong, Ernie Chang, Yangyang Shi, Raghuraman Krishnamoorthi, Liangzhen Lai, and Vikas Chandra. 2024. MobileLLM: Optimizing Sub-billion Parameter Language Models for On-Device Use Cases. arXiv preprint arXiv:2402.14905 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design. arXiv preprint arXiv:1807.11164","author":"Ma Ningning","year":"2018","unstructured":"Ningning Ma, Xiangyu Zhang, Haitao Zheng, and Jian Sun. 2018. ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design. arXiv preprint arXiv:1807.11164 (2018)."},{"key":"e_1_3_2_1_38_1","volume-title":"PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods. https:\/\/github.com\/huggingface\/peft.","author":"Mangrulkar Sourab","year":"2022","unstructured":"Sourab Mangrulkar, Sylvain Gugger, Lysandre Debut, Younes Belkada, Sayak Paul, and Benjamin Bossan. 2022. PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods. https:\/\/github.com\/huggingface\/peft."},{"key":"e_1_3_2_1_39_1","unstructured":"Michael Matena and Colin Raffel. 2022. Merging models with fisher-weighted averaging. In Advances in Neural Information Processing Systems (NeurIPS). 17703--17716."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS). 1273--1282","author":"McMahan Brendan","year":"2017","unstructured":"Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise Aguera y Arcas. 2017. Communication-Efficient Learning of Deep Networks from Decentralized Data. In Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS). 1273--1282."},{"key":"e_1_3_2_1_41_1","unstructured":"Sean Huver Nigel Nelson and Mostafa Toloui. 2023. Deploy Large Language Models at the Edge with NVIDIA IGX Orin Developer Kit. https:\/\/developer.nvidia.com\/blog\/deploy-large-language-models-at-the-edge-with-nvidia-igx-orin-developer-kit\/."},{"key":"e_1_3_2_1_42_1","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/blog\/chatgpt\/."},{"key":"e_1_3_2_1_43_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). 973--994","author":"Padmanabhan Arthi","year":"2023","unstructured":"Arthi Padmanabhan, Neil Agarwal, Anand Iyer, Ganesh Ananthanarayanan, Yuanchao Shu, Nikolaos Karianakis, Guoqing Harry Xu, and Ravi Netravali. 2023. Gemel: Model Merging for Memory-Efficient, Real-Time Video Analytics at the Edge. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI). 973--994."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459199"},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning (ICML). 18332--18346","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International Conference on Machine Learning (ICML). 18332--18346."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_48_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_49_1","volume-title":"Efficient LLM Inference on CPUs. arXiv preprint arXiv:2311.00502","author":"Shen Haihao","year":"2023","unstructured":"Haihao Shen, Hanwen Chang, Bo Dong, Yu Luo, and Hengyu Meng. 2023. Efficient LLM Inference on CPUs. arXiv preprint arXiv:2311.00502 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"JetMoE: Reaching Llama2 Performance with 0.1M Dollars. arXiv preprint arXiv:2404.07413","author":"Shen Yikang","year":"2024","unstructured":"Yikang Shen, Zhen Guo, Tianle Cai, and Zengyi Qin. 2024. JetMoE: Reaching Llama2 Performance with 0.1M Dollars. arXiv preprint arXiv:2404.07413 (2024)."},{"key":"e_1_3_2_1_51_1","unstructured":"Sidak Pal Singh and Martin Jaggi. 2020. Model fusion via optimal transport. In Advances in Neural Information Processing Systems (NeurIPS). 22045--22055."},{"key":"e_1_3_2_1_52_1","unstructured":"Virginia Smith Chao-Kai Chiang Maziar Sanjabi and Ameet Talwalkar. 2017. Federated Multi-Task Learning. In Advances in Neural Information Processing Systems (NeurIPS). 1--11."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML). 10096--10106","author":"Tan Mingxing","year":"2021","unstructured":"Mingxing Tan and Quoc Le. 2021. EfficientNetV2: Smaller Models and Faster Training. In Proceedings of the 38th International Conference on Machine Learning (ICML). 10096--10106."},{"key":"e_1_3_2_1_54_1","volume-title":"Le","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc V. Le. 2019. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. arXiv preprint arXiv:1905.11946 (2019)."},{"key":"e_1_3_2_1_55_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, et al. 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_56_1","unstructured":"LLaMA-MoE Team. 2024. LLaMA-MoE: Building Mixture-of-Experts from LLaMA with Continual Pre-training. https:\/\/github.com\/pjlab-sys4nlp\/llama-moe\/blob\/main\/docs\/LLaMA_MoE.pdf."},{"key":"e_1_3_2_1_57_1","volume-title":"Baby Llama: knowledge distillation from an ensemble of teachers trained on a small dataset with no performance penalty. arXiv preprint arXiv:2308.02019","author":"Timiryasov Inar","year":"2023","unstructured":"Inar Timiryasov and Jean-Loup Tastet. 2023. Baby Llama: knowledge distillation from an ensemble of teachers trained on a small dataset with no performance penalty. arXiv preprint arXiv:2308.02019 (2023)."},{"key":"e_1_3_2_1_58_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In International Conference on Learning Representations (ICLR).","author":"Wang Alex","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. 2019. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_60_1","volume-title":"Federated Learning with Matched Averaging. In International Conference on Learning Representations (ICLR). 1--16","author":"Wang Hongyi","year":"2020","unstructured":"Hongyi Wang, Mikhail Yurochkin, Yuekai Sun, Dimitris S. Papailiopoulos, and Yasaman Khazaeni. 2020. Federated Learning with Matched Averaging. In International Conference on Learning Representations (ICLR). 1--16."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3592529"},{"key":"e_1_3_2_1_64_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (ICML). 38087--38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (ICML). 38087--38099."},{"key":"e_1_3_2_1_65_1","volume-title":"One Student Knows All Experts Know: From Sparse to Dense. arXiv preprint arXiv:2201.10890","author":"Xue Fuzhao","year":"2022","unstructured":"Fuzhao Xue, Xiaoxin He, Xiaozhe Ren, Yuxuan Lou, and Yang You. 2022. One Student Knows All Experts Know: From Sparse to Dense. arXiv preprint arXiv:2201.10890 (2022)."},{"key":"e_1_3_2_1_66_1","volume-title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. arXiv preprint arXiv:2308.14352","author":"Yi Rongjie","year":"2023","unstructured":"Rongjie Yi, Liwei Guo, Shiyun Wei, Ao Zhou, Shangguang Wang, and Mengwei Xu. 2023. EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models. arXiv preprint arXiv:2308.14352 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"LLM as a System Service on Mobile Devices. arXiv preprint arXiv:2403.11805","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Mengwei Xu, Yuanchun Li, and Xuanzhe Liu. 2024. LLM as a System Service on Mobile Devices. arXiv preprint arXiv:2403.11805 (2024)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Zhengyan Zhang Yankai Lin Zhiyuan Liu Peng Li Maosong Sun and Jie Zhou. 2022. MoEfication: Transformer Feed-forward Layers are Mixtures of Experts. In Findings of the Association for Computational Linguistics (ACL). 877--890.","DOI":"10.18653\/v1\/2022.findings-acl.71"},{"key":"e_1_3_2_1_70_1","volume-title":"Learn To be Efficient: Build Structured Sparsity in Large Language Models. arXiv preprint arXiv:2402.06126","author":"Zheng Haizhong","year":"2024","unstructured":"Haizhong Zheng, Xiaoyan Bai, Beidi Chen, Fan Lai, and Atul Prakash. 2024. Learn To be Efficient: Build Structured Sparsity in Large Language Models. arXiv preprint arXiv:2402.06126 (2024)."},{"key":"e_1_3_2_1_71_1","volume-title":"PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (ATC). 489--504","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. PetS: A Unified Framework for Parameter-Efficient Transformers Serving. In 2022 USENIX Annual Technical Conference (ATC). 489--504."},{"key":"e_1_3_2_1_72_1","volume-title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models. arXiv preprint arXiv:2202.08906","author":"Zoph Barret","year":"2022","unstructured":"Barret Zoph, Irwan Bello, Sameer Kumar, Nan Du, Yanping Huang, Jeff Dean, Noam M. Shazeer, and William Fedus. 2022. ST-MoE: Designing Stable and Transferable Sparse Expert Models. arXiv preprint arXiv:2202.08906 (2022)."}],"event":{"name":"SenSys '24: 22nd ACM Conference on Embedded Networked Sensor Systems","location":"Hangzhou China","acronym":"SenSys '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 22nd ACM Conference on Embedded Networked Sensor Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3666025.3699355","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3666025.3699355","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:10Z","timestamp":1750295890000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3666025.3699355"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":71,"alternative-id":["10.1145\/3666025.3699355","10.1145\/3666025"],"URL":"https:\/\/doi.org\/10.1145\/3666025.3699355","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}