{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:51:11Z","timestamp":1778860271733,"version":"3.51.4"},"reference-count":101,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFB4502400"],"award-info":[{"award-number":["2023YFB4502400"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"China NSF","award":["62322206"],"award-info":[{"award-number":["62322206"]}]},{"name":"China NSF","award":["62432007"],"award-info":[{"award-number":["62432007"]}]},{"name":"China NSF","award":["62441236"],"award-info":[{"award-number":["62441236"]}]},{"name":"China NSF","award":["62132018"],"award-info":[{"award-number":["62132018"]}]},{"name":"China NSF","award":["62025204"],"award-info":[{"award-number":["62025204"]}]},{"name":"China NSF","award":["U2268204"],"award-info":[{"award-number":["U2268204"]}]},{"name":"China NSF","award":["62272307"],"award-info":[{"award-number":["62272307"]}]},{"name":"China NSF","award":["62372296"],"award-info":[{"award-number":["62372296"]}]},{"name":"Science and Technology Innovation 2030 \u2013New Generation Artificial Intelligence Major","award":["2022ZD0119100"],"award-info":[{"award-number":["2022ZD0119100"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Netw."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/ton.2026.3658387","type":"journal-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:58:44Z","timestamp":1769633924000},"page":"3687-3703","source":"Crossref","is-referenced-by-count":1,"title":["Resource-Efficient LLM Customization on Mobile Devices Through Proxy Submodel Tuning"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0702-5093","authenticated-orcid":false,"given":"Yan","family":"Zhuang","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Key Laboratory of Scalable Computing and Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0333-6418","authenticated-orcid":false,"given":"Chen","family":"Gong","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Key Laboratory of Scalable Computing and Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3447-5349","authenticated-orcid":false,"given":"Zhenzhe","family":"Zheng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Key Laboratory of Scalable Computing and Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0965-9058","authenticated-orcid":false,"given":"Fan","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Key Laboratory of Scalable Computing and Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6934-1685","authenticated-orcid":false,"given":"Guihai","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Key Laboratory of Scalable Computing and Systems, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"ChatGPT","year":"2022"},{"key":"ref2","article-title":"Personal LLM agents: Insights and survey about the capability, efficiency and security","author":"Li","year":"2024","journal-title":"arXiv:2401.05459"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"ref4","volume-title":"Google: Get Started With Gemini Nano on Android (on Device)","year":"2024"},{"key":"ref5","volume-title":"Introducing Apple\u2019s On-Device and Server Foundation Models","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649361"},{"key":"ref7","first-page":"32431","article-title":"MobileLLM: Optimizing sub-billion parameter language models for on-device use cases","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Liu"},{"key":"ref8","article-title":"LLM as a system service on mobile devices","author":"Yin","year":"2024","journal-title":"arXiv:2403.11805"},{"key":"ref9","volume-title":"Aicore","year":"2024"},{"key":"ref10","article-title":"Mixtral of experts","author":"Jiang","year":"2024","journal-title":"arXiv:2401.04088"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2025.3546466"},{"key":"ref12","article-title":"SwapMoE: Serving off-the-shelf MoE-based large language models with tunable memory budget","author":"Kong","year":"2023","journal-title":"arXiv:2308.15030"},{"key":"ref13","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Houlsby"},{"key":"ref14","first-page":"489","article-title":"PetS: A unified framework for Parameter-Efficient transformers serving","volume-title":"Proc. USENIX Annu. Tech. Conf. (ATC)","author":"Zhou"},{"key":"ref15","volume-title":"Peft: State-of-the-art Parameter-Efficient Fine-Tuning Methods","author":"Mangrulkar","year":"2022"},{"key":"ref16","first-page":"1","article-title":"Deep compression: Compressing deep neural network with pruning, trained quantization and Huffman coding","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Han"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1503.02531"},{"key":"ref18","article-title":"One student knows all experts know: From sparse to dense","author":"Xue","year":"2022","journal-title":"arXiv:2201.10890"},{"key":"ref19","article-title":"QMoE: Practical sub-1-bit compression of trillion-parameter models","author":"Frantar","year":"2023","journal-title":"arXiv:2310.16795"},{"key":"ref20","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv:1701.06538"},{"key":"ref21","first-page":"5232","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref22","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref23","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref24","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref25","volume-title":"Gemma: Introducing New State-of-the-art Open Models","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.conll-babylm.24"},{"key":"ref27","volume-title":"Snapdragon 8 Gen 3 Mobile Platform Product Brief","year":"2024"},{"key":"ref28","volume-title":"Deploy Large Language Models at the Edge With NVIDIA IGX Orin Developer Kit","author":"Nigel Nelson","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-85747-8_3"},{"key":"ref30","first-page":"1","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Represen.","author":"Hu"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3592505"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref35","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv:2001.08361"},{"key":"ref36","first-page":"5547","article-title":"GLaM: Efficient scaling of language models with mixture-of-experts","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"162","author":"Du"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1287"},{"key":"ref38","first-page":"1273","article-title":"Communication-efficient learning of deep networks from decentralized data","volume-title":"Proc. 20th Int. Conf. Artif. Intell. Statist.","volume":"54","author":"McMahan"},{"key":"ref39","article-title":"ST-MoE: Designing stable and transferable sparse expert models","author":"Zoph","year":"2022","journal-title":"arXiv:2202.08906"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/6"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2016.2527003"},{"key":"ref42","first-page":"1","article-title":"Federated multi-task learning","volume-title":"Proc. NIPS","author":"Smith"},{"key":"ref43","first-page":"973","article-title":"GEMEL: Model merging for memory-efficient, real-time video analytics at the edge","volume-title":"Proc. 20th USENIX Symp. Networked Syst. Design Implement. (NSDI)","author":"Padmanabhan"},{"key":"ref44","first-page":"1","article-title":"Federated learning with matched averaging","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Wang"},{"key":"ref45","first-page":"22045","article-title":"Model fusion via optimal transport","volume-title":"Proc. NIPS","volume":"33","author":"Singh"},{"key":"ref46","first-page":"2292","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"26","author":"Cuturi"},{"key":"ref47","first-page":"769","article-title":"ModelKeeper: Accelerating DNN training via automated training warmup","volume-title":"Proc. 20th USENIX Symp. Networked Syst. Design Implement. (NSDI)","author":"Lai"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3690701"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3613297"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.388"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"ref53","volume-title":"Pytorch","year":"2024"},{"key":"ref54","volume-title":"Huggingface Models","year":"2024"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/w18-5446"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.13"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459199"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0441"},{"key":"ref60","article-title":"SlimMoE: Structured compression of large MoE models via expert slimming and distillation","author":"Li","year":"2025","journal-title":"arXiv:2506.18349"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1206"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5409"},{"key":"ref63","article-title":"JetMoE: Reaching Llama2 performance with 0.1M dollars","author":"Shen","year":"2024","journal-title":"arXiv:2404.07413"},{"key":"ref64","first-page":"1","article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Lepikhin"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"ref66","volume-title":"Llama-Moe: Building Mixture-of-Experts From Llama With Continual Pre-Training","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.71"},{"key":"ref68","article-title":"MoE-LLaVA: Mixture of experts for large vision-language models","author":"Lin","year":"2024","journal-title":"arXiv:2401.15947"},{"key":"ref69","article-title":"Learn to be efficient: Build structured sparsity in large language models","author":"Zheng","year":"2024","journal-title":"arXiv:2402.06126"},{"key":"ref70","article-title":"Task-specific expert pruning for sparse mixture-of-experts","author":"Chen","year":"2022","journal-title":"arXiv:2206.00277"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.304"},{"key":"ref72","first-page":"18332","article-title":"DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Rajbhandari"},{"key":"ref73","article-title":"Fast inference of mixture-of-experts language models with offloading","author":"Eliseev","year":"2023","journal-title":"arXiv:2312.17238"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.363"},{"key":"ref76","first-page":"7103","article-title":"Mixture-of-experts with expert choice routing","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Zhou"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.334"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.361"},{"key":"ref79","article-title":"Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models","author":"Ding","year":"2022","journal-title":"arXiv:2203.06904"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_41"},{"key":"ref81","first-page":"12991","article-title":"LST: Ladder side-tuning for parameter and memory efficient transfer learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Sung"},{"key":"ref82","first-page":"53038","article-title":"Fine-tuning language models with just forward passes","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Malladi"},{"key":"ref83","first-page":"61121","article-title":"GaLore: Memory-efficient LLM training by gradient low-rank projection","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Zhao"},{"key":"ref84","first-page":"29768","article-title":"SparseLoRA: Accelerating LLM fine-tuning with contextual sparsity","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Samir"},{"key":"ref85","first-page":"1","article-title":"QA-LoRA: Quantization-aware low-rank adaptation of large language models","volume-title":"Proc. 12th Int. Conf. Learn. Represent.","author":"Xu"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.1"},{"key":"ref87","first-page":"1","article-title":"LoftQ: LoRA-fine-tuning-aware quantization for large language models","volume-title":"Proc. 12th Int. Conf. Learn. Represent.","author":"Li"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1569"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1168"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3658473"},{"key":"ref91","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"Tan","year":"2019","journal-title":"arXiv:1905.11946"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref94","first-page":"10096","article-title":"EfficientNetV2: Smaller models and faster training","volume-title":"Proc. 38th Int. Conf. Mach. Learn. (ICML)","author":"Tan"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref96","first-page":"38087","article-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Xiao"},{"key":"ref97","first-page":"1135","article-title":"Learning both weights and connections for efficient neural network","volume-title":"Proc. NIPS","author":"Han"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3592529"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3447993.3483249"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3241539.3241559"},{"key":"ref101","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume-title":"Proc. Mach. Learn. Syst. (MLSys)","author":"Korthikanti"}],"container-title":["IEEE Transactions on Networking"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10723154\/11317935\/11367102.pdf?arnumber=11367102","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T20:42:28Z","timestamp":1772743348000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11367102\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":101,"URL":"https:\/\/doi.org\/10.1109\/ton.2026.3658387","relation":{},"ISSN":["2998-4157"],"issn-type":[{"value":"2998-4157","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}