{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T19:14:13Z","timestamp":1774120453228,"version":"3.50.1"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T00:00:00Z","timestamp":1716163200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T00:00:00Z","timestamp":1716163200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,20]]},"DOI":"10.1109\/infocom52122.2024.10621327","type":"proceedings-article","created":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T17:25:41Z","timestamp":1723483541000},"page":"1880-1889","source":"Crossref","is-referenced-by-count":9,"title":["Parm: Efficient Training of Large Sparsely-Activated Models with Dedicated Schedules"],"prefix":"10.1109","author":[{"given":"Xinglin","family":"Pan","sequence":"first","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou),Data Science and Analytics Thrust"}]},{"given":"Wenxiang","family":"Lin","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology,School of Computer Science and Technology,Shenzhen"}]},{"given":"Shaohuai","family":"Shi","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology,School of Computer Science and Technology,Shenzhen"}]},{"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou),Data Science and Analytics Thrust"}]},{"given":"Weinong","family":"Sun","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering"}]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"issue":"8","key":"ref2","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"issue":"240","key":"ref4","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref5","article-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"ref6","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","volume-title":"International Conference on Learning Representations","author":"Shazeer"},{"key":"ref7","article-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","volume-title":"International Conference on Learning Representations","author":"Lepikhin"},{"issue":"1","key":"ref8","first-page":"5232","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"The Journal of Machine Learning Research"},{"key":"ref9","first-page":"1","article-title":"Efficient large-scale language model training on GPU clusters using Megatron-LM","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Narayanan"},{"key":"ref10","article-title":"Large scale distributed deep networks","volume":"25","author":"Dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref12","first-page":"18 332","article-title":"Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale","volume-title":"International Conference on Machine Learning","author":"Rajbhandari"},{"key":"ref13","article-title":"Tutel: Adaptive mixture-of-experts at scale","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Hwang"},{"key":"ref14","first-page":"13 782","article-title":"Gating dropout: Communication-efficient regularization for sparsely activated transformers","volume-title":"International Conference on Machine Learning","author":"Liu"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508417"},{"key":"ref16","article-title":"Hetumoe: An effi-cient trillion-scale mixture-of-expert distributed training system","author":"Nie","year":"2022"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563487"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM53939.2023.10228874"},{"key":"ref19","first-page":"945","article-title":"Accelerating distributed {MoE} training and inference with lina","volume-title":"USENIX Annual Technical Conference","author":"Li"},{"key":"ref20","first-page":"559","article-title":"Alpa: Automating inter-and Intra-Operator} parallelism for distributed deep learning","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation","author":"Zheng"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/SHPCC.1994.296665"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24685-5_1"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/505202.505215"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356222"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"ref26","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of NAACL-HLT","author":"Kenton"},{"key":"ref27","first-page":"6265","article-title":"BASE layers: Simplifying training of large, sparse models","volume-title":"International Conference on Machine Learning","author":"Lewis"},{"key":"ref28","article-title":"Mixture-of-experts with expert choice routing","author":"Zhou","year":"2022"},{"key":"ref29","article-title":"SComoe: Efficient mixtures of experts with structured communication","volume-title":"International Conference on Learning Representations","author":"Zeng"},{"key":"ref30","first-page":"11","article-title":"Pangu-\u03a3: Towards trillion parameter language model with sparse heterogeneous computing","volume":"10","author":"Ren","year":"2023"},{"key":"ref31","article-title":"Taming sparsely activated transformer with stochastic experts","volume-title":"International Conference on Learning Representations","author":"Zuo"},{"key":"ref32","first-page":"13 782","article-title":"Gating dropout: Communication-efficient regularization for sparsely activated transformers","volume-title":"International Conference on Learning Representations","author":"Liu"},{"key":"ref33","article-title":"The lazy neuron phenomenon: On emergence of activation sparsity in transformers","volume-title":"International Conference on Learning Representations","author":"Li"},{"key":"ref34","article-title":"Doubling all2all performance with nvidia collective communication library 2.12"},{"key":"ref35","article-title":"SE-MoE: A scalable and efficient mixture-of-experts distributed training and inference system","author":"Shen","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref37","first-page":"961","article-title":"{SmartMoE} : Efficiently training {Sparsely-Activated} models through combining offline and online parallelization","volume-title":"USENIX Annual Technical Conference","author":"Zhai"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"}],"event":{"name":"IEEE INFOCOM 2024 - IEEE Conference on Computer Communications","location":"Vancouver, BC, Canada","start":{"date-parts":[[2024,5,20]]},"end":{"date-parts":[[2024,5,23]]}},"container-title":["IEEE INFOCOM 2024 - IEEE Conference on Computer Communications"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10621050\/10621073\/10621327.pdf?arnumber=10621327","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,13]],"date-time":"2024-08-13T05:17:05Z","timestamp":1723526225000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10621327\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,20]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/infocom52122.2024.10621327","relation":{},"subject":[],"published":{"date-parts":[[2024,5,20]]}}}