{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T10:21:25Z","timestamp":1766053285128,"version":"3.48.0"},"reference-count":84,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T00:00:00Z","timestamp":1762128000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T00:00:00Z","timestamp":1762128000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001321","name":"National Research Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001321","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003661","name":"Korea Institute for Advancement of Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003661","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,11,3]]},"DOI":"10.1109\/pact65351.2025.00015","type":"proceedings-article","created":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T18:30:30Z","timestamp":1765909830000},"page":"30-42","source":"Crossref","is-referenced-by-count":0,"title":["ScaleMoE: A Fast and Scalable Distributed Training Framework for Large-Scale Mixture-of-Experts Models"],"prefix":"10.1109","author":[{"given":"Seohong","family":"Choi","sequence":"first","affiliation":[{"name":"Sungkyunkwan University,Suwon,South Korea"}]},{"given":"Huize","family":"Hong","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University,Suwon,South Korea"}]},{"given":"Tae Hee","family":"Han","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University,Suwon,South Korea"}]},{"given":"Joonsung","family":"Kim","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University,Suwon,South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9891914"},{"key":"ref2","article-title":"Machine learning model sizes and the parameter gap","author":"Villalobos","year":"2022","journal-title":"arXiv preprint arXiv:2207.02852"},{"key":"ref3","article-title":"Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model","volume":"20","author":"Smith","journal-title":"arXiv preprint arXiv:2201.11990"},{"key":"ref4","article-title":"Deepvit: Towards deeper vision transformer","author":"Zhou","year":"2021","journal-title":"arXiv preprint arXiv:2103.11886"},{"key":"ref5","first-page":"5958","article-title":"Train big, then compress: Rethinking model size for efficient training and inference of transformers","volume-title":"International Conference on machine learning.","author":"Li"},{"key":"ref6","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv preprint arXiv:1909.08053"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"ref8","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017","journal-title":"arXiv preprint arXiv:1701.06538"},{"key":"ref9","article-title":"Mixture-of-experts meets instruction tuning: A winning combination for large language models","author":"Shen","year":"2023","journal-title":"arXiv preprint arXiv:2305.14705"},{"key":"ref10","article-title":"Mixtral of experts","author":"Jiang","year":"2024","journal-title":"arXiv preprint arXiv:2401.04088"},{"key":"ref11","article-title":"Jamba: A hybrid transformer-mamba language model","author":"Lieber","year":"2024","journal-title":"arXiv preprint arXiv:2403.19887"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/979-8-8688-0444-1_13"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"ref14","article-title":"Moma: Efficient early-fusion pre-training with mixture of modality-aware experts","author":"Lin","year":"2024","journal-title":"arXiv preprint arXiv:2407.21770"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3588964"},{"issue":"120","key":"ref16","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"ref17","article-title":"Dense training, sparse inference: Rethinking training of mixture-of-experts language models","author":"Pan","year":"2024","journal-title":"arXiv preprint arXiv:2404.05567"},{"key":"ref18","article-title":"Skywork-moe: A deep dive into training techniques for mixture-of-experts language models","author":"Wei","year":"2024","journal-title":"arXiv preprint arXiv:2406.06563"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.890"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3385639"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"ref22","article-title":"Moe-infinity: Activationaware expert offloading for efficient moe serving","author":"Xue","year":"2024","journal-title":"arXiv preprint arXiv:2401.14361"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM53939.2023.10228874"},{"year":"2024","key":"ref24","article-title":"DeepSpeed"},{"year":"2024","key":"ref25","article-title":"Amazon elastic compute cloud"},{"key":"ref26","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Improving language understanding by generative pretraining","year":"2018","author":"Radford","key":"ref27"},{"key":"ref28","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint arXiv:1810.04805"},{"issue":"140","key":"ref29","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of machine learning research"},{"key":"ref30","article-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2019","journal-title":"arXiv preprint arXiv:1910.13461"},{"issue":"8","key":"ref31","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref32","article-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019","journal-title":"arXiv preprint arXiv:1907.11692"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.804"},{"key":"ref34","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref35","article-title":"Visualbert: A simple and performant baseline for vision and language","author":"Li","year":"2019","journal-title":"arXiv preprint arXiv:1908.03557"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref37","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume":"34","author":"Riquelme","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref38","first-page":"28 441","article-title":"M3 vit: Mixture-of-experts vision transformer for efficient multitask learning with model-accelerator co-design","volume":"35","author":"Fan","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref40","article-title":"Graph transformer networks","volume":"32","author":"Yun","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref41","article-title":"A generalization of transformer networks to graphs","author":"Dwivedi","year":"2020","journal-title":"arXiv preprint arXiv:2012.09699"},{"key":"ref42","article-title":"Transformer for graphs: An overview from architecture perspective","author":"Min","year":"2022","journal-title":"arXiv preprint arXiv:2202.08455"},{"key":"ref43","article-title":"Opt: Open pre-trained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv preprint arXiv:2205.01068"},{"article-title":"Bigscience large open-science open-access multilingual language model","year":"2022","author":"AI","key":"ref44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3152828"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3516946"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"ref48","article-title":"Proprophet: Systematic load balancing method for efficient parallel training of large-scale moe models","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2411.10003"},{"key":"ref49","article-title":"Locmoe: A low-overhead moe for large language model training","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv:2401.13920"},{"key":"ref50","article-title":"Auxiliary-lossfree load balancing strategy for mixture-of-experts","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2408.15664"},{"article-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","volume-title":"International Conference on Learning Representations","author":"Lepikhin","key":"ref51"},{"key":"ref52","first-page":"269","article-title":"Tutel: Adaptive mixture-of-experts at scale","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Hwang"},{"key":"ref53","first-page":"6265","article-title":"Base layers: Simplifying training of large, sparse models","volume-title":"International Conference on Machine Learning.","author":"Lewis"},{"key":"ref54","article-title":"Expertflow: Optimized expert activation and token allocation for efficient mixture-of-experts inference","author":"He","year":"2024","journal-title":"arXiv preprint arXiv:2410.17954"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3300"},{"key":"ref56","first-page":"288","article-title":"Megablocks: Efficient sparse training with mixture-of-experts","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Gale"},{"key":"ref57","article-title":"Deepseek-v3 technical report","volume-title":"arXiv preprint arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"ref59","article-title":"Openmoe: An early effort on open mixture-of-experts language models","author":"Xue","year":"2024","journal-title":"arXiv preprint arXiv:2402.01739"},{"key":"ref60","article-title":"St-moe: Designing stable and transferable sparse expert models","author":"Zoph","year":"2022","journal-title":"arXiv preprint arXiv:2202.08906"},{"key":"ref61","first-page":"5547","article-title":"Glam: Efficient scaling of language models with mixture-of-experts","volume-title":"International Conference on Machine Learning.","author":"Du"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1115"},{"key":"ref63","article-title":"Olmoe: Open mixture-of-experts language models","author":"Muennighoff","year":"2024","journal-title":"arXiv preprint arXiv:2409.02060"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895480"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s00450-011-0157-1"},{"key":"ref67","article-title":"Ultra ethernet consortium: Optimizing ethernet for ai and hpc workloads","volume":"20","year":"2024","journal-title":"Linux Foundation"},{"year":"2025","key":"ref68","article-title":"Amazon EC2 Instance Network Bandwidth"},{"journal-title":"IEEE Standard for Ethernet (IEEE 802.3)","year":"2018","key":"ref69"},{"key":"ref70","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650083"},{"key":"ref72","first-page":"945","article-title":"Accelerating distributed MoE training and inference with lina","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC","volume":"23","author":"Li"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604869"},{"article-title":"Netmoe: Accelerating moe training through dynamic sample placement","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Liu","key":"ref74"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2670"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392771"},{"year":"2024","key":"ref77","article-title":"Collective Communications Library (NCCL)"},{"key":"ref78","first-page":"593","article-title":"TACCL: Guiding collective algorithm synthesis using communication sketches","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Shah"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"year":"2025","key":"ref80","article-title":"Huawei collective communication library (hccl) documentation"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472904"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126456"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747408"},{"article-title":"Efficient distributed training through gradient compression with sparsification and quantization techniques","year":"2021","author":"Aragani","key":"ref84"}],"event":{"name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","start":{"date-parts":[[2025,11,3]]},"location":"Irvine, CA, USA","end":{"date-parts":[[2025,11,6]]}},"container-title":["2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11282903\/11282500\/11282919.pdf?arnumber=11282919","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T10:18:21Z","timestamp":1766053101000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11282919\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,3]]},"references-count":84,"URL":"https:\/\/doi.org\/10.1109\/pact65351.2025.00015","relation":{},"subject":[],"published":{"date-parts":[[2025,11,3]]}}}