{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:53:10Z","timestamp":1777063990551,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803587","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"675-692","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MegaScale-Omni: A Hyper-Scale, Workload-Resilient System for MultiModal LLM Training in Production"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9272-1732","authenticated-orcid":false,"given":"Chunyu","family":"Xue","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6682-1783","authenticated-orcid":false,"given":"Yangrui","family":"Chen","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8684-8509","authenticated-orcid":false,"given":"Jianyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1147-6984","authenticated-orcid":false,"given":"Ningxin","family":"Zheng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3664-6615","authenticated-orcid":false,"given":"Junda","family":"Feng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4135-4684","authenticated-orcid":false,"given":"Jingji","family":"Chen","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3006-8217","authenticated-orcid":false,"given":"Shen","family":"Yan","sequence":"additional","affiliation":[{"name":"Bytedance seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6591-5863","authenticated-orcid":false,"given":"Yi","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2392-5300","authenticated-orcid":false,"given":"Lei","family":"Shi","sequence":"additional","affiliation":[{"name":"Bytedance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0037-7720","authenticated-orcid":false,"given":"Zanbo","family":"Wang","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0313-9423","authenticated-orcid":false,"given":"Lishu","family":"Luo","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4256-3277","authenticated-orcid":false,"given":"Faming","family":"Wu","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4879-5335","authenticated-orcid":false,"given":"Haibin","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3989-4358","authenticated-orcid":false,"given":"Yanghua","family":"Peng","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8346-3323","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Seed, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2022. Nvidia Transformer Engine. https:\/\/github.com\/NVIDIA\/TransformerEngine."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. InternLM Technical Report. https:\/\/github.com\/InternLM\/InternLM-techreport."},{"key":"e_1_3_2_1_3_1","unstructured":"2025. Gemini 2.5 Flash. https:\/\/deepmind.google\/models\/gemini\/flash\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2025. Nvidia Context Parallel Package. https:\/\/docs.nvidia.com\/megatron-core\/developer-guide\/latest\/api-guide\/context_parallel.html."},{"key":"e_1_3_2_1_5_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang Humen Zhong Yuanzhi Zhu Mingkun Yang Zhaohai Li Jianqiang Wan Pengfei Wang Wei Ding Zheren Fu Yiheng Xu Jiabo Ye Xi Zhang Tianbao Xie Zesen Cheng Hang Zhang Zhibo Yang Haiyang Xu and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv:2502.13923 [cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1140\/epjb\/e2008-00320-9"},{"key":"e_1_3_2_1_7_1","unstructured":"Florian Bordes Richard Yuanzhe Pang Anurag Ajay Alexander C. Li Adrien Bardes Suzanne Petryk Oscar Ma\u00f1as Zhiqiu Lin Anas Mahmoud Bargav Jayaraman Mark Ibrahim Melissa Hall Yunyang Xiong Jonathan Lebensold Candace Ross Srihari Jayakumar Chuan Guo Diane Bouchacourt Haider Al-Tahan Karthik Padthe Vasu Sharma Hu Xu Xiaoqing Ellen Tan Megan Richards Samuel Lavoie Pietro Astolfi Reyhane Askari Hemmat Jun Chen Kushal Tirumala Rim Assouel Mazda Moayeri Arjang Talattof Kamalika Chaudhuri Zechun Liu Xilun Chen Quentin Garrido Karen Ullrich Aishwarya Agrawal Kate Saenko Asli Celikyilmaz and Vikas Chandra. 2024. An Introduction to Vision-Language Modeling. arXiv:2405.17247 [cs.LG] https:\/\/arxiv.org\/abs\/2405.17247"},{"key":"e_1_3_2_1_8_1","unstructured":"Zheng Cai Maosong Cao Haojiong Chen Kai Chen Keyu Chen Xin Chen Xun Chen Zehui Chen Zhi Chen Pei Chu Xiaoyi Dong Haodong Duan Qi Fan Zhaoye Fei Yang Gao Jiaye Ge Chenya Gu Yuzhe Gu Tao Gui Aijia Guo Qipeng Guo Conghui He Yingfan Hu Ting Huang Tao Jiang Penglong Jiao Zhenjiang Jin Zhikai Lei Jiaxing Li Jingwen Li Linyang Li Shuaibin Li Wei Li Yining Li Hongwei Liu Jiangning Liu Jiawei Hong Kaiwen Liu Kuikun Liu Xiaoran Liu Chengqi Lv Haijun Lv Kai Lv Li Ma Runyuan Ma Zerun Ma Wenchang Ning Linke Ouyang Jiantao Qiu Yuan Qu Fukai Shang Yunfan Shao Demin Song Zifan Song Zhihao Sui Peng Sun Yu Sun Huanze Tang Bin Wang Guoteng Wang Jiaqi Wang Jiayu Wang Rui Wang Yudong Wang Ziyi Wang Xingjian Wei Qizhen Weng Fan Wu Yingtong Xiong Chao Xu Ruiliang Xu Hang Yan Yirong Yan Xiaogui Yang Haochen Ye Huaiyuan Ying Jia Yu Jing Yu Yuhang Zang Chuyu Zhang Li Zhang Pan Zhang Peng Zhang Ruijie Zhang Shuo Zhang Songyang Zhang Wenjian Zhang Wenwei Zhang Xingcheng Zhang Xinyue Zhang Hui Zhao Qian Zhao Xiaomeng Zhao Fengzhe Zhou Zaida Zhou Jingming Zhuo Yicheng Zou Xipeng Qiu Yu Qiao and Dahua Lin. 2024. InternLM2 Technical Report. arXiv:2403.17297 [cs.CL] https:\/\/arxiv.org\/abs\/2403.17297"},{"key":"e_1_3_2_1_9_1","volume-title":"FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv preprint arXiv:2406.06858","author":"Chang Liwen","year":"2024","unstructured":"Liwen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Ziheng Jiang, Haibin Lin, et al. 2024. FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv preprint arXiv:2406.06858 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. arXiv:2307.08691 [cs.LG] https:\/\/arxiv.org\/abs\/2307.08691"},{"key":"e_1_3_2_1_11_1","unstructured":"Mostafa Dehghani Basil Mustafa Josip Djolonga Jonathan Heek Matthias Minderer Mathilde Caron Andreas Steiner Joan Puigcerver Robert Geirhos Ibrahim Alabdulmohsin Avital Oliver Piotr Padlewski Alexey Gritsenko Mario Lu\u010di\u0107 and Neil Houlsby. 2023. Patch n' Pack: NaViT a Vision Transformer for any Aspect Ratio and Resolution. arXiv:2307.06304 [cs.CV] https:\/\/arxiv.org\/abs\/2307.06304"},{"key":"e_1_3_2_1_12_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs.CV] https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_2_1_13_1","volume-title":"Optimus: Accelerating Large-Scale Multi-Modal LLM Training by Bubble Exploitation. arXiv preprint arXiv:2408.03505","author":"Feng Weiqi","year":"2024","unstructured":"Weiqi Feng, Yangrui Chen, Shaoyu Wang, Yanghua Peng, Haibin Lin, and Minlan Yu. 2024. Optimus: Accelerating Large-Scale Multi-Modal LLM Training by Bubble Exploitation. arXiv preprint arXiv:2408.03505 (2024)."},{"key":"e_1_3_2_1_14_1","unstructured":"Yu Gao Lixue Gong Qiushan Guo Xiaoxia Hou Zhichao Lai Fanshi Li Liang Li Xiaochen Lian Chao Liao Liyang Liu Wei Liu Yichun Shi Shiqi Sun Yu Tian Zhi Tian Peng Wang Rui Wang Xuanda Wang Xun Wang Ye Wang Guofeng Wu Jie Wu Xin Xia Xuefeng Xiao Zhonghua Zhai Xinyu Zhang Qi Zhang Yuwei Zhang Shijia Zhao Jianchao Yang and Weilin Huang. 2025. Seedream 3.0 Technical Report. arXiv:2504.11346 [cs.CV] https:\/\/arxiv.org\/abs\/2504.11346"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Hao Ge Junda Feng Qi Huang Fangcheng Fu Xiaonan Nie Lei Zuo Haibin Lin Bin Cui and Xin Liu. 2025. ByteScale: Efficient Scaling of LLM Training with a 2048K Context Length on More Than 12 000 GPUs. arXiv:2502.21231 [cs.DC] https:\/\/arxiv.org\/abs\/2502.21231","DOI":"10.1145\/3718958.3754352"},{"key":"e_1_3_2_1_16_1","unstructured":"Dong Guo Faming Wu Feida Zhu Fuxing Leng Guang Shi Haobin Chen Haoqi Fan Jian Wang Jianyu Jiang Jiawei Wang Jingji Chen Jingjia Huang Kang Lei Liping Yuan Lishu Luo Pengfei Liu Qinghao Ye Rui Qian Shen Yan Shixiong Zhao Shuai Peng Shuangye Li Sihang Yuan Sijin Wu Tianheng Cheng Weiwei Liu Wenqian Wang Xianhan Zeng Xiao Liu Xiaobo Qin Xiaohan Ding Xiaojun Xiao Xiaoying Zhang Xuanwei Zhang Xuehan Xiong Yanghua Peng Yangrui Chen Yanwei Li Yanxu Hu Yi Lin Yiyuan Hu Yiyuan Zhang Youbin Wu Yu Li Yudong Liu Yue Ling Yujia Qin Zanbo Wang Zhiwu He Aoxue Zhang Bairen Yi Bencheng Liao Can Huang Can Zhang Chaorui Deng Chaoyi Deng Cheng Lin Cheng Yuan Chenggang Li Chenhui Gou Chenwei Lou Chengzhi Wei Chundian Liu Chunyuan Li Deyao Zhu Donghong Zhong Feng Li Feng Zhang Gang Wu Guodong Li Guohong Xiao Haibin Lin Haihua Yang Haoming Wang Heng Ji Hongxiang Hao Hui Shen Huixia Li Jiahao Li Jialong Wu Jianhua Zhu Jianpeng Jiao Jiashi Feng Jiaze Chen Jianhui Duan Jihao Liu Jin Zeng Jingqun Tang Jingyu Sun Joya Chen Jun Long Junda Feng Junfeng Zhan Junjie Fang Junting Lu Kai Hua Kai Liu Kai Shen Kaiyuan Zhang Ke Shen Ke Wang Keyu Pan Kun Zhang Kunchang Li Lanxin Li Lei Li Lei Shi Li Han Liang Xiang Liangqiang Chen Lin Chen Lin Li Lin Yan Liying Chi Longxiang Liu Mengfei Du Mingxuan Wang Ningxin Pan Peibin Chen Pengfei Chen Pengfei Wu Qingqing Yuan Qingyao Shuai Qiuyan Tao Renjie Zheng Renrui Zhang Ru Zhang Rui Wang Rui Yang Rui Zhao Shaoqiang Xu Shihao Liang Shipeng Yan Shu Zhong Shuaishuai Cao Shuangzhi Wu Shufan Liu Shuhan Chang Songhua Cai Tenglong Ao Tianhao Yang Tingting Zhang Wanjun Zhong Wei Jia Wei Weng Weihao Yu Wenhao Huang Wenjia Zhu Wenli Yang Wenzhi Wang Xiang Long XiangRui Yin Xiao Li Xiaolei Zhu Xiaoying Jia Xijin Zhang Xin Liu Xinchen Zhang Xinyu Yang Xiongcai Luo Xiuli Chen Xuantong Zhong Xuefeng Xiao Xujing Li Yan Wu Yawei Wen Yifan Du Yihao Zhang Yining Ye Yonghui Wu Yu Liu Yu Yue Yufeng Zhou Yufeng Yuan Yuhang Xu Yuhong Yang Yun Zhang Yunhao Fang Yuntao Li Yurui Ren Yuwen Xiong Zehua Hong Zehua Wang Zewei Sun Zeyu Wang Zhao Cai Zhaoyue Zha Zhecheng An Zhehui Zhao Zhengzhuo Xu Zhipeng Chen Zhiyong Wu Zhuofan Zheng Zihao Wang Zilong Huang Ziyu Zhu and Zuquan Song. 2025. Seed1.5-VL Technical Report. arXiv:2505.07062 [cs.CV] https:\/\/arxiv.org\/abs\/2505.07062"},{"key":"e_1_3_2_1_17_1","unstructured":"Ailin Huang Boyong Wu Bruce Wang Chao Yan Chen Hu Chengli Feng Fei Tian Feiyu Shen Jingbei Li Mingrui Chen Peng Liu Ruihang Miao Wang You Xi Chen Xuerui Yang Yechang Huang Yuxiang Zhang Zheng Gong Zixin Zhang Hongyu Zhou Jianjian Sun Brian Li Chengting Feng Changyi Wan Hanpeng Hu Jianchang Wu Jiangjie Zhen Ranchen Ming Song Yuan Xuelin Zhang Yu Zhou Bingxin Li Buyun Ma Hongyuan Wang Kang An Wei Ji Wen Li Xuan Wen Xiangwen Kong Yuankai Ma Yuanwei Liang Yun Mou Bahtiyar Ahmidi Bin Wang Bo Li Changxin Miao Chen Xu Chenrun Wang Dapeng Shi Deshan Sun Dingyuan Hu Dula Sai Enle Liu Guanzhe Huang Gulin Yan Heng Wang Haonan Jia Haoyang Zhang Jiahao Gong Junjing Guo Jiashuai Liu Jiahong Liu Jie Feng Jie Wu Jiaoren Wu Jie Yang Jinguo Wang Jingyang Zhang Junzhe Lin Kaixiang Li Lei Xia Li Zhou Liang Zhao Longlong Gu Mei Chen Menglin Wu Ming Li Mingxiao Li Mingliang Li Mingyao Liang Na Wang Nie Hao Qiling Wu Qinyuan Tan Ran Sun Shuai Shuai Shaoliang Pang Shiliang Yang Shuli Gao Shanshan Yuan Siqi Liu Shihong Deng Shilei Jiang Sitong Liu Tiancheng Cao Tianyu Wang Wenjin Deng Wuxun Xie Weipeng Ming Wenqing He Wen Sun Xin Han Xin Huang Xiaomin Deng Xiaojia Liu Xin Wu Xu Zhao Yanan Wei Yanbo Yu Yang Cao Yangguang Li Yangzhen Ma Yanming Xu Yaoyu Wang Yaqiang Shi Yilei Wang Yizhuang Zhou Yinmin Zhong Yang Zhang Yaoben Wei Yu Luo Yuanwei Lu Yuhe Yin Yuchu Luo Yuanhao Ding Yuting Yan Yaqi Dai Yuxiang Yang Zhe Xie Zheng Ge Zheng Sun Zhewei Huang Zhichao Chang Zhisheng Guan Zidong Yang Zili Zhang Binxing Jiao Daxin Jiang Heung-Yeung Shum Jiansheng Chen Jing Li Shuchang Zhou Xiangyu Zhang Xinhao Zhang and Yibo Zhu. 2025. Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction. arXiv:2502.11946 [cs.CL] https:\/\/arxiv.org\/abs\/2502.11946"},{"key":"e_1_3_2_1_18_1","volume-title":"DISTMM: Accelerating Distributed Multimodal Model Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Huang Jun","year":"2024","unstructured":"Jun Huang, Zhen Zhang, Shuai Zheng, Feng Qin, and Yida Wang. 2024. DISTMM: Accelerating Distributed Multimodal Model Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 1157\u20131171. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/huang"},{"key":"e_1_3_2_1_19_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_20_1","volume-title":"Samyam Rajbhandari, and Yuxiong He.","author":"Jacobs Sam Ade","year":"2023","unstructured":"Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, and Yuxiong He. 2023. DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models. arXiv:2309.14509 [cs.LG] https:\/\/arxiv.org\/abs\/2309.14509"},{"key":"e_1_3_2_1_21_1","unstructured":"Byungsoo Jeon Mengdi Wu Shiyi Cao Sunghyun Kim Sunghyun Park Neeraj Aggarwal Colin Unger Daiyaan Arfeen Peiyuan Liao Xupeng Miao Mohammad Alizadeh Gregory R. Ganger Tianqi Chen and Zhihao Jia. 2024. GraphPipe: Improving Performance and Scalability of DNN Training with Graph Pipeline Parallelism. arXiv:2406.17145 [cs.DC] https:\/\/arxiv.org\/abs\/2406.17145"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems. 542\u2013559","author":"Jiang Chenyu","year":"2024","unstructured":"Chenyu Jiang, Zhen Jia, Shuai Zheng, Yida Wang, and Chuan Wu. 2024. DynaPipe: Optimizing multi-task training through dynamic pipelines. In Proceedings of the Nineteenth European Conference on Computer Systems. 542\u2013559."},{"key":"e_1_3_2_1_23_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong Yulu Jia Sun He Hongmin Chen Zhihao Bai Qi Hou Shipeng Yan Ding Zhou Yiyao Sheng Zhuo Jiang Haohan Xu Haoran Wei Zhang Zhang Pengfei Nie Leqi Zou Sida Zhao Liang Xiang Zherui Liu Zhe Li Xiaoying Jia Jianxi Ye Xin Jin and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10 000 GPUs. arXiv:2402.15627 [cs.LG] https:\/\/arxiv.org\/abs\/2402.15627"},{"key":"e_1_3_2_1_24_1","unstructured":"Diederik P Kingma and Max Welling. 2022. Auto-Encoding Variational Bayes. arXiv:1312.6114 [stat.ML] https:\/\/arxiv.org\/abs\/1312.6114"},{"key":"e_1_3_2_1_25_1","unstructured":"Vijay Korthikanti Jared Casper Sangkug Lym Lawrence McAfee Michael Andersch Mohammad Shoeybi and Bryan Catanzaro. 2022. Reducing Activation Recomputation in Large Transformer Models. arXiv:2205.05198 [cs.LG] https:\/\/arxiv.org\/abs\/2205.05198"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_1_27_1","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania et al. 2020. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 6543\u20136552","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. Terapipe: Token-level pipeline parallelism for training large-scale language models. In International Conference on Machine Learning. PMLR, 6543\u20136552."},{"key":"e_1_3_2_1_29_1","volume-title":"Mogao: An Omni Foundation Model for Interleaved Multi-Modal Generation. arXiv:2505.05472 [cs.CV] https:\/\/arxiv.org\/abs\/2505.05472","author":"Liao Chao","year":"2025","unstructured":"Chao Liao, Liyang Liu, Xun Wang, Zhengxiong Luo, Xinyu Zhang, Wenliang Zhao, Jie Wu, Liang Li, Zhi Tian, and Weilin Huang. 2025. Mogao: An Omni Foundation Model for Interleaved Multi-Modal Generation. arXiv:2505.05472 [cs.CV] https:\/\/arxiv.org\/abs\/2505.05472"},{"key":"e_1_3_2_1_30_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Quanlu Zhang, Fan Yang, Yi Zhu, Cheng Li, Saeed Maleki, Xu Cao, Ning Shang, Yilei Yang, et al. 2024. {nnScaler}:{Constraint-Guided} Parallelization Plan Generation for Deep Learning Training. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 347\u2013363."},{"key":"e_1_3_2_1_31_1","unstructured":"Ze Liu Jia Ning Yue Cao Yixuan Wei Zheng Zhang Stephen Lin and Han Hu. 2021. Video Swin Transformer. arXiv:2106.13230 [cs.CV] https:\/\/arxiv.org\/abs\/2106.13230"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1\u201315","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil R Devanur, Gregory R Ganger, Phillip B Gibbons, and Matei Zaharia. 2019. PipeDream: Generalized pipeline parallelism for DNN training. In Proceedings of the 27th ACM Symposium on Operating Systems Principles. 1\u201315."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. 2021. Efficient large-scale language model training on gpu clusters using megatron-lm. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1\u201315."},{"key":"e_1_3_2_1_34_1","unstructured":"OpenAI Aaron Hurst et al. 2024. GPT-4o System Card. arXiv:2410.21276 [cs.CL] https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"e_1_3_2_1_35_1","volume-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 5206\u20135210","author":"Panayotov Vassil","year":"2015","unstructured":"Vassil Panayotov, Guoguo Chen, Daniel Povey, and Sanjeev Khudanpur. 2015. Librispeech: An ASR corpus based on public domain audio books. In 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 5206\u20135210. 10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_36_1","volume-title":"Zero bubble pipeline parallelism. arXiv preprint arXiv:2401.10241","author":"Qi Penghui","year":"2023","unstructured":"Penghui Qi, Xinyi Wan, Guangxing Huang, and Min Lin. 2023. Zero bubble pipeline parallelism. arXiv preprint arXiv:2401.10241 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He.","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. arXiv:2201.05596 [cs.LG] https:\/\/arxiv.org\/abs\/2201.05596"},{"key":"e_1_3_2_1_38_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201316."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","author":"Rasley Jeff","year":"2020","unstructured":"Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. 2020. DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (Virtual Event, CA, USA) (KDD '20). Association for Computing Machinery, New York, NY, USA, 3505\u20133506. 10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_40_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multibillion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_41_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. 2022. Unity: Accelerating {DNN} training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 267\u2013284."},{"key":"e_1_3_2_1_43_1","volume-title":"ByteCheckpoint: A Unified Checkpointing System for LLM Development. arXiv preprint arXiv:2407.20143","author":"Wan Borui","year":"2024","unstructured":"Borui Wan, Mingji Han, Yiyao Sheng, Zhichao Lai, Mofan Zhang, Junda Zhang, Yanghua Peng, Haibin Lin, Xin Liu, and Chuan Wu. 2024. ByteCheckpoint: A Unified Checkpointing System for LLM Development. arXiv preprint arXiv:2407.20143 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Dingdong Wang Mingyu Cui Dongchao Yang Xueyuan Chen and Helen Meng. 2024. A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks with Large Language Models. arXiv:2411.08742 [cs.CL] https:\/\/arxiv.org\/abs\/2411.08742"},{"key":"e_1_3_2_1_45_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Yang Fan Kai Dang Mengfei Du Xuancheng Ren Rui Men Dayiheng Liu Chang Zhou Jingren Zhou and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv:2409.12191 [cs.CV] https:\/\/arxiv.org\/abs\/2409.12191"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Wang Shibo","year":"2022","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, et al. 2022. Overlap communication with dependent computation via decomposition in large deep learning models. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 93\u2013106."},{"key":"e_1_3_2_1_47_1","volume-title":"Spindle: Efficient Distributed Training of Multi-Task Large Models via Wavefront Scheduling. arXiv:2409.03365 [cs.DC] https:\/\/arxiv.org\/abs\/2409.03365","author":"Wang Yujie","year":"2025","unstructured":"Yujie Wang, Shenhan Zhu, Fangcheng Fu, Xupeng Miao, Jie Zhang, Juan Zhu, Fan Hong, Yong Li, and Bin Cui. 2025. Spindle: Efficient Distributed Training of Multi-Task Large Models via Wavefront Scheduling. arXiv:2409.03365 [cs.DC] https:\/\/arxiv.org\/abs\/2409.03365"},{"key":"e_1_3_2_1_48_1","unstructured":"Zheng Wang Anna Cai Xinfeng Xie Zaifeng Pan Yue Guan Weiwei Chu Jie Wang Shikai Li Jianyu Huang Chris Cai Yuchen Hao and Yufei Ding. 2025. WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training. arXiv:2503.17924 [cs.DC] https:\/\/arxiv.org\/abs\/2503.17924"},{"key":"e_1_3_2_1_49_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Perric Cistac, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. Association for Computational Linguistics, 38\u201345. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_50_1","volume-title":"Adaptra: Straggler-Resilient Hybrid-Parallel Training with Pipeline Adaptation. arXiv:2504.19232 [cs.DC] https:\/\/arxiv.org\/abs\/2504.19232","author":"Wu Tianyuan","year":"2025","unstructured":"Tianyuan Wu, Lunxi Cao, Hanfeng Lu, Xiaoxiao Jiang, Yinghao Yu, Siran Yang, Guodong Yang, Jiamang Wang, Lin Qu, Liping Zhang, and Wei Wang. 2025. Adaptra: Straggler-Resilient Hybrid-Parallel Training with Pipeline Adaptation. arXiv:2504.19232 [cs.DC] https:\/\/arxiv.org\/abs\/2504.19232"},{"key":"e_1_3_2_1_51_1","unstructured":"Zhenliang Xue Hanpeng Hu Xing Chen Yimin Jiang Yixin Song Zeyu Mi Yibo Zhu Daxin Jiang Yubin Xia and Haibo Chen. 2025. PipeWeaver: Addressing Data Dynamicity in Large Multimodal Model Training with Dynamic Interleaved Pipeline. arXiv:2504.14145 [cs.DC] https:\/\/arxiv.org\/abs\/2504.14145"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Yifan Yang Zheshu Song Jianheng Zhuo Mingyu Cui Jinpeng Li Bo Yang Yexing Du Ziyang Ma Xunying Liu Ziyuan Wang Ke Li Shuai Fan Kai Yu Wei-Qiang Zhang Guoguo Chen and Xie Chen. 2025. GigaSpeech 2: An Evolving Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling Transcription and Refinement. arXiv:2406.11546 [eess.AS] https:\/\/arxiv.org\/abs\/2406.11546","DOI":"10.18653\/v1\/2025.acl-long.135"},{"key":"e_1_3_2_1_53_1","volume-title":"Berg","author":"Yu Licheng","year":"2016","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C. Berg, and Tamara L. Berg. 2016. Modeling Context in Referring Expressions. arXiv:1608.00272 [cs.CV] https:\/\/arxiv.org\/abs\/1608.00272"},{"key":"e_1_3_2_1_54_1","unstructured":"Yu Zhang Wei Han James Qin Yongqiang Wang Ankur Bapna Zhehuai Chen Nanxin Chen Bo Li Vera Axelrod Gary Wang Zhong Meng Ke Hu Andrew Rosenberg Rohit Prabhavalkar Daniel S. Park Parisa Haghani Jason Riesa Ginger Perng Hagen Soltau Trevor Strohman Bhuvana Ramabhadran Tara Sainath Pedro Moreno Chung-Cheng Chiu Johan Schalkwyk Fran\u00e7oise Beaufays and Yonghui Wu. 2023. Google USM: Scaling Automatic Speech Recognition Beyond 100 Languages. arXiv:2303.01037 [cs.CL] https:\/\/arxiv.org\/abs\/2303.01037"},{"key":"e_1_3_2_1_55_1","volume-title":"Disttrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. arXiv preprint arXiv:2408.04275","author":"Zhang Zili","year":"2024","unstructured":"Zili Zhang, Yinmin Zhong, Ranchen Ming, Hanpeng Hu, Jianjian Sun, Zheng Ge, Yibo Zhu, and Xin Jin. 2024. Disttrain: Addressing model and data heterogeneity with disaggregated training for multimodal large language models. arXiv preprint arXiv:2408.04275 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"OVERLORD: Ultimate Scaling of DataLoader for Multi-Source Large Foundation Model Training. arXiv:2504.09844 [cs.DC] https:\/\/arxiv.org\/abs\/2504.09844","author":"Zhao Juntao","year":"2025","unstructured":"Juntao Zhao, Qi Lu, Wei Jia, Borui Wan, Lei Zuo, Junda Feng, Jianyu Jiang, Yangrui Chen, Shuaishuai Cao, Jialing He, Kaihua Jiang, Yuanzhe Hu, Shibiao Nong, Yanghua Peng, Haibin Lin, Xin Liu, and Chuan Wu. 2025. OVERLORD: Ultimate Scaling of DataLoader for Multi-Source Large Foundation Model Training. arXiv:2504.09844 [cs.DC] https:\/\/arxiv.org\/abs\/2504.09844"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_58_1","volume-title":"Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559\u2013578."}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:26:19Z","timestamp":1777062379000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803587"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":58,"alternative-id":["10.1145\/3767295.3803587","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803587","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}