{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:32:12Z","timestamp":1777105932535,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB4500700"],"award-info":[{"award-number":["2022YFB4500700"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Scientific Research Innovation Capability Support Project for Young Faculty","award":["ZYGXQNJSKYCXNLZCXM-I1"],"award-info":[{"award-number":["ZYGXQNJSKYCXNLZCXM-I1"]}]},{"name":"Fundamental Research Funds for the Central Universities, Peking University"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172008"],"award-info":[{"award-number":["62172008"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3718958.3750472","type":"proceedings-article","created":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:54:11Z","timestamp":1756313651000},"page":"24-38","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["DistTrain: Addressing Model and Data Heterogeneity with Disaggregated Training for Multimodal Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4209-9451","authenticated-orcid":false,"given":"Zili","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2504-7652","authenticated-orcid":false,"given":"Yinmin","family":"Zhong","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0049-873X","authenticated-orcid":false,"given":"Yimin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Independent Researcher, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5787-5226","authenticated-orcid":false,"given":"Hanpeng","family":"Hu","sequence":"additional","affiliation":[{"name":"StepFun, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1216-9626","authenticated-orcid":false,"given":"Jianjian","family":"Sun","sequence":"additional","affiliation":[{"name":"StepFun, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8630-8270","authenticated-orcid":false,"given":"Zheng","family":"Ge","sequence":"additional","affiliation":[{"name":"StepFun, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9113-2660","authenticated-orcid":false,"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"StepFun, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6657-5806","authenticated-orcid":false,"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"StepFun, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8741-5847","authenticated-orcid":false,"given":"Xin","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2022. Techniques and Systems to Train and Serve Bigger Models. https:\/\/icml.cc\/virtual\/2022\/tutorial\/18440."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/vicuna.lmsys.org\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. CVXPY 1.5. https:\/\/www.cvxpy.org\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. Google ViT-Huge. https:\/\/huggingface.co\/google\/vit-huge-patch14-224-in21k."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Introducing Gemini: our largest and most capable AI model. https:\/\/blog.google\/technology\/ai\/google-gemini-ai\/."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. Meta Llama3. https:\/\/llama.meta.com\/."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. NVIDIA Transformer Engine. https:\/\/github.com\/NVIDIA\/TransformerEngine."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. PyTorch Distributed Overview. https:\/\/pytorch.org\/tutorials\/beginner\/dist_overview.html."},{"key":"e_1_3_2_1_10_1","volume-title":"arXiv preprint arXiv:2412.15115","year":"2024","unstructured":"2024. Qwen2.5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. Stable Diffusion 2.1. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2-1\/."},{"key":"e_1_3_2_1_12_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_13_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_14_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_15_1","volume-title":"Approximation algorithms for maximin fair division. ACM Transactions on Economics and Computation (TEAC)","author":"Barman Siddharth","year":"2020","unstructured":"Siddharth Barman and Sanath Kumar Krishnamurthy. 2020. Approximation algorithms for maximin fair division. ACM Transactions on Economics and Computation (TEAC) (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Zal\u00e1n Borsos Rapha\u00ebl Marinier Damien Vincent Eugene Kharitonov Olivier Pietquin Matt Sharifi Dominik Roblek Olivier Teboul David Grangier Marco Tagliasacchi et al. 2023. Audiolm: a language modeling approach to audio generation. IEEE\/ACM transactions on audio speech and language processing (2023).","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Brock Andy","year":"2021","unstructured":"Andy Brock, Soham De, Samuel L Smith, and Karen Simonyan. 2021. High-performance large-scale image recognition without normalization. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_18_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_19_1","volume-title":"FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv:2406.06858 [cs.LG]","author":"Chang Li-Wen","year":"2024","unstructured":"Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Ziheng Jiang, Haibin Lin, Xin Jin, and Xin Liu. 2024. FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arXiv:2406.06858 [cs.LG]"},{"key":"e_1_3_2_1_20_1","volume-title":"Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In ACM ASPLOS.","author":"Chen Chang","year":"2024","unstructured":"Chang Chen, Xiuhong Li, Qianchao Zhu, Jiangfei Duan, Peng Sun, Xingcheng Zhang, and Chao Yang. 2024. Centauri: Enabling Efficient Scheduling for Communication-Computation Overlap in Large Model Training via Communication Partitioning. In ACM ASPLOS."},{"key":"e_1_3_2_1_21_1","volume-title":"Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022. Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811","author":"Chen Xiaokang","year":"2025","unstructured":"Xiaokang Chen, Zhiyu Wu, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, and Chong Ruan. 2025. Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811 (2025)."},{"key":"e_1_3_2_1_23_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al.","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research (2023)."},{"key":"e_1_3_2_1_24_1","unstructured":"Chaorui Deng Deyao Zhu Kunchang Li Chenhui Gou Feng Li Zeyu Wang Shu Zhong Weihao Yu Xiaonan Nie Ziang Song et al. 2025. Emerging properties in unified multimodal pretraining. arXiv preprint arXiv:2505.14683 (2025)."},{"key":"e_1_3_2_1_25_1","volume-title":"Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499","author":"Dong Runpei","year":"2023","unstructured":"Runpei Dong, Chunrui Han, Yuang Peng, Zekun Qi, Zheng Ge, Jinrong Yang, Liang Zhao, Jianjian Sun, Hongyu Zhou, Haoran Wei, et al. 2023. Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. 2023. Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. 2022. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_29_1","volume-title":"DAPPLE: A pipelined data parallel approach for training large models. In ACM PPoPP.","author":"Fan Shiqing","year":"2021","unstructured":"Shiqing Fan, Yi Rong, Chen Meng, Zongyan Cao, Siyu Wang, Zhen Zheng, Chuan Wu, Guoping Long, Jun Yang, Lixue Xia, et al. 2021. DAPPLE: A pipelined data parallel approach for training large models. In ACM PPoPP."},{"key":"e_1_3_2_1_30_1","volume-title":"CVX: Matlab software for disciplined convex programming, version 2.1.","author":"Grant Michael","year":"2014","unstructured":"Michael Grant and Stephen Boyd. 2014. CVX: Matlab software for disciplined convex programming, version 2.1."},{"key":"e_1_3_2_1_31_1","unstructured":"Michael Grant Stephen Boyd and Yinyu Ye. 2006. Disciplined convex programming."},{"key":"e_1_3_2_1_32_1","volume-title":"DISTMM: Accelerating Distributed Multimodal Model Training. In USENIX NSDI.","author":"Huang Jun","year":"2024","unstructured":"Jun Huang, Zhen Zhang, Shuai Zheng, Feng Qin, and Yida Wang. 2024. DISTMM: Accelerating Distributed Multimodal Model Training. In USENIX NSDI."},{"key":"e_1_3_2_1_33_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. In Advances in Neural Information Processing Systems.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_34_1","volume-title":"Conference on Machine Learning and Systems","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2019. Beyond data and model parallelism for deep neural networks. Conference on Machine Learning and Systems (2019)."},{"key":"e_1_3_2_1_35_1","unstructured":"Ziheng Jiang Haibin Lin Yinmin Zhong Qi Huang Yangrui Chen Zhi Zhang Yanghua Peng Xiang Li Cong Xie Shibiao Nong et al. 2024. MegaScale: Scaling large language model training to more than 10 000 GPUs. In USENIX NSDI."},{"key":"e_1_3_2_1_36_1","unstructured":"Diederik P Kingma Max Welling et al. 2013. Auto-encoding variational bayes."},{"key":"e_1_3_2_1_37_1","volume-title":"Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125","author":"Kondratyuk Dan","year":"2023","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, et al. 2023. Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125 (2023)."},{"key":"e_1_3_2_1_38_1","unstructured":"Richard Earl Korf. 2009. Multi-way number partitioning. In Twenty-first international joint conference on artificial intelligence."},{"key":"e_1_3_2_1_39_1","volume-title":"Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120","author":"Li Shenggui","year":"2021","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. 2021. Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:2105.13120 (2021)."},{"key":"e_1_3_2_1_40_1","volume-title":"Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Visual instruction tuning. Advances in Neural Information Processing Systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in Neural Information Processing Systems (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"World model on million-length video and language with blockwise ringattention. arXiv preprint arXiv:2402.08268","author":"Liu Hao","year":"2024","unstructured":"Hao Liu, Wilson Yan, Matei Zaharia, and Pieter Abbeel. 2024. World model on million-length video and language with blockwise ringattention. arXiv preprint arXiv:2402.08268 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Jessie Hui Wang, and Yimin Jiang","author":"Liu Juncai","year":"2023","unstructured":"Juncai Liu, Jessie Hui Wang, and Yimin Jiang. 2023. Janus: A unified distributed training framework for sparse mixture-of-experts models. In ACM SIGCOMM."},{"key":"e_1_3_2_1_44_1","unstructured":"Shiyu Liu Yucheng Han Peng Xing Fukun Yin Rui Wang Wei Cheng Jiaqi Liao Yingming Wang Honghao Fu Chunrui Han et al. 2025. Step1x-edit: A practical framework for general image editing. arXiv preprint arXiv:2504.17761 (2025)."},{"key":"e_1_3_2_1_45_1","unstructured":"Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaiev Ganesh Venkatesh et al. 2017. Mixed precision training. arXiv preprint arXiv:1710.03740 (2017)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Deepak Narayanan Mohammad Shoeybi Jared Casper Patrick LeGresley Mostofa Patwary Vijay Korthikanti Dmitri Vainbrand Prethvi Kashinkunti Julie Bernauer Bryan Catanzaro et al. 2021. Efficient large-scale language model training on gpu clusters using megatron-lm. In SC.","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_47_1","volume-title":"Splitwise: Efficient generative llm inference using phase splitting","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Splitwise: Efficient generative llm inference using phase splitting. In ACM\/IEEE ISCA."},{"key":"e_1_3_2_1_48_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_49_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_50_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog (2019)."},{"key":"e_1_3_2_1_51_1","volume-title":"International Conference for High Performance Computing, Networking, Storage and Analysis.","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. Zero: Memory optimizations toward training trillion parameter models. In International Conference for High Performance Computing, Networking, Storage and Analysis."},{"key":"e_1_3_2_1_52_1","volume-title":"Enabling Compute-Communication Overlap in Distributed Deep Learning Training Platforms","author":"Rashidi Saeed","unstructured":"Saeed Rashidi, Matthew Denton, Srinivas Sridharan, Sudarshan Srinivasan, Amoghavarsha Suresh, Jade Nie, and Tushar Krishna. 2021. Enabling Compute-Communication Overlap in Distributed Deep Learning Training Platforms. In ACM\/IEEE ISCA."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_54_1","volume-title":"Ankur Bapna, Zal\u00e1n Borsos, F\u00e9lix de Chaumont Quitry, Peter Chen, Dalia El Badawy, Wei Han, Eugene Kharitonov, et al.","author":"Rubenstein Paul K","year":"2023","unstructured":"Paul K Rubenstein, Chulayuth Asawaroengchai, Duc Dung Nguyen, Ankur Bapna, Zal\u00e1n Borsos, F\u00e9lix de Chaumont Quitry, Peter Chen, Dalia El Badawy, Wei Han, Eugene Kharitonov, et al. 2023. Audiopalm: A large language model that can speak and listen. arXiv preprint arXiv:2306.12925 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_56_1","unstructured":"Yizhou Shan Yutong Huang Yilun Chen and Yiying Zhang. 2018. LegoOS: A disseminated distributed OS for hardware resource disaggregation. In USENIX OSDI."},{"key":"e_1_3_2_1_57_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_58_1","unstructured":"Shaden Smith Mostofa Patwary Brandon Norick Patrick LeGresley Samyam Rajbhandari Jared Casper Zhun Liu Shrimai Prabhumoye George Zerveas Vijay Korthikanti et al. 2022. Using deepspeed and megatron to train megatron-turing nlg 530b a large-scale generative language model. arXiv preprint arXiv:2201.11990 (2022)."},{"key":"e_1_3_2_1_59_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Qiying Yu, Yufeng Cui, Fan Zhang, Xiaosong Zhang, Yueze Wang, Hongcheng Gao, Jingjing Liu, Tiejun Huang, and Xinlong Wang. 2023. Emu: Generative pretraining in multimodality. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_61_1","volume-title":"Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818","author":"Team Chameleon","year":"2024","unstructured":"Chameleon Team. 2024. Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_63_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_64_1","unstructured":"Pauli Virtanen Ralf Gommers Travis E Oliphant Matt Haberland Tyler Reddy David Cournapeau Evgeni Burovski Pearu Peterson Warren Weckesser Jonathan Bright et al. 2020. SciPy 1.0: fundamental algorithms for scientific computing in Python. Nature methods (2020)."},{"key":"e_1_3_2_1_65_1","unstructured":"Chenxi Wang Haoran Ma Shi Liu Yifan Qiao Jonathan Eyolfson Christian Navasca Shan Lu and Guoqing Harry Xu. 2022. MemLiner: Lining up Tracing and Application for a Far-Memory-Friendly Runtime. In USENIX OSDI."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Minjie Wang Chien-chin Huang and Jinyang Li. 2019. Supporting very large models using automatic dataflow graph partitioning. In EuroSys.","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_2_1_67_1","volume-title":"Marcello Maggioni, Qiao Zhang, et al.","author":"Wang Shibo","year":"2022","unstructured":"Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao Zhang, et al. 2022. Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models. In ACM ASPLOS."},{"key":"e_1_3_2_1_68_1","volume-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. In Advances in Neural Information Processing Systems.","author":"Wang Wenhai","year":"2024","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al. 2024. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_69_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et al. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. arXiv preprint arXiv:2412.10302 (2024)."},{"key":"e_1_3_2_1_70_1","volume-title":"A survey on multimodal large language models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_71_1","unstructured":"Lijun Yu Jos\u00e9 Lezama Nitesh B Gundavarapu Luca Versari Kihyuk Sohn David Minnen Yong Cheng Vighnesh Birodkar Agrim Gupta Xiuye Gu et al. 2023. Language Model Beats Diffusion-Tokenizer is Key to Visual Generation. arXiv preprint arXiv:2310.05737 (2023)."},{"key":"e_1_3_2_1_72_1","volume-title":"Soundstream: An end-to-end neural audio codec","author":"Zeghidour Neil","year":"2021","unstructured":"Neil Zeghidour, Alejandro Luebs, Ahmed Omran, Jan Skoglund, and Marco Tagliasacchi. 2021. Soundstream: An end-to-end neural audio codec. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2021)."},{"key":"e_1_3_2_1_73_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition.","author":"Zhai Xiaohua","year":"2022","unstructured":"Xiaohua Zhai, Xiao Wang, Basil Mustafa, Andreas Steiner, Daniel Keysers, Alexander Kolesnikov, and Lucas Beyer. 2022. Lit: Zero-shot transfer with locked-image text tuning. In IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.birob.2023.100131"},{"key":"e_1_3_2_1_75_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Chenxing Li, Jiahua Dong, Dan Su, Chenhui Chu, and Dong Yu. 2024. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_1_76_1","volume-title":"Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al.","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Shuangrui Ding, Songyang Zhang, Haodong Duan, Hang Yan, et al. 2023. Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_78_1","volume-title":"Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In USENIX OSDI.","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In USENIX OSDI."},{"key":"e_1_3_2_1_79_1","unstructured":"Ruidong Zhu Ziheng Jiang Chao Jin Peng Wu Cesar A Stuardo Dongyang Wang Xinlei Zhang Huaping Zhou Haoran Wei Yang Cheng et al. 2025. MegaScale-Infer: Serving Mixture-of-Experts at Scale with Disaggregated Expert Parallelism. arXiv preprint arXiv:2504.02263 (2025)."}],"event":{"name":"SIGCOMM '25: ACM SIGCOMM 2025 Conference","location":"S\u00e3o Francisco Convent Coimbra Portugal","acronym":"SIGCOMM '25","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the ACM SIGCOMM 2025 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3718958.3750472","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:58:31Z","timestamp":1756313911000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3718958.3750472"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,27]]},"references-count":79,"alternative-id":["10.1145\/3718958.3750472","10.1145\/3718958"],"URL":"https:\/\/doi.org\/10.1145\/3718958.3750472","relation":{},"subject":[],"published":{"date-parts":[[2025,8,27]]},"assertion":[{"value":"2025-08-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}