{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:51:04Z","timestamp":1777063864670,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":79,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3803568","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MegaScale-Data: Scaling DataLoader for Multisource Large Foundation Model Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3376-0607","authenticated-orcid":false,"given":"Juntao","family":"Zhao","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6926-5787","authenticated-orcid":false,"given":"Qi","family":"Lu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8596-5877","authenticated-orcid":false,"given":"Wei","family":"Jia","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5902-1611","authenticated-orcid":false,"given":"Borui","family":"Wan","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7423-1675","authenticated-orcid":false,"given":"Lei","family":"Zuo","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3664-6615","authenticated-orcid":false,"given":"Junda","family":"Feng","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8684-8509","authenticated-orcid":false,"given":"Jianyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6682-1783","authenticated-orcid":false,"given":"Yangrui","family":"Chen","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6870-8898","authenticated-orcid":false,"given":"Shuaishuai","family":"Cao","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7858-8728","authenticated-orcid":false,"given":"Jialing","family":"He","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4598-3807","authenticated-orcid":false,"given":"Kaihua","family":"Jiang","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6943-6090","authenticated-orcid":false,"given":"Yuanzhe","family":"Hu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0653-767X","authenticated-orcid":false,"given":"Shibiao","family":"Nong","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3989-4358","authenticated-orcid":false,"given":"Yanghua","family":"Peng","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4879-5335","authenticated-orcid":false,"given":"Haibin","family":"Lin","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3144-4398","authenticated-orcid":false,"given":"Chuan","family":"Wu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amazon Web Services. 2025. Amazon S3 (Simple Storage Service). https:\/\/docs.aws.amazon.com\/zh_cn\/emr\/latest\/ReleaseGuide\/emr-hbase-s3.html Accessed: 2025-03-22."},{"key":"e_1_3_2_1_2_1","unstructured":"Apache Software Foundation. 2025. Apache Parquet Documentation: File Format Configurations. https:\/\/parquet.apache.org\/docs\/file-format\/configurations\/ Accessed: 2025-03-22."},{"key":"e_1_3_2_1_3_1","unstructured":"Apache Software Foundation. 2025. Hadoop Distributed File System (HDFS). https:\/\/docs.aws.amazon.com\/zh_cn\/emr\/latest\/ReleaseGuide\/emr-encryption-tdehdfs.html Accessed: 2025-03-22."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.2174\/2213275911666180719111118"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 2023 ACM Symposium on Cloud Computing (SoCC '23)","author":"Audibert Andrew","unstructured":"Andrew Audibert, Yang Chen, Dan Graur, Ana Klimovic, Ji\u0159\u00ed \u0160im\u0161a, and Chandramohan A. Thekkath. 2023. tf.data service: A Case for Disaggregating ML Input Data Processing. In Proceedings of the 2023 ACM Symposium on Cloud Computing (SoCC '23). Association for Computing Machinery, New York, NY, USA, 358\u2013375."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Daniel Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, Brennan Saeta, Parker Schuh, Ryan Sepassi, Laurent Shafey, Chandu Thekkath, and Yonghui Wu. 2022. Pathways: Asynchronous Distributed Dataflow for ML. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. MLSys Organization, Santa Clara, CA, USA, 430\u2013449."},{"key":"e_1_3_2_1_7_1","unstructured":"Xiao Bi Deli Chen Guanting Chen Shanhuang Chen Damai Dai Chengqi Deng Honghui Ding Kai Dong Qiushi Du Zhe Fu et al. 2024. Deepseek llm: Scaling open-source language models with longtermism. arXiv:2401.02954 [cs.CL] https:\/\/arxiv.org\/abs\/2401.02954"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1140\/epjb\/e2008-00320-9"},{"key":"e_1_3_2_1_9_1","unstructured":"Minwoo Byeon Beomhee Park Haecheon Kim Sungjun Lee Woonhyuk Baek and Saehoon Kim. 2022. COYO-700M: Image-Text Pair Dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset."},{"key":"e_1_3_2_1_10_1","unstructured":"Mayee F. Chen Nicholas Roberts Kush Bhatia Jue Wang Ce Zhang Frederic Sala and Christopher R\u00e9. 2023. Skill-it! A Data-Driven Skills Framework for Understanding and Training Language Models. arXiv:2307.14430 [cs.CL] https:\/\/arxiv.org\/abs\/2307.14430"},{"key":"e_1_3_2_1_11_1","unstructured":"Shouyuan Chen Sherman Wong Liangjian Chen and Yuandong Tian. 2023. Extending Context Window of Large Language Models via Positional Interpolation. arXiv:2306.15595 [cs.CL] https:\/\/arxiv.org\/abs\/2306.15595"},{"key":"e_1_3_2_1_12_1","volume-title":"Evans and contributors","author":"Clark","year":"2024","unstructured":"Clark C. Evans and contributors. 2024. Pillow Library. https:\/\/pillow.readthedocs.io\/en\/stable\/. Python Imaging Library (PIL) Fork."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"e_1_3_2_1_14_1","volume-title":"Weinberger (Eds.)","volume":"25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc' aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, Quoc Le, and Andrew Ng. 2012. Large Scale Distributed Deep Networks. In Advances in Neural Information Processing Systems, F. Pereira, C.J. Burges, L. Bottou, and K.Q. Weinberger (Eds.), Vol. 25. Curran Associates, Inc."},{"key":"e_1_3_2_1_15_1","volume-title":"Levine (Eds.)","volume":"36","author":"Dehghani Mostafa","year":"2023","unstructured":"Mostafa Dehghani, Basil Mustafa, Josip Djolonga, Jonathan Heek, Matthias Minderer, Mathilde Caron, Andreas Steiner, Joan Puigcerver, Robert Geirhos, Ibrahim M Alabdulmohsin, Avital Oliver, Piotr Padlewski, Alexey Gritsenko, Mario Lucic, and Neil Houlsby. 2023. Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 2252\u20132274. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/06ea400b9b7cfce6428ec27a371632eb-Paper-Conference.pdf"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_17_1","volume-title":"Mycroft: Tracing Dependencies in Collective Communication Towards Reliable LLM Training. arXiv preprint arXiv:2509.03018","author":"Deng Yangtao","year":"2025","unstructured":"Yangtao Deng, Lei Zhang, Qinlong Wang, Xiaoyun Zhi, Xinlei Zhang, Zhuo Jiang, Haohan Xu, Lei Wang, Zuquan Song, Gaohong Liu, et al. 2025. Mycroft: Tracing Dependencies in Collective Communication Towards Reliable LLM Training. arXiv preprint arXiv:2509.03018 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Dong Jianbo","year":"2025","unstructured":"Jianbo Dong, Kun Qian, Pengcheng Zhang, Zhilong Zheng, Liang Chen, Fei Feng, Yichi Xu, Yikai Zhu, Gang Lu, Xue Li, Zhihui Ren, Zhicheng Wang, Bin Luo, Peng Zhang, Yang Liu, Yanqing Chen, Yu Guan, Weicheng Wang, Chaojie Yang, Yang Zhang, Man Yuan, Hanyu Zhao, Yong Li, Zihan Zhao, Shan Li, Xianlong Zeng, Zhiping Yao, Binzhang Fu, Ennan Zhai, Wei Lin, Chao Wang, and Dennis Cai. 2025. Evolution of Aegis: Fault Diagnosis for AI Model Training Service in Production. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). USENIX Association, Philadelphia, PA, 865\u2013881. https:\/\/www.usenix.org\/conference\/nsdi25\/presentation\/dong"},{"key":"e_1_3_2_1_19_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale."},{"key":"e_1_3_2_1_20_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv:2407.21783 [cs.AI]"},{"key":"e_1_3_2_1_21_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Check-pointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 929\u2013943. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/eisenman"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3478329"},{"key":"e_1_3_2_1_23_1","volume-title":"Optimus: Accelerating Large-Scale Multi-Modal LLM Training by Bubble Exploitation. arXiv:2408.03505 [cs.CL] https:\/\/arxiv.org\/abs\/2408.03505","author":"Feng Weiqi","year":"2024","unstructured":"Weiqi Feng, Yangrui Chen, Shaoyu Wang, Yanghua Peng, Haibin Lin, and Minlan Yu. 2024. Optimus: Accelerating Large-Scale Multi-Modal LLM Training by Bubble Exploitation. arXiv:2408.03505 [cs.CL] https:\/\/arxiv.org\/abs\/2408.03505"},{"key":"e_1_3_2_1_24_1","unstructured":"Common Crawl Foundation. 2014. Common Crawl. https:\/\/commoncrawl.org."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1186\/s41044-016-0020-2"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Hao Ge Junda Feng Qi Huang Fangcheng Fu Xiaonan Nie Lei Zuo Haibin Lin Bin Cui and Xin Liu. 2025. ByteScale: Efficient Scaling of LLM Training with a 2048K Context Length on More Than 12 000 GPUs. arXiv:2502.21231 [cs.DC] https:\/\/arxiv.org\/abs\/2502.21231","DOI":"10.1145\/3718958.3754352"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/69.273032"},{"key":"e_1_3_2_1_28_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Graur Dan","year":"2022","unstructured":"Dan Graur, Damien Aymon, Dan Kluser, Tanguy Albrici, Chandramohan A. Thekkath, and Ana Klimovic. 2022. Cachew: Machine Learning Input Data Processing as a Service. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 689\u2013706."},{"key":"e_1_3_2_1_29_1","volume-title":"Pecan: Cost-Efficient ML Data Preprocessing with Automatic Transformation Ordering and Hybrid Placement. In 2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Graur Dan","year":"2024","unstructured":"Dan Graur, Oto Mraz, Muyu Li, Sepehr Pourghannad, Chandramohan A. Thekkath, and Ana Klimovic. 2024. Pecan: Cost-Efficient ML Data Preprocessing with Automatic Transformation Ordering and Hybrid Placement. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 649\u2013665. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/graur"},{"key":"e_1_3_2_1_30_1","unstructured":"Qinghao Hu Zhisheng Ye Zerui Wang Guoteng Wang Meng Zhang Qiaoling Chen Peng Sun Dahua Lin Xiaolin Wang Yingwei Luo Yonggang Wen and Tianwei Zhang. 2024. Characterization of Large Language Model Development in the Datacenter. arXiv:2403.07648 [cs.DC] https:\/\/arxiv.org\/abs\/2403.07648"},{"key":"e_1_3_2_1_31_1","volume-title":"Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Qinghao","year":"2024","unstructured":"Qinghao Hu, Zhisheng Ye, Zerui Wang, Guoteng Wang, Meng Zhang, Qiaoling Chen, Peng Sun, Dahua Lin, Xiaolin Wang, Yingwei Luo, Yonggang Wen, and Tianwei Zhang. 2024. Characterization of Large Language Model Development in the Datacenter. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 709\u2013729. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/hu"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 21st USENIX Symposium on Networked Systems Design and Implementation","author":"Huang Jun","year":"2024","unstructured":"Jun Huang, Zhen Zhang, Shuai Zheng, Feng Qin, and Yida Wang. 2024. DISTMM: accelerating distributed multimodal model training. In Proceedings of the 21st USENIX Symposium on Networked Systems Design and Implementation (Santa Clara, CA, USA) (NSDI'24). USENIX Association, USA, Article 64, 15 pages."},{"key":"e_1_3_2_1_33_1","volume-title":"Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3662158.3662806"},{"key":"e_1_3_2_1_35_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier MarieAnne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Theophile Gervet Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2024. Mixtral of Experts. arXiv:2401.04088 [cs.LG] https:\/\/arxiv.org\/abs\/2401.04088"},{"key":"e_1_3_2_1_36_1","unstructured":"Yiding Jiang Allan Zhou Zhili Feng Sadhika Malladi and J. Zico Kolter. 2024. Adaptive Data Optimization: Dynamic Sample Selection with Scaling Laws. arXiv:2410.11820 [cs.LG] https:\/\/arxiv.org\/abs\/2410.11820"},{"key":"e_1_3_2_1_37_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. 2024. {MegaScale}: Scaling large language model training to more than 10,000 {GPUs}. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, USA, 745\u2013760."},{"key":"e_1_3_2_1_38_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 745\u2013760. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/jiang-ziheng"},{"key":"e_1_3_2_1_39_1","unstructured":"Vijay Korthikanti Jared Casper Sangkug Lym Lawrence McAfee Michael Andersch Mohammad Shoeybi and Bryan Catanzaro. 2022. Reducing Activation Recomputation in Large Transformer Models. arXiv:2205.05198 [cs.LG] https:\/\/arxiv.org\/abs\/2205.05198"},{"key":"e_1_3_2_1_40_1","unstructured":"Mario Michael Krell Matej Kosec Sergio P. Perez and Andrew Fitzgibbon. 2022. Efficient Sequence Packing without Cross-contamination: Accelerating Large Language Models without Impacting Performance. arXiv:2107.02027 [cs.CL] https:\/\/arxiv.org\/abs\/2107.02027"},{"key":"e_1_3_2_1_41_1","unstructured":"Kubernetes. 2024. Sidecar Containers. https:\/\/kubernetes.io\/docs\/concepts\/workloads\/pods\/sidecar-containers\/ Kubernetes Documentation v1.29."},{"key":"e_1_3_2_1_42_1","unstructured":"Conglong Li Minjia Zhang and Yuxiong He. 2022. The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. arXiv:2108.06084 [cs.LG] https:\/\/arxiv.org\/abs\/2108.06084"},{"key":"e_1_3_2_1_43_1","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv:2006.15704 [cs.DC] https:\/\/arxiv.org\/abs\/2006.15704"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_45_1","unstructured":"Bin Lin Chen Zhang Tao Peng Hanyu Zhao Wencong Xiao Minmin Sun Anmin Liu Zhipeng Zhang Lanbo Li Xiafei Qiu Shen Li Zhigang Ji Tao Xie Yong Li and Wei Lin. 2024. Infinite-LLM: Efficient LLM Service for Long Context with DistAttention and Distributed KVCache. arXiv:2401.02669 [cs.DC] https:\/\/arxiv.org\/abs\/2401.02669"},{"key":"e_1_3_2_1_46_1","unstructured":"Hao Liu Matei Zaharia and Pieter Abbeel. 2023. Ring Attention with Blockwise Transformers for Near-Infinite Context."},{"key":"e_1_3_2_1_47_1","volume-title":"Charlotte Lin, Amog Kamsetty, Hao Chen, Cheng Su, Balaji Veeramani, Scott Lee, SangBin Cho, Clark Zinzow, Eric Liang, Ion Stoica, and Stephanie Wang.","author":"Luan Frank Sifei","year":"2025","unstructured":"Frank Sifei Luan, Ziming Mao, Ron Yifeng Wang, Charlotte Lin, Amog Kamsetty, Hao Chen, Cheng Su, Balaji Veeramani, Scott Lee, SangBin Cho, Clark Zinzow, Eric Liang, Ion Stoica, and Stephanie Wang. 2025. The Streaming Batch Model for Efficient and Fault-Tolerant Heterogeneous Execution. arXiv:2501.12407 [cs.DC] https:\/\/arxiv.org\/abs\/2501.12407"},{"key":"e_1_3_2_1_48_1","unstructured":"Meta AI. 2025. The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation. https:\/\/ai.meta.com\/blog\/llama-4-multimodal-intelligence\/ Accessed: 2025-04-06."},{"key":"e_1_3_2_1_49_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203\u2013216. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Moritz Philipp","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I. Jordan, and Ion Stoica.2018. Ray: a distributed framework for emerging AI applications. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (Carlsbad, CA, USA) (OSDI'18). USENIX Association, USA, 561\u2013577."},{"key":"e_1_3_2_1_51_1","unstructured":"Derek G. Murray Jiri Simsa Ana Klimovic and Ihor Indyk. 2021. tf.data: A Machine Learning Data Processing Framework. arXiv:2101.12127 [cs.LG] https:\/\/arxiv.org\/abs\/2101.12127"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_54_1","unstructured":"PyTorch contributors. 2024. torch.utils.data \u2014 PyTorch 2.4 documentation. https:\/\/pytorch.org\/docs\/stable\/data.html Accessed: [Insert access date]."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_56_1","unstructured":"Christoph Schuhmann Richard Vencu Romain Beaumont Robert Kaczmarczyk Clayton Mullis Aarush Katta Theo Coombes Jenia Jitsev and Aran Komatsuzaki. 2021. LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs. arXiv:2111.02114 [cs.CV]"},{"key":"e_1_3_2_1_57_1","unstructured":"John Schulman Filip Wolski Prafulla Dhariwal Alec Radford and Oleg Klimov. 2017. Proximal Policy Optimization Algorithms. arXiv:1707.06347 [cs.LG]"},{"key":"e_1_3_2_1_58_1","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv:1802.05799 [cs.LG] https:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_59_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL]"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TEL-NET.2017.8343593"},{"key":"e_1_3_2_1_61_1","volume-title":"Paolo Rota, and Nicu Sebe.","author":"Soviany Petru","year":"2022","unstructured":"Petru Soviany, Radu Tudor Ionescu, Paolo Rota, and Nicu Sebe. 2022. Curriculum Learning: A Survey. arXiv:2101.10382 [cs.LG] https:\/\/arxiv.org\/abs\/2101.10382"},{"key":"e_1_3_2_1_62_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2025. Gemini: a family of highly capable multimodal models. arXiv:2312.11805 [cs.CL] https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.14778\/3579075.3579083"},{"key":"e_1_3_2_1_64_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2023. Attention Is All You Need. arXiv:1706.03762 [cs.CL] https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_1_66_1","unstructured":"Borui Wan Mingji Han Yiyao Sheng Zhichao Lai Mofan Zhang Junda Zhang Yanghua Peng Haibin Lin Xin Liu and Chuan Wu. 2024. ByteCheckpoint: A Unified Checkpointing System for LLM Development. arXiv:2407.20143 [cs.AI] https:\/\/arxiv.org\/abs\/2407.20143"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Borui Wan Gaohong Liu Zuquan Song Jun Wang Yun Zhang Guangming Sheng Shuguang Wang Houmin Wei Chenyuan Wang Weiqiang Lou Xi Yang Mofan Zhang Kaihua Jiang Cheng Ren Xiaoyun Zhi Menghan Yu Zhe Nan Zhuolin Zheng Baoquan Zhong Qinlong Wang Huan Yu Jinxin Chi Wang Zhang Yuhan Li Zixian Du Sida Zhao Yongqiang Zhang Jingzhe Tang Zherui Liu Chuan Wu Yanghua Peng Haibin Lin Wencong Xiao Xin Liu and Liang Xiang. 2025. Robust LLM Training Infrastructure at ByteDance. arXiv:2509.16293 [cs.LG] https:\/\/arxiv.org\/abs\/2509.16293","DOI":"10.1145\/3731569.3764838"},{"key":"e_1_3_2_1_68_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Yang Fan Kai Dang Mengfei Du Xuancheng Ren Rui Men Dayiheng Liu Chang Zhou Jingren Zhou and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv:2409.12191 [cs.CV] https:\/\/arxiv.org\/abs\/2409.12191"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_70_1","volume-title":"Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation. arXiv:2410.13848 [cs.CV] https:\/\/arxiv.org\/abs\/2410.13848","author":"Wu Chengyue","year":"2024","unstructured":"Chengyue Wu, Xiaokang Chen, Zhiyu Wu, Yiyang Ma, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, Chong Ruan, and Ping Luo. 2024. Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation. arXiv:2410.13848 [cs.CV] https:\/\/arxiv.org\/abs\/2410.13848"},{"key":"e_1_3_2_1_71_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang Zhenda Xie Yu Wu Kai Hu Jiawei Wang Yaofeng Sun Yukun Li Yishi Piao Kang Guan Aixin Liu Xin Xie Yuxiang You Kai Dong Xingkai Yu Haowei Zhang Liang Zhao Yisong Wang and Chong Ruan. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. arXiv:2412.10302 [cs.CV] https:\/\/arxiv.org\/abs\/2412.10302"},{"key":"e_1_3_2_1_72_1","unstructured":"Jiasheng Ye Peiju Liu Tianxiang Sun Yunhua Zhou Jun Zhan and Xipeng Qiu. 2024. Data Mixing Laws: Optimizing Data Mixtures by Predicting Language Modeling Performance. arXiv:2403.16952 [cs.CL] https:\/\/arxiv.org\/abs\/2403.16952"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.14778\/3626292.3626298"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3718958.3750472"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.14778\/3705829.3705861"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_78_1","unstructured":"Yu Zhu Wenqi Jiang and Gustavo Alonso. 2025. Multi-Tenant SmartNICs for In-Network Preprocessing of Recommender Systems. arXiv:2501.12032 [cs.AR] https:\/\/arxiv.org\/abs\/2501.12032"},{"key":"e_1_3_2_1_79_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Zu Yazhou","year":"2024","unstructured":"Yazhou Zu, Alireza Ghaffarkhah, Hoang-Vu Dang, Brian Towles, Steven Hand, Safeen Huda, Adekunle Bello, Alexander Kolbasov, Arash Rezaei, Dayou Du, Steve Lacy, Hang Wang, Aaron Wisner, Chris Lewis, and Henri Bahini. 2024. Resiliency at Scale: Managing Google's TPUv4 Machine Learning Supercomputer. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 761\u2013774. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/zu"}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:21:12Z","timestamp":1777062072000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3803568"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":79,"alternative-id":["10.1145\/3767295.3803568","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3803568","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}