{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:38Z","timestamp":1773588698153,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["62372287"],"award-info":[{"award-number":["62372287"]}]},{"name":"National Natural Science Foundation of China","award":["U24A20235"],"award-info":[{"award-number":["U24A20235"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790154","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"618-632","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DIP: Efficient Large Multimodal Model Training with Dynamic Interleaved Pipeline"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4904-0078","authenticated-orcid":false,"given":"Zhenliang","family":"Xue","sequence":"first","affiliation":[{"name":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5787-5226","authenticated-orcid":false,"given":"Hanpeng","family":"Hu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6453-5026","authenticated-orcid":false,"given":"Xing","family":"Chen","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0049-873X","authenticated-orcid":false,"given":"Yimin","family":"Jiang","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4605-7382","authenticated-orcid":false,"given":"Yixin","family":"Song","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8395-1319","authenticated-orcid":false,"given":"Zeyu","family":"Mi","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9113-2660","authenticated-orcid":false,"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6657-5806","authenticated-orcid":false,"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6558-5298","authenticated-orcid":false,"given":"Yubin","family":"Xia","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-0361","authenticated-orcid":false,"given":"Haibo","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/357456.357458"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2025. Qwen2.5-VL Technical Report. arXiv. arXiv:2502.13923 [cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_2_1_3_1","unstructured":"Minwoo Byeon Beomhee Park Haecheon Kim Sungjun Lee Woonhyuk Baek et al. 2022. COYO-700M: Image-Text Pair Dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang et al. 2024. ShareGPT4Video: Improving Video Understanding and Generation with Better Captions. arXiv. arXiv:2406.04325 [cs.CV] https:\/\/arxiv.org\/abs\/2406.04325","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_2_1_5_1","volume-title":"SPPO: Efficient Long-Sequence LLM Training via Adaptive Sequence Pipeline Parallel Offloading. arXiv:2503.10377 [cs.DC] https:\/\/arxiv.org\/abs\/2503.10377","author":"Chen Qiaoling","year":"2025","unstructured":"Qiaoling Chen, Shenggui Li, Wei Gao, Peng Sun, Yonggang Wen, et al., 2025. SPPO: Efficient Long-Sequence LLM Training via Adaptive Sequence Pipeline Parallel Offloading. arXiv:2503.10377 [cs.DC] https:\/\/arxiv.org\/abs\/2503.10377"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 37th International Conference on Machine Learning (ICML'20). JMLR.org, Virtual conference, Article 149","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A Simple Framework for Contrastive Learning of Visual Representations. In Proceedings of the 37th International Conference on Machine Learning (ICML'20). JMLR.org, Virtual conference, Article 149, 11 pages."},{"key":"e_1_3_2_1_7_1","unstructured":"Xiaowei Chi Yatian Wang Aosong Cheng Pengjun Fang Zeyue Tian et al. 2024. MMTrail: A Multimodal Trailer Video Dataset with Language and Music Descriptions. arXiv. arXiv:2407.20962 [cs.CV] https:\/\/arxiv.org\/abs\/2407.20962"},{"key":"e_1_3_2_1_8_1","volume-title":"Chaoqun Liu, Maojia Song, et al.","author":"Chia Yew Ken","year":"2024","unstructured":"Yew Ken Chia, Liying Cheng, Hou Pong Chan, Chaoqun Liu, Maojia Song, et al., 2024. M-Longdoc: A Benchmark For Multimodal Super-Long Document Understanding And A Retrieval-Aware Tuning Framework. arXiv. arXiv:2411.06176 [cs.CL] https:\/\/arxiv.org\/abs\/2411.06176"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/1777826.1777833"},{"key":"e_1_3_2_1_10_1","unstructured":"DeepMind. 2025. Gemma 3. https:\/\/blog.google\/technology\/developers\/gemma-3\/."},{"key":"e_1_3_2_1_11_1","unstructured":"DeepSeek-AI Aixin Liu Bei Feng Bing Xue Bingxuan Wang et al. 2025. DeepSeek-V3 Technical Report. arXiv:2412.19437 [cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_2_1_12_1","unstructured":"Mostafa Dehghani Josip Djolonga Basil Mustafa Piotr Padlewski Jonathan Heek et al. 2023. Scaling Vision Transformers to 22 Billion Parameters. arXiv. arXiv:2302.05442 [cs.CV] https:\/\/arxiv.org\/abs\/2302.05442"},{"key":"e_1_3_2_1_13_1","unstructured":"Z3 developers. 2025. The Z3 Theorem Prover. https:\/\/github.com\/Z3Prover\/z3."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, et al., 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_15_1","first-page":"161","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Feng Weiqi","year":"2025","unstructured":"Weiqi Feng, Yangrui Chen, Shaoyu Wang, Yanghua Peng, Haibin Lin, et al., 2025. Optimus: Accelerating Large-Scale Multi-Modal LLM Training by Bubble Exploitation. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). USENIX Association, Boston, MA, USA, 161-178."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695960"},{"key":"e_1_3_2_1_17_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_18_1","unstructured":"Ailin Huang Boyong Wu Bruce Wang Chao Yan Chen Hu et al. 2025. Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction. arXiv. arXiv:2502.11946 [cs.CL] https:\/\/arxiv.org\/abs\/2502.11946"},{"key":"e_1_3_2_1_19_1","first-page":"1157","volume-title":"DISTMM: Accelerating Distributed Multimodal Model Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Huang Jun","year":"2024","unstructured":"Jun Huang, Zhen Zhang, Shuai Zheng, Feng Qin, and Yida Wang. 2024b. DISTMM: Accelerating Distributed Multimodal Model Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 1157-1171. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/huang"},{"key":"e_1_3_2_1_20_1","unstructured":"Minbin Huang Yanxin Long Xinchi Deng Ruihang Chu Jiangfeng Xiong et al. 2024a. DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation. arXiv. arXiv:2403.08857 [cs.CV] https:\/\/arxiv.org\/abs\/2403.08857"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1460833.1460872"},{"key":"e_1_3_2_1_22_1","volume-title":"Cornstarch: Distributed Multimodal Training Must Be Multimodality-Aware. arXiv. arXiv:2503.11367 [cs.DC] https:\/\/arxiv.org\/abs\/2503.11367","author":"Jang Insu","year":"2025","unstructured":"Insu Jang, Runyu Lu, Nikhil Bansal, Ang Chen, and Mosharaf Chowdhury. 2025. Cornstarch: Distributed Multimodal Training Must Be Multimodality-Aware. arXiv. arXiv:2503.11367 [cs.DC] https:\/\/arxiv.org\/abs\/2503.11367"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707220"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629585"},{"key":"e_1_3_2_1_25_1","volume-title":"OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv. arXiv:2306.16527 [cs.IR] https:\/\/arxiv.org\/abs\/2306.16527","author":"Lauren\u00e7on Hugo","year":"2023","unstructured":"Hugo Lauren\u00e7on, Lucile Saulnier, L\u00e9o Tronchon, Stas Bekman, Amanpreet Singh, et al., 2023. OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv. arXiv:2306.16527 [cs.IR] https:\/\/arxiv.org\/abs\/2306.16527"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_27_1","unstructured":"Zhimin Li Jianwei Zhang Qin Lin Jiangfeng Xiong Yanxin Long et al. 2024. Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding. arXiv. arXiv:2405.08748 [cs.CV] https:\/\/arxiv.org\/abs\/2405.08748"},{"key":"e_1_3_2_1_28_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lin Zhiqi","year":"2024","unstructured":"Zhiqi Lin, Youshan Miao, Quanlu Zhang, Fan Yang, Yi Zhu, et al., 2024. nnScaler: Constraint-Guided Parallelization Plan Generation for Deep Learning Training. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"e_1_3_2_1_30_1","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS).","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tony Xia, Liang Qiu, Kai-Wei Chang, et al., 2022. Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering. In The 36th Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_31_1","volume-title":"Technical Report: The Practice, Challenges, and Future of Video Foundation Model. arXiv. arXiv:2502.10248 [cs.CV] https:\/\/arxiv.org\/abs\/2502.10248","author":"Ma Guoqing","year":"2025","unstructured":"Guoqing Ma, Haoyang Huang, Kun Yan, Liangyu Chen, Nan Duan, et al., 2025. Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model. arXiv. arXiv:2502.10248 [cs.CV] https:\/\/arxiv.org\/abs\/2502.10248"},{"key":"e_1_3_2_1_32_1","unstructured":"OpenAI. 2024. GPT-4o System Card. https:\/\/openai.com\/index\/gpt-4o-system-card\/."},{"key":"e_1_3_2_1_33_1","unstructured":"OpenAI. 2025. Introducing 4o Image Generation. https:\/\/openai.com\/index\/introducing-4o-image-generation\/."},{"key":"e_1_3_2_1_34_1","unstructured":"Gurobi Optimization. 2025. Gurobi. https:\/\/www.gurobi.com\/."},{"key":"e_1_3_2_1_35_1","volume-title":"Movie Gen: A Cast of Media Foundation Models. arXiv. arXiv:2410.13720 [cs.CV] https:\/\/arxiv.org\/abs\/2410.13720","author":"Polyak Adam","year":"2024","unstructured":"Adam Polyak, Amit Zohar, Andrew Brown, Andros Tjandra, Animesh Sinha, et al., 2024. Movie Gen: A Cast of Media Foundation Models. arXiv. arXiv:2410.13720 [cs.CV] https:\/\/arxiv.org\/abs\/2410.13720"},{"key":"e_1_3_2_1_36_1","volume-title":"Zero Bubble Pipeline Parallelism. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=tuzTN0eIO5","author":"Qi Penghui","year":"2024","unstructured":"Penghui Qi, Xinyi Wan, Guangxing Huang, and Min Lin. 2024. Zero Bubble Pipeline Parallelism. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=tuzTN0eIO5"},{"key":"e_1_3_2_1_37_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, et al., 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, et al., 2022. LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 1833, 17 pages."},{"key":"e_1_3_2_1_39_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper et al. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv. arXiv:1909.08053 [cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_40_1","volume-title":"Modalities: Survey on Multimodal Large Language Model. arXiv. arXiv:2311.07594 [cs.CL] https:\/\/arxiv.org\/abs\/2311.07594","author":"Song Shezheng","year":"2025","unstructured":"Shezheng Song, Xiaopeng Li, Shasha Li, Shan Zhao, Jie Yu, et al., 2025. How to Bridge the Gap between Modalities: Survey on Multimodal Large Language Model. arXiv. arXiv:2311.07594 [cs.CL] https:\/\/arxiv.org\/abs\/2311.07594"},{"key":"e_1_3_2_1_41_1","unstructured":"HiGHS Team. 2025. HiGHS: Linear Optimization Software. https:\/\/github.com\/ERGO-Code\/HiGHS."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of Machine Learning and Systems.","author":"Tian Ye","year":"2024","unstructured":"Ye Tian, Zhen Jia, Ziyue Luo, Yida Wang, and Chuan Wu. 2024. DiffusionPipe: Training Large Diffusion Models with Efficient Pipelines. In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_2_1_43_1","volume-title":"Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, et al., 2022. Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)."},{"key":"e_1_3_2_1_44_1","volume-title":"Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, et al., 2017. Attention is All You Need. In Conference on Neural Information Processing Systems (Long Beach, California, USA) (NIPS'17). Curran Associates Inc., Red Hook, NY, USA, 6000\u20136010."},{"key":"e_1_3_2_1_45_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan et al. 2024a. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv. arXiv:2409.12191 [cs.CV] https:\/\/arxiv.org\/abs\/2409.12191"},{"key":"e_1_3_2_1_46_1","unstructured":"Yi Wang Yinan He Yizhuo Li Kunchang Li Jiashuo Yu et al. 2024b. InternVid: A Large-Scale Video-Text Dataset for Multimodal Understanding and Generation. arXiv. arXiv:2307.06942 [cs.CV] https:\/\/arxiv.org\/abs\/2307.06942"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3715992"},{"key":"e_1_3_2_1_48_1","volume-title":"19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)","author":"Wang Zheng","year":"2025","unstructured":"Zheng Wang, Anna Cai, Xinfeng Xie, Zaifeng Pan, Yue Guan, et al., 2025a. WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training. In 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_50_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu et al. 2024. Qwen2 Technical Report. arXiv. arXiv:2407.10671 [cs.CL] https:\/\/arxiv.org\/abs\/2407.10671"},{"key":"e_1_3_2_1_51_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng et al. 2025. Qwen2.5 Technical Report. arXiv. arXiv:2412.15115 [cs.CL] https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00042"},{"key":"e_1_3_2_1_53_1","unstructured":"Zili Zhang Yinmin Zhong Ranchen Ming Hanpeng Hu Jianjian Sun et al. 2024. DistTrain: Addressing Model and Data Heterogeneity with Disaggregated Training for Multimodal Large Language Models. arXiv. arXiv:2408.04275 [cs.DC] https:\/\/arxiv.org\/abs\/2408.04275"},{"key":"e_1_3_2_1_54_1","first-page":"143","volume-title":"FlexPipe: Maximizing Training Efficiency for Transformer-Based Models with Variable-Length Inputs. In 2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Zhao Hairui","year":"2025","unstructured":"Hairui Zhao, Qi Tian, Hongliang Li, and Zizhong Chen. 2025. FlexPipe: Maximizing Training Efficiency for Transformer-Based Models with Variable-Length Inputs. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). USENIX Association, Boston, MA, USA, 143-159."},{"key":"e_1_3_2_1_55_1","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang et al. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv. arXiv:2304.11277 [cs.DC] https:\/\/arxiv.org\/abs\/2304.11277"},{"key":"e_1_3_2_1_56_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, et al., 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)."},{"key":"e_1_3_2_1_57_1","unstructured":"Yijie Zheng Bangjun Xiao Lei Shi Xiaoyang Li Faming Wu et al. 2025. Orchestrate Multimodal Data with Batch Post-Balancing to Accelerate Multimodal Large Language Model Training. arXiv. arXiv:2503.23830 [cs.DC] https:\/\/arxiv.org\/abs\/2503.23830"},{"key":"e_1_3_2_1_58_1","volume-title":"Jesse Dodge, et al.","author":"Zhu Wanrong","year":"2023","unstructured":"Wanrong Zhu, Jack Hessel, Anas Awadalla, Samir Yitzhak Gadre, Jesse Dodge, et al., 2023. Multimodal C4: An Open, Billion-Scale Corpus of Images Interleaved with Text. arXiv. arXiv:2304.06939 [cs.CV] https:\/\/arxiv.org\/abs\/2304.06939"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:07:18Z","timestamp":1773583638000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":58,"alternative-id":["10.1145\/3779212.3790154","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790154","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}