{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:16:41Z","timestamp":1777421801138,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3694715.3695975","type":"proceedings-article","created":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T19:28:18Z","timestamp":1731698898000},"page":"195-210","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Tenplex: Dynamic Parallelism for Deep Learning using Parallelizable Tensor Collections"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0594-4138","authenticated-orcid":false,"given":"Marcel","family":"Wagenl\u00e4nder","sequence":"first","affiliation":[{"name":"Imperial College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2467-8448","authenticated-orcid":false,"given":"Guo","family":"Li","sequence":"additional","affiliation":[{"name":"Imperial College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0768-3444","authenticated-orcid":false,"given":"Bo","family":"Zhao","sequence":"additional","affiliation":[{"name":"Aalto University, Helsinki, Finland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3594-1092","authenticated-orcid":false,"given":"Luo","family":"Mai","sequence":"additional","affiliation":[{"name":"University of Edinburgh, Edinburgh, Scotland Uk"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6963-5640","authenticated-orcid":false,"given":"Peter","family":"Pietzuch","sequence":"additional","affiliation":[{"name":"Imperial College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, and Michael Isard. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_2_1","volume-title":"A Hybrid Parallelization Approach for Distributed and Scalable Deep Learning","author":"Akintoye Samson B.","year":"2022","unstructured":"Samson B. Akintoye, Liangxiu Han, Xin Zhang, Haoming Chen, and Daoqiang Zhang. 2022. A Hybrid Parallelization Approach for Distributed and Scalable Deep Learning. IEEE Access (2022)."},{"key":"e_1_3_2_1_3_1","unstructured":"Amazon. 2024. Cloud Object Storage - Amazon S3. https:\/\/aws.amazon.com\/pm\/serv-s3\/."},{"key":"e_1_3_2_1_4_1","volume-title":"Low-Cost Training of Massive Deep Learning Models. In Seventeenth European Conference on Computer Systems (EuroSys)","author":"Athlur Sanjith","year":"2022","unstructured":"Sanjith Athlur, Nitika Saran, Muthian Sivathanu, Ramachandran Ramjee, and Nipun Kwatra. 2022. Varuna: Scalable, Low-Cost Training of Massive Deep Learning Models. In Seventeenth European Conference on Computer Systems (EuroSys) (Rennes, France)."},{"key":"e_1_3_2_1_5_1","unstructured":"The Kubernetes Authors. 2020. Kubernetes. https:\/\/kubernetes.io."},{"key":"e_1_3_2_1_6_1","unstructured":"AWS. 2024. SageMaker Distributed Model Parallelism Best Practices. https:\/\/docs.aws.amazon.com\/sagemaker\/latest\/dg\/model-parallel-best-practices.html."},{"key":"e_1_3_2_1_7_1","volume-title":"Pathways: Asynchronous distributed dataflow for ML. Machine Learning and Systems (MLSys).","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Daniel Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, et al. 2022. Pathways: Asynchronous distributed dataflow for ML. Machine Learning and Systems (MLSys)."},{"key":"e_1_3_2_1_8_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_10_1","volume-title":"Large Scale Distributed Deep Networks. In Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems (NeurIPS)","author":"Dean Jeffrey","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Quoc V. Le, Mark Z. Mao, Marc'Aurelio Ranzato, Andrew W. Senior, Paul A. Tucker, Ke Yang, and Andrew Y. Ng. 2012. Large Scale Distributed Deep Networks. In Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems (NeurIPS) (Lake Tahoe, Nevada, United States)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_14_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI) (Renton, WA, USA)."},{"key":"e_1_3_2_1_15_1","volume-title":"Neural Acceleration for General-Purpose Approximate Programs. In 45th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Esmaeilzadeh Hadi","year":"2012","unstructured":"Hadi Esmaeilzadeh, Adrian Sampson, Luis Ceze, and Doug Burger. 2012. Neural Acceleration for General-Purpose Approximate Programs. In 45th Annual IEEE\/ACM International Symposium on Microarchitecture (Austin, Texas, USA)."},{"key":"e_1_3_2_1_16_1","unstructured":"Wikimedia Foundation. 2024. Wikimedia Downloads. https:\/\/dumps.wikimedia.org."},{"key":"e_1_3_2_1_17_1","volume-title":"Matthew James Johnson, and Chris Leary","author":"Frostig Roy","year":"2018","unstructured":"Roy Frostig, Matthew James Johnson, and Chris Leary. 2018. Compiling machine learning programs via high-level tracing. Systems for Machine Learning (2018)."},{"key":"e_1_3_2_1_18_1","volume-title":"ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 135--153."},{"key":"e_1_3_2_1_19_1","unstructured":"Aaron Gokaslan and Vanya Cohen. 2019. OpenWebText Corpus. http:\/\/Skylion007.github.io\/OpenWebTextCorpus."},{"key":"e_1_3_2_1_20_1","unstructured":"Google. 2024. Google Cloud. https:\/\/cloud.google.com\/."},{"key":"e_1_3_2_1_21_1","volume-title":"Large Minibatch SGD: Training ImageNet in 1 Hour. CoRR abs\/1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross B. Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour. CoRR abs\/1706.02677 (2017). arXiv:1706.02677 http:\/\/arxiv.org\/abs\/1706.02677"},{"key":"e_1_3_2_1_22_1","volume-title":"26th International Joint Conference on Artificial Intelligence (IJCAI)","author":"Guo Huifeng","year":"2017","unstructured":"Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li, and Xiuqiang He. 2017. DeepFM: a factorization-machine based neural network for CTR prediction. In 26th International Joint Conference on Artificial Intelligence (IJCAI) (Melbourne, Australia)."},{"key":"e_1_3_2_1_23_1","volume-title":"Hydrozoa: Dynamic Hybrid-Parallel DNN Training on Serverless Containers. In Machine Learning and Systems (MLSys) (Santa Clara, CA, USA).","author":"Guo Runsheng","year":"2022","unstructured":"Runsheng Guo, Victor Guo, Antonio Kim, Josh Hildred, and Khuzaima Daudjee. 2022. Hydrozoa: Dynamic Hybrid-Parallel DNN Training on Serverless Containers. In Machine Learning and Systems (MLSys) (Santa Clara, CA, USA)."},{"key":"e_1_3_2_1_24_1","volume-title":"Ralf Gommers, Pauli Virtanen, David Cournapeau, Eric Wieser, Julian Taylor","author":"Harris Charles R","year":"2020","unstructured":"Charles R Harris, K Jarrod Millman, St\u00e9fan J Van Der Walt, Ralf Gommers, Pauli Virtanen, David Cournapeau, Eric Wieser, Julian Taylor, Sebastian Berg, Nathaniel J Smith, et al. 2020. Array programming with NumPy. Nature 585 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Hayashi Tomoki","year":"2020","unstructured":"Tomoki Hayashi, Ryuichi Yamamoto, Katsuki Inoue, Takenori Yoshimura, Shinji Watanabe, Tomoki Toda, Kazuya Takeda, Yu Zhang, and Xu Tan. 2020. Espnet-TTS: Unified, Reproducible, and Integratable Open Source End-to-End Text-to-Speech Toolkit. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_26_1","volume-title":"Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Las Vegas, NV, USA)."},{"key":"e_1_3_2_1_27_1","unstructured":"Marius Hobbhahn and Tamay Besiroglu. 2022. Trends in GPU Price-Performance. https:\/\/epochai.org\/blog\/trends-in-gpu-price-performance"},{"key":"e_1_3_2_1_28_1","unstructured":"Horovod. 2024. Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/latest\/elastic_include.html."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems (NeurIPS)","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Xu Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems (NeurIPS) (Vancouver, BC, Canada)."},{"key":"e_1_3_2_1_30_1","volume-title":"Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates. In 29th Symposium on Operating Systems Principles, (SOSP)","author":"Jang Insu","year":"2023","unstructured":"Insu Jang, Zhenning Yang, Zhen Zhang, Xin Jin, and Mosharaf Chowdhury. 2023. Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates. In 29th Symposium on Operating Systems Principles, (SOSP) (Koblenz, Germany)."},{"key":"e_1_3_2_1_31_1","volume-title":"29th Symposium on Operating Systems Principles (SOSP)","author":"Subramanya Suhas Jayaram","unstructured":"Suhas Jayaram Subramanya, Daiyaan Arfeen, Shouxu Lin, Aurick Qiao, Zhihao Jia, and Gregory R. Ganger. 2023. Sia: Heterogeneity-aware, goodput-optimized ML-cluster scheduling. In 29th Symposium on Operating Systems Principles (SOSP) (Koblenz, Germany)."},{"key":"e_1_3_2_1_32_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX Annual Technical Conference (ATC)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX Annual Technical Conference (ATC) (Renton, WA, USA)."},{"key":"e_1_3_2_1_33_1","volume-title":"15th Workshop on Hot Topics in Operating Systems (HotOS)","author":"Jeong Joo Seong","year":"2015","unstructured":"Joo Seong Jeong, Woo-Yeon Lee, Yunseong Lee, Youngseok Yang, Brian Cho, and Byung-Gon Chun. 2015. Elastic Memory: Bring Elasticity Back to In-Memory Big Data Analytics. In 15th Workshop on Hot Topics in Operating Systems (HotOS) (Kartause Ittingen, Switzerland)."},{"key":"e_1_3_2_1_34_1","volume-title":"Beyond Data and Model Parallelism for Deep Neural Networks. Machine Learning and Systems (MLSys)","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2019. Beyond Data and Model Parallelism for Deep Neural Networks. Machine Learning and Systems (MLSys) (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_36_1","volume-title":"ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems (NeurIPS)","author":"Krizhevsky Alex","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems 25: 26th Annual Conference on Neural Information Processing Systems (NeurIPS) (Lake Tahoe, Nevada, United States)."},{"key":"e_1_3_2_1_37_1","volume-title":"Automating System Configuration of Distributed Machine Learning. In IEEE 39th International Conference on Distributed Computing Systems (ICDCS)","author":"Lee Woo-Yeon","year":"2019","unstructured":"Woo-Yeon Lee, Yunseong Lee, Joo Seong Jeong, Gyeong-In Yu, Joo Yeon Kim, Ho Jin Park, Beomyeol Jeon, Wonwook Song, Gunhee Kim, Markus Weimer, Brian Cho, and Byung-Gon Chun. 2019. Automating System Configuration of Distributed Machine Learning. In IEEE 39th International Conference on Distributed Computing Systems (ICDCS) (Dallas, TX, USA)."},{"key":"e_1_3_2_1_38_1","volume-title":"Aryl: An Elastic Cluster Scheduler for Deep Learning. arXiv:2202.07896 [cs.DC] https:\/\/arxiv.org\/abs\/2202.07896","author":"Li Jiamin","year":"2022","unstructured":"Jiamin Li, Hong Xu, Yibo Zhu, Zherui Liu, Chuanxiong Guo, and Cong Wang. 2022. Aryl: An Elastic Cluster Scheduler for Deep Learning. arXiv:2202.07896 [cs.DC] https:\/\/arxiv.org\/abs\/2202.07896"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_40_1","volume-title":"EasyScale: Elastic Training with Consistent Accuracy and Improved Utilization on GPUs. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC)","author":"Li Mingzhen","year":"2023","unstructured":"Mingzhen Li, Wencong Xiao, Hailong Yang, Biao Sun, Hanyu Zhao, Shiru Ren, Zhongzhi Luan, Xianyan Jia, Yi Liu, Yong Li, Wei Lin, and Depei Qian. 2023. EasyScale: Elastic Training with Consistent Accuracy and Improved Utilization on GPUs. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC) (Denver, CO, USA)."},{"key":"e_1_3_2_1_41_1","volume-title":"Sequence Parallelism: Long Sequence Training from System Perspective. arXiv:2105.13120 [cs.LG] https:\/\/arxiv.org\/abs\/2105.13120","author":"Li Shenggui","year":"2022","unstructured":"Shenggui Li, Fuzhao Xue, Chaitanya Baranwal, Yongbin Li, and Yang You. 2022. Sequence Parallelism: Long Sequence Training from System Perspective. arXiv:2105.13120 [cs.LG] https:\/\/arxiv.org\/abs\/2105.13120"},{"key":"e_1_3_2_1_42_1","volume-title":"PyTorch distributed: experiences on accelerating data parallel training. VLDB Endowment","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, and Soumith Chintala. 2020. PyTorch distributed: experiences on accelerating data parallel training. VLDB Endowment (2020)."},{"key":"e_1_3_2_1_43_1","volume-title":"KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, Marcel Wagenl\u00e4nder, Konstantinos Fertakis, Andrei-Octavian Brabete, and Peter Pietzuch. 2020. KungFu: Making Training in Distributed Machine Learning Adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_44_1","unstructured":"Microsoft. 2024. Microsoft Azure. https:\/\/azure.microsoft.com\/."},{"key":"e_1_3_2_1_45_1","unstructured":"MindSpore. 2020. Mindspore Deep Learning Training\/Inference Framework. https:\/\/github.com\/mindspore-ai\/mindspore."},{"key":"e_1_3_2_1_46_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST).","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST)."},{"key":"e_1_3_2_1_47_1","volume-title":"Ray: A Distributed Framework for Emerging AI Applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I. Jordan, and Ion Stoica. 2018. Ray: A Distributed Framework for Emerging AI Applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI) (Carlsbad, CA, USA)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_50_1","volume-title":"DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models. In 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"Nicolae Bogdan","year":"2020","unstructured":"Bogdan Nicolae, Jiali Li, Justin M. Wozniak, George Bosilca, Matthieu Dorier, and Franck Cappello. 2020. DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models. In 20th IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID) (Melbourne, Australia)."},{"key":"e_1_3_2_1_51_1","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt. (2022)."},{"key":"e_1_3_2_1_52_1","unstructured":"Andrew Or Haoyu Zhang and Michael None Freedman. 2022. VirtualFlow: Decoupling Deep Learning Models from the Underlying Hardware. In Machine Learning and Systems (MLSys) (Santa Clara CA USA)."},{"key":"e_1_3_2_1_53_1","unstructured":"Seo Jin Park Joshua Fried Sunghyun Kim Mohammad Alizadeh and Adam Belay. 2022. Efficient Strong Scaling Through Burst Parallel Training. In Machine Learning and Systems (MLSys) (Santa Clara CA USA)."},{"key":"e_1_3_2_1_54_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems (NeurIPS) (Vancouver, BC, Canada)."},{"key":"e_1_3_2_1_55_1","volume-title":"Optimus: An Efficient Dynamic Resource Scheduler for Deep Learning Clusters. In Thirteenth EuroSys Conference (EuroSys)","author":"Peng Yanghua","year":"2018","unstructured":"Yanghua Peng, Yixin Bao, Yangrui Chen, Chuan Wu, and Chuanxiong Guo. 2018. Optimus: An Efficient Dynamic Resource Scheduler for Deep Learning Clusters. In Thirteenth EuroSys Conference (EuroSys) (Porto, Portugal)."},{"key":"e_1_3_2_1_56_1","unstructured":"Shashank Prasanna. 2019. Train Deep Learning Models on GPUs using Amazon EC2 Spot Instances. https:\/\/aws.amazon.com\/blogs\/machine-learning\/train-deep-learning-models-on-gpus-using-amazon-ec2-spot-instances\/."},{"key":"e_1_3_2_1_57_1","unstructured":"PyTorch. 2024. Torch Elastic. https:\/\/pytorch.org\/elastic\/latest\/."},{"key":"e_1_3_2_1_58_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_59_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf. (2019)."},{"key":"e_1_3_2_1_60_1","volume-title":"26th Annual International Conference on Machine Learning (ICML) (Montreal","author":"Raina Rajat","unstructured":"Rajat Raina, Anand Madhavan, and Andrew Y. Ng. 2009. Large-scale deep unsupervised learning using graphics processors. In 26th Annual International Conference on Machine Learning (ICML) (Montreal, Quebec, Canada)."},{"key":"e_1_3_2_1_61_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In International Conference on Machine Learning (ICML) (Baltimore, Maryland, USA)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_64_1","volume-title":"XLA: Compiling Machine Learning for Peak Performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. XLA: Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_2_1_65_1","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv:1802.05799 [cs.LG] https:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_66_1","unstructured":"Chris Shallue and George Dahl. 2019. Measuring the Limits of Data Parallel Training for Neural Networks. https:\/\/blog.research.google\/2019\/03\/measuring-limits-of-data-parallel.html."},{"key":"e_1_3_2_1_67_1","unstructured":"Noam Shazeer Azalia Mirhoseini Krzysztof Maziarz Andy Davis Quoc Le Geoffrey Hinton and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. arXiv:1701.06538 [cs.LG] https:\/\/arxiv.org\/abs\/1701.06538"},{"key":"e_1_3_2_1_68_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_69_1","volume-title":"Singularity: Planet-Scale, Preemptive and Elastic Scheduling of AI Workloads. arXiv:2202.07848 [cs.DC] https:\/\/arxiv.org\/abs\/2202.07848","author":"Shukla Dharma","year":"2022","unstructured":"Dharma Shukla, Muthian Sivathanu, Srinidhi Viswanatha, Bhargav Gulavani, Rimma Nehme, Amey Agrawal, Chen Chen, Nipun Kwatra, Ramachandran Ramjee, Pankaj Sharma, et al. 2022. Singularity: Planet-Scale, Preemptive and Elastic Scheduling of AI Workloads. arXiv:2202.07848 [cs.DC] https:\/\/arxiv.org\/abs\/2202.07848"},{"key":"e_1_3_2_1_70_1","volume-title":"Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Sima Chijun","year":"2022","unstructured":"Chijun Sima, Yao Fu, Man-Kit Sit, Liyi Guo, Xuri Gong, Feng Lin, Junyu Wu, Yongsheng Li, Haidong Rong, Pierre-Louis Aublin, and Luo Mai. 2022. Ekko: A Large-Scale Deep Learning Recommender System with Low-Latency Model Update. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22) (Carlsbad, CA)."},{"key":"e_1_3_2_1_71_1","unstructured":"Shaden Smith Mostofa Patwary Brandon Norick Patrick LeGresley Samyam Rajbhandari Jared Casper Zhun Liu Shrimai Prabhumoye George Zerveas Vijay Korthikanti et al. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B A Large-Scale Generative Language Model. arXiv:2201.11990 [cs.CL] https:\/\/arxiv.org\/abs\/2201.11990"},{"key":"e_1_3_2_1_72_1","volume-title":"Le","author":"Smith Samuel L.","year":"2018","unstructured":"Samuel L. Smith and Quoc V. Le. 2018. A Bayesian Perspective on Generalization and Stochastic Gradient Descent. arXiv:1710.06451 [cs.LG] https:\/\/arxiv.org\/abs\/1710.06451"},{"key":"e_1_3_2_1_73_1","volume-title":"Rethinking the Inception Architecture for Computer Vision. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Szegedy Christian","year":"2016","unstructured":"Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. 2016. Rethinking the Inception Architecture for Computer Vision. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Las Vegas, NV, USA)."},{"key":"e_1_3_2_1_74_1","volume-title":"Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Patrick S. McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. 2022. Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI) (Carlsbad, CA, USA)."},{"key":"e_1_3_2_1_75_1","volume-title":"15th USENIX Conference on File and Storage Technologies (FAST)","author":"Reddy Vangoor Bharath Kumar","year":"2017","unstructured":"Bharath Kumar Reddy Vangoor, Vasily Tarasov, and Erez Zadok. 2017. To FUSE or Not to FUSE: Performance of User-Space File Systems. In 15th USENIX Conference on File and Storage Technologies (FAST) (Santa Clara, CA, USA)."},{"key":"e_1_3_2_1_76_1","volume-title":"Spotnik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Wagenl\u00e4nder Marcel","year":"2020","unstructured":"Marcel Wagenl\u00e4nder, Luo Mai, Guo Li, and Peter Pietzuch. 2020. Spotnik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)."},{"key":"e_1_3_2_1_77_1","volume-title":"GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In 29th Symposium on Operating Systems Principles (SOSP)","author":"Wang Zhuang","year":"2023","unstructured":"Zhuang Wang, Zhen Jia, Shuai Zheng, Zhen Zhang, Xinwei Fu, T. S. Eugene Ng, and Yida Wang. 2023. GEMINI: Fast Failure Recovery in Distributed Training with In-Memory Checkpoints. In 29th Symposium on Operating Systems Principles (SOSP) (Koblenz, Germany)."},{"key":"e_1_3_2_1_78_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI) (Renton, WA, USA)."},{"key":"e_1_3_2_1_79_1","volume-title":"Beware of Fragmentation: Scheduling GPU-Sharing Workloads with Fragmentation Gradient Descent. In USENIX Annual Technical Conference (ATC)","author":"Weng Qizhen","year":"2023","unstructured":"Qizhen Weng, Lingyun Yang, Yinghao Yu, Wei Wang, Xiaochuan Tang, Guodong Yang, and Liping Zhang. 2023. Beware of Fragmentation: Scheduling GPU-Sharing Workloads with Fragmentation Gradient Descent. In USENIX Annual Technical Conference (ATC) (Boston, MA)."},{"key":"e_1_3_2_1_80_1","volume-title":"Elastic Deep Learning in Multi-Tenant GPU Clusters","author":"Wu Yidi","year":"2022","unstructured":"Yidi Wu, Kaihao Ma, Xiao Yan, Zhi Liu, Zhenkun Cai, Yuzhen Huang, James Cheng, Han Yuan, and Fan Yu. 2022. Elastic Deep Learning in Multi-Tenant GPU Clusters. IEEE Transactions on Parallel and Distributed Systems (TPDS) (2022)."},{"key":"e_1_3_2_1_81_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI) (Carlsbad, CA, USA)."},{"key":"e_1_3_2_1_82_1","volume-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_83_1","volume-title":"9th USENIX Conference on Networked Systems Design and Implementation (NSDI)","author":"Zaharia Matei","year":"2012","unstructured":"Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauley, Michael J. Franklin, Scott Shenker, and Ion Stoica. 2012. Resilient distributed datasets: a fault-tolerant abstraction for in-memory cluster computing. In 9th USENIX Conference on Networked Systems Design and Implementation (NSDI) (San Jose, CA, USA)."},{"key":"e_1_3_2_1_84_1","volume-title":"Accelerating Large-Scale Distributed Neural Network Training with SPMD Parallelism. In 13th Symposium on Cloud Computing (SoCC)","author":"Zhang Shiwei","year":"2022","unstructured":"Shiwei Zhang, Lansong Diao, Chuan Wu, Siyu Wang, and Wei Lin. 2022. Accelerating Large-Scale Distributed Neural Network Training with SPMD Parallelism. In 13th Symposium on Cloud Computing (SoCC) (San Francisco, California, USA)."},{"key":"e_1_3_2_1_85_1","volume-title":"Goldminer: Elastic scaling of training data pre-processing pipelines for deep learning. ACM on Management of Data","author":"Zhao Hanyu","year":"2023","unstructured":"Hanyu Zhao, Zhi Yang, Yu Cheng, Chao Tian, Shiru Ren, Wencong Xiao, Man Yuan, Langshi Chen, Kaibo Liu, Yang Zhang, et al. 2023. Goldminer: Elastic scaling of training data pre-processing pipelines for deep learning. ACM on Management of Data (2023)."},{"key":"e_1_3_2_1_86_1","volume-title":"Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI) (Carlsbad, CA, USA)."},{"key":"e_1_3_2_1_87_1","volume-title":"Deep Interest Network for Click-Through Rate Prediction. In 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (SIGKDD)","author":"Zhou Guorui","year":"2018","unstructured":"Guorui Zhou, Xiaoqiang Zhu, Chengru Song, Ying Fan, Han Zhu, Xiao Ma, Yanghui Yan, Junqi Jin, Han Li, and Kun Gai. 2018. Deep Interest Network for Click-Through Rate Prediction. In 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (SIGKDD) (London, UK)."}],"event":{"name":"SOSP '24: ACM SIGOPS 30th Symposium on Operating Systems Principles","location":"Austin TX USA","acronym":"SOSP '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695975","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694715.3695975","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:48Z","timestamp":1750291548000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695975"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":87,"alternative-id":["10.1145\/3694715.3695975","10.1145\/3694715"],"URL":"https:\/\/doi.org\/10.1145\/3694715.3695975","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}