{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T15:54:01Z","timestamp":1776182041759,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,5,8]],"date-time":"2023-05-08T00:00:00Z","timestamp":1683504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Research Grants Council of Hong Kong","award":["N_CityU139\/21"],"award-info":[{"award-number":["N_CityU139\/21"]}]},{"name":"Research Grants Council of Hong Kong","award":["C2004-21GF"],"award-info":[{"award-number":["C2004-21GF"]}]},{"name":"Research Grants Council of Hong Kong","award":["R1012-21"],"award-info":[{"award-number":["R1012-21"]}]},{"name":"Research Grants Council of Hong Kong","award":["R6021-20F"],"award-info":[{"award-number":["R6021-20F"]}]},{"name":"Research Grants Council of Hong Kong","award":["11209520"],"award-info":[{"award-number":["11209520"]}]},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["4937007"],"award-info":[{"award-number":["4937007"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["4937008"],"award-info":[{"award-number":["4937008"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["5501329"],"award-info":[{"award-number":["5501329"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["5501517"],"award-info":[{"award-number":["5501517"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004853","name":"Chinese University of Hong Kong","doi-asserted-by":"publisher","award":["8601677"],"award-info":[{"award-number":["8601677"]}],"id":[{"id":"10.13039\/501100004853","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,5,8]]},"DOI":"10.1145\/3552326.3587445","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:33:02Z","timestamp":1683307982000},"page":"835-850","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":51,"title":["Lyra: Elastic Scheduling for Deep Learning Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8110-2436","authenticated-orcid":false,"given":"Jiamin","family":"Li","sequence":"first","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9359-9571","authenticated-orcid":false,"given":"Hong","family":"Xu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9113-2660","authenticated-orcid":false,"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"Google, Kirkland, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4910-6095","authenticated-orcid":false,"given":"Zherui","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0730-8468","authenticated-orcid":false,"given":"Chuanxiong","family":"Guo","sequence":"additional","affiliation":[{"name":"Non affiliated, Bellevue, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0547-315X","authenticated-orcid":false,"given":"Cong","family":"Wang","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2023,5,8]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2966884.2966912"},{"key":"e_1_3_2_1_2_1","volume-title":"Proc. USENIX OSDI.","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai , Zhen Zhang , Yibo Zhu , and Xin Jin . 2020 . PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications . In Proc. USENIX OSDI. Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. USENIX OSDI.","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi , Yutaka Suzue , Johnson Apacible , and Karthik Kalyanaraman . 2014 . Project adam: Building an efficient and scalable deep learning training system . In Proc. USENIX OSDI. Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. 2014. Project adam: Building an efficient and scalable deep learning training system. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Edward G. Coffman  Jr. J\u00e1nos Csirik G\u00e1bor Galambos Silvano Martello and Daniele Vigo. 2013. Bin Packing Approximation Algorithms: Survey and Classification. In Handbook of Combinatorial Optimization.  Edward G. Coffman Jr. J\u00e1nos Csirik G\u00e1bor Galambos Silvano Martello and Daniele Vigo. 2013. Bin Packing Approximation Algorithms: Survey and Classification. In Handbook of Combinatorial Optimization.","DOI":"10.1007\/978-1-4419-7997-1_35"},{"key":"e_1_3_2_1_7_1","volume-title":"Proc. USENIX NSDI.","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw , Xin Wang , Guilio Zhou , Michael J Franklin , Joseph E Gonzalez , and Ion Stoica . 2017 . Clipper: A low-latency online prediction serving system . In Proc. USENIX NSDI. Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. 2017. Clipper: A low-latency online prediction serving system. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805 Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Samuel Eilon and IG Chowdhury. 1977. Minimising waiting time variance in the single machine problem.  Samuel Eilon and IG Chowdhury. 1977. Minimising waiting time variance in the single machine problem.","DOI":"10.1287\/mnsc.23.6.567"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5555\/646379.689529"},{"key":"e_1_3_2_1_11_1","unstructured":"Priya Goyal Piotr Doll\u00e1r Ross Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate large minibatch sgd: Training imagenet in 1 hour. arXiv:1706.02677  Priya Goyal Piotr Doll\u00e1r Ross Girshick Pieter Noordhuis Lukasz Wesolowski Aapo Kyrola Andrew Tulloch Yangqing Jia and Kaiming He. 2017. Accurate large minibatch sgd: Training imagenet in 1 hour. arXiv:1706.02677"},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. USENIX NSDI.","author":"Grandl Robert","year":"2016","unstructured":"Robert Grandl , Srikanth Kandula , Sriram Rao , Aditya Akella , and Janardhan Kulkarni . 2016 . GRAPHENE: Packing and dependency-aware scheduling for data-parallel clusters . In Proc. USENIX NSDI. Robert Grandl, Srikanth Kandula, Sriram Rao, Aditya Akella, and Janardhan Kulkarni. 2016. GRAPHENE: Packing and dependency-aware scheduling for data-parallel clusters. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. USENIX NSDI.","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu , Mosharaf Chowdhury , Kang G. Shin , Yibo Zhu , Myeongjae Jeon , Junjie Qian , Hongqiang Liu , and Chuanxiong Guo . 2019 . Tiresias: A GPU Cluster Manager for Distributed Deep Learning . In Proc. USENIX NSDI. Juncheng Gu, Mosharaf Chowdhury, Kang G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_14_1","volume-title":"Proc. USENIX OSDI.","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati , Reza Karimi , Safya Alzayat , Wei Hao , Antoine Kaufmann , Ymir Vigfusson , and Jonathan Mace . 2020 . Serving DNNs like Clockwork: Performance Predictability from the Bottom Up . In Proc. USENIX OSDI. Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","unstructured":"Horovod. 2021. Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/latest\/elastic_include.html.  Horovod. 2021. Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/latest\/elastic_include.html."},{"key":"e_1_3_2_1_17_1","volume-title":"Proc. MLSys.","author":"Hu Hanpeng","year":"2022","unstructured":"Hanpeng Hu , Chenyu Jiang , Yuchen Zhong , Yanghua Peng , Chuan Wu , Yibo Zhu , Haibin Lin , and Chuanxiong Guo . 2022 . dPRO: A Generic Performance Diagnosis and Optimization Toolkit for Expediting Distributed DNN Training . In Proc. MLSys. Hanpeng Hu, Chenyu Jiang, Yuchen Zhong, Yanghua Peng, Chuan Wu, Yibo Zhu, Haibin Lin, and Chuanxiong Guo. 2022. dPRO: A Generic Performance Diagnosis and Optimization Toolkit for Expediting Distributed DNN Training. In Proc. MLSys."},{"key":"e_1_3_2_1_18_1","volume-title":"Proc. USENIX NSDI.","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang , Taehyun Kim , Sunghyun Kim , Jinwoo Shin , and KyoungSoo Park . 2021 . Elastic Resource Sharing for Distributed Deep Learning . In Proc. USENIX NSDI. Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic Resource Sharing for Distributed Deep Learning. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_19_1","unstructured":"Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu etal 2018. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv:1807.11205  Xianyan Jia Shutao Song Wei He Yangzihao Wang Haidong Rong Feihu Zhou Liqiang Xie Zhenyu Guo Yuanzhou Yang Liwei Yu et al. 2018. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv:1807.11205"},{"key":"e_1_3_2_1_20_1","volume-title":"Proc. USENIX OSDI.","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang , Yibo Zhu , Chang Lan , Bairen Yi , Yong Cui , and Chuanxiong Guo . 2020 . A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters . In Proc. USENIX OSDI. Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. 2020. A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_21_1","unstructured":"Tyler B Johnson Pulkit Agrawal Haijie Gu and Carlos Guestrin. 2019. AdaScale SGD: A Scale-Invariant Algorithm for Distributed Training.  Tyler B Johnson Pulkit Agrawal Haijie Gu and Carlos Guestrin. 2019. AdaScale SGD: A Scale-Invariant Algorithm for Distributed Training."},{"key":"e_1_3_2_1_22_1","unstructured":"Kubernetes. 2021. ElasticDL: A Kubernetes-native Deep Learning Framework. https:\/\/github.com\/sql-machine-learning\/elasticdl.  Kubernetes. 2021. ElasticDL: A Kubernetes-native Deep Learning Framework. https:\/\/github.com\/sql-machine-learning\/elasticdl."},{"key":"e_1_3_2_1_23_1","unstructured":"Kubernetes. 2021. Kubernetes. https:\/\/kubernetes.io\/.  Kubernetes. 2021. Kubernetes. https:\/\/kubernetes.io\/."},{"key":"e_1_3_2_1_24_1","unstructured":"Kubernetes. 2021. Kubernetes Horizontal Pod Autoscaler. https:\/\/kubernetes.io\/docs\/tasks\/run-application\/horizontal-pod-autoscale\/.  Kubernetes. 2021. Kubernetes Horizontal Pod Autoscaler. https:\/\/kubernetes.io\/docs\/tasks\/run-application\/horizontal-pod-autoscale\/."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Yann LeCun Yoshua Bengio and Geoffrey Hinton. 2015. Deep Learning.  Yann LeCun Yoshua Bengio and Geoffrey Hinton. 2015. Deep Learning.","DOI":"10.1038\/nature14539"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_27_1","volume-title":"Proc. IEEE INFOCOM.","author":"Lin M.","unstructured":"M. Lin , A. Wierman , L. L. H. Andrew , and E. Thereska . 2011. Dynamic right-sizing for power-proportional data centers . In Proc. IEEE INFOCOM. M. Lin, A. Wierman, L. L. H. Andrew, and E. Thereska. 2011. Dynamic right-sizing for power-proportional data centers. In Proc. IEEE INFOCOM."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749475"},{"key":"e_1_3_2_1_29_1","unstructured":"Peter Mattson Christine Cheng Cody Coleman Greg Diamos Paulius Micikevicius David Patterson Hanlin Tang Gu-Yeon Wei Peter Bailis Victor Bittorf etal 2019. Mlperf training benchmark. arXiv:1910.01500  Peter Mattson Christine Cheng Cody Coleman Greg Diamos Paulius Micikevicius David Patterson Hanlin Tang Gu-Yeon Wei Peter Bailis Victor Bittorf et al. 2019. Mlperf training benchmark. arXiv:1910.01500"},{"key":"e_1_3_2_1_30_1","volume-title":"SGD: ImageNet\/ResNet-50 training in a flash. arXiv:1811.05233","author":"Mikami Hiroaki","year":"2018","unstructured":"Hiroaki Mikami , Hisahiro Suganuma , Yoshiki Tanaka , Yuichi Kageyama , 2018 . Massively distributed SGD: ImageNet\/ResNet-50 training in a flash. arXiv:1811.05233 Hiroaki Mikami, Hisahiro Suganuma, Yoshiki Tanaka, Yuichi Kageyama, et al. 2018. Massively distributed SGD: ImageNet\/ResNet-50 training in a flash. arXiv:1811.05233"},{"key":"e_1_3_2_1_31_1","volume-title":"Proc. USENIX FAST.","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan , Amar Phanishayee , and Vijay Chidambaram . 2021 . CheckFreq: Frequent, Fine-Grained DNN Checkpointing . In Proc. USENIX FAST. Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In Proc. USENIX FAST."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-63004-5_12"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_34_1","volume-title":"Proc. USENIX OSDI.","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan , Keshav Santhanam , Fiodar Kazhamiaka , Amar Phanishayee , and Matei Zaharia . 2020 . Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads . In Proc. USENIX OSDI. Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_35_1","volume-title":"NeurIPS Workshop on Systems for Machine Learning.","author":"Narayanan Deepak","year":"2018","unstructured":"Deepak Narayanan , Keshav Santhanam , Amar Phanishayee , and Matei Zaharia . 2018 . Accelerating deep learning workloads through efficient multi-model execution . In NeurIPS Workshop on Systems for Machine Learning. Deepak Narayanan, Keshav Santhanam, Amar Phanishayee, and Matei Zaharia. 2018. Accelerating deep learning workloads through efficient multi-model execution. In NeurIPS Workshop on Systems for Machine Learning."},{"key":"e_1_3_2_1_36_1","volume-title":"Proc. MLSys.","author":"Or Andrew","year":"2020","unstructured":"Andrew Or , Haoyu Zhang , and Michael Freedman . 2020 . Resource elasticity in distributed deep learning . In Proc. MLSys. Andrew Or, Haoyu Zhang, and Michael Freedman. 2020. Resource elasticity in distributed deep learning. In Proc. MLSys."},{"key":"e_1_3_2_1_38_1","unstructured":"Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Khudia James Law Parth Malani Andrey Malevich Satish Nadathur etal 2018. Deep learning inference in facebook data centers: Characterization performance optimizations and hardware implications. arXiv:1811.09886  Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Khudia James Law Parth Malani Andrey Malevich Satish Nadathur et al. 2018. Deep learning inference in facebook data centers: Characterization performance optimizations and hardware implications. arXiv:1811.09886"},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. USENIX ATC.","author":"Park Jay H","year":"2020","unstructured":"Jay H Park , Gyeongchan Yun , M Yi Chang , Nguyen T Nguyen , Seungmin Lee , Jaesik Choi , Sam H Noh , and Young-ri Choi. 2020 . HetPipe: Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism . In Proc. USENIX ATC. Jay H Park, Gyeongchan Yun, M Yi Chang, Nguyen T Nguyen, Seungmin Lee, Jaesik Choi, Sam H Noh, and Young-ri Choi. 2020. HetPipe: Enabling Large DNN Training on (Whimpy) Heterogeneous GPU Clusters through Integration of Pipelined Model Parallelism and Data Parallelism. In Proc. USENIX ATC."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_42_1","unstructured":"PyTorch. 2021. PyTorch Elastic. https:\/\/pytorch.org\/elastic\/0.2.0rc1\/distributed.html#module-torchelastic.distributed.launch.  PyTorch. 2021. PyTorch Elastic. https:\/\/pytorch.org\/elastic\/0.2.0rc1\/distributed.html#module-torchelastic.distributed.launch."},{"key":"e_1_3_2_1_43_1","volume-title":"Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing.","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao , Sang Keun Choe , Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021 . Pollux : Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In Proc. USENIX OSDI. Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_45_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi , Mostofa Patwary , Raul Puri , Patrick LeGresley , Jared Casper , and Bryan Catanzaro . 2019 . Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv:1909.08053 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv:1909.08053"},{"key":"e_1_3_2_1_46_1","volume-title":"Singularity: Planet-Scale, Preemptible, Elastic Scheduling of AI Workloads. arXiv:1403.1349","author":"Shukla Dharma","year":"2022","unstructured":"Dharma Shukla , Muthian Sivathanu , Srinidhi Viswanatha , Bhargav Gulavani , Rimma Nehme , Amey Agrawal , Chen Chen , Nipun Kwatra , Ramachandran Ramjee , Pankaj Sharma , 2022 . Singularity: Planet-Scale, Preemptible, Elastic Scheduling of AI Workloads. arXiv:1403.1349 Dharma Shukla, Muthian Sivathanu, Srinidhi Viswanatha, Bhargav Gulavani, Rimma Nehme, Amey Agrawal, Chen Chen, Nipun Kwatra, Ramachandran Ramjee, Pankaj Sharma, et al. 2022. Singularity: Planet-Scale, Preemptible, Elastic Scheduling of AI Workloads. arXiv:1403.1349"},{"key":"e_1_3_2_1_47_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"e_1_3_2_1_49_1","volume-title":"Proc. USENIX OSDI.","author":"Tang Chunqiang","year":"2020","unstructured":"Chunqiang Tang , Kenny Yu , Kaushik Veeraraghavan , Jonathan Kaldor , Scott Michelson , Thawan Kooburat , Aravind Anbudurai , Matthew Clark , Kabir Gogia , Long Cheng , Ben Christensen , Alex Gartrell , Maxim Khutornenko , Sachin Kulkarni , Marcin Pawlowski , Tuomas Pelkonen , Andre Rodrigues , Rounak Tibrewal , Vaishnavi Venkatesan , and Peter Zhang . 2020 . Twine: A Unified Cluster Management System for Shared Infrastructure . In Proc. USENIX OSDI. Chunqiang Tang, Kenny Yu, Kaushik Veeraraghavan, Jonathan Kaldor, Scott Michelson, Thawan Kooburat, Aravind Anbudurai, Matthew Clark, Kabir Gogia, Long Cheng, Ben Christensen, Alex Gartrell, Maxim Khutornenko, Sachin Kulkarni, Marcin Pawlowski, Tuomas Pelkonen, Andre Rodrigues, Rounak Tibrewal, Vaishnavi Venkatesan, and Peter Zhang. 2020. Twine: A Unified Cluster Management System for Shared Infrastructure. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901355"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2741948.2741964"},{"key":"e_1_3_2_1_53_1","volume-title":"Proc. USENIX NSDI.","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng , Wencong Xiao , Yinghao Yu , Wei Wang , Cheng Wang , Jian He , Yong Li , Liping Zhang , Wei Lin , and Yu Ding . 2022 . MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters . In Proc. USENIX NSDI. Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In Proc. USENIX NSDI."},{"key":"e_1_3_2_1_54_1","unstructured":"Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey etal 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv:1609.08144  Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv:1609.08144"},{"key":"e_1_3_2_1_55_1","volume-title":"Proc. USENIX OSDI.","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao , Romil Bhardwaj , Ramachandran Ramjee , Muthian Sivathanu , Nipun Kwatra , Zhenhua Han , Pratyush Patel , Xuan Peng , Hanyu Zhao , Quanlu Zhang , 2018 . Gandiva: Introspective cluster scheduling for deep learning . In Proc. USENIX OSDI. Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. 2018. Gandiva: Introspective cluster scheduling for deep learning. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_56_1","volume-title":"Proc. USENIX OSDI.","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao , Shiru Ren , Yong Li , Yang Zhang , Pengyang Hou , Zhi Li , Yihui Feng , Wei Lin , and Yangqing Jia . 2020 . AntMan: Dynamic Scaling on GPU Clusters for Deep Learning . In Proc. USENIX OSDI. Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In Proc. USENIX OSDI."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783270"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386367.3432728"},{"key":"e_1_3_2_1_59_1","unstructured":"Yang You Jing Li Sashank Reddi Jonathan Hseu Sanjiv Kumar Srinadh Bhojanapalli Xiaodan Song James Demmel Kurt Keutzer and Cho-Jui Hsieh. 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv:1904.00962  Yang You Jing Li Sashank Reddi Jonathan Hseu Sanjiv Kumar Srinadh Bhojanapalli Xiaodan Song James Demmel Kurt Keutzer and Cho-Jui Hsieh. 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv:1904.00962"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.14778\/2733004.2733012"},{"key":"e_1_3_2_1_61_1","volume-title":"Proc. USENIX ATC.","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu , Amar Phanishayee , and Gennady Pekhimenko . 2020 . Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training . In Proc. USENIX ATC. Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. 2020. Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In Proc. USENIX ATC."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2670979.2671008"}],"event":{"name":"EuroSys '23: Eighteenth European Conference on Computer Systems","location":"Rome Italy","acronym":"EuroSys '23","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Eighteenth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3587445","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3552326.3587445","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:40Z","timestamp":1750178860000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3587445"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,8]]},"references-count":60,"alternative-id":["10.1145\/3552326.3587445","10.1145\/3552326"],"URL":"https:\/\/doi.org\/10.1145\/3552326.3587445","relation":{},"subject":[],"published":{"date-parts":[[2023,5,8]]},"assertion":[{"value":"2023-05-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}