{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T19:31:30Z","timestamp":1773775890371,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,30]],"date-time":"2023-10-30T00:00:00Z","timestamp":1698624000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation","doi-asserted-by":"publisher","award":["JUMP 2.0"],"award-info":[{"award-number":["JUMP 2.0"]}],"id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,30]]},"DOI":"10.1145\/3620678.3624669","type":"proceedings-article","created":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T13:58:07Z","timestamp":1698760687000},"page":"410-426","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Anticipatory Resource Allocation for ML Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1507-6708","authenticated-orcid":false,"given":"Tapan","family":"Chugh","sequence":"first","affiliation":[{"name":"Microsoft Research, University of Washington"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9494-6435","authenticated-orcid":false,"given":"Srikanth","family":"Kandula","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9505-9528","authenticated-orcid":false,"given":"Arvind","family":"Krishnamurthy","sequence":"additional","affiliation":[{"name":"University of Washington"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8005-6948","authenticated-orcid":false,"given":"Ratul","family":"Mahajan","sequence":"additional","affiliation":[{"name":"University of Washington"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2540-236X","authenticated-orcid":false,"given":"Ishai","family":"Menache","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]}],"member":"320","published-online":{"date-parts":[[2023,10,31]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Osdi","volume":"16","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. Tensorflow: a system for large-scale machine learning.. In Osdi, Vol. 16. Savannah, GA, USA, 265--283."},{"key":"e_1_3_2_1_2_1","volume-title":"Re-optimizing Data Parallel Computing. In Symposium on Networked Systems Design and Implementation.","author":"Agarwal Sameer","year":"2012","unstructured":"Sameer Agarwal, Srikanth Kandula, Nicolas Bruno, Ming Wu, Ion Stoica, and Jingren Zhou. 2012. Re-optimizing Data Parallel Computing. In Symposium on Networked Systems Design and Implementation."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330701"},{"key":"e_1_3_2_1_4_1","unstructured":"Amazon. 2023. Amazon Sagemaker. https:\/\/aws.amazon.com\/sagemaker\/features."},{"key":"e_1_3_2_1_5_1","volume-title":"Providing SLOs for Resource-Harvesting VMs in Cloud Platforms. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Ambati Pradeep","year":"2020","unstructured":"Pradeep Ambati, I\u00f1igo Goiri, Felipe Vieira Frujeri, Alper Gun, Ke Wang, Brian Dolan, Brian Corell, Sekhar Pasupuleti, Thomas Moscibroda, Sameh Elnikety, Marcus Fontoura, and Ricardo Bianchini. 2020. Providing SLOs for Resource-Harvesting VMs in Cloud Platforms. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4-6, 2020. 735--751."},{"key":"e_1_3_2_1_6_1","volume-title":"David Dion, Thomas Moscibroda, and Ishai Menache.","author":"Barbalho Hugo","year":"2023","unstructured":"Hugo Barbalho, Patricia Kovaleski, Beibin Li, Luke Marshall, Marco Molinaro, Abhisek Pan, Eli Cortez, Matheus Leao, Harsh Patwari, Zuzu Tang, Tamires Santos, Larissa Rozales Gon\u00e7alves, David Dion, Thomas Moscibroda, and Ishai Menache. 2023. Virtual Machine Allocation with Lifetime Predictions. In MLSys."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410220.3456278"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626315"},{"key":"e_1_3_2_1_11_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Chung Andrew","unstructured":"Andrew Chung, Subru Krishnan, Konstantinos Karanasos, Carlo Curino, and Gregory R. Ganger. 2020. Unearthing inter-job dependencies for better cluster scheduling. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 1205--1223."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132772"},{"key":"e_1_3_2_1_13_1","unstructured":"CRIU. 2023. CRIU. https:\/\/criu.org\/. https:\/\/criu.org\/"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/75247.75248"},{"key":"e_1_3_2_1_15_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 929--943."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507725"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2382553.2382556"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 8th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2011","author":"Ghodsi Ali","year":"2011","unstructured":"Ali Ghodsi, Matei Zaharia, Benjamin Hindman, Andy Konwinski, Scott Shenker, and Ion Stoica. 2011. Dominant Resource Fairness: Fair Allocation of Multiple Resource Types. In Proceedings of the 8th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2011, Boston, MA, USA, March 30 - April 1, 2011."},{"key":"e_1_3_2_1_19_1","unstructured":"Github. 2023. Helios Data. https:\/\/github.com\/S-Lab-System-Group\/HeliosData. https:\/\/github.com\/S-Lab-System-Group\/HeliosData"},{"key":"e_1_3_2_1_20_1","unstructured":"Google. 2020. Web Post. https:\/\/groups.google.com\/g\/gce-discussion\/c\/8vCwUKaGs2o."},{"key":"e_1_3_2_1_21_1","unstructured":"Google. 2020. Web Post. https:\/\/groups.google.com\/g\/gce-discussion\/c\/34zBBmTV8Tg."},{"key":"e_1_3_2_1_22_1","unstructured":"Google. 2021. Running a training Job | AI Platform. https:\/\/cloud.google.com\/ai-platform\/training\/docs\/training-jobs."},{"key":"e_1_3_2_1_23_1","unstructured":"Google. 2023. Google Cloud AI. https:\/\/cloud.google.com\/products\/ai."},{"key":"e_1_3_2_1_24_1","unstructured":"Google. 2023. Live Migration on Google Cloud. https:\/\/cloud.google.com\/compute\/docs\/instances\/live-migration#gpusmaintenance. https:\/\/cloud.google.com\/compute\/docs\/instances\/live-migration#gpusmaintenance"},{"key":"e_1_3_2_1_25_1","unstructured":"Google. 2023. Reserve VM Capacity. https:\/\/cloud.google.com\/compute\/docs\/instances\/reservations-overview."},{"key":"e_1_3_2_1_26_1","unstructured":"Google. 2023. Runtime version list | AI Platform. https:\/\/cloud.google.com\/ai-platform\/training\/docs\/runtime-version-list."},{"key":"e_1_3_2_1_27_1","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Grandl Robert","year":"2016","unstructured":"Robert Grandl, Mosharaf Chowdhury, Aditya Akella, and Ganesh Ananthanarayanan. 2016. Altruistic Scheduling in {Multi-Resource} Clusters. In 12th USENIX symposium on operating systems design and implementation (OSDI 16). 65--80."},{"key":"e_1_3_2_1_28_1","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Grandl Robert","year":"2016","unstructured":"Robert Grandl, Srikanth Kandula, Sriram Rao, Aditya Akella, and Janardhan Kulkarni. 2016. {GRAPHENE}: Packing and {Dependency-Aware} Scheduling for {Data-Parallel} Clusters. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 81--97."},{"key":"e_1_3_2_1_29_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2019","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Harry Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2019, Boston, MA, February 26-28, 2019. 485--500."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 845--861","author":"Hadary Ori","year":"2020","unstructured":"Ori Hadary, Luke Marshall, Ishai Menache, Abhisek Pan, Esaias E Greeff, David Dion, Star Dorminey, Shailesh Joshi, Yang Chen, Mark Russinovich, et al. 2020. Protean: VM allocation service at scale. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 845--861."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2775054.2694384"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"e_1_3_2_1_33_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 721--739."},{"key":"e_1_3_2_1_34_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 721--739."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/502034.502046"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCC.2022.3222649"},{"key":"e_1_3_2_1_37_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference, USENIX ATC 2019","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference, USENIX ATC 2019, Renton, WA, USA, July 10-12, 2019. 947--960."},{"key":"e_1_3_2_1_38_1","volume-title":"Morpheus: Towards Automated SLOs for Enterprise Clusters. In 12th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2016","author":"Jyothi Sangeetha Abdu","year":"2016","unstructured":"Sangeetha Abdu Jyothi, Carlo Curino, Ishai Menache, Shravan Matthur Narayanamurthy, Alexey Tumanov, Jonathan Yaniv, Ruslan Mavlyutov, I\u00f1igo Goiri, Subru Krishnan, Janardhan Kulkarni, and Sriram Rao. 2016. Morpheus: Towards Automated SLOs for Enterprise Clusters. In 12th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2016, Savannah, GA, USA, November 2-4, 2016. 117--134."},{"key":"e_1_3_2_1_39_1","volume-title":"Prediction-Based Power Oversubscription in Cloud Platforms. In 2021 USENIX Annual Technical Conference, USENIX ATC 2021","author":"Kumbhare Alok Gautam","year":"2021","unstructured":"Alok Gautam Kumbhare, Reza Azimi, Ioannis Manousakis, Anand Bonde, Felipe Vieira Frujeri, Nithish Mahalingam, Pulkit A. Misra, Seyyed Ahmad Javadi, Bianca Schroeder, Marcus Fontoura, and Ricardo Bianchini. 2021. Prediction-Based Power Oversubscription in Cloud Platforms. In 2021 USENIX Annual Technical Conference, USENIX ATC 2021, July 14-16, 2021. 473--487."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3122009.3242042"},{"key":"e_1_3_2_1_41_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Lim Gangmuk","year":"2021","unstructured":"Gangmuk Lim, Jeongseob Ahn, Wencong Xiao, Youngjin Kwon, and Myeongjae Jeon. 2021. Zico: Efficient {GPU} Memory Sharing for Concurrent {DNN} Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 161--175."},{"key":"e_1_3_2_1_42_1","volume-title":"Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2020","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and Efficient GPU Cluster Scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2020, Santa Clara, CA, USA, February 25-27, 2020. 289--304."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3005745.3005750"},{"key":"e_1_3_2_1_44_1","unstructured":"Microsoft. 2021. Guarantee capacity access with on-demand capacity reservations. https:\/\/bit.ly\/3JUVdde."},{"key":"e_1_3_2_1_45_1","unstructured":"Microsoft. 2021. How to Access Data - Azure Machine Learning. https:\/\/docs.microsoft.com\/en-us\/azure\/machine-learning\/how-to-access-data. https:\/\/docs.microsoft.com\/en-us\/azure\/machine-learning\/how-to-access-data"},{"key":"e_1_3_2_1_46_1","unstructured":"Microsoft. 2023. Azure Machine Learning. https:\/\/azure.microsoft.com\/en-us\/services\/machine-learning\/."},{"key":"e_1_3_2_1_47_1","unstructured":"Microsoft. 2023. On-demand Capacity Reservation. https:\/\/docs.microsoft.com\/en-us\/azure\/virtual-machines\/capacity-reservation-overview."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.4230\/LIPIcs.ITCS.2020.14"},{"key":"e_1_3_2_1_49_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021, February 23-25, 2021. 203--216."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190543"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cor.2006.03.020"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.14778\/2752939.2752942"},{"key":"e_1_3_2_1_53_1","volume-title":"Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 481--498."},{"key":"e_1_3_2_1_54_1","unstructured":"NVIDIA. 2023. NVIDIA Multi-Instance GPU. https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/. https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/"},{"key":"e_1_3_2_1_55_1","unstructured":"OpenAI. 2021. OpenAI: Scaling Kubernetes to 7500 nodes. https:\/\/openai.com\/research\/scaling-kubernetes-to-7500-nodes."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_57_1","unstructured":"Reddit. 2020. Reddit Comment. https:\/\/www.reddit.com\/r\/googlecloud\/comments\/glh1v4\/gpu_shortage_in_all_regions."},{"key":"e_1_3_2_1_58_1","unstructured":"Reddit. 2020. Reddit Comment. https:\/\/www.reddit.com\/r\/googlecloud\/comments\/kmlwn4\/gpu_shortage."},{"key":"e_1_3_2_1_59_1","volume-title":"2020 USENIX Annual Technical Conference, USENIX ATC 2020","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, I\u00f1igo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider. In 2020 USENIX Annual Technical Conference, USENIX ATC 2020, July 15-17, 2020. 205--218."},{"key":"e_1_3_2_1_60_1","volume-title":"Serving DNN models with multi-instance gpus: A case of the reconfigurable machine scheduling problem. arXiv preprint arXiv:2109.11067","author":"Tan Cheng","year":"2021","unstructured":"Cheng Tan, Zhichao Li, Jian Zhang, Yu Cao, Sikai Qi, Zherui Liu, Yibo Zhu, and Chuanxiong Guo. 2021. Serving DNN models with multi-instance gpus: A case of the reconfigurable machine scheduling problem. arXiv preprint arXiv:2109.11067 (2021)."},{"key":"e_1_3_2_1_61_1","unstructured":"Twitter. 2017. Twitter Post. https:\/\/twitter.com\/Reza_Zadeh\/status\/867176425903710210."},{"key":"e_1_3_2_1_62_1","volume-title":"Karma: Resource Allocation for Dynamic Demands. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Vuppalapati Midhul","year":"2023","unstructured":"Midhul Vuppalapati, Giannis Fikioris, Rachit Agarwal, Asaf Cidon, Anurag Khandelwal, and Eva Tardos. 2023. Karma: Resource Allocation for Dynamic Demands. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)."},{"key":"e_1_3_2_1_63_1","volume-title":"Building An Elastic Query Engine on Disaggregated Storage. In 17th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2020","author":"Vuppalapati Midhul","year":"2020","unstructured":"Midhul Vuppalapati, Justin Miron, Rachit Agarwal, Dan Truong, Ashish Motivala, and Thierry Cruanes. 2020. Building An Elastic Query Engine on Disaggregated Storage. In 17th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2020, Santa Clara, CA, USA, February 25-27, 2020. 449--462."},{"key":"e_1_3_2_1_64_1","first-page":"696","article-title":"Wavelet: Efficient DNN Training with Tick-Tock Scheduling","volume":"3","author":"Wang Guanhua","year":"2021","unstructured":"Guanhua Wang, Kehan Wang, Kenan Jiang, Xiangjun Li, and Ion Stoica. 2021. Wavelet: Efficient DNN Training with Tick-Tock Scheduling. Proceedings of Machine Learning and Systems 3 (2021), 696--710.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456225"},{"key":"e_1_3_2_1_66_1","unstructured":"Weights and Biases. 2023. Weights and Biases. https:\/\/wandb.ai."},{"key":"e_1_3_2_1_67_1","volume-title":"Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In NSDI.","author":"Qizhen Weng","year":"2022","unstructured":"Qizhen Weng et al. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In NSDI."},{"key":"e_1_3_2_1_68_1","unstructured":"Wikipedia. 2023. Bayes error rate. https:\/\/en.wikipedia.org\/wiki\/Bayes_error_rate."},{"key":"e_1_3_2_1_69_1","unstructured":"Wikipedia. 2023. Buddy Memory Allocation. https:\/\/en.wikipedia.org\/wiki\/Buddy_memory_allocation."},{"key":"e_1_3_2_1_70_1","unstructured":"Wikipedia. 2023. Mixed Model. https:\/\/en.wikipedia.org\/wiki\/Mixed_model."},{"key":"e_1_3_2_1_71_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018. 595--610."},{"key":"e_1_3_2_1_72_1","volume-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4-6, 2020. 533--548."},{"key":"e_1_3_2_1_73_1","first-page":"98","article-title":"Fine-grained GPU sharing primitives for deep learning applications","volume":"2","author":"Yu Peifeng","year":"2020","unstructured":"Peifeng Yu and Mosharaf Chowdhury. 2020. Fine-grained GPU sharing primitives for deep learning applications. Proceedings of Machine Learning and Systems 2 (2020), 98--111.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_74_1","volume-title":"Khaled Elmeleegy, Scott Shenker, and Ion Stoica.","author":"Zaharia Matei","year":"2010","unstructured":"Matei Zaharia, Dhruba Borthakur, Joydeep Sen Sarma, Khaled Elmeleegy, Scott Shenker, and Ion Stoica. 2010. Delay scheduling: a simple technique for achieving locality and fairness in cluster scheduling. In EuroSys. 265--278."},{"key":"e_1_3_2_1_75_1","volume-title":"Khaled Elmeleegy, Scott Shenker, and Ion Stoica.","author":"Zaharia Matei","year":"2010","unstructured":"Matei Zaharia, Dhruba Borthakur, Joydeep Sen Sarma, Khaled Elmeleegy, Scott Shenker, and Ion Stoica. 2010. Delay Scheduling: A Simple Technique for Achieving Locality and Fairness in Cluster Scheduling. In EuroSYS."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3127490"},{"key":"e_1_3_2_1_77_1","unstructured":"Lixia Zhang. 1989. A new architecture for packet switching network protocols. Technical Report. MASSACHUSETTS INST OF TECH CAMBRIDGE LAB FOR COMPUTER SCIENCE."},{"key":"e_1_3_2_1_78_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis C.M. Lau, Yuqi Wang, Yifan Xiong, and Bin Wang. 2020. HiveD: Sharing a GPU Cluster for Deep Learning with Guarantees. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 515--532."}],"event":{"name":"SoCC '23: ACM Symposium on Cloud Computing","location":"Santa Cruz CA USA","acronym":"SoCC '23","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 2023 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620678.3624669","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620678.3624669","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:54:15Z","timestamp":1755878055000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620678.3624669"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,30]]},"references-count":78,"alternative-id":["10.1145\/3620678.3624669","10.1145\/3620678"],"URL":"https:\/\/doi.org\/10.1145\/3620678.3624669","relation":{},"subject":[],"published":{"date-parts":[[2023,10,30]]},"assertion":[{"value":"2023-10-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}