{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:40:03Z","timestamp":1755877203822,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656598","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"473-484","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["AutoSched: An Adaptive Self-configured Framework for Scheduling Deep Learning Training Workloads"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7048-1722","authenticated-orcid":false,"given":"Wei","family":"Gao","sequence":"first","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5608-6955","authenticated-orcid":false,"given":"Xu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chongqing University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8491-4873","authenticated-orcid":false,"given":"Shan","family":"Huang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6443-5308","authenticated-orcid":false,"given":"Shangwei","family":"Guo","sequence":"additional","affiliation":[{"name":"Chongqing University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8456-0491","authenticated-orcid":false,"given":"Peng","family":"Sun","sequence":"additional","affiliation":[{"name":"SenseTime, China and Shanghai AI Lab, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2751-5114","authenticated-orcid":false,"given":"Yonggang","family":"Wen","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6595-6650","authenticated-orcid":false,"given":"Tianwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. KNative Issues. https:\/\/github.com\/knative\/serving\/issues\/8682."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. Kubeflow Issues. https:\/\/github.com\/kubeflow\/kubeflow\/issues\/1219."},{"key":"e_1_3_2_1_3_1","volume-title":"CaRE: Finding Root Causes of Configuration Issues in Highly-Configurable Robots. arXiv e-prints","author":"Abir\u00a0Hossen Md","year":"2023","unstructured":"Md Abir\u00a0Hossen, Sonam Kharade, Bradley Schmerl, Javier C\u00e1mara, Jason\u00a0M O\u2019Kane, Ellen\u00a0C Czaplinski, Katherine\u00a0A Dzurilla, David Garlan, and Pooyan Jamshidi. 2023. CaRE: Finding Root Causes of Configuration Issues in Highly-Configurable Robots. arXiv e-prints (2023), arXiv\u20132301."},{"key":"e_1_3_2_1_4_1","volume-title":"Blox: A Modular Toolkit for Deep Learning Schedulers. arXiv preprint arXiv:2312.12621","author":"Agarwal Saurabh","year":"2023","unstructured":"Saurabh Agarwal, Amar Phanishayee, and Shivaram Venkataraman. 2023. Blox: A Modular Toolkit for Deep Learning Schedulers. arXiv preprint arXiv:2312.12621 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197978"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3309205"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2462902.2462931"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Shane Bergsma Timothy Zeyl Arik Senderovich and J.\u00a0Christopher Beck. 2021. Generating Complex Realistic Cloud Workloads using Recurrent Neural Networks. In SOSP.","DOI":"10.1145\/3477132.3483590"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3480859"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"volume-title":"Efficient compiler autotuning via bayesian optimization","author":"Chen Junjie","key":"e_1_3_2_1_11_1","unstructured":"Junjie Chen, Ningxin Xu, Peiqi Chen, and Hongyu Zhang. 2021. Efficient compiler autotuning via bayesian optimization. In ICSE. IEEE, 1198\u20131209."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624669"},{"key":"e_1_3_2_1_13_1","volume-title":"A tutorial on Bayesian optimization. arXiv preprint arXiv:1807.02811","author":"Frazier I","year":"2018","unstructured":"Peter\u00a0I Frazier. 2018. A tutorial on Bayesian optimization. arXiv preprint arXiv:1807.02811 (2018)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_15_1","unstructured":"gRPC. 2023. gRPC: A High-Performance Open Source Universal RPC Framework. https:\/\/grpc.io."},{"key":"e_1_3_2_1_16_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In NSDI.","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang\u00a0G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In NSDI."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3381027"},{"key":"e_1_3_2_1_18_1","unstructured":"Qinghao Hu Peng Sun Shengen Yan Yonggang Wen and Tianwei Zhang. 2021. Characterization and Prediction of Deep Learning Workloads in Large-Scale GPU Datacenters. In SC."},{"key":"e_1_3_2_1_19_1","volume-title":"Lucid: A Non-intrusive, Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In ASPLOS. 457\u2013472.","author":"Hu Qinghao","year":"2023","unstructured":"Qinghao Hu, Meng Zhang, Peng Sun, Yonggang Wen, and Tianwei Zhang. 2023. Lucid: A Non-intrusive, Scalable and Interpretable Scheduler for Deep Learning Training Jobs. In ASPLOS. 457\u2013472."},{"key":"e_1_3_2_1_20_1","unstructured":"Jez Humble and David Farley. 2010. Continuous delivery: reliable software releases through build test and deployment automation. Pearson Education."},{"key":"e_1_3_2_1_21_1","volume-title":"Elastic Resource Sharing for Distributed Deep Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation(NSDI \u201921)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic Resource Sharing for Distributed Deep Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation(NSDI \u201921)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519575"},{"key":"e_1_3_2_1_23_1","unstructured":"Myeongjae Jeon Shivaram Venkataraman Amar Phanishayee Junjie Qian Wencong Xiao and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX ATC."},{"key":"e_1_3_2_1_24_1","unstructured":"Ajaykrishna Karthikeyan Nagarajan Natarajan Gagan Somashekar Lei Zhao Ranjita Bhagwan Rodrigo Fonseca Tatiana Racheva and Yogesh Bansal. 2023. { SelfTune} : Tuning Cluster Managers. In NSDI. 1097\u20131114."},{"key":"e_1_3_2_1_25_1","unstructured":"kubeflow. 2021. kubeflow: https:\/\/www.kubeflow.org\/."},{"key":"e_1_3_2_1_26_1","volume-title":"USENIX Symposium on Networked Systems Design and Implementation (NSDI).","author":"Lai Fan","year":"2023","unstructured":"Fan Lai, Yinwei Dai, Harsha\u00a0V. Madhyastha, and Mosharaf Chowdhury. 2023. ModelKeeper: Accelerating DNN Training via Automated Training Warmup. In USENIX Symposium on Networked Systems Design and Implementation (NSDI)."},{"key":"e_1_3_2_1_27_1","volume-title":"Aryl: An Elastic Cluster Scheduler for Deep Learning. CoRR","author":"Li Jiamin","year":"2022","unstructured":"Jiamin Li, Hong Xu, Yibo Zhu, Zherui Liu, Chuanxiong Guo, and Cong Wang. 2022. Aryl: An Elastic Cluster Scheduler for Deep Learning. CoRR (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Themis: Fair and Efficient GPU Cluster Scheduling. In NSDI.","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and Efficient GPU Cluster Scheduling. In NSDI."},{"volume-title":"Localizing failure root causes in a microservice through causality inference","author":"Meng Yuan","key":"e_1_3_2_1_29_1","unstructured":"Yuan Meng, Shenglin Zhang, Yongqian Sun, Ruru Zhang, Zhilong Hu, Yiyin Zhang, Chenyang Jia, Zhaogang Wang, and Dan Pei. 2020. Localizing failure root causes in a microservice through causality inference. In IWQoS. IEEE, 1\u201310."},{"key":"e_1_3_2_1_30_1","unstructured":"Deepak Narayanan Keshav Santhanam Fiodar Kazhamiaka Amar Phanishayee and Matei Zaharia. 2020. Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In OSDI."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_32_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation(OSDI \u201921)","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang\u00a0Keun Choe, Suhas\u00a0Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory\u00a0R. Ganger, and Eric\u00a0P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation(OSDI \u201921)."},{"key":"e_1_3_2_1_33_1","volume-title":"Towards Causal Deep Learning for Vulnerability Detection. arXiv preprint arXiv:2310.07958","author":"Rahman Md\u00a0Mahbubur","year":"2023","unstructured":"Md\u00a0Mahbubur Rahman, Ira Ceka, Chengzhi Mao, Saikat Chakraborty, Baishakhi Ray, and Wei Le. 2023. Towards Causal Deep Learning for Vulnerability Detection. arXiv preprint arXiv:2310.07958 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"CAMEO: A Causal Transfer Learning Approach for Performance Optimization of Configurable Computer Systems. arXiv e-prints","author":"Shahriar\u00a0Iqbal Md","year":"2023","unstructured":"Md Shahriar\u00a0Iqbal, Ziyuan Zhong, Iftakhar Ahmad, Baishakhi Ray, and Pooyan Jamshidi. 2023. CAMEO: A Causal Transfer Learning Approach for Performance Optimization of Configurable Computer Systems. arXiv e-prints (2023), arXiv\u20132306."},{"key":"e_1_3_2_1_35_1","unstructured":"Gagan Somashekar Karan Tandon Anush Kini Chieh-Chun Chang Petr Husak Ranjita Bhagwan Mayukh Das Anshul Gandhi and Nagarajan Natarajan. [n.d.]. OPPerTune: Post-Deployment Configuration Tuning of Services Made Easy. ([n. d.])."},{"key":"e_1_3_2_1_36_1","volume-title":"International Workshop on Artificial Intelligence and Statistics. PMLR, 278\u2013285","author":"Spirtes Peter","year":"2001","unstructured":"Peter Spirtes. 2001. An anytime algorithm for causal inference. In International Workshop on Artificial Intelligence and Statistics. PMLR, 278\u2013285."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Chunqiang Tang Thawan Kooburat Pradeep Venkatachalam Akshay Chander Zhe Wen Aravind Narayanan Patrick Dowell and Robert Karl. 2015. Holistic configuration management at facebook. In SOSP. 328\u2013343.","DOI":"10.1145\/2815400.2815401"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2771783.2784770"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Dana Van\u00a0Aken Andrew Pavlo Geoffrey\u00a0J. Gordon and Bohan Zhang. 2017. Automatic Database Management System Tuning Through Large-Scale Machine Learning. In SIGMOD.","DOI":"10.1145\/3035918.3064029"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3035918.3064029"},{"key":"e_1_3_2_1_41_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation(NSDI \u201922)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation(NSDI \u201922)."},{"key":"e_1_3_2_1_42_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In OSDI.","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In OSDI."},{"key":"e_1_3_2_1_43_1","volume-title":"ASTRAEA: A Fair Deep Learning Scheduler for Multi-tenant GPU Clusters","author":"Ye Zhisheng","year":"2021","unstructured":"Zhisheng Ye, Peng Sun, Wei Gao, Tianwei Zhang, Xiaolin Wang, Shengen Yan, and Yingwei Luo. 2021. ASTRAEA: A Fair Deep Learning Scheduler for Multi-tenant GPU Clusters. IEEE Transactions on Parallel and Distributed Systems (2021)."},{"key":"e_1_3_2_1_44_1","unstructured":"Hanyu Zhao Zhenhua Han Zhi Yang Quanlu Zhang Fan Yang Lidong Zhou Mao Yang Francis\u00a0C.M. Lau Yuqi Wang Yifan Xiong and Bin Wang. 2020. HiveD: Sharing a GPU Cluster for Deep Learning with Guarantees. In OSDI."},{"key":"e_1_3_2_1_45_1","volume-title":"Automatic Database Knob Tuning: A Survey","author":"Zhao Xinyang","year":"2023","unstructured":"Xinyang Zhao, Xuanhe Zhou, and Guoliang Li. 2023. Automatic Database Knob Tuning: A Survey. IEEE Transactions on Knowledge and Data Engineering (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Shockwave: Fair and Efficient Cluster Scheduling for Dynamic Adaptation in Machine Learning. In NSDI.","author":"Zheng Pengfei","year":"2023","unstructured":"Pengfei Zheng, Rui Pan, Tarannum Khan, Shivaram Venkataraman, and Aditya Akella. 2023. Shockwave: Fair and Efficient Cluster Scheduling for Dynamic Adaptation in Machine Learning. In NSDI."}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Kyoto Japan","acronym":"ICS '24"},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656598","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656598","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:25:30Z","timestamp":1755876330000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656598"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":46,"alternative-id":["10.1145\/3650200.3656598","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656598","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}