{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T15:07:52Z","timestamp":1764688072429,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T00:00:00Z","timestamp":1691366400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"FHWA","award":["693JJ31950016"],"award-info":[{"award-number":["693JJ31950016"]}]},{"name":"Microsoft Research Faculty Fellowship","award":["8300751"],"award-info":[{"award-number":["8300751"]}]},{"name":"CCF","award":["1822965"],"award-info":[{"award-number":["1822965"]}]},{"name":"Commonwealth Cyber Initiative (CCI)"},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2206522, 1827674"],"award-info":[{"award-number":["2206522, 1827674"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,8,7]]},"DOI":"10.1145\/3605573.3605583","type":"proceedings-article","created":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T16:21:16Z","timestamp":1694622076000},"page":"423-432","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Embracing Uncertainty for Equity in Resource Allocation in ML Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1946-5235","authenticated-orcid":false,"given":"Suraiya","family":"Tairin","sequence":"first","affiliation":[{"name":"University of Virginia, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7548-6223","authenticated-orcid":false,"given":"Haiying","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Virginia, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7853-6854","authenticated-orcid":false,"given":"Zeyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Virginia, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,9,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. Alibaba trace.https:\/\/github.com\/alibaba\/clusterdata\/blob\/master\/cluster-trace-gpu-v2020\/README.md."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. Amazon EC2. https:\/\/aws.amazon.com\/en\/blogs\/machine-learning\/traindeep-learning-models-on-gpus-using-amazon-ec2-spot-instances\/. ."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. Cifar-10 dataset. https:\/\/www.cs.toronto.edu\/kriz\/cifar.html."},{"key":"e_1_3_2_1_4_1","unstructured":"2023. cpu-load-generator. https:\/\/pypi.org\/project\/cpu-load-generator\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2023. Microsoft trace. https:\/\/github.com\/msr-fiddle\/philly-traces."},{"key":"e_1_3_2_1_6_1","unstructured":"2023. Psutil.https:\/\/pypi.org\/project\/psutil\/."},{"key":"e_1_3_2_1_7_1","unstructured":"2023. Straggler existence.https:\/\/github.com\/pcl-projects\/Alibaba-PAI-Data.git."},{"key":"e_1_3_2_1_8_1","volume-title":"Proc. of OSDI. 265\u2013283","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi 2016. Tensorflow: A system for large-scale machine learning. In Proc. of OSDI. 265\u2013283."},{"key":"e_1_3_2_1_9_1","volume-title":"Principal component analysis","author":"Herv\u00e9 Abdi","year":"2010","unstructured":"Herv\u00e9 Abdi 2010. Principal component analysis. Wiley interdisciplinary reviews: computational statistics 2, 4 (2010)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737460"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8486422"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737587"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_2_1_14_1","volume-title":"An adaption scheduling based on dynamic weighted random forests for load demand forecasting. The Journal of Supercomputing","author":"M. Chen","year":"2020","unstructured":"M. Chen 2020. An adaption scheduling based on dynamic weighted random forests for load demand forecasting. The Journal of Supercomputing (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421307"},{"key":"e_1_3_2_1_16_1","volume-title":"Task failure prediction in cloud data centers using deep learning","author":"Jiechao Gao","year":"2020","unstructured":"Jiechao Gao 2020. Task failure prediction in cloud data centers using deep learning. IEEE Transactions on Services Computing (2020)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343180.3343192"},{"key":"e_1_3_2_1_18_1","volume-title":"Tails in the cloud: a survey and taxonomy of straggler management within large-scale cloud data centres. The Journal of Supercomputing","author":"S. Gill","year":"2020","unstructured":"S.\u00a0S. Gill 2020. Tails in the cloud: a survey and taxonomy of straggler management within large-scale cloud data centres. The Journal of Supercomputing (2020)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"e_1_3_2_1_20_1","volume-title":"Proc. of NSDI. 485\u2013500","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu 2019. Tiresias: A GPU cluster manager for distributed deep learning. In Proc. of NSDI. 485\u2013500."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987554"},{"key":"e_1_3_2_1_22_1","first-page":"418","article-title":"Tictac: Accelerating distributed deep learning with communication scheduling","volume":"1","author":"Hashemi Sayed\u00a0Hadi","year":"2019","unstructured":"Sayed\u00a0Hadi Hashemi 2019. Tictac: Accelerating distributed deep learning with communication scheduling. Proc. of MLSys 1 (2019), 418\u2013430.","journal-title":"Proc. of MLSys"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipl.2010.02.001"},{"key":"e_1_3_2_1_25_1","unstructured":"[25] Kubernetes. 2018. https:\/\/kubernetes.io."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00057"},{"key":"e_1_3_2_1_27_1","volume-title":"Proc. of ICML. PMLR.","author":"Lian Xiangru","year":"2018","unstructured":"Xiangru Lian 2018. Asynchronous decentralized parallel stochastic gradient descent. In Proc. of ICML. PMLR."},{"key":"e_1_3_2_1_28_1","volume-title":"Proc. of NSDI.","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan 2020. Themis: Fair and Efficient GPU Cluster Scheduling. In Proc. of NSDI."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342080"},{"key":"e_1_3_2_1_30_1","unstructured":"James\u00a0A McHugh. 1990. Algorithmic graph theory. Vol.\u00a068056. Citeseer."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_32_1","volume-title":"Proc. of OSDI.","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao 2021. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In Proc. of OSDI."},{"key":"e_1_3_2_1_33_1","volume-title":"Density-based clustering in spatial databases: The algorithm gdbscan and its applications. Data mining and knowledge discovery","author":"J\u00f6rg Sander","year":"1998","unstructured":"J\u00f6rg Sander 1998. Density-based clustering in spatial databases: The algorithm gdbscan and its applications. Data mining and knowledge discovery (1998)."},{"key":"e_1_3_2_1_34_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Alexander Sergeev","year":"2018","unstructured":"Alexander Sergeev 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_35_1","volume-title":"Reinforcement learning: An Introduction Cambridge","author":"S Sutton","year":"1998","unstructured":"Richard\u00a0S Sutton 1998. Reinforcement learning: An Introduction Cambridge. MA: MIT Press.[Google Scholar] (1998)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386367.3432588"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"S. Wasserman 1994. Social network analysis: Methods and applications. (1994).","DOI":"10.1017\/CBO9780511815478"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.2737\/FPL-GTR-290"},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. of OSDI.","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao 2018. Gandiva: Introspective cluster scheduling for deep learning. In Proc. of OSDI."},{"key":"e_1_3_2_1_40_1","volume-title":"Proc. of OSDI. 533\u2013548","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In Proc. of OSDI. 533\u2013548."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488815"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3127490"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544224"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDMW.2017.19"},{"key":"e_1_3_2_1_45_1","volume-title":"Machine learning-based prediction of COVID-19 diagnosis based on symptoms. npj Digital Medicine 4, 1","author":"Yazeed Zoabi","year":"2021","unstructured":"Yazeed Zoabi 2021. Machine learning-based prediction of COVID-19 diagnosis based on symptoms. npj Digital Medicine 4, 1 (2021), 1\u20135."}],"event":{"name":"ICPP 2023: 52nd International Conference on Parallel Processing","acronym":"ICPP 2023","location":"Salt Lake City UT USA"},"container-title":["Proceedings of the 52nd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605583","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605573.3605583","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605573.3605583","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:04Z","timestamp":1750182544000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605583"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,7]]},"references-count":45,"alternative-id":["10.1145\/3605573.3605583","10.1145\/3605573"],"URL":"https:\/\/doi.org\/10.1145\/3605573.3605583","relation":{},"subject":[],"published":{"date-parts":[[2023,8,7]]},"assertion":[{"value":"2023-09-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}