{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:34:45Z","timestamp":1771698885967,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","funder":[{"name":"Science and Technology Development Fund of Macau","award":["0071\/2023\/ITP2 and 0024\/2022\/A1"],"award-info":[{"award-number":["0071\/2023\/ITP2 and 0024\/2022\/A1"]}]},{"name":"Multi-Year Research Grant of University of Macau","award":["MYRG-GRG2024-00255-FST-UMDF and MYRG-GRG2023-00019-FST-UMDF"],"award-info":[{"award-number":["MYRG-GRG2024-00255-FST-UMDF and MYRG-GRG2023-00019-FST-UMDF"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3728488","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"324-338","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Fast and Fair Training for Deep Learning in Heterogeneous GPU Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3590-4400","authenticated-orcid":false,"given":"Zizhao","family":"Mo","sequence":"first","affiliation":[{"name":"University of Macau, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6657-1154","authenticated-orcid":false,"given":"Huanle","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Macau, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1179-7855","authenticated-orcid":false,"given":"Wing Cheong","family":"Lau","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"[n. d.]. https:\/\/www.alibabacloud.com\/product\/heterogeneous_computing."},{"key":"e_1_3_3_2_3_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_2_4_2","volume-title":"Proceedings of SODA","year":"2012","unstructured":"Anand et\u00a0al. 2012. Resource augmentation for weighted flow-time explained by dual fitting. In Proceedings of SODA. SIAM."},{"key":"e_1_3_3_2_5_2","volume-title":"Convex Analysis and Optimization","author":"Bertsekas Dimitri\u00a0P.","year":"2003","unstructured":"Dimitri\u00a0P. Bertsekas et\u00a0al. 2003. Convex Analysis and Optimization. Athena Scientific."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2001.923223"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Henri Casanova Rafael Ferreira\u00a0da Silva Ryan Tanaka Suraj Pandey Gautam Jethwani William Koch Spencer Albrecht James Oeth and Fr\u00e9d\u00e9ric Suter. 2020. Developing Accurate and Scalable Simulators of Production Workflow Management Systems with WRENCH. Future Generation Computer Systems 112 (2020) 162\u2013175. https:\/\/doi.org\/10.1016\/j.future.2020.05.030","DOI":"10.1016\/j.future.2020.05.030"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Chen et\u00a0al. 2017. An online convex optimization approach to proactive network resource allocation. IEEE Transactions on Signal Processing 65 24 (2017) 6350\u20136364.","DOI":"10.1109\/TSP.2017.2750109"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/75246.75248"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3210377.3210384"},{"key":"e_1_3_3_2_13_2","first-page":"929","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran\u00a0Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. { Check-N-Run} : A checkpointing system for training deep learning recommendation models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 929\u2013943."},{"key":"e_1_3_3_2_14_2","volume-title":"Proceedings of NSDI","author":"Ghodsi Ali","year":"2011","unstructured":"Ali Ghodsi et\u00a0al. 2011. Dominant Resource Fairness: Fair Allocation of Multiple Resource Types. In Proceedings of NSDI."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626334"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575721"},{"key":"e_1_3_3_2_17_2","volume-title":"Proceedings of NSDI","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang\u00a0G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU cluster manager for distributed deep learning. In Proceedings of NSDI."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"e_1_3_3_2_19_2","volume-title":"Proceedings of CVPR","year":"2017","unstructured":"He et\u00a0al. 2017. Mask r-cnn. In Proceedings of CVPR."},{"key":"e_1_3_3_2_20_2","volume-title":"Proceedings of NSDI","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang et\u00a0al. 2021. Elastic Resource Sharing for Distributed Deep Learning. In Proceedings of NSDI."},{"key":"e_1_3_3_2_21_2","volume-title":"Proceedings of ACM SPAA","year":"2015","unstructured":"Im et\u00a0al. 2015. Temporal fairness of round robin: Competitive analysis for lk-norms of flow time. In Proceedings of ACM SPAA."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00032"},{"key":"e_1_3_3_2_23_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_2_24_2","volume-title":"Proceedings of Eurosys","year":"2020","unstructured":"Le et\u00a0al. 2020. AlloX: compute allocation in hybrid clusters. In Proceedings of Eurosys."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS60910.2024.00015"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3210377.3210402"},{"key":"e_1_3_3_2_28_2","volume-title":"Proceedings of NSDI","year":"2020","unstructured":"Mahajan et\u00a0al. 2020. Themis: Fair and efficient GPU cluster scheduling. In Proceedings of NSDI."},{"key":"e_1_3_3_2_29_2","unstructured":"Mahdavi et\u00a0al. 2012. Trading regret for efficiency: online convex optimization with long term constraints. 13 1 (2012) 2503\u20132528."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652892.3654792"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640375"},{"key":"e_1_3_3_2_32_2","first-page":"203","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. { CheckFreq} : Frequent,{ Fine-Grained}{ DNN} Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). 203\u2013216."},{"key":"e_1_3_3_2_33_2","volume-title":"Proceedings of ATC","year":"2019","unstructured":"Myeongjae et\u00a0al. 2019. Analysis of large-scale multi-tenant GPU clusters for DNN training workloads. In Proceedings of ATC."},{"key":"e_1_3_3_2_34_2","first-page":"481","volume-title":"Proceedings of OSDI","year":"2020","unstructured":"Narayanan et\u00a0al. 2020. Heterogeneity-aware cluster scheduling policies for deep learning workloads. In Proceedings of OSDI. 481\u2013498."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_36_2","unstructured":"Michael\u00a0J. Neely et\u00a0al. 2017. Online convex optimization with time-varying constraints. arXiv preprint:1701.03974."},{"key":"e_1_3_3_2_37_2","unstructured":"Paszke et\u00a0al. 2019. Pytorch: An imperative style high-performance deep learning library. Advances in neural information processing systems (2019)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4614-2361-4"},{"key":"e_1_3_3_2_40_2","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation OSDI 21","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao et\u00a0al. 2021. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In 15th USENIX Symposium on Operating Systems Design and Implementation OSDI 21."},{"key":"e_1_3_3_2_41_2","volume-title":"Proceedings of NSDI","year":"2022","unstructured":"Qizhen et\u00a0al. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In Proceedings of NSDI."},{"key":"e_1_3_3_2_42_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_2_43_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research 21 1 (2020) 5485\u20135551."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Shafiee et\u00a0al. 2017. Fast YOLO: A fast you only look once system for real-time embedded object detection in video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1709.05943 (2017).","DOI":"10.15353\/vsnl.v3i1.171"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Ren\u00e9 Sitters. 2017. Approximability of average completion time scheduling on unrelated machines. Math. Program. 161 (2017) 135\u2013158.","DOI":"10.1007\/s10107-016-1004-8"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Martin Skutella et\u00a0al. 2016. Unrelated Machine Scheduling with Stochastic Processing Times. Mathematics of Operation Research 41 3 (2016).","DOI":"10.1287\/moor.2015.0757"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613175"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00066"},{"key":"e_1_3_3_2_49_2","unstructured":"Hugo Touvron et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_2_50_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_51_2","volume-title":"Advances in neural information processing systems","year":"2017","unstructured":"Vaswani et\u00a0al. 2017. Attention is all you need. In Advances in neural information processing systems."},{"key":"e_1_3_3_2_52_2","first-page":"739","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang et\u00a0al. 2023. TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 739\u2013767."},{"key":"e_1_3_3_2_53_2","unstructured":"Jingfeng Wu Minxian Xu Yiyuan He Kejiang Ye and Chengzhong Xu. 2024. Cloudnativesim: A Toolkit for Modeling and Simulation of Cloud-Native Applications. Software: Practice and Experience (2024)."},{"key":"e_1_3_3_2_54_2","volume-title":"Proceedings of OSDI","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et\u00a0al. 2018. Gandiva: Introspective cluster scheduling for deep learning. In Proceedings of OSDI."},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/SmartCity.2015.143"},{"key":"e_1_3_3_2_56_2","unstructured":"Wutong Yang Minxian Xu Guozhong Li and Wenhong Tian. 2015. CloudSimNFV: modeling and simulation of energy-efficient NFV in cloud data centers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1509.05875 (2015)."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503264"},{"key":"e_1_3_3_2_58_2","unstructured":"Hao Yu et\u00a0al. 2020. A Low Complexity Algorithm with \\(O(\\sqrt {T})\\) Regret and O(1) Constraint Violations for Online Convex Optimization with Long Term Constraints. The Journal of Machine Learning Research (2020)."},{"key":"e_1_3_3_2_59_2","first-page":"515","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao et\u00a0al. 2020. { HiveD} : Sharing a { GPU} cluster for deep learning with guarantees. In 14th USENIX symposium on operating systems design and implementation (OSDI 20). 515\u2013532."},{"key":"e_1_3_3_2_60_2","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng et\u00a0al. 2022. Alpa: Automating inter-and { Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559\u2013578."},{"key":"e_1_3_3_2_61_2","first-page":"703","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zheng Pengfei","year":"2023","unstructured":"Pengfei Zheng et\u00a0al. 2023. Shockwave: Fair and Efficient Cluster Scheduling for Dynamic Adaptation in Machine Learning. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 703\u2013723."}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3728488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:02:41Z","timestamp":1755867761000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3728488"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":60,"alternative-id":["10.1145\/3721145.3728488","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3728488","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}