{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,19]],"date-time":"2026-04-19T16:56:05Z","timestamp":1776617765296,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Science and Technology Development Fund of Macau","award":["0024\/2022\/A1"],"award-info":[{"award-number":["0024\/2022\/A1"]}]},{"name":"the Science and Technology Development Fund of Macau","award":["0071\/2023\/ITP2"],"award-info":[{"award-number":["0071\/2023\/ITP2"]}]},{"name":"the Science and Technology Development Fund of Macau","award":["0081\/2022\/A2"],"award-info":[{"award-number":["0081\/2022\/A2"]}]},{"name":"the Science and Technology Development Fund of Macau","award":["0123\/2022\/AFJ"],"award-info":[{"award-number":["0123\/2022\/AFJ"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640375","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"499-513","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["Heet: Accelerating Elastic Training in Heterogeneous Deep Learning Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3590-4400","authenticated-orcid":false,"given":"Zizhao","family":"Mo","sequence":"first","affiliation":[{"name":"University of Macau, Macau SAR, China, Macau, Macao"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6657-1154","authenticated-orcid":false,"given":"Huanle","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Macau, Macau SAR, China, Macau, Macao"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9480-0356","authenticated-orcid":false,"given":"Chengzhong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Macau, Macau SAR, China, Macau, Macao"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/www.alibabacloud.com\/product\/gpu","author":"Alibaba","year":"2023","unstructured":"Alibaba cloud. https:\/\/www.alibabacloud.com\/product\/gpu, 2023."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/sifter.org\/~simon\/journal\/20061211.html","author":"Funk","year":"2023","unstructured":"Funk svd. https:\/\/sifter.org\/~simon\/journal\/20061211.html, 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/cloud.google.com\/gpu","author":"Google","year":"2023","unstructured":"Google cloud. https:\/\/cloud.google.com\/gpu, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"accelerated multi-gpu collective communication. https:\/\/images.nvidia.com\/events\/sc15\/pdfs\/NCCL-Woolley.pdf","author":"Nccl","year":"2023","unstructured":"Nccl: accelerated multi-gpu collective communication. https:\/\/images.nvidia.com\/events\/sc15\/pdfs\/NCCL-Woolley.pdf, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/www.dropbox.com\/scl\/fi\/kpzudtyls285lp3zhhzgv\/Tech.pdf?rlkey=glhq3rpaipaizaa1k04bl67t7&dl=0","author":"Report Technical","year":"2023","unstructured":"Technical Report. https:\/\/www.dropbox.com\/scl\/fi\/kpzudtyls285lp3zhhzgv\/Tech.pdf?rlkey=glhq3rpaipaizaa1k04bl67t7&dl=0, 2023."},{"key":"e_1_3_2_1_6_1","volume-title":"The bellkor 2008 solution to the netflix prize. Statistics Research Department at AT&T Research, 1(1)","author":"Bell Robert M","year":"2008","unstructured":"Robert M Bell, Yehuda Koren, and Chris Volinsky. The bellkor 2008 solution to the netflix prize. Statistics Research Department at AT&T Research, 1(1), 2008."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499368.2451125"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541941"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3093337.3037703"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626334"},{"key":"e_1_3_2_1_16_1","volume-title":"Elasticflow: An elastic serverless training platform for distributed deep learning","author":"Gu Diandian","year":"2023","unstructured":"Diandian Gu, Yihao Zhao, Yinmin Zhong, Yifan Xiong, Zhenhua Han, Peng Cheng, Fan Yang, Gang Huang, Xin Jin, and Xuanzhe Liu. Elasticflow: An elastic serverless training platform for distributed deep learning. 2023."},{"key":"e_1_3_2_1_17_1","first-page":"485","volume-title":"NSDI","volume":"19","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Harry Liu, and Chuanxiong Guo. Tiresias: A gpu cluster manager for distributed deep learning. In NSDI, volume 19, pages 485--500, 2019."},{"key":"e_1_3_2_1_18_1","volume-title":"Los Alamos National Lab.(LANL)","author":"Hagberg Aric","year":"2008","unstructured":"Aric Hagberg, Pieter Swart, and Daniel S Chult. Exploring network structure, dynamics, and function using networkx. Technical report, Los Alamos National Lab.(LANL), Los Alamos, NM (United States), 2008."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/358916.358995"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575705"},{"key":"e_1_3_2_1_22_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.02174"},{"key":"e_1_3_2_1_24_1","first-page":"721","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. Elastic resource sharing for distributed deep learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21), pages 721--739, 2021."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of ATC","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. Analysis of large-scale multi-tenant GPU clusters for DNN training workloads. In Proceedings of ATC, 2019."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1002\/net.3230100205"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387547"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378499"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304009"},{"key":"e_1_3_2_1_32_1","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient gpu cluster scheduling. In 17th USENIX Symposium on Networked Systems Design and Implementation, 2020."},{"key":"e_1_3_2_1_33_1","first-page":"937","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, Marcel Wagenl\u00e4nder, Konstantinos Fertakis, Andrei-Octavian Brabete, and Peter Pietzuch. Kungfu: Making training in distributed machine learning adaptive. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation, pages 937--954, 2020."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of OSDI","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. Looking beyond gpus for dnn scheduling on multi-tenant clusters. In Proceedings of OSDI, July 2022."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488793"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_38_1","first-page":"400","article-title":"Resource elasticity in distributed deep learning","volume":"2","author":"Or Andrew","year":"2020","unstructured":"Andrew Or, Haoyu Zhang, and Michael Freedman. Resource elasticity in distributed deep learning. Proceedings of Machine Learning and Systems, 2:400--411, 2020.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_39_1","first-page":"126","article-title":"Decoupling deep learning models from the underlying hardware","volume":"4","author":"Or Andrew","year":"2022","unstructured":"Andrew Or, Haoyu Zhang, and Michael None Freedman. Virtualflow: Decoupling deep learning models from the underlying hardware. Proceedings of Machine Learning and Systems, 4:126--140, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_41_1","first-page":"631","volume-title":"Litz: Elastic framework for high-performance distributed machine learning. In 2018 { USENIX} Annual Technical Conference ({USENIX}{ATC } 18)","author":"Qiao Aurick","year":"2018","unstructured":"Aurick Qiao, Abutalib Aghayev, Weiren Yu, Haoyang Chen, Qirong Ho, Garth A Gibson, and Eric P Xing. Litz: Elastic framework for high-performance distributed machine learning. In 2018 { USENIX} Annual Technical Conference ({USENIX}{ATC } 18), pages 631--644, 2018."},{"key":"e_1_3_2_1_42_1","first-page":"1","volume-title":"OSDI","volume":"21","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R Ganger, and Eric P Xing. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In OSDI, volume 21, pages 1--18, 2021."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"issue":"1","key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","first-page":"26","DOI":"10.2307\/2308012","article-title":"A remark on stirling's formula","volume":"10","author":"Robbins Herbert","year":"1955","unstructured":"Herbert Robbins. A remark on stirling's formula. The American Mathematical Monthly, 10(1):26--29, 1955.","journal-title":"The American Mathematical Monthly"},{"key":"e_1_3_2_1_45_1","volume-title":"Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. Horovod: fast and easy distributed deep learning in tensorflow. arXiv preprint arXiv:1802.05799, 2018."},{"key":"e_1_3_2_1_46_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_47_1","volume-title":"A survey of collaborative filtering techniques. Advances in artificial intelligence","author":"Su Xiaoyuan","year":"2009","unstructured":"Xiaoyuan Su and Taghi M Khoshgoftaar. A survey of collaborative filtering techniques. Advances in artificial intelligence, 2009, 2009."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"e_1_3_2_1_49_1","first-page":"945","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. Mlaas in the wild: Workload analysis and scheduling in large-scale heterogeneous gpu clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), pages 945--960. USENIX Association, 2022."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/507338.507355"},{"key":"e_1_3_2_1_51_1","first-page":"595","volume-title":"13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18), pages 595--610, 2018."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS47774.2020.00018"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386367.3432728"},{"key":"e_1_3_2_1_54_1","first-page":"119","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"You Jie","year":"2023","unstructured":"Jie You, Jae-Won Chung, and Mosharaf Chowdhury. Zeus: Understanding and optimizing {GPU} energy consumption of {DNN} training. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 119--139, 2023."},{"key":"e_1_3_2_1_55_1","first-page":"515","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, Zhi Yang, Quanlu Zhang, Fan Yang, Lidong Zhou, Mao Yang, Francis CM Lau, Yuqi Wang, Yifan Xiong, et al. Hived: Sharing a gpu cluster for deep learning with guarantees. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation, pages 515--532, 2020."}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640375","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640375","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:41Z","timestamp":1750291421000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640375"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":55,"alternative-id":["10.1145\/3620665.3640375","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640375","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}