{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:12:54Z","timestamp":1765465974331,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3760250.3762231","type":"proceedings-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:36Z","timestamp":1765465596000},"page":"117-131","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GFS: A Preemption-aware Scheduling Framework for GPU Clusters with Predictive Spot Instance Management"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0629-4865","authenticated-orcid":false,"given":"Jiaang","family":"Duan","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3055-7227","authenticated-orcid":false,"given":"Shenglin","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7775-1740","authenticated-orcid":false,"given":"Shiyou","family":"Qian","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8156-3926","authenticated-orcid":false,"given":"Dingyu","family":"Yang","sequence":"additional","affiliation":[{"name":"The State Key Laboratory of Blockchain and Data Security, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5607-1607","authenticated-orcid":false,"given":"Kangjin","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5645-4167","authenticated-orcid":false,"given":"Chenzhi","family":"Liao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2744-845X","authenticated-orcid":false,"given":"Yinghao","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2846-6083","authenticated-orcid":false,"given":"Qin","family":"Hua","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong Univeristy, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6439-5169","authenticated-orcid":false,"given":"Hanwen","family":"Hu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2384-8807","authenticated-orcid":false,"given":"Qi","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0814-0919","authenticated-orcid":false,"given":"Wenchao","family":"Wu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4873-8736","authenticated-orcid":false,"given":"Dongqing","family":"Bao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0360-8912","authenticated-orcid":false,"given":"Tianyu","family":"Lu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0036-9436","authenticated-orcid":false,"given":"Jian","family":"Cao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1617-3593","authenticated-orcid":false,"given":"Guangtao","family":"Xue","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1908-071X","authenticated-orcid":false,"given":"Guodong","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2334-3471","authenticated-orcid":false,"given":"Liping","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7483-0045","authenticated-orcid":false,"given":"Gang","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2009. Amazon EC2 Spot Instances. https:\/\/aws.amazon.com\/ec2\/spot\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2015. Google Compute Engine. https:\/\/cloud.google.com\/compute\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2018. Use Azure Spot Virtual Machines. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/spot-vms."},{"key":"e_1_3_2_1_4_1","unstructured":"2025. Alibaba cluster trace program. https:\/\/github.com\/alibaba\/clusterdata."},{"key":"e_1_3_2_1_5_1","unstructured":"2025. Aliyun Pricing. https:\/\/www.aliyun.com\/price."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/UKSim.2014.67"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_8_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: fast pipelined context switching for deep learning applications. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020 (OSDI'20). Article 28, 16 pages. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/bai"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference, USENIX ATC 2020, July 15--17, 2020 (USENIX ATC'20). Article 31","author":"Boucher Sol","year":"2020","unstructured":"Sol Boucher, Anuj Kalia, David G. Andersen, and Michael Kaminsky. 2020. Lightweight preemptible functions. In Proceedings of the 2020 USENIX Annual Technical Conference, USENIX ATC 2020, July 15--17, 2020 (USENIX ATC'20). Article 31, 13 pages. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/boucher"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jspi.2008.04.020"},{"key":"e_1_3_2_1_11_1","volume-title":"Comparing the Pearson and Spearman correlation coefficients across distributions and sample sizes: A tutorial using simulations and empirical data. Psychological methods 21, 3","author":"De Winter Joost CF","year":"2016","unstructured":"Joost CF De Winter, SamuelDGosling, and Jeff Potter. 2016. Comparing the Pearson and Spearman correlation coefficients across distributions and sample sizes: A tutorial using simulations and empirical data. Psychological methods 21, 3 (2016), 273."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aur\u00e9lien Rodriguez Austen Gregerson Ava Spataru Baptiste Rozi\u00e8re Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric Michael Smith Filip Radenovic Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia Lewis Anderson Graeme Nail Gr\u00e9goire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol Arrieta Ibarra Isabel M. Kloumann Ishan Misra Ivan Evtimov Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan Vasuden Alwala Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone and et al. 2024. The Llama 3 Herd of Models. CoRR abs\/2407.21783 (2024). arXiv:2407.21783 doi:10.48550\/ARXIV.2407.21783","DOI":"10.48550\/ARXIV.2407.21783"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10479-005-3446-x"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3430063"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671969"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392671"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGC.2012.61"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD.2017.21"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00093"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587445"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_1_24_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021, February 23--25, 2021. 203--216. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0022--2496(02)00028--7"},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. CoRR abs\/2303.08774 (2023). arXiv:2303.08774 doi:10.48550\/ARXIV.2303.08774"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"David Salinas Valentin Flunkert Jan Gasthaus and Tim Januschowski. 2020. DeepAR: Probabilistic forecasting with autoregressive recurrent networks. International journal of forecasting 36 3 (2020) 1181--1191. doi:10.1016\/j.ijforecast.2019.07.001","DOI":"10.1016\/j.ijforecast.2019.07.001"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-031--12597--3_8"},{"key":"e_1_3_2_1_29_1","unstructured":"Jaroslaw Sobieszczanski-Sobieski. 1982. A linear decomposition method for large optimization problems. Blueprint for development. Technical Report. https:\/\/ntrs.nasa.gov\/citations\/19820014371"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.24963\/IJCAI.2020\/458"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1080\/00031305.2017.1380080"},{"key":"e_1_3_2_1_32_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023, Boston, MA, April 17--19, 2023. 497--513. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/thorpe"},{"key":"e_1_3_2_1_33_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4--9, 2017, Long Beach, CA, USA. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 2023 USENIX Annual Technical Conference, USENIX ATC 2023","author":"Yang Lingyun","year":"2023","unstructured":"QizhenWeng, Lingyun Yang, Yinghao Yu,WeiWang, Xiaochuan Tang, Guodong Yang, and Liping Zhang. 2023. Beware of Fragmentation: Scheduling GPU-Sharing Workloads with Fragmentation Gradient Descent. In Proceedings of the 2023 USENIX Annual Technical Conference, USENIX ATC 2023, Boston, MA, USA, July 10--12, 2023. USENIX Association, 995--1008. https:\/\/www.usenix.org\/conference\/atc23\/presentation\/weng"},{"key":"e_1_3_2_1_36_1","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Wu Haixu","year":"2021","unstructured":"Haixu Wu, Jiehui Xu, Jianmin Wang, and Mingsheng Long. 2021. Autoformer: Decomposition transformers with auto-correlation for long-term series forecasting. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6--14, 2021, virtual. 22419--22430. https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/bcc0d400288793e8bdcd7c19a8ac0c2b-Abstract.html"},{"key":"e_1_3_2_1_37_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: introspective cluster scheduling for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8--10, 2018. 595--610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_1_38_1","volume-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng,Wei Lin, and Yangqing Jia. 2020. AntMan: Dynamic Scaling on GPU Clusters for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020. 533--548. https:\/\/www.usenix.org\/conference\/ osdi20\/presentation\/xiao"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3487553.3524229"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582028"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V37I9.26317"},{"key":"e_1_3_2_1_42_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023, Boston, MA, April 17--19, 2023. 787--808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V35I12.17325"},{"key":"e_1_3_2_1_44_1","volume-title":"International Conference on Machine Learning, ICML 2022","author":"Zhou Tian","year":"2022","unstructured":"Tian Zhou, Ziqing Ma, QingsongWen, XueWang, Liang Sun, and Rong Jin. 2022. Fedformer: Frequency enhanced decomposed transformer for long-term series forecasting. In International Conference on Machine Learning, ICML 2022, 17--23 July 2022, Baltimore, Maryland, USA. 27268--27286. https:\/\/proceedings.mlr.press\/v162\/zhou22g.html"}],"event":{"name":"ASPLOS '26:31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3760250.3762231","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:07:57Z","timestamp":1765465677000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3760250.3762231"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,11]]},"references-count":44,"alternative-id":["10.1145\/3760250.3762231","10.1145\/3760250"],"URL":"https:\/\/doi.org\/10.1145\/3760250.3762231","relation":{},"subject":[],"published":{"date-parts":[[2025,12,11]]},"assertion":[{"value":"2025-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}