{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T11:11:24Z","timestamp":1777461084981,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2443515"],"award-info":[{"award-number":["2443515"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Science Foundation","award":["https:\/\/doi.org\/10.13039\/100000001"],"award-info":[{"award-number":["https:\/\/doi.org\/10.13039\/100000001"]}]},{"name":"National Science Foundation","award":["2402328"],"award-info":[{"award-number":["2402328"]}]},{"name":"National Science Foundation","award":["2323100"],"award-info":[{"award-number":["2323100"]}]},{"DOI":"10.13039\/100000001","name":"Institute for Information and communications Technology Promotion","doi-asserted-by":"publisher","award":["RS-2024-00428758"],"award-info":[{"award-number":["RS-2024-00428758"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3805621.3807617","type":"proceedings-article","created":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:08:45Z","timestamp":1777381725000},"page":"31-40","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Cost-Efficient Training and Checkpointing for Large Models on Preemptible Cloud VMs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5678-7434","authenticated-orcid":false,"given":"Omkar","family":"Desai","sequence":"first","affiliation":[{"name":"Syracuse University, Syracuse, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8649-4253","authenticated-orcid":false,"given":"Shuyi","family":"Pei","sequence":"additional","affiliation":[{"name":"Samsung Semiconductor, Inc., San Jose, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4421-9923","authenticated-orcid":false,"given":"Janki","family":"Bhimani","sequence":"additional","affiliation":[{"name":"Florida International University, Miami, FL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3689-3985","authenticated-orcid":false,"given":"Bryan S.","family":"Kim","sequence":"additional","affiliation":[{"name":"Syracuse University, Syracuse, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. Gpt-4 technical report. arXiv preprint arXiv:2303.08774","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. Gpt-4 technical report. arXiv preprint arXiv:2303.08774, 2023."},{"key":"e_1_3_2_1_2_1","first-page":"487","volume-title":"Proceedings of the Seventeenth European Conference on Computer Systems","author":"Athlur Sanjith","year":"2022","unstructured":"Sanjith Athlur, Nitika Saran, Muthian Sivathanu, Ramachandran Ramjee, and Nipun Kwatra. Varuna: scalable, low-cost training of massive deep learning models. In Proceedings of the Seventeenth European Conference on Computer Systems, pages 472\u2013487, 2022."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/aws.amazon.com\/ec2\/spot\/pricing\/","author":"AWS.","year":"2025","unstructured":"AWS. Amazon ec2 spot instances pricing. https:\/\/aws.amazon.com\/ec2\/spot\/pricing\/, 2025."},{"key":"e_1_3_2_1_4_1","volume-title":"Amazon EC2 P3 instances. accelerate machine learning and high performance computing applications with powerful GPUs. https:\/\/aws.amazon.com\/ec2\/instance-types\/p3\/","author":"Amazon Web","year":"2023","unstructured":"Amazon Web Services (AWS). Amazon EC2 P3 instances. accelerate machine learning and high performance computing applications with powerful GPUs. https:\/\/aws.amazon.com\/ec2\/instance-types\/p3\/, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"Amazon ec2 spot instances. https:\/\/aws.amazon.com\/ec2\/spot\/instance-advisor\/","author":"Amazon Web","year":"2025","unstructured":"Amazon Web Services (AWS). Amazon ec2 spot instances. https:\/\/aws.amazon.com\/ec2\/spot\/instance-advisor\/, 2025."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/sizes\/gpu-accelerated\/nca100v4-series?tabs=sizebasic","year":"2024","unstructured":"Azure. Nc_a100_v4 sizes series. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/sizes\/gpu-accelerated\/nca100v4-series?tabs=sizebasic, 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"Azure spot virtual machines pricings. https:\/\/azure.microsoft.com\/en-us\/pricing\/spot-advisor\/","year":"2025","unstructured":"Azure. Azure spot virtual machines pricings. https:\/\/azure.microsoft.com\/en-us\/pricing\/spot-advisor\/, 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"Checkmate: Zero-overhead model checkpointing via network gradient replication. arXiv preprint arXiv:2507.13522","author":"Bhardwaj Ankit","year":"2025","unstructured":"Ankit Bhardwaj, Weiyang Wang, Jeremy Carin, Adam Belay, and Manya Ghobadi. Checkmate: Zero-overhead model checkpointing via network gradient replication. arXiv preprint arXiv:2507.13522, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901, 2020."},{"key":"e_1_3_2_1_10_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. Journal of machine learning research, 24(240):1\u2013113","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. Journal of machine learning research, 24(240):1\u2013113, 2023."},{"key":"e_1_3_2_1_11_1","volume-title":"A higher order estimate of the optimum checkpoint interval for restart dumps. Future generation computer systems, 22(3):303\u2013312","author":"Daly John T","year":"2006","unstructured":"John T Daly. A higher order estimate of the optimum checkpoint interval for restart dumps. Future generation computer systems, 22(3):303\u2013312, 2006."},{"key":"e_1_3_2_1_12_1","first-page":"238","volume-title":"24th USENIX Conference on File and Storage Technologies (FAST 26)","author":"Desai Omkar","year":"2026","unstructured":"Omkar Desai, Ziyang Jiao, Shuyi Pei, Janki Bhimani, and Bryan S Kim. Preparation meets opportunity: Enhancing data preprocessing for ML training with seneca. In 24th USENIX Conference on File and Storage Technologies (FAST 26), pages 221\u2013238, 2026."},{"key":"e_1_3_2_1_13_1","volume-title":"https:\/\/www.amd.com\/en\/products\/processors\/server\/epyc\/7003-series\/amd-epyc-7413.html","author":"Devices Advanced Micro","year":"2024","unstructured":"Advanced Micro Devices. AMD EPYC 7413. https:\/\/www.amd.com\/en\/products\/processors\/server\/epyc\/7003-series\/amd-epyc-7413.html, 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers), pages 4171\u20134186, 2019."},{"key":"e_1_3_2_1_15_1","volume-title":"Springer","author":"Dongarra Jack","year":"2015","unstructured":"Jack Dongarra, Thomas Herault, and Yves Robert. Fault tolerance techniques for high-performance computing. In Fault-tolerance techniques for high-performance computing, pages 3\u201385. Springer, 2015."},{"key":"e_1_3_2_1_16_1","first-page":"1139","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Duan Jiangfei","year":"2024","unstructured":"Jiangfei Duan, Ziang Song, Xupeng Miao, Xiaoli Xi, Dahua Lin, Harry Xu, Minjia Zhang, and Zhihao Jia. Parcae: Proactive, Liveput-Optimized DNN training on preemptible instances. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 1121\u20131139, 2024."},{"key":"e_1_3_2_1_17_1","first-page":"14","volume-title":"USENIX Annual Technical Conference (ATC)","author":"Duplyakin Dmitry","year":"2019","unstructured":"Dmitry Duplyakin, Robert Ricci, Aleksander Maricq, Gary Wong, Jonathon Duerig, Eric Eide, Leigh Stoller, Mike Hibler, David Johnson, Kirk Webb, et al. The design and operation of CloudLab. In USENIX Annual Technical Conference (ATC), pages 1\u201314, 2019. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/duplyakin."},{"issue":"120","key":"e_1_3_2_1_18_1","first-page":"1","article-title":"Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, 23(120):1\u201339, 2022.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_19_1","first-page":"1125","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems","author":"Gupta Tanmaey","year":"2024","unstructured":"Tanmaey Gupta, Sanjeev Krishnan, Rituraj Kumar, Abhishek Vijeev, Bhargav Gulavani, Nipun Kwatra, Ramachandran Ramjee, and Muthian Sivathanu. Just-in-time checkpointing: Low cost error recovery from deep learning training failures. In Proceedings of the Nineteenth European Conference on Computer Systems, pages 1110\u20131125, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, and zhifeng Chen. GPipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_21_1","first-page":"395","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Jang Insu","year":"2023","unstructured":"Insu Jang, Zhenning Yang, Zhen Zhang, Xin Jin, and Mosharaf Chowdhury. Oobleck: Resilient distributed training of large models using pipeline templates. In Proceedings of the 29th Symposium on Operating Systems Principles, pages 382\u2013395, 2023."},{"key":"e_1_3_2_1_22_1","first-page":"960","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 947\u2013960, 2019."},{"key":"e_1_3_2_1_23_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. Mixtral of experts. arXiv preprint arXiv:2401.04088","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. Mixtral of experts. arXiv preprint arXiv:2401.04088, 2024."},{"key":"e_1_3_2_1_24_1","first-page":"760","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, et al. MegaScale: Scaling large language model training to more than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 745\u2013760, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361, 2020."},{"key":"e_1_3_2_1_26_1","volume-title":"ExCP: Extreme LLM checkpoint compression via weight-momentum joint shrinking. arXiv preprint arXiv:2406.11257","author":"Li Wenshuo","year":"2024","unstructured":"Wenshuo Li, Xinghao Chen, Han Shu, Yehui Tang, and Yunhe Wang. ExCP: Extreme LLM checkpoint compression via weight-momentum joint shrinking. arXiv preprint arXiv:2406.11257, 2024."},{"key":"e_1_3_2_1_27_1","unstructured":"Xinyu Lian Sam Ade Jacobs Lev Kurilenko Masahiro Tanaka Stas Bekman Olatunji Ruwase and Minjia Zhang. Universal checkpointing: Efficient and flexible checkpointing for large scale distributed training. arXiv preprint arXiv:2406.18820 2024."},{"key":"e_1_3_2_1_28_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems, 36:34892\u201334916","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. Visual instruction tuning. Advances in neural information processing systems, 36:34892\u201334916, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"24th USENIX Conference on File and Storage Technologies (FAST 26)","author":"Liu Weijie","year":"2026","unstructured":"Weijie Liu, Shengwei Li, Zhiquan Lai, Keshi Ge, Qiaoling Chen, Peng Sun, Dongsheng Li, and Kai Lu. AdaCheck: Adaptive checkpointing for large language model training. In 24th USENIX Conference on File and Storage Technologies (FAST 26), 2026."},{"key":"e_1_3_2_1_30_1","first-page":"954","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Mai Luo","year":"2020","unstructured":"Luo Mai, Guo Li, Marcel Wagenl\u00e4nder, Konstantinos Fertakis, Andrei-Octavian Brabete, and Peter Pietzuch. KungFu: Making training in distributed machine learning adaptive. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20), pages 937\u2013954, 2020."},{"key":"e_1_3_2_1_31_1","first-page":"239","volume-title":"Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing","author":"Maurya Avinash","year":"2024","unstructured":"Avinash Maurya, Robert Underwood, M Mustafa Rafique, Franck Cappello, and Bogdan Nicolae. Datastates-LLM: Lazy asynchronous checkpointing for large language models. In Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing, pages 227\u2013239, 2024."},{"key":"e_1_3_2_1_32_1","first-page":"216","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. Check-Freq: Frequent, Fine-Grained DNN checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21), pages 203\u2013216, 2021."},{"key":"e_1_3_2_1_33_1","volume-title":"https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-nvidia-us-2188504-web.pdf","author":"NVIDIA. NVIDIA","year":"2024","unstructured":"NVIDIA. NVIDIA A100. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-nvidia-us-2188504-web.pdf, 2024."},{"key":"e_1_3_2_1_34_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_35_1","volume-title":"Spot vms pricing. https:\/\/cloud.google.com\/spot-vms\/pricing?hl=en","author":"Platform Google Compute","year":"2025","unstructured":"Google Compute Platform. Spot vms pricing. https:\/\/cloud.google.com\/spot-vms\/pricing?hl=en, 2025."},{"key":"e_1_3_2_1_36_1","volume-title":"torch.distributed.elastic. https:\/\/docs.pytorch.org\/docs\/stable\/distributed.elastic.html","year":"2024","unstructured":"PyTorch. torch.distributed.elastic. https:\/\/docs.pytorch.org\/docs\/stable\/distributed.elastic.html, 2024."},{"issue":"8","key":"e_1_3_2_1_37_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_38_1","first-page":"16","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201316. IEEE, 2020."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_40_1","first-page":"564","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. Zero-offload: Democratizing billion-scale model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21), pages 551\u2013564, 2021."},{"key":"e_1_3_2_1_41_1","first-page":"116","volume-title":"Proceedings of the 4th Workshop on Machine Learning and Systems","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Paul Elvinger, Tolga Kerimoglu, and Ana Klimovic. ML training with cloud GPU shortages: Is cross-region the answer? In Proceedings of the 4th Workshop on Machine Learning and Systems, pages 107\u2013116, 2024."},{"key":"e_1_3_2_1_42_1","first-page":"827","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Strati Foteini","year":"2025","unstructured":"Foteini Strati, Michal Friedman, and Ana Klimovic. Pccheck: Persistent concurrent checkpointing for ML. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1, pages 811\u2013827, 2025."},{"key":"e_1_3_2_1_43_1","first-page":"220","volume-title":"Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles","author":"Strati Foteini","year":"2025","unstructured":"Foteini Strati, Zhendong Zhang, George Manos, Ixeia S\u00e1nchez P\u00e9riz, Qinghao Hu, Tiancheng Chen, Berk Buzcu, Song Han, Pamela Delgado, and Ana Klimovic. Sailor: Automating distributed training over dynamic, heterogeneous, and geo-distributed clusters. In Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles, pages 204\u2013220, 2025."},{"key":"e_1_3_2_1_44_1","first-page":"513","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. Bamboo: Making preemptible instances resilient for affordable training of large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 497\u2013513, 2023."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_2_1_46_1","volume-title":"12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Wagenl\u00e4nder Marcel","year":"2020","unstructured":"Marcel Wagenl\u00e4nder, Luo Mai, Guo Li, and Peter Pietzuch. Spotnik: Designing distributed machine learning for transient cloud resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20), 2020."},{"key":"e_1_3_2_1_47_1","volume-title":"Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768","author":"Wang Guanhua","year":"2024","unstructured":"Guanhua Wang, Olatunji Ruwase, Bing Xie, and Yuxiong He. Fast-persist: Accelerating model checkpointing in deep learning. arXiv preprint arXiv:2406.13768, 2024."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_49_1","unstructured":"BigScience Workshop Teven Le Scao Angela Fan Christopher Akiki Ellie Pavlick Suzana Ili\u0107 Daniel Hesslow Roman Castagn\u00e9 Alexandra Sasha Luccioni Fran\u00e7ois Yvon et al. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100 2022."},{"key":"e_1_3_2_1_50_1","first-page":"203","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Wu Zhanghao","year":"2024","unstructured":"Zhanghao Wu, Wei-Lin Chiang, Ziming Mao, Zongheng Yang, Eric Friedman, Scott Shenker, and Ion Stoica. Can't be late: optimizing spot instance savings under deadlines. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pages 185\u2013203, 2024."},{"key":"e_1_3_2_1_51_1","first-page":"455","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Yang Zongheng","year":"2023","unstructured":"Zongheng Yang, Zhanghao Wu, Michael Luo, Wei-Lin Chiang, Romil Bhardwaj, Woosuk Kwon, Siyuan Zhuang, Frank Sifei Luan, Gautam Mittal, Scott Shenker, et al. Sk yPilot: An intercloud broker for sky computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 437\u2013455, 2023."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/361147.361115"},{"key":"e_1_3_2_1_53_1","volume-title":"Are larger pretrained language models uniformly better? comparing performance at the instance level. arXiv preprint arXiv:2105.06020","author":"Zhong Ruiqi","year":"2021","unstructured":"Ruiqi Zhong, Dhruba Ghosh, Dan Klein, and Jacob Steinhardt. Are larger pretrained language models uniformly better? comparing performance at the instance level. arXiv preprint arXiv:2105.06020, 2021."}],"event":{"name":"EuroSys '26: 21st European Conference on Computer Systems","location":"Edinburgh Scotland Uk","acronym":"EuroMLSys '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Sixth European Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3805621.3807617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:13:42Z","timestamp":1777382022000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805621.3807617"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,27]]},"references-count":53,"alternative-id":["10.1145\/3805621.3807617","10.1145\/3805621"],"URL":"https:\/\/doi.org\/10.1145\/3805621.3807617","relation":{},"subject":[],"published":{"date-parts":[[2026,4,27]]},"assertion":[{"value":"2026-04-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}