{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T08:45:34Z","timestamp":1773305134756,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707255","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"811-827","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["PCcheck: Persistent Concurrent Checkpointing for ML"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3364-2109","authenticated-orcid":false,"given":"Foteini","family":"Strati","sequence":"first","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5588-8617","authenticated-orcid":false,"given":"Michal","family":"Friedman","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8559-0529","authenticated-orcid":false,"given":"Ana","family":"Klimovic","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. A2 machine series Google Cloud. https:\/\/cloud.google.com\/compute\/docs\/gpus#a100-gpus. Accessed: 2024-10-24."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. GCP Network Bandwidth. https:\/\/cloud.google.com\/compute\/docs\/network-bandwidth. Accessed: 2023-07-23."},{"key":"e_1_3_2_1_3_1","unstructured":"[n. d.]. Google Cloud Persistent Disk Types. https:\/\/cloud.google.com\/compute\/docs\/disks#disk-types. Accessed: 2024-10-24."},{"key":"e_1_3_2_1_4_1","unstructured":"[n. d.]. GPUDirect Storage: A Direct Path Between Storage and GPU Memory. https:\/\/developer.nvidia.com\/blog\/gpudirect-storage\/. Accessed: 2023-08-05."},{"key":"e_1_3_2_1_5_1","unstructured":"[n. d.]. How to Overlap Data Transfers in CUDA C\/C. https:\/\/developer.nvidia.com\/blog\/how-overlap-data-transfers-cuda-cc\/. Accessed: 2024-06-20."},{"key":"e_1_3_2_1_6_1","unstructured":"[n. d.]. Hugging Face. https:\/\/huggingface.co\/. Accessed: 2023-07-23."},{"key":"e_1_3_2_1_7_1","unstructured":"[n. d.]. NCads H100 v5-series. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/ncads-h100-v5. Accessed: 2024-06-20."},{"key":"e_1_3_2_1_8_1","unstructured":"[n. d.]. NVIDIA Deep Learning Examples. https:\/\/github.com\/NVIDIA\/DeepLearningExamples. Accessed: 2023-07-23."},{"key":"e_1_3_2_1_9_1","unstructured":"[n. d.]. The Stanford Question Answering Dataset. https:\/\/rajpurkar.github.io\/SQuAD-explorer\/. Accessed: 2024-10-24."},{"key":"e_1_3_2_1_10_1","unstructured":"[n. d.]. Torchvision. https:\/\/github.com\/pytorch\/vision. Accessed: 2023-07-23."},{"key":"e_1_3_2_1_11_1","unstructured":"[n. d.]. Unified Memory for CUDA Beginners. https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/. Accessed: 2023-08-06."},{"key":"e_1_3_2_1_12_1","unstructured":"[n. d.]. Wikitext. https:\/\/developer.ibm.com\/exchanges\/data\/all\/wikitext-103\/. Accessed: 2024-10-24."},{"key":"e_1_3_2_1_13_1","unstructured":"2022. Discontinuation of Intel Optane. Accessed: 2023-08-10."},{"key":"e_1_3_2_1_14_1","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (Savannah, GA, USA). USENIX Association, USA, 265--283."},{"key":"e_1_3_2_1_15_1","unstructured":"Arize AI. 2022. The ML Observability Platform for Practitioners. https:\/\/arize.com\/. Accessed: 2023-08-10."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3565010.3569067"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_18_1","unstructured":"AWS. [n. d.]. AWS Spot Instance interruption notices. https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/spotinstance- termination-notices.html. Accessed: 2024-06-16."},{"key":"e_1_3_2_1_19_1","volume-title":"FlashNeuron: SSDEnabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Bae Jonghyun","unstructured":"Jonghyun Bae, Jongsung Lee, Yunho Jin, Sam Son, Shine Kim, Hakbeom Jang, Tae Jun Ham, and Jae W. Lee. 2021. FlashNeuron: SSDEnabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 387--401. https:\/\/www.usenix.org\/conference\/ fast21\/presentation\/bae"},{"key":"e_1_3_2_1_20_1","volume-title":"Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal.","author":"Biderman Stella","year":"2023","unstructured":"Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, USVSN Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal. 2023. Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling. arXiv:2304.01373 [cs.CL]"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/214451.214456"},{"key":"e_1_3_2_1_22_1","volume-title":"MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. CoRR abs\/1512.01274","author":"Chen Tianqi","year":"2015","unstructured":"Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. 2015. MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. CoRR abs\/1512.01274 (2015). arXiv:1512.01274 http:\/\/arxiv.org\/abs\/1512.01274"},{"key":"e_1_3_2_1_23_1","unstructured":"CXL. 2022. Compute Express Link."},{"key":"e_1_3_2_1_24_1","unstructured":"Zihang Dai Zhilin Yang Yiming Yang Jaime Carbonell Quoc V. Le and Ruslan Salakhutdinov. 2019. Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context. arXiv:1901.02860 [cs.LG]"},{"key":"e_1_3_2_1_25_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1970386.1970387"},{"key":"e_1_3_2_1_27_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang and Angela Fan et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_28_1","volume-title":"Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Murali Annavaram, Krishnakumar Nair, and Misha Smelyanskiy.","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Murali Annavaram, Krishnakumar Nair, and Misha Smelyanskiy. 2022. Check-N-Run: A Checkpointing System for Training Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 929--943. https:\/\/www.usenix.org\/ conference\/nsdi22\/presentation\/eisenman"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/568522.568525"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751212"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_32_1","unstructured":"Google. [n. d.]. Google Cloud GPUs. https:\/\/cloud.google.com\/ compute\/docs\/gpus. Accessed: 2023-08-06."},{"key":"e_1_3_2_1_33_1","unstructured":"Google. [n. d.]. Google Cloud Spot VMs. https:\/\/cloud.google.com\/spotvms. Accessed: 2023-08-10."},{"key":"e_1_3_2_1_34_1","volume-title":"Hinton","author":"Graves Alex","year":"2013","unstructured":"Alex Graves, Abdel-rahman Mohamed, and Geoffrey E. Hinton. 2013. Speech Recognition with Deep Recurrent Neural Networks. CoRR abs\/1303.5778 (2013). arXiv:1303.5778 http:\/\/arxiv.org\/abs\/1303.5778"},{"key":"e_1_3_2_1_35_1","volume-title":"Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G. Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU Cluster Manager for Distributed Deep Learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX Association, Boston, MA, 485--500. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/gu"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650085"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589105"},{"key":"e_1_3_2_1_39_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947--960. https:\/\/www.usenix.org\/conference\/ atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_40_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang Ziheng","year":"2024","unstructured":"Ziheng Jiang, Haibin Lin, Yinmin Zhong, Qi Huang, Yangrui Chen, Zhi Zhang, Yanghua Peng, Xiang Li, Cong Xie, Shibiao Nong, Yulu Jia, Sun He, Hongmin Chen, Zhihao Bai, Qi Hou, Shipeng Yan, Ding Zhou, Yiyao Sheng, Zhuo Jiang, Haohan Xu, Haoran Wei, Zhang Zhang, Pengfei Nie, Leqi Zou, Sida Zhao, Liang Xiang, Zherui Liu, Zhe Li, Xiaoying Jia, Jianxi Ye, Xin Jin, and Xin Liu. 2024. MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA, 745--760. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/jiang-ziheng"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2014.76"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.1987.232562"},{"key":"e_1_3_2_1_43_1","volume-title":"Deep Learning. Nature 521, 7553","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep Learning. Nature 521, 7553 (2015), 436--444. https:\/\/doi.org\/10.1038\/ nature14539"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551828"},{"key":"e_1_3_2_1_45_1","volume-title":"CPR: Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery. arXiv:2011.02999 [cs.LG]","author":"Maeng Kiwan","year":"2020","unstructured":"Kiwan Maeng, Shivam Bharuka, Isabel Gao, Mark Jeffrey, Vikram Saraph, Bor-Yiing Su, Caroline Trippel, Jiyan Yang, Mike Rabbat, Brandon Lucia, and Carole-Jean Wu. 2020. CPR: Understanding and Improving Failure Tolerant Training for Deep Learning Recommendation with Partial Recovery. arXiv:2011.02999 [cs.LG]"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2014.6816685"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00035"},{"key":"e_1_3_2_1_48_1","unstructured":"Meta. 2022. Democratizing access to large-scale language models with OPT-175B. https:\/\/ai.facebook.com\/blog\/democratizing-accessto- large-scale-language-models-with-opt-175b\/."},{"key":"e_1_3_2_1_49_1","unstructured":"Microsoft. [n. d.]. Azure Spot Virtual Machines. https:\/\/learn.microsoft. com\/en-us\/azure\/virtual-machines\/spot-vms. Accessed: 2023-08-10."},{"key":"e_1_3_2_1_50_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies, FAST 2021, February 23-25, 2021. USENIX Association, 203--216. https:\/\/www.usenix. org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_51_1","volume-title":"Looking Beyond GPUs for DNN Scheduling on Multi-Tenant Clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Mohan Jayashree","year":"2022","unstructured":"Jayashree Mohan, Amar Phanishayee, Janardhan Kulkarni, and Vijay Chidambaram. 2022. Looking Beyond GPUs for DNN Scheduling on Multi-Tenant Clusters. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 579--596. https:\/\/www.usenix.org\/conference\/osdi22\/ presentation\/mohan"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442527"},{"key":"e_1_3_2_1_53_1","volume-title":"Analysis and Exploitation of Dynamic Pricing in the Public Cloud for ML Training. In Workshop on Distributed Infrastructure, Systems, Programming, and AI.","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Analysis and Exploitation of Dynamic Pricing in the Public Cloud for ML Training. In Workshop on Distributed Infrastructure, Systems, Programming, and AI."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507758"},{"key":"e_1_3_2_1_56_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_57_1","unstructured":"Aurick Qiao Bryon Aragam Bingjing Zhang and Eric Xing. 2018. Fault Tolerance in Iterative-Convergent Machine Learning. CoRR abs\/1810.07354. arXiv:1810.07354 http:\/\/arxiv.org\/abs\/1810.07354"},{"key":"e_1_3_2_1_58_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput- Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao Aurick","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput- Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 1--18. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/qiao"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. ArXiv. https:\/\/www.microsoft.com\/enus\/ research\/publication\/zero-memory-optimizations-towardtraining- trillion-parameter-models\/","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.)","volume":"3","author":"Rauschmayr Nathalie","year":"2021","unstructured":"Nathalie Rauschmayr, Vikas Kumar, Rahul Huilgol, Andrea Olgiati, Satadal Bhattacharjee, Nihal Harish, Vandana Kannan, Amol Lele, Anirudh Acharya, Jared Nielsen, Lakshmi Ramakrishnan, Ishan Bhatt, Kohen Chia, Neelesh Dodda, Zhihan Li, Jiacheng Gu, Miyoung Choi, Balajee Nagarajan, Jeffrey Geevarghese, Denis Davydenko, Sifei Li, Lu Huang, Edward Kim, Tyler Hill, and Krishnaram Kenthapadi. 2021. Amazon SageMaker Debugger: A System for Real-Time Insights into Machine Learning Model Training. In Proceedings of Machine Learning and Systems, A. Smola, A. Dimakis, and I. Stoica (Eds.), Vol. 3. 770--782. https:\/\/proceedings.mlsys.org\/paper\/2021\/ file\/d1f491a404d6854880943e5c3cd9ca25-Paper.pdf"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2915966"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2102.06604"},{"key":"e_1_3_2_1_64_1","volume-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition. In International Conference on Learning Representations.","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3959.3962"},{"key":"e_1_3_2_1_66_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, 497--513. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/thorpe"},{"key":"e_1_3_2_1_67_1","volume-title":"Spotnik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Wagenl\u00e4nder Marcel","year":"2020","unstructured":"Marcel Wagenl\u00e4nder, Luo Mai, Guo Li, and Peter Pietzuch. 2020. Spotnik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20). USENIX Association. https:\/\/www.usenix.org\/ conference\/hotcloud20\/presentation\/wagenl{\u00e4}nder"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567505"},{"key":"e_1_3_2_1_70_1","unstructured":"Weights and Biases. 2022. The AI developer platform. https:\/\/wandb. ai\/site. Accessed: 2023-08-10."},{"key":"e_1_3_2_1_71_1","volume-title":"Workshop,:, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, and Jonathan Tow et al.","year":"2023","unstructured":"BigScience Workshop,:, Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, and Jonathan Tow et al. 2023. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. arXiv:2211.05100 [cs.CL] https:\/\/arxiv.org\/abs\/2211. 05100"},{"key":"e_1_3_2_1_72_1","unstructured":"Carole-Jean Wu Ramya Raghavendra Udit Gupta Bilge Acun Newsha Ardalani Kiwan Maeng Gloria Chang Fiona Aga Behram James Huang Charles Bai Michael Gschwind Anurag Gupta Myle Ott Anastasia Melnikov Salvatore Candido David Brooks Geeta Chauhan Benjamin Lee Hsien-Hsin S. Lee Bugra Akyildiz Maximilian Balandat Joe Spisak Ravi Jain Mike Rabbat and Kim Hazelwood. 2021. Sustainable AI: Environmental Implications Challenges and Opportunities. (2021). arXiv:2111.00364 [cs.LG] https:\/\/arxiv.org\/abs\/2111.00364"},{"key":"e_1_3_2_1_73_1","volume-title":"Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. CoRR","author":"Wu Yonghui","year":"2016","unstructured":"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, and Jeffrey Dean. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. CoRR (2016). arXiv:1609.08144 http:\/\/arxiv.org\/ abs\/1609.08144"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.5555\/3386691.3386708"},{"key":"e_1_3_2_1_75_1","volume-title":"Tianyi Zhang, Tri Dao, Beidi Chen, Percy Liang, Christopher Re, and Ce Zhang.","author":"Yuan Binhang","year":"2023","unstructured":"Binhang Yuan, Yongjun He, Jared Quincy Davis, Tianyi Zhang, Tri Dao, Beidi Chen, Percy Liang, Christopher Re, and Ce Zhang. 2023. Decentralized Training of Foundation Models in Heterogeneous Environments. arXiv:2206.01288 [cs.DC]"},{"key":"e_1_3_2_1_76_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL]"},{"key":"e_1_3_2_1_77_1","volume-title":"Fast Databases with Fast Durability and Recovery Through Multicore Parallelism. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Zheng Wenting","year":"2014","unstructured":"Wenting Zheng, Stephen Tu, Eddie Kohler, and Barbara Liskov. 2014. Fast Databases with Fast Durability and Recovery Through Multicore Parallelism. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). USENIX Association, Broomfield, CO, 465--477. https:\/\/www.usenix.org\/conference\/osdi14\/technicalsessions\/presentation\/zheng_wenting"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707255","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707255","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:52:08Z","timestamp":1755787928000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707255"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":77,"alternative-id":["10.1145\/3669940.3707255","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707255","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}