{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,11]],"date-time":"2026-07-11T15:41:54Z","timestamp":1783784514727,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3689031.3717459","type":"proceedings-article","created":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T06:25:20Z","timestamp":1742970320000},"page":"159-175","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["SkyServe: Serving AI Models across Regions and Clouds with Spot Instances"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7738-2498","authenticated-orcid":false,"given":"Ziming","family":"Mao","sequence":"first","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0895-4585","authenticated-orcid":false,"given":"Tian","family":"Xia","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2352-4002","authenticated-orcid":false,"given":"Zhanghao","family":"Wu","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0105-723X","authenticated-orcid":false,"given":"Wei-Lin","family":"Chiang","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4655-0229","authenticated-orcid":false,"given":"Tyler","family":"Griggs","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7314-1643","authenticated-orcid":false,"given":"Romil","family":"Bhardwaj","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8716-8743","authenticated-orcid":false,"given":"Zongheng","family":"Yang","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1357-7533","authenticated-orcid":false,"given":"Scott","family":"Shenker","sequence":"additional","affiliation":[{"name":"UC Berkeley and ICSI"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5373-0088","authenticated-orcid":false,"given":"Ion","family":"Stoica","sequence":"additional","affiliation":[{"name":"UC Berkeley"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Amazon EC2 Auto Scaling with EC2 Spot Instances. https:\/\/aws.amazon.com\/tutorials\/ec2-auto-scaling-spot-instances\/. Accessed","year":"2024","unstructured":"2024. Amazon EC2 Auto Scaling with EC2 Spot Instances. https:\/\/aws.amazon.com\/tutorials\/ec2-auto-scaling-spot-instances\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_2_1","volume-title":"Amazon EC2 Burstable Performance Instances. https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/burstable-performance-instances.html. Accessed","year":"2024","unstructured":"2024. Amazon EC2 Burstable Performance Instances. https:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/burstable-performance-instances.html. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/aws.amazon.com\/sagemaker\/. Accessed","author":"SageMaker Amazon","year":"2024","unstructured":"2024. Amazon SageMaker. https:\/\/aws.amazon.com\/sagemaker\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"Accessed","year":"2024","unstructured":"2024. AWS Autoscaling Group. https:\/\/docs.aws.amazon.com\/autoscaling\/ec2\/userguide\/auto-scaling-groups.html. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/aws.amazon.com\/pm\/lambda\/. Accessed","author":"Lambda AWS","year":"2024","unstructured":"2024. AWS Lambda. https:\/\/aws.amazon.com\/pm\/lambda\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/aws.amazon.com\/pricing. Accessed","author":"Pricing AWS","year":"2024","unstructured":"2024. AWS Pricing. https:\/\/aws.amazon.com\/pricing. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"Accessed","year":"2024","unstructured":"2024. AWS Spot Instance Advisor. https:\/\/aws.amazon.com\/ec2\/spot\/instance-advisor\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/azure.microsoft.com\/en-us\/pricing. Accessed","author":"Pricing Azure","year":"2024","unstructured":"2024. Azure Pricing. https:\/\/azure.microsoft.com\/en-us\/pricing. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Bard: A conversational AI tool by Google. https:\/\/bard.google.com. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"Accessed","year":"2024","unstructured":"2024. CloudOps for Spot Instances. https:\/\/spot.io\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/openai.com\/dall-e-3. Accessed","year":"2024","unstructured":"2024. DALL\u00b7E3. https:\/\/openai.com\/dall-e-3. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"Accessed","year":"2024","unstructured":"2024. The Desperate Hunt for the A.I. Boom's Most Indispensable Prize. https:\/\/www.nytimes.com\/2023\/08\/16\/technology\/ai-gpuchips-shortage.html. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Focus: For tech giants, AI like Bing and Bard poses billion-dollar search problem. https:\/\/www.reuters.com\/technology\/tech-giants-ai-like-bing-bard-poses-billion-dollar-search-problem-2023-02-22\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/cloud.google.com\/pricing. Accessed","author":"Pricing GCP","year":"2024","unstructured":"2024. GCP Pricing. https:\/\/cloud.google.com\/pricing. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Accessed","author":"Copilot Github","year":"2024","unstructured":"2024. Github Copilot: The world's most widely adopted AI developer tool. https:\/\/github.com\/features\/copilot. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Google Kubernetes Engine (GKE). https:\/\/cloud.google.com\/kubernetes-engine?hl=en. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Grammarly: AI Writing Assistance. https:\/\/www.grammarly.com\/ai. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_18_1","volume-title":"Accessed","year":"2024","unstructured":"2024. How To Reduce Cold Start Times For LLM Inference. https:\/\/scale.com\/blog\/reduce-cold-start-time-llm-inference. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"Accessed","year":"2024","unstructured":"2024. HuggingFace Chatbot Arena Conversation. https:\/\/huggingface.co\/datasets\/lmsys\/chatbot_arena_conversations. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"https:\/\/openai.com\/index\/chatgpt\/. Accessed","author":"Introducing","year":"2024","unstructured":"2024. Introducing ChatGPT. https:\/\/openai.com\/index\/chatgpt\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_21_1","volume-title":"Loading Llama-2 70b 20x faster with Anyscale Endpoints. https:\/\/www.anyscale.com\/blog\/loading-llama-2-70b-20x-faster-with-anyscale-endpoints. Accessed","year":"2024","unstructured":"2024. Loading Llama-2 70b 20x faster with Anyscale Endpoints. https:\/\/www.anyscale.com\/blog\/loading-llama-2-70b-20x-faster-with-anyscale-endpoints. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"https:\/\/www.midjourney.com\/home. Accessed","year":"2024","unstructured":"2024. Midjourney. https:\/\/www.midjourney.com\/home. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_23_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Navigating the High Cost of AI Compute. https:\/\/a16z.com\/navigating-the-high-cost-of-ai-compute\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_24_1","volume-title":"Accessed","year":"2024","unstructured":"2024. NVIDIA Triton Inference Server. https:\/\/developer.nvidia.com\/triton-inference-server\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"https:\/\/openai.com\/blog\/openai-api. Accessed","author":"AI","year":"2024","unstructured":"2024. OpenAI API. https:\/\/openai.com\/blog\/openai-api. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_26_1","volume-title":"Accessed","author":"Serve Ray","year":"2024","unstructured":"2024. Ray Serve: Scalable and Programmable Serving. https:\/\/docs.ray.io\/en\/latest\/serve\/index.html. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_27_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Running a GKE application on spot nodes with on-demand nodes as fallback. https:\/\/cloud.google.com\/blog\/topics\/developers-practitioners\/running-gke-application-spot-nodes-demand-nodes-fallback. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_28_1","volume-title":"Accessed","year":"2024","unstructured":"2024. SpotServe: A Cost-Effective Spot Instance Serving Framework. https:\/\/github.com\/Hsword\/SpotServe. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_29_1","volume-title":"Accessed","year":"2024","unstructured":"2024. Text Generation Inference (TGI). https:\/\/github.com\/huggingface\/text-generation-inference. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"Unofficial OpenAI Status. https:\/\/openai-status.llm-utils.org.Accessed","year":"2024","unstructured":"2024. Unofficial OpenAI Status. https:\/\/openai-status.llm-utils.org.Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"https:\/\/cloud.google.com\/vertex-ai. Accessed","author":"Vertex","year":"2024","unstructured":"2024. Vertex AI. https:\/\/cloud.google.com\/vertex-ai. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"Accessed","year":"2025","unstructured":"2025. Anthropic: Claude 3.5 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet. Accessed: Feb. 13, 2025."},{"key":"e_1_3_2_1_33_1","volume-title":"Accessed","author":"Gemini Google","year":"2025","unstructured":"2025. Google Gemini: Supercharge your creativity and productivity. https:\/\/gemini.google.com. Accessed: Feb. 13, 2025."},{"key":"e_1_3_2_1_34_1","volume-title":"Accessed","year":"2025","unstructured":"2025. Serverless Endpoints for leading open-source models. https:\/\/www.together.ai\/products_inference\/. Accessed: Feb. 13, 2025."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341617.3326143"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E.2019.00-10"},{"key":"e_1_3_2_1_38_1","volume-title":"Boris Hanin, Peter Bailis, Ion Stoica, Matei Zaharia, and James Zou.","author":"Chen Lingjiao","year":"2024","unstructured":"Lingjiao Chen, Jared Quincy Davis, Boris Hanin, Peter Bailis, Ion Stoica, Matei Zaharia, and James Zou. 2024. Are more LLM calls all you need? Towards scaling laws of compound inference systems. arXiv preprint arXiv:2403.02419 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Accessed","author":"Cloud Google","year":"2023","unstructured":"Google Cloud. 2023. Spot Virtual Machine Instances Documentation. https:\/\/cloud.google.com\/compute\/docs\/instances\/spot. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2018.02.003"},{"key":"e_1_3_2_1_41_1","volume-title":"Bidipta Sarkar, Rohan Taori, Yusuke Noda, Demetri Terzopoulos, Yejin Choi, Katsushi Ikeuchi, Hoi Vo, Li Fei-Fei, and Jianfeng Gao.","author":"Durante Zane","year":"2024","unstructured":"Zane Durante, Qiuyuan Huang, Naoki Wake, Ran Gong, Jae Sung Park, Bidipta Sarkar, Rohan Taori, Yusuke Noda, Demetri Terzopoulos, Yejin Choi, Katsushi Ikeuchi, Hoi Vo, Li Fei-Fei, and Jianfeng Gao. 2024. Agent AI: Surveying the Horizons of Multimodal Interaction. arXiv:2401.03568 [cs.AI] https:\/\/arxiv.org\/abs\/2401.03568"},{"key":"e_1_3_2_1_42_1","volume-title":"ServerlessLLM: Locality-Enhanced Serverless Inference for Large Language Models. arXiv preprint arXiv:2401.14351","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Locality-Enhanced Serverless Inference for Large Language Models. arXiv preprint arXiv:2401.14351 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Gunasekaran Jashwant Raj","year":"2022","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut Taylan Kandemir, and Chita R Das. 2022. Cocktail: A Multidimensional Optimization for Model Serving in Cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 1041--1057."},{"key":"e_1_3_2_1_44_1","volume-title":"2018 USENIX Annual Technical Conference (USENIX ATC 18)","author":"Harlap Aaron","year":"2018","unstructured":"Aaron Harlap, Andrew Chung, Alexey Tumanov, Gregory R Ganger, and Phillip B Gibbons. 2018. Tributary: spot-dancing for elastic services with latency SLOs. In 2018 USENIX Annual Technical Conference (USENIX ATC 18). 1--14."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3064176.3064182"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E.2018.00052"},{"key":"e_1_3_2_1_47_1","volume-title":"Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714","author":"Khattab Omar","year":"2023","unstructured":"Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T Joshi, Hanna Moazam, et al. 2023. Dspy: Compiling declarative language model calls into self-improving pipelines. arXiv preprint arXiv:2310.03714 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_49_1","volume-title":"DeepSpotCloud: Leveraging Cross-Region GPU Spot Instances for Deep Learning. In 2017 IEEE 10th International Conference on Cloud Computing (CLOUD). IEEE, 98--105","author":"Lee Kyungyong","year":"2017","unstructured":"Kyungyong Lee and Myungjun Son. 2017. DeepSpotCloud: Leveraging Cross-Region GPU Spot Instances for Deep Learning. In 2017 IEEE 10th International Conference on Cloud Computing (CLOUD). IEEE, 98--105."},{"key":"e_1_3_2_1_50_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL] https:\/\/arxiv.org\/abs\/2005.11401"},{"key":"e_1_3_2_1_51_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663--679."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2011.46"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Xupeng Miao Chunan Shi Jiangfei Duan Xiaoli Xi Dahua Lin Bin Cui and Zhihao Jia. 2023. SpotServe: Serving Generative Large Language Models on Preemptible Instances. arXiv:2311.15566 [cs.DC]","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_2_1_54_1","volume-title":"Accessed","year":"2024","unstructured":"Microsoft. 2024. Use Azure Spot Virtual Machines. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/spot-vms. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_55_1","volume-title":"Accessed","author":"Morikawa Evan","year":"2023","unstructured":"Evan Morikawa. 2023. Behind the Scenes Scaling ChatGPT. https:\/\/youtu.be\/PeKMEXUrlq4?t=833. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_56_1","volume-title":"Accessed","author":"AI.","year":"2024","unstructured":"OpenAI. 2024. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_57_1","volume-title":"The Inference Cost of Search Disruption-Large Language Model Cost Analysis. Verf\u00fcgbar unter https:\/\/www. semianalysis. com\/p\/theinference-cost-of-search-disruption","author":"Patel Dylan","year":"2023","unstructured":"Dylan Patel and Afzal Ahmad. 2023. The Inference Cost of Search Disruption-Large Language Model Cost Analysis. Verf\u00fcgbar unter https:\/\/www. semianalysis. com\/p\/theinference-cost-of-search-disruption (2023)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jnca.2016.03.001"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_60_1","unstructured":"Mark Seery. 2024. LLM Routing: Bottleneck is Compute Not the WAN. https:\/\/www.linkedin.com\/pulse\/llm-routing-bottleneck-compute-wan-mark-seery-4xeac\/. LinkedIn."},{"key":"e_1_3_2_1_61_1","volume-title":"Accessed","author":"Services Amazon Web","year":"2015","unstructured":"Amazon Web Services. 2015. Announcing Amazon EC2 Spot Instance Termination Notices. https:\/\/aws.amazon.com\/about-aws\/whats-new\/2015\/01\/05\/announcing-amazon-ec2-spot-instance-termination-notices\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_62_1","unstructured":"Mohammad Shahrad Rodrigo Fonseca Inigo Goiri Gohar Chaudhry Paul Batum Jason Cooke Eduardo Laureano Colby Tresness Mark Russinovich and Ricardo Bianchini. 2020. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20). 205--218."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS57198.2023.10188717"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2012.6195567"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24669-2_11"},{"key":"e_1_3_2_1_66_1","volume-title":"Accessed","author":"Taylor David","year":"2024","unstructured":"David Taylor. 2024. Best Practices for Running Your Database on AWS Spot Instances. https:\/\/www.cockroachlabs.com\/blog\/database-spot-instances\/. Accessed: Oct. 23, 2024."},{"key":"e_1_3_2_1_67_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 497--513."},{"key":"e_1_3_2_1_68_1","unstructured":"H. Touvron L. Martin K. Stone and et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_69_1","volume-title":"Spot-nik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Wagenl\u00e4nder Marcel","year":"2020","unstructured":"Marcel Wagenl\u00e4nder, Luo Mai, Guo Li, and Peter Pietzuch. 2020. Spot-nik: Designing Distributed Machine Learning for Transient Cloud Resources. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126953"},{"key":"e_1_3_2_1_71_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Wu Zhanghao","year":"2024","unstructured":"Zhanghao Wu, Wei-Lin Chiang, Ziming Mao, Zongheng Yang, Eric Friedman, Scott Shenker, and Ion Stoica. 2024. Can't Be Late: Optimizing Spot Instance Savings under Deadlines. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 185--203."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2016.7524348"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582028"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492323.3495594"},{"key":"e_1_3_2_1_75_1","volume-title":"SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Yang Zongheng","year":"2023","unstructured":"Zongheng Yang, Zhanghao Wu, Michael Luo, Wei-Lin Chiang, Romil Bhardwaj, Woosuk Kwon, Siyuan Zhuang, Frank Sifei Luan, Gautam Mittal, Scott Shenker, et al. 2023. SkyPilot: An Intercloud Broker for Sky Computing. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 437--455."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSC.2011.44"},{"key":"e_1_3_2_1_77_1","volume-title":"SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. MArk: Exploiting Cloud Services for Cost-Effective, SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1049--1062."},{"key":"e_1_3_2_1_78_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al.","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. 2023. Efficiently Programming Large Language Models using SGLang. arXiv preprint arXiv:2312.07104 (2023)."}],"event":{"name":"EuroSys '25: Twentieth European Conference on Computer Systems","location":"Rotterdam Netherlands","acronym":"EuroSys '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Twentieth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3717459","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689031.3717459","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:19:06Z","timestamp":1755775146000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689031.3717459"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":78,"alternative-id":["10.1145\/3689031.3717459","10.1145\/3689031"],"URL":"https:\/\/doi.org\/10.1145\/3689031.3717459","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}