{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T08:28:50Z","timestamp":1768033730545,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767354","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:13:44Z","timestamp":1762532024000},"page":"114-125","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Engine-Agnostic Model Hot-Swapping for Cost-Effective LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9688-2615","authenticated-orcid":false,"given":"Radostin","family":"Stoyanov","sequence":"first","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5204-1478","authenticated-orcid":false,"given":"Vikt\u00f3ria","family":"Spi\u0161akov\u00e1","sequence":"additional","affiliation":[{"name":"Masaryk University, Brno, Czech Republic"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4959-6561","authenticated-orcid":false,"given":"Adrian","family":"Reber","sequence":"additional","affiliation":[{"name":"Red Hat, Stuttgart, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1756-3064","authenticated-orcid":false,"given":"Wesley","family":"Armour","sequence":"additional","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7606-5519","authenticated-orcid":false,"given":"Marcin","family":"Copik","sequence":"additional","affiliation":[{"name":"ETH Zurich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1578-5149","authenticated-orcid":false,"given":"Rodrigo","family":"Bruno","sequence":"additional","affiliation":[{"name":"INESC-ID, Instituto Superior T\u00e9cnico, University of Lisbon, Lisbon, Portugal"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00073"},{"key":"e_1_3_3_2_3_2","volume-title":"Alibaba Cloud Function Compute","author":"Cloud Alibaba","year":"2025","unstructured":"Alibaba Cloud. 2025. Alibaba Cloud Function Compute. https:\/\/www.alibabacloud.com\/product\/function-compute Accessed: 15-09-2025."},{"key":"e_1_3_3_2_4_2","unstructured":"Amazon Web Services. 2025. Amazon SageMaker. https:\/\/aws.amazon.com\/sagemaker Accessed: 15-09-2025."},{"key":"e_1_3_3_2_5_2","unstructured":"Anthropic. 2025. Claude. https:\/\/claude.ai."},{"key":"e_1_3_3_2_6_2","unstructured":"Microsoft Azure. 2025. Azure AI Foundry. https:\/\/ai.azure.com Accessed: 15-09-2025."},{"key":"e_1_3_3_2_7_2","unstructured":"Xiao Bi et\u00a0al. 2024. DeepSeek LLM: Scaling Open-Source Language Models with Longtermism. arxiv:https:\/\/arXiv.org\/abs\/2401.02954\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2401.02954"},{"key":"e_1_3_3_2_8_2","first-page":"199","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 199\u2013216. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/choi-seungbeom"},{"key":"e_1_3_3_2_9_2","unstructured":"Cloud Native Computing Foundation. 2025. KServe: Standardized Serverless ML Inference Platform on Kubernetes. https:\/\/github.com\/kserve\/kserve. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_10_2","unstructured":"DeepSeek-AI Aixin Liu Bei Feng et\u00a0al. 2025. DeepSeek-V3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.19437\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378512"},{"key":"e_1_3_3_2_12_2","unstructured":"Hugging Face. 2025. Deploy with your own container. https:\/\/huggingface.co\/docs\/inference-endpoints\/en\/engines\/custom_container. Accessed: 15\u201109\u20112025."},{"key":"e_1_3_3_2_13_2","unstructured":"Hugging Face. 2025. Hugging Face Inference. https:\/\/huggingface.co\/docs\/inference-providers\/providers\/hf-inference. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_14_2","unstructured":"Freezer Subsystem. 2025. Linux Kernel Documentation. https:\/\/www.kernel.org\/doc\/Documentation\/cgroup-v1\/freezer-subsystem.txt. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_15_2","first-page":"135","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 135\u2013153. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/fu"},{"key":"e_1_3_3_2_16_2","unstructured":"Georgi Gerganov. 2025. llama.cpp. https:\/\/github.com\/ggerganov\/llama.cpp."},{"key":"e_1_3_3_2_17_2","unstructured":"Google Cloud. 2025. Autoscaling Machine Learning Inference Workloads on Google Kubernetes Engine. https:\/\/cloud.google.com\/kubernetes-engine\/docs\/best-practices\/machine-learning\/inference\/autoscaling Accessed: 15-09-2025."},{"key":"e_1_3_3_2_18_2","unstructured":"Google DeepMind. 2025. Gemini. https:\/\/gemini.google.com. Google\u2019s multimodal chatbot integrated across Workspace and Search hundreds of millions of users."},{"key":"e_1_3_3_2_19_2","unstructured":"Hugging Face. 2025. Simple safe zero\u2011copy tensor storage. https:\/\/github.com\/huggingface\/safetensors"},{"key":"e_1_3_3_2_20_2","unstructured":"Shashwat Jaiswal Kunal Jain Yogesh Simmhan Anjaly Parayil Ankur Mallick Rujia Wang Renee\u00a0St. Amant Chetan Bansal Victor R\u00fchle Anoop Kulkarni Steve Kofsky and Saravan Rajmohan. 2025. Serving Models Fast and Slow: Optimizing Heterogeneous LLM Inferencing Workloads at Scale. arxiv:https:\/\/arXiv.org\/abs\/2502.14617\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2502.14617"},{"key":"e_1_3_3_2_21_2","unstructured":"Youhe Jiang Fangcheng Fu Xiaozhe Yao Guoliang He Xupeng Miao Ana Klimovic Bin Cui Binhang Yuan and Eiko Yoneki. 2025. Demystifying Cost-Efficiency in LLM Serving over Heterogeneous GPUs. arxiv:https:\/\/arXiv.org\/abs\/2502.00722\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2502.00722"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642773"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629556"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_2_25_2","series-title":"Proceedings of Machine Learning Research","first-page":"19274","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast Inference from Transformers via Speculative Decoding. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19274\u201319286. https:\/\/proceedings.mlr.press\/v202\/leviathan23a.html"},{"key":"e_1_3_3_2_26_2","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Li Jie","year":"2022","unstructured":"Jie Li, Laiping Zhao, Yanan Yang, Kunlin Zhan, and Keqiu Li. 2022. Tetris: Memory-efficient Serverless Inference through Tensor Sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/li-jie"},{"key":"e_1_3_3_2_27_2","unstructured":"Bo-Yi\u00a0Wu Manu Martinez-Almeida Javier\u00a0Provecho et\u00a0al. 2025. Gin Web Framework. https:\/\/github.com\/gin-gonic\/gin"},{"key":"e_1_3_3_2_28_2","unstructured":"Meta AI. 2025. LLaMA 4: Large Language Model. https:\/\/www.llama.com\/models\/llama-4. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_29_2","unstructured":"Microsoft. 2024. How to Customize an LLM: A Deep Dive to Tailoring an LLM for Your Business. https:\/\/techcommunity.microsoft.com\/blog\/azure-ai-foundry-blog\/how-to-customize-an-llm-a-deep-dive-to-tailoring-an-llm-for-your-business\/4110204 Accessed: 15-09-2025."},{"key":"e_1_3_3_2_30_2","unstructured":"Microsoft. 2025. Azure LLM Inference Traces. https:\/\/github.com\/Azure\/AzurePublicDataset."},{"key":"e_1_3_3_2_31_2","unstructured":"Microsoft. 2025. Copilot. https:\/\/www.bing.com\/chat."},{"key":"e_1_3_3_2_32_2","unstructured":"Jeffrey Morgan and Michael Chiang. 2025. Ollama. https:\/\/ollama.ai\/."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Deepak Narayanan Mohammad Shoeybi Jared Casper Patrick LeGresley Mostofa Patwary Vijay\u00a0Anand Korthikanti Dmitri Vainbrand Prethvi Kashinkunti Julie Bernauer Bryan Catanzaro Amar Phanishayee and Matei Zaharia. 2021. Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. arxiv:https:\/\/arXiv.org\/abs\/2104.04473\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2104.04473","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_34_2","unstructured":"Chanh Nguyen. 2025. CUDA Graph Capture Support in vLLM. https:\/\/github.com\/vllm-project\/vllm\/pull\/16072"},{"key":"e_1_3_3_2_35_2","unstructured":"Vinh Nguyen Michael Carilli Sukru\u00a0Burc Eryilmaz Vartika Singh Michelle Lin Natalia Gimelshein Alban Desmaison and Edward Yang. 2021. Accelerating PyTorch with CUDA Graphs. https:\/\/pytorch.org\/blog\/accelerating-pytorch-with-cuda-graphs\/. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_36_2","unstructured":"NVIDIA. 2025. TensorRT-LLM: Accelerated Large Language Model Inference. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_3_2_37_2","unstructured":"NVIDIA. 2025. TensorRT-LLM Build Workflow. https:\/\/nvidia.github.io\/TensorRT-LLM. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_38_2","unstructured":"OpenAI. 2025. ChatGPT. https:\/\/chat.openai.com."},{"key":"e_1_3_3_2_39_2","volume-title":"OpenAI Pricing","year":"2025","unstructured":"OpenAI. 2025. OpenAI Pricing. https:\/\/openai.com\/api\/pricing Accessed: 15-09-2025."},{"key":"e_1_3_3_2_40_2","unstructured":"OpenAI. 2025. Specification for the OpenAI API. https:\/\/platform.openai.com\/docs\/api-reference Accessed: 15-09-2025."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3656019.3676949"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624664"},{"key":"e_1_3_3_2_44_2","unstructured":"Podman Documentation. 2025. Pause one or more containers. https:\/\/docs.podman.io\/en\/latest\/markdown\/podman-pause.1.html. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_45_2","unstructured":"Red Hat Inc.2025. Podman. https:\/\/podman.io\/ Accessed: 15-09-2025."},{"key":"e_1_3_3_2_46_2","first-page":"397","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja\u00a0J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397\u2013411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584037"},{"key":"e_1_3_3_2_48_2","unstructured":"Amazon\u00a0Web Services. 2025. AWS Lambda. https:\/\/aws.amazon.com\/lambda Accessed: 15-09-2025."},{"key":"e_1_3_3_2_49_2","first-page":"205","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. 2020. Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). USENIX Association, Santa Clara, CA, 205\u2013218. https:\/\/www.usenix.org\/conference\/atc20\/presentation\/shahrad"},{"key":"e_1_3_3_2_50_2","unstructured":"Haihao Shen Hanwen Chang Bo Dong Yu Luo and Hengyu Meng. 2023. Efficient LLM Inference on CPUs. arxiv:https:\/\/arXiv.org\/abs\/2311.00502\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2311.00502"},{"key":"e_1_3_3_2_51_2","unstructured":"Snowflake. 2024. Snowflake LLM Inference: Model Hotswapping. https:\/\/www.snowflake.com\/engineering-blog\/llm-interference-model-hotswapping\/ Accessed: 15-09-2025."},{"key":"e_1_3_3_2_52_2","volume-title":"Job Scheduling Strategies for Parallel Processing","author":"Spi\u0161akov\u00e1 Vikt\u00f3ria","year":"2025","unstructured":"Vikt\u00f3ria Spi\u0161akov\u00e1, Radostin Stoyanov, Luk\u00e1\u0161 Hejtm\u00e1nek, Dalibor Klus\u00e1\u010dek, Adrian Reber, and Rodrigo Bruno. 2025. Kubernetes Scheduling with Checkpoint\/Restore: Challenges and Open Problems. In Job Scheduling Strategies for Parallel Processing. Springer Nature Switzerland."},{"key":"e_1_3_3_2_53_2","unstructured":"Steven Gurfinkel. 2025. CUDA Checkpoint and Restore Utility. https:\/\/github.com\/NVIDIA\/cuda-checkpoint."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_3_2_55_2","volume-title":"High Performance Container Workshop","author":"Stoyanov Radostin","year":"2025","unstructured":"Radostin Stoyanov. 2025. Transparent Hot-Swapping of Containerized AI\/ML Workloads. In High Performance Container Workshop."},{"key":"e_1_3_3_2_56_2","volume-title":"KubeCon + CloudNativeCon Europe 2025","author":"Stoyanov Radostin","year":"2025","unstructured":"Radostin Stoyanov, Adrian Reber, and Vikt\u00f3ria Spi\u0161akov\u00e1. 2025. Efficient Transparent Checkpointing of AI\/ML Workloads in Kubernetes. In KubeCon + CloudNativeCon Europe 2025. https:\/\/kccnceu2025.sched.com\/event\/1tx7i"},{"key":"e_1_3_3_2_57_2","unstructured":"Radostin Stoyanov Vikt\u00f3ria Spi\u0161akov\u00e1 Jesus Ramos Steven Gurfinkel Andrei Vagin Adrian Reber Wesley Armour and Rodrigo Bruno. 2025. CRIUgpu: Transparent Checkpointing of GPU-Accelerated Workloads. arxiv:https:\/\/arXiv.org\/abs\/2502.16631\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2502.16631"},{"key":"e_1_3_3_2_58_2","first-page":"173","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 173\u2013191. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sun-biao"},{"key":"e_1_3_3_2_59_2","unstructured":"Gemini Team. 2024. Gemini: A Family of Highly Capable Multimodal Models. arxiv:https:\/\/arXiv.org\/abs\/2312.11805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_3_2_60_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_2_61_2","unstructured":"Harshith Umesh. 2025. Ollama vs. vLLM: A deep dive into performance benchmarking. Red Hat Developers (8 Aug. 2025). https:\/\/developers.redhat.com\/articles\/2025\/08\/08\/ollama-vs-vllm-deep-dive-performance-benchmarking Accessed: 2025\u201108\u201114."},{"key":"e_1_3_3_2_62_2","unstructured":"vLLM. 2025. Sleep Mode. https:\/\/docs.vllm.ai\/en\/latest\/features\/sleep_mode.html. Accessed: 15-09-2025."},{"key":"e_1_3_3_2_63_2","first-page":"1657","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Wang Jiali","year":"2025","unstructured":"Jiali Wang, Yankui Wang, Mingcong Han, and Rong Chen. 2025. Colocating ML Inference and Training with Fast GPU Memory Handover. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 1657\u20131675."},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_3_2_65_2","unstructured":"Yuxin Wang Yuhan Chen Zeyu Li Xueze Kang Yuchu Fang Yeju Zhou Yang Zheng Zhenheng Tang Xin He Rui Guo Xin Wang Qiang Wang Amelie\u00a0Chi Zhou and Xiaowen Chu. 2025. BurstGPT: A Real-world Workload Dataset to Optimize LLM Serving Systems. arxiv:https:\/\/arXiv.org\/abs\/2401.17644\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2401.17644"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3661370"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3698038.3698510"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_3_2_69_2","unstructured":"Xiaozhe Yao Qinghao Hu and Ana Klimovic. 2024. DeltaZip: Efficient Serving of Multiple Full-Model-Tuned LLMs. arxiv:https:\/\/arXiv.org\/abs\/2312.05215\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2312.05215"},{"key":"e_1_3_3_2_70_2","unstructured":"Minchen Yu Ao Wang Dong Chen Haoxuan Yu Xiaonan Luo Zhuohao Li Wei Wang Ruichuan Chen Dapeng Nie and Haoran Yang. 2024. FaaSwap: SLO-Aware GPU-Efficient Serverless Inference via Model Swapping. arxiv:https:\/\/arXiv.org\/abs\/2306.03622\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2306.03622"},{"key":"e_1_3_3_2_71_2","unstructured":"Minchen Yu Ao Wang Dong Chen Haoxuan Yu Xiaonan Luo Zhuohao Li Wei Wang Ruichuan Chen Dapeng Nie Haoran Yang and Yu Ding. 2025. Torpor: GPU-Enabled Serverless Computing for Low-Latency Resource-Efficient Inference. arxiv:https:\/\/arXiv.org\/abs\/2306.03622\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2306.03622"},{"key":"e_1_3_3_2_72_2","first-page":"1049","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. MArk: Exploiting Cloud Services for Cost-Effective, SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 1049\u20131062. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/zhang-chengliang"},{"key":"e_1_3_3_2_73_2","first-page":"787","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 787\u2013808. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhang-hong"},{"key":"e_1_3_3_2_74_2","volume-title":"12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Zhang Jeff","year":"2020","unstructured":"Jeff Zhang, Sameh Elnikety, Shuayb Zarar, Atul Gupta, and Siddharth Garg. 2020. Model-Switching: Dealing with Fluctuating Workloads in Machine-Learning-as-a-Service Systems. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20). USENIX Association. https:\/\/www.usenix.org\/conference\/hotcloud20\/presentation\/zhang"},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483580"},{"key":"e_1_3_3_2_76_2","unstructured":"Lianmin Zheng Liangsheng Yin Zhiqiang Xie Chuyue Sun Jeff Huang Cody\u00a0Hao Yu Shiyi Cao Christos Kozyrakis Ion Stoica Joseph\u00a0E. Gonzalez Clark Barrett and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. arxiv:https:\/\/arXiv.org\/abs\/2312.07104\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2312.07104"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767354","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:27:48Z","timestamp":1767986868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767354"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":75,"alternative-id":["10.1145\/3731599.3767354","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767354","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}