{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T21:24:40Z","timestamp":1769549080952,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"NSF","award":["2444537"],"award-info":[{"award-number":["2444537"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3715983","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"48-62","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A\n            <scp>qua<\/scp>\n            : Network-Accelerated Memory Offloading for LLMs in Scale-Up GPU Domains"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6599-3996","authenticated-orcid":false,"given":"Abhishek","family":"Vijaya Kumar","sequence":"first","affiliation":[{"name":"Cornell University, Ithaca, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6063-4975","authenticated-orcid":false,"given":"Gianni","family":"Antichi","sequence":"additional","affiliation":[{"name":"Politecnico di Milano, Milan, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8118-3026","authenticated-orcid":false,"given":"Rachee","family":"Singh","sequence":"additional","affiliation":[{"name":"Cornell University, Ithaca, New York, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. PyTorch: An open source machine learning framework that accelerates the path from research prototyping to production deployment. https:\/\/pytorch.org. Accessed: 2024-05-03."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604878"},{"key":"e_1_3_2_1_3_1","volume-title":"Vidur: A Large-Scale Simulation Framework For LLM Inference. arXiv:2405.05465 [cs.LG] https:\/\/arxiv.org\/abs\/2405.05465","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav Gulavani, Ramachandran Ramjee, and Alexey Tumanov. 2024. Vidur: A Large-Scale Simulation Framework For LLM Inference. arXiv:2405.05465 [cs.LG] https:\/\/arxiv.org\/abs\/2405.05465"},{"key":"e_1_3_2_1_4_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 117--134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_2_1_5_1","unstructured":"Mistral AI. 2023. Mistral. https:\/\/huggingface.co\/docs\/transformers\/main\/model_doc\/mistral."},{"key":"e_1_3_2_1_6_1","unstructured":"Mistral AI. 2023. Mixtral. https:\/\/huggingface.co\/docs\/transformers\/model_doc\/mixtral."},{"key":"e_1_3_2_1_7_1","unstructured":"Meta AI. 2024. CodeLLaMa. https:\/\/github.com\/Meta-Llama\/codellama."},{"key":"e_1_3_2_1_8_1","unstructured":"Meta AI. 2024. LLaMA 3.2. https:\/\/www.llama.com\/."},{"key":"e_1_3_2_1_9_1","volume-title":"Together: The fastest cloud platform for building and running generative AI. https:\/\/www.together.ai\/. Accessed: 2024-03-16.","author":"Together","year":"2024","unstructured":"Together AI. 2024. Together: The fastest cloud platform for building and running generative AI. https:\/\/www.together.ai\/. Accessed: 2024-03-16."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387522"},{"key":"e_1_3_2_1_11_1","unstructured":"Amazon Web Services. 2024. EC2 Auto Scaling Warm Pools. Accessed: 2024-10-12."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_13_1","unstructured":"Anthropic. [n. d.]. Prompt Caching. https:\/\/docs.anthropic.com\/en\/docs\/build-with-claude\/prompt-caching. Accessed: 2024-09-11."},{"key":"e_1_3_2_1_14_1","unstructured":"arXiv. 2023. Arxiv summarization dataset. https:\/\/huggingface.co\/datasets\/ccdv\/arxiv-summarization. Accessed: 2024-10-12."},{"key":"e_1_3_2_1_15_1","unstructured":"Amazon Web Services (AWS). 2024. Generative AI-Use Cases and Resources. https:\/\/aws.amazon.com\/generative-ai\/use-cases\/. Accessed on March 16 2024."},{"key":"e_1_3_2_1_16_1","unstructured":"Microsoft Azure. [n. d.]. Azure AI Studio. Accessed on March 16 2024. Available at: https:\/\/azure.microsoft.com\/en-us\/products\/ai-studio\/."},{"key":"e_1_3_2_1_17_1","volume-title":"Azure LLM Inference Dataset","author":"Azure Microsoft","year":"2024","unstructured":"Microsoft Azure. 2024. Azure LLM Inference Dataset 2024. https:\/\/github.com\/Azure\/AzurePublicDataset\/blob\/master\/analysis\/AzureLLMInferenceDataset2024.ipynb. Accessed: 2025-02-04."},{"key":"e_1_3_2_1_18_1","volume-title":"Fine-grained GPU Sharing for ML Applications. In EuroSys","author":"Bai Zhihao","year":"2024","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. [n. d.]. Orion: Interference-aware, Fine-grained GPU Sharing for ML Applications. In EuroSys 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 499--514. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/bai"},{"key":"e_1_3_2_1_20_1","volume-title":"Advances in Neural Information Processing Systems","author":"Bengio Yoshua","year":"2000","unstructured":"Yoshua Bengio, R\u00e9jean Ducharme, and Pascal Vincent. 2000. A Neural Probabilistic Language Model. In Advances in Neural Information Processing Systems, T. Leen, T. Dietterich, and V. Tresp (Eds.), Vol. 13. MIT Press. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2000\/file\/728f206c2a01bf572b5940d7d9a8fa4c-Paper.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2014.2375213"},{"key":"e_1_3_2_1_22_1","unstructured":"Cerebras 2021. The future of AI is Wafer-Scale. https:\/\/www.cerebras.net\/product-chip\/."},{"key":"e_1_3_2_1_23_1","volume-title":"Memory Harvesting in Multi-GPU Systems with Hierarchical Unified Virtual Memory. In 2022 USENIX Annual Technical Conference (USENIX ATC. USENIX Association","author":"Choi Sangjin","year":"2022","unstructured":"Sangjin Choi, Taeksoo Kim, Jinwoo Jeong, Rachata Ausavarungnirun, Myeongjae Jeon, Youngjin Kwon, and Jeongseob Ahn. 2022. Memory Harvesting in Multi-GPU Systems with Hierarchical Unified Virtual Memory. In 2022 USENIX Annual Technical Conference (USENIX ATC. USENIX Association, Carlsbad, CA, 625--638. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/choi-sangjin"},{"key":"e_1_3_2_1_24_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_25_1","unstructured":"Google AI for Developers. [n. d.]. Gemini API Pricing. https:\/\/ai.google.dev\/pricing. Accessed: 2024-09-11."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3373763"},{"key":"e_1_3_2_1_27_1","unstructured":"Google. 2024. Google user abandonment stats. Accessed: 2024-10-12."},{"key":"e_1_3_2_1_28_1","unstructured":"Google. 2024. Parti-prompts. https:\/\/huggingface.co\/datasets\/nateraw\/parti-prompts. Accessed: 2024-05-02."},{"key":"e_1_3_2_1_29_1","volume-title":"Efficient Memory Disaggregation with Infiniswap. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Gu Juncheng","unstructured":"Juncheng Gu, Youngmoon Lee, Yiwen Zhang, Mosharaf Chowdhury, and Kang G. Shin. 2017. Efficient Memory Disaggregation with Infiniswap. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 649--667. https:\/\/www.usenix.org\/conference\/nsdi17\/technicalsessions\/presentation\/gu"},{"key":"e_1_3_2_1_30_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 443--462. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/gujarati"},{"key":"e_1_3_2_1_31_1","unstructured":"Gurobi. 2024. Gurobi. https:\/\/www.gurobi.com\/. Accessed: 2024-05-03."},{"key":"e_1_3_2_1_32_1","unstructured":"Intel Gaudi AI accelerator 2021. Intel Gaudi AI accelerator. https:\/\/habana.ai\/products\/gaudi\/."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Norman P. Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Cliff Young Xiang Zhou Zongwei Zhou and David Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. arXiv:2304.01433 [cs.AR]","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_34_1","unstructured":"Felix Kreuk Gabriel Synnaeve Adam Polyak Uriel Singer Alexandre D\u00e9fossez Jade Copet Devi Parikh Yaniv Taigman and Yossi Adi. 2023. AudioGen: Textually Guided Audio Generation. arXiv:2209.15352 [cs.SD]"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_36_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. [n. d.]. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"e_1_3_2_1_38_1","volume-title":"AI Is Transforming Businesses. https: \/\/azure.microsoft.com\/en-us\/blog\/azure-openai-service-10-waysgenerative-ai-is-transforming-businesses\/,accessed on","year":"2024","unstructured":"Microsoft. 2024. AI Is Transforming Businesses. https: \/\/azure.microsoft.com\/en-us\/blog\/azure-openai-service-10-waysgenerative-ai-is-transforming-businesses\/,accessed on March 16, 2024."},{"key":"e_1_3_2_1_39_1","unstructured":"Tomas Mikolov Kai Chen Greg Corrado and Jeffrey Dean. 2013. Efficient Estimation of Word Representations in Vector Space. arXiv:1301.3781 [cs.CL]"},{"key":"e_1_3_2_1_40_1","unstructured":"MosaicML. 2023. MPT. https:\/\/huggingface.co\/docs\/transformers\/main\/model_doc\/mpt."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA. 2024. NVIDIA Blackwell Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/technologies\/blackwell-architecture\/ Accessed: 2024-10-12."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2024. NVLink & NVSwitch: Fastest HPC Data Center Platform. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/. Accessed: 2024-03-16."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA B200. https:\/\/www.nvidia.com\/enus\/data-center\/dgx-b200\/. Accessed: 2024-04-28."},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA DGX A100 Datasheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-dgx-a100-datasheet.pdf. Accessed: 2024-04-21."},{"key":"e_1_3_2_1_45_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA H100 Tensor Core GPU Datasheet. https:\/\/resources.nvidia.com\/en-us-tensor-core\/nvidiatensor-core-gpu-datasheet. Accessed: 2024-04-21."},{"key":"e_1_3_2_1_46_1","unstructured":"Nvidia DGX Systems 2021. Nvidia DGX Systems. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-systems\/."},{"key":"e_1_3_2_1_47_1","volume-title":"Accessed on","author":"AI.","year":"2024","unstructured":"OpenAI. 2024. OpenAI API. Accessed on March 16, 2024. Available at: https:\/\/openai.com\/blog\/openai-api."},{"key":"e_1_3_2_1_48_1","unstructured":"OpenAI. 2024. OpenAI batch API. https:\/\/platform.openai.com\/docs\/guides\/batch. Accessed: 2024-09-11."},{"key":"e_1_3_2_1_49_1","unstructured":"OpenAI. 2024. OpenAI Customer Stories. https:\/\/openai.com\/customer-stories. Accessed on March 16 2024."},{"key":"e_1_3_2_1_50_1","volume-title":"Splitwise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR] https:\/\/arxiv.org\/abs\/2311.18677","author":"Patel Pratyush","year":"2024","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, Aashaka Shah, \u00cd\u00f1igo Goiri, Saeed Maleki, and Ricardo Bianchini. 2024. Splitwise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR] https:\/\/arxiv.org\/abs\/2311.18677"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_52_1","unstructured":"Amazon Web Services. [n. d.]. Amazon SageMaker. Accessed on March 16 2024. Available at: https:\/\/aws.amazon.com\/sagemaker\/."},{"key":"e_1_3_2_1_53_1","unstructured":"Amazon Web Services. 2024. Mewtant Case Study. urlhttps:\/\/aws.amazon.com\/solutions\/case-studies\/mewtant-casestudy\/ accessed on 2024-03-16."},{"key":"e_1_3_2_1_54_1","volume-title":"Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E. Gonzalez, and Ion Stoica. 2024. Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 965--988. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sheng"},{"key":"e_1_3_2_1_55_1","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Daniel Y. Fu Zhiqiang Xie Beidi Chen Clark Barrett Joseph E. Gonzalez Percy Liang Christopher R\u00e9 Ion Stoica and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. arXiv:2303.06865 [cs.LG]"},{"key":"e_1_3_2_1_56_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_57_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Shubha Sudipta Saha","unstructured":"Sudipta Saha Shubha, Haiying Shen, and Anand Iyer. [n. d.]. USHER: Holistic Interference Avoidance for Resource Optimized ML Inference. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/shubha. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787508"},{"key":"e_1_3_2_1_59_1","volume-title":"Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 173--191. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sun-biao"},{"key":"e_1_3_2_1_60_1","unstructured":"The Linux Kernel Developers. [n. d.]. CFS Scheduler. https:\/\/docs. kernel.org\/scheduler\/sched-design-CFS.html. Accessed: 2024-03-17."},{"key":"e_1_3_2_1_61_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696348.3696856"},{"key":"e_1_3_2_1_63_1","unstructured":"vLLM. 2023. Distributed LLM serving. https:\/\/docs.vllm.ai\/en\/latest\/ serving\/distributed_serving.html. Accessed: 2024-10-12."},{"key":"e_1_3_2_1_64_1","unstructured":"VLLM. 2024. LLama31: Enhancing LLM Serving Efficiency. https: \/\/blog.vllm.ai\/2024\/07\/23\/llama31.html. Accessed on September 11 2024."},{"key":"e_1_3_2_1_65_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Shengyu Liu Fangyue Liu Yuanhang Sun Gang Huang Xuanzhe Liu and Xin Jin. 2024. Fast Distributed Inference Serving for Large Language Models. arXiv:2305.05920 [cs.LG] https:\/\/arxiv.org\/abs\/2305.05920"},{"key":"e_1_3_2_1_66_1","volume-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, Fan Yang, and Lidong Zhou. 2018. Gandiva: Introspective Cluster Scheduling for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 595--610. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/xiao"},{"key":"e_1_3_2_1_67_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521--538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_68_1","volume-title":"Salus: Fine- Grained GPU Sharing Primitives for Deep Learning Applications. arXiv:1902.04610 [cs.DC]","author":"Yu Peifeng","year":"2019","unstructured":"Peifeng Yu and Mosharaf Chowdhury. 2019. Salus: Fine- Grained GPU Sharing Primitives for Deep Learning Applications. arXiv:1902.04610 [cs.DC]"},{"key":"e_1_3_2_1_69_1","unstructured":"Xuanlei Zhao Bin Jia Haotian Zhou Ziming Liu Shenggan Cheng and Yang You. 2024. HeteGen: Heterogeneous Parallel Inference for Large Language Models on Resource-Constrained Devices. https:\/\/arxiv.org\/abs\/2403.01164. Accessed on September 11 2024."},{"key":"e_1_3_2_1_70_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 193--210. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/zhongyinmin"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715983","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3715983","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:13:12Z","timestamp":1755774792000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715983"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":70,"alternative-id":["10.1145\/3676641.3715983","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3715983","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}