{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:33:08Z","timestamp":1773318788150,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":115,"publisher":"ACM","funder":[{"name":"Cisco Research","award":[""],"award-info":[{"award-number":[""]}]},{"name":"Amazon Research Award","award":[""],"award-info":[{"award-number":[""]}]},{"name":"IFML","award":[""],"award-info":[{"award-number":[""]}]},{"name":"UT Austin Junior Faculty Startup","award":[""],"award-info":[{"award-number":[""]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759867","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1676-1696","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MaverIQ: Fingerprint-Guided Extrapolation and Fragmentation-Aware Layering for Intent-Based LLM Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9444-2407","authenticated-orcid":false,"given":"Dimitrios","family":"Liakopoulos","sequence":"first","affiliation":[{"name":"The University of Texas at Austin, Austin, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5538-8829","authenticated-orcid":false,"given":"Prasoon","family":"Sinha","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0246-7826","authenticated-orcid":false,"given":"Tianrui","family":"Hu","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2360-7019","authenticated-orcid":false,"given":"Myungjin","family":"Lee","sequence":"additional","affiliation":[{"name":"Cisco Systems, Bellevue, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7556-3069","authenticated-orcid":false,"given":"Neeraja J.","family":"Yadwadkar","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Amey Agrawal Nitin Kedia Jayashree Mohan Ashish Panwar Nipun Kwatra Bhargav Gulavani Ramachandran Ramjee and Alexey Tumanov. 2024. Vidur: A Large-Scale Simulation Framework For LLM Inference. Proceedings of Machine Learning and Systems 6 (29 May 2024) 351\u2013366."},{"key":"e_1_3_3_2_3_2","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 117\u2013134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3617232.3624849"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Reza\u00a0Yazdani Aminabadi Samyam Rajbhandari Minjia Zhang Ammar\u00a0Ahmad Awan Cheng Li Du Li Elton Zheng Jeff Rasley Shaden Smith Olatunji Ruwase and Yuxiong He. 2022. DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2207.00032 (2022).","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00046"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476138"},{"key":"e_1_3_3_2_10_2","unstructured":"Shaoyuan Chen Yutong Lin Mingxing Zhang and Yongwei Wu. 2024. Efficient and Economic Large Language Model Inference with Attention Offloading. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.01814 (2024)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00080"},{"key":"e_1_3_3_2_12_2","series-title":"(NIPS \u201922)","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit matrix multiplication for transformers at scale. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201922). Curran Associates Inc., Red Hook, NY, USA, Article 2198, 15\u00a0pages."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0441"},{"key":"e_1_3_3_2_14_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers and Luke Zettlemoyer. 2023. The case for 4-bit precision: k-bit inference scaling laws. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 307, 25\u00a0pages."},{"key":"e_1_3_3_2_15_2","unstructured":"Shichen Dong Wen Cheng Jiayu Qin and Wei Wang. 2024. QAQ: Quality Adaptive Quantization for LLM KV Cache. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.04643 (2024)."},{"key":"e_1_3_3_2_16_2","unstructured":"Jiaang Duan Shiyou Qian Dingyu Yang Hanwen Hu Jian Cao and Guangtao Xue. 2024. MOPAR: A Model Partitioning Framework for Deep Learning Inference Services on Serverless Platforms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.02445 (2024)."},{"key":"e_1_3_3_2_17_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Faiz Ahmad","year":"2024","unstructured":"Ahmad Faiz, Sotaro Kaneda, Ruhan Wang, Rita\u00a0Chukwunyere Osi, Prateek Sharma, Fan Chen, and Lei Jiang. 2024. LLMCarbon: Modeling the End-to-End Carbon Footprint of Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=aIok3ZD9to"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717481"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_3_2_20_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 414, 15\u00a0pages."},{"key":"e_1_3_3_2_21_2","volume-title":"International Conference on Learning Representations","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_22_2","unstructured":"Elias Frantar Roberto\u00a0L Castro Jiale Chen Torsten Hoefler and Dan Alistarh. 2024. MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.11743 (2024)."},{"key":"e_1_3_3_2_23_2","series-title":"(OSDI\u201924)","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: low-latency serverless inference for large language models. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation (Santa Clara, CA, USA) (OSDI\u201924). USENIX Association, USA, Article 8, 19\u00a0pages."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.5555\/3086952"},{"key":"e_1_3_3_2_25_2","unstructured":"Sylvain Gugger Lysandre Debut Thomas Wolf Philipp Schmid Zachary Mueller Sourab Mangrulkar Marc Sun and Benjamin Bossan. 2022. Accelerate: Training and inference at scale made simple efficient and adaptable. https:\/\/github.com\/huggingface\/accelerate. Accessed: 2024-08-17."},{"key":"e_1_3_3_2_26_2","series-title":"(OSDI\u201920)","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like clockwork: performance predictability from the bottom up. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation(OSDI\u201920). USENIX Association, USA, Article 25, 20\u00a0pages."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_3_2_28_2","unstructured":"Geoffrey\u00a0E. Hinton Oriol Vinyals and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. ArXiv abs\/1503.02531 (2015). https:\/\/api.semanticscholar.org\/CorpusID:7200347"},{"key":"e_1_3_3_2_29_2","unstructured":"Connor Holmes Masahiro Tanaka Michael Wyatt Ammar\u00a0Ahmad Awan Jeff Rasley Samyam Rajbhandari Reza\u00a0Yazdani Aminabadi Heyang Qin Arash Bakhtiari Lev Kurilenko and Yuxiong He. 2024. DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.08671 (2024). arxiv:https:\/\/arXiv.org\/abs\/2401.08671\u00a0[cs.PF] https:\/\/arxiv.org\/abs\/2401.08671"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00064"},{"key":"e_1_3_3_2_31_2","volume-title":"International Conference on Learning Representations","author":"Hu Edward\u00a0J","year":"2022","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_3_2_32_2","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, Article 10, 10\u00a0pages."},{"key":"e_1_3_3_2_33_2","unstructured":"HuggingFace. 2022. Safetensors. https:\/\/github.com\/huggingface\/safetensors. Accessed: 2024-10-08."},{"key":"e_1_3_3_2_34_2","unstructured":"HuggingFace. 2024. Hugging Face. https:\/\/huggingface.co\/. Accessed: 2024-10-20."},{"key":"e_1_3_3_2_35_2","unstructured":"IBM Developer. 2020. Socket programming in C. https:\/\/developer.ibm.com\/tutorials\/l-sock\/ Accessed: 2025-04-13."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716245"},{"key":"e_1_3_3_2_37_2","unstructured":"Albert\u00a0Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Emma\u00a0Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio\u00a0Renard Lavaud Lucile Saulnier Marie-Anne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven\u00a0Le Scao Th\u00e9ophile Gervet Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El\u00a0Sayed. 2024. Mixtral of Experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04088 (2024)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.726"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563468"},{"key":"e_1_3_3_2_40_2","series-title":"(SC \u201920)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Kung H.\u00a0T.","year":"2020","unstructured":"H.\u00a0T. Kung, Bradley McDanel, and Sai\u00a0Qian Zhang. 2020. Term quantization: furthering quantization at run time. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 96, 14\u00a0pages."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_2_42_2","unstructured":"Lambda Labs. 2025. GPU Cloud Pricing. https:\/\/lambda.ai\/service\/gpu-cloud\/pricing. Accessed: 2025-04-13."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00022"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707265"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_3_2_46_2","unstructured":"Zhuohan Li Lianmin Zheng Yinmin Zhong Vincent Liu Ying Sheng Xin Jin Yanping Huang Zhifeng Chen Hao Zhang Joseph\u00a0E. Gonzalez and Ion Stoica. 2023. AlpaServe [Source Code]: Function in model_parallelism.py Line 323-353. https:\/\/github.com\/alpa-projects\/mms\/blob\/dba47b18e95f037aadc8aff336e2e7e337010495\/alpa_serve\/placement_policy\/model_parallelism.py#L323."},{"key":"e_1_3_3_2_47_2","first-page":"663","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663\u2013679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_3_2_48_2","first-page":"87","volume-title":"Proceedings of Machine Learning and Systems","volume":"6","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, and Song Han. 2024. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. In Proceedings of Machine Learning and Systems , P.\u00a0Gibbons, G.\u00a0Pekhimenko, and C.\u00a0De Sa (Eds.), Vol.\u00a06. 87\u2013100. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/42a452cbafa9dd64e9ba4aa95cc1ef21-Paper-Conference.pdf"},{"key":"e_1_3_3_2_49_2","unstructured":"Zhiqi Lin Youshan Miao Guodong Liu Xiaoxiang Shi Quanlu Zhang Fan Yang Saeed Maleki Yi Zhu Xu Cao Cheng Li Mao Yang Lintao Zhang and Lidong Zhou. 2023. SuperScaler: Supporting Flexible DNN Parallelization via a Unified Abstraction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.08984 (2023)."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00042"},{"key":"e_1_3_3_2_51_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher R\u00e9, and Beidi Chen. 2023. Deja Vu: contextual sparsity for efficient LLMs at inference time. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 919, 40\u00a0pages."},{"key":"e_1_3_3_2_52_2","unstructured":"LMDeploy. 2024. INT8 KV Cache. https:\/\/lmdeploy.readthedocs.io\/en\/v0.4.0\/quantization\/kv_quant.html. Accessed: 2024-10-20."},{"key":"e_1_3_3_2_53_2","unstructured":"Cheng Luo Tianle Zhong and Geoffrey Fox. 2023. RTP: Rethinking Tensor Parallelism with Memory Deduplication. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.01635 (2023)."},{"key":"e_1_3_3_2_54_2","unstructured":"Shuming Ma Hongyu Wang Lingxiao Ma Lei Wang Wenhui Wang Shaohan Huang Li Dong Ruiping Wang Jilong Xue and Furu Wei. 2024. The era of 1-bit llms: All large language models are in 1.58 bits. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.17764 (2024)."},{"key":"e_1_3_3_2_55_2","series-title":"(NIPS \u201923)","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Ma Xinyin","year":"2023","unstructured":"Xinyin Ma, Gongfan Fang, and Xinchao Wang. 2023. LLM-Pruner: On the Structural Pruning of Large Language Models. In Proceedings of the 37th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS \u201923). Article 950, 19\u00a0pages."},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707215"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_60_2","unstructured":"NetApp. 2024. Fractional GPU Allocation for Less Demanding or Interactive Workloads. https:\/\/docs.netapp.com\/us-en\/netapp-solutions\/ai\/osrunai_fractional_gpu_allocation_for_less_demanding_or_interactive_workloads.html. Accessed: 2024-10-18."},{"key":"e_1_3_3_2_61_2","unstructured":"NVIDIA. 2020. Optimizing NVIDIA TensorRT Conversion for Real-time Inference on Autonomous Vehicles. https:\/\/developer.nvidia.com\/blog\/optimizing-nvidia-tensorrt-conversion-for-real-time-inference-on-autonomous-vehicles\/. Accessed: 2024-10-15."},{"key":"e_1_3_3_2_62_2","unstructured":"NVIDIA. 2023. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. Accessed: 2024-10-20."},{"key":"e_1_3_3_2_63_2","unstructured":"NVIDIA. 2024. Amazon Accelerates Customer Satisfaction With NVIDIA Triton Inference Server and NVIDIA TensorRT. https:\/\/resources.nvidia.com\/en-us-inference-customer-story\/nvidia-amazon-custom. Accessed: 2024-10-17."},{"key":"e_1_3_3_2_64_2","unstructured":"NVIDIA. 2024. American Express Prevents Fraud and Foils Cybercrime With NVIDIA AI Solutions. https:\/\/resources.nvidia.com\/en-us-inference-customer-story\/american-express-prevents-fraud. Accessed: 2024-10-17."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"e_1_3_3_2_66_2","unstructured":"OpenNMT. 2019. CTranslate2. https:\/\/opennmt.net\/CTranslate2\/. Accessed: 2024-10-06."},{"key":"e_1_3_3_2_67_2","unstructured":"Xiurui Pan Endian Li Qiao Li Shengwen Liang Yizhou Shan Ke Zhou Yingwei Luo Xiaolin Wang and Jie Zhang. 2024. InstInfer: In-Storage Attention Offloading for Cost-Effective Long-Context LLM Inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.04992 (2024)."},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"e_1_3_3_2_70_2","unstructured":"Jeff Pool Abhishek Sawarkar and Jay Rodge. 2021. Accelerating Inference with Sparsity Using the NVIDIA Ampere Architecture and NVIDIA TensorRT. https:\/\/developer.nvidia.com\/blog\/accelerating-inference-with-sparsity-using-ampere-and-tensorrt\/. Accessed: 2024-10-20."},{"key":"e_1_3_3_2_71_2","unstructured":"Ramya Prabhu Ajay Nayak Jayashree Mohan Ramachandran Ramjee and Ashish Panwar. 2024. vAttention: Dynamic Memory Management for Serving LLMs without PagedAttention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.04437 (2024)."},{"key":"e_1_3_3_2_72_2","first-page":"75","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew Kalbarczyk, Tamer Ba\u015far, and Ravishankar\u00a0K. Iyer. 2024. Power-aware Deep Learning Model Serving with \u03bc -Serve. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 75\u201393. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/qiu"},{"key":"e_1_3_3_2_73_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving language understanding by generative pre-training. https:\/\/cdn.openai.com\/research-covers\/language-unsupervised\/language_understanding_paper.pdf."},{"key":"e_1_3_3_2_74_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. J. Mach. Learn. Res. 21 1 Article 140 67\u00a0pages."},{"key":"e_1_3_3_2_75_2","series-title":"(SC \u201920)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 20, 16\u00a0pages."},{"key":"e_1_3_3_2_76_2","first-page":"397","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja\u00a0J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397\u2013411. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/romero"},{"key":"e_1_3_3_2_77_2","unstructured":"Sebastian Ruder. 2016. An overview of gradient descent optimization algorithms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.04747 (2016)."},{"key":"e_1_3_3_2_78_2","unstructured":"Christopher\u00a0J. Shallue Jaehoon Lee Joseph Antognini Jascha Sohl-Dickstein Roy Frostig and George\u00a0E. Dahl. 2019. Measuring the Effects of Data Parallelism on Neural Network Training. Journal of Machine Learning Research 20 (2019) 1\u201349."},{"key":"e_1_3_3_2_79_2","unstructured":"ShareGPT. 2023. ShareGPT. https:\/\/sharegpt.com\/. Accessed: 2024-10-16."},{"key":"e_1_3_3_2_80_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: high-throughput generative inference of large language models with a single GPU. In Proceedings of the 40th International Conference on Machine Learning (, Honolulu, Hawaii, USA,) (ICML\u201923). JMLR.org, Article 1288, 23\u00a0pages."},{"key":"e_1_3_3_2_81_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.08053 (2020). arxiv:https:\/\/arXiv.org\/abs\/1909.08053\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_3_2_82_2","series-title":"(OSDI\u201924)","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Shubha Sudipta\u00a0Saha","year":"2024","unstructured":"Sudipta\u00a0Saha Shubha, Haiying Shen, and Anand Iyer. 2024. USHER: holistic interference avoidance for resource optimized ML inference. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation (Santa Clara, CA, USA) (OSDI\u201924). USENIX Association, USA, Article 51, 18\u00a0pages."},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_3_2_85_2","unstructured":"Jovan Stojkovic Chaojie Zhang \u00cd\u00f1igo Goiri Josep Torrellas and Esha Choukse. 2024. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00741 (2024). https:\/\/arxiv.org\/abs\/2408.00741"},{"key":"e_1_3_3_2_86_2","doi-asserted-by":"publisher","unstructured":"Jianlin Su Murtadha Ahmed Yu Lu Shengfeng Pan Wen Bo and Yunfeng Liu. 2024. RoFormer: Enhanced transformer with Rotary Position Embedding. Neurocomputing 568 (2024) 127063. 10.1016\/j.neucom.2023.127063","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_3_2_87_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Sun Mingjie","year":"2024","unstructured":"Mingjie Sun, Zhuang Liu, Anna Bair, and J.\u00a0Zico Kolter. 2024. A Simple and Effective Pruning Approach for Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3717469"},{"key":"e_1_3_3_2_90_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-demo.14"},{"key":"e_1_3_3_2_91_2","volume-title":"International Conference on Learning Representations","author":"Tay Yi","year":"2023","unstructured":"Yi Tay, Mostafa Dehghani, Vinh\u00a0Q. Tran, Xavier Garcia, Jason Wei, Xuezhi Wang, Hyung\u00a0Won Chung, Siamak Shakeri, Dara Bahri, Tal Schuster, Huaixiu\u00a0Steven Zheng, Denny Zhou, Neil Houlsby, and Donald Metzler. 2023. UL2: Unifying Language Learning Paradigms. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_92_2","doi-asserted-by":"crossref","unstructured":"Inar Timiryasov and Jean-Loup Tastet. 2023. Baby Llama: knowledge distillation from an ensemble of teachers trained on a small dataset with no performance penalty. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.02019 (2023).","DOI":"10.18653\/v1\/2023.conll-babylm.24"},{"key":"e_1_3_3_2_93_2","unstructured":"TrueFoundry. 2023. Using Fractional GPUs. https:\/\/docs.truefoundry.com\/docs\/using-fractional-gpus. Accessed: 2024-10-18."},{"key":"e_1_3_3_2_94_2","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_3_2_95_2","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695975"},{"key":"e_1_3_3_2_96_2","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545087"},{"key":"e_1_3_3_2_97_2","unstructured":"Hongyu Wang Shuming Ma Li Dong Shaohan Huang Huaijie Wang Lingxiao Ma Fan Yang Ruiping Wang Yi Wu and Furu Wei. 2023. BitNet: Scaling 1-bit Transformers for Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11453 (2023). arXiv:arXiv:2310.11453"},{"key":"e_1_3_3_2_98_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00081"},{"key":"e_1_3_3_2_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_3_2_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695948"},{"key":"e_1_3_3_2_101_2","doi-asserted-by":"publisher","unstructured":"Hao Wu Shiyi Wang Youhui Bai Cheng Li Quan Zhou Jun Yi Feng Yan Ruichuan Chen and Yinlong Xu. 2023. A Generic High-Performance Compression-Aware Framework for Data Parallel DNN Training. IEEE Transactions on Parallel and Distributed Systems (2023) 1\u201320. 10.1109\/TPDS.2023.3266246","DOI":"10.1109\/TPDS.2023.3266246"},{"key":"e_1_3_3_2_102_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Wu Xiaoxia","year":"2023","unstructured":"Xiaoxia Wu, Cheng Li, Reza\u00a0Yazdani Aminabadi, Zhewei Yao, and Yuxiong He. 2023. Understanding INT4 quantization for language models: latency speedup, composability, and failure cases. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 1562, 16\u00a0pages."},{"key":"e_1_3_3_2_103_2","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_3_2_104_2","unstructured":"Leyang Xue Yao Fu Zhan Lu Luo Mai and Mahesh Marina. 2024. Moe-infinity: Activation-aware expert offloading for efficient moe serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.14361 (2024)."},{"key":"e_1_3_3_2_105_2","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC62300.2024.10737825"},{"key":"e_1_3_3_2_106_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"e_1_3_3_2_107_2","unstructured":"Zhewei Yao Xiaoxia Wu Cheng Li Stephen Youn and Yuxiong He. 2023. ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08302 (2023). arxiv:https:\/\/arXiv.org\/abs\/2303.08302\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2303.08302"},{"key":"e_1_3_3_2_108_2","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_3_2_109_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651368"},{"key":"e_1_3_3_2_110_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696086"},{"key":"e_1_3_3_2_111_2","first-page":"1049","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. MArk: Exploiting Cloud Services for Cost-Effective, SLO-Aware Machine Learning Inference Serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 1049\u20131062. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/zhang-chengliang"},{"key":"e_1_3_3_2_112_2","series-title":"(ICML\u201920)","volume-title":"Proceedings of the 37th International Conference on Machine Learning","author":"Zhang Jingqing","year":"2020","unstructured":"Jingqing Zhang, Yao Zhao, Mohammad Saleh, and Peter\u00a0J. Liu. 2020. PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization. In Proceedings of the 37th International Conference on Machine Learning(ICML\u201920). JMLR.org, Article 1051, 12\u00a0pages."},{"key":"e_1_3_3_2_113_2","doi-asserted-by":"publisher","unstructured":"Shixiong Zhao Fanxin Li Xusheng Chen Xiuxian Guan Jianyu Jiang Dong Huang Yuhao Qing Sen Wang Peng Wang Gong Zhang Cheng Li Ping Luo and Heming Cui. 2022. vPipe: A Virtualized Acceleration System for Achieving Efficient and Scalable Pipeline Parallel DNN Training. IEEE Transactions on Parallel and Distributed Systems 33 3 (2022) 489\u2013506. 10.1109\/TPDS.2021.3094364","DOI":"10.1109\/TPDS.2021.3094364"},{"key":"e_1_3_3_2_114_2","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_3_2_115_2","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric\u00a0P. Xing, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 559\u2013578. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zheng-lianmin"},{"key":"e_1_3_3_2_116_2","series-title":"(OSDI\u201924)","volume-title":"Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation","author":"Zhuang Donglin","year":"2024","unstructured":"Donglin Zhuang, Zhen Zheng, Haojun Xia, Xiafei Qiu, Junjie Bai, Wei Lin, and Shuaiwen\u00a0Leon Song. 2024. MonoNN: enabling a new monolithic optimization space for neural network inference tasks on modern GPU-centric architectures. In Proceedings of the 18th USENIX Conference on Operating Systems Design and Implementation (Santa Clara, CA, USA) (OSDI\u201924). USENIX Association, USA, Article 53, 17\u00a0pages."}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759867","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:51:39Z","timestamp":1773255099000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759867"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":115,"alternative-id":["10.1145\/3712285.3759867","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759867","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}