{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T10:17:51Z","timestamp":1777457871996,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777911.3800633","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:00:11Z","timestamp":1776949211000},"page":"277-286","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OASIS: Optimal Allocation Strategy for Inference Services in Cloud Environments"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6238-6709","authenticated-orcid":false,"given":"Viyom","family":"Mittal","sequence":"first","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6981-581X","authenticated-orcid":false,"given":"Mohammed","family":"Baydoun","sequence":"additional","affiliation":[{"name":"American University of Beirut, Beirut, Lebanon"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3140-1088","authenticated-orcid":false,"given":"Alok","family":"Mishra","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5752-5778","authenticated-orcid":false,"given":"Pavana","family":"Prakash","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0373-1867","authenticated-orcid":false,"given":"Gourav","family":"Rattihalli","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8297-8525","authenticated-orcid":false,"given":"Aditya","family":"Dhakal","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-1829","authenticated-orcid":false,"given":"Eitan","family":"Frachtenberg","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3356-6898","authenticated-orcid":false,"given":"Izzat","family":"El Hajj","sequence":"additional","affiliation":[{"name":"American University of Beirut, Beirut, Lebanon"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3882-9987","authenticated-orcid":false,"given":"Michails","family":"Faloutsos","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9830-8588","authenticated-orcid":false,"given":"Dejan","family":"Milojicic","sequence":"additional","affiliation":[{"name":"Hewlett Packard Labs, Milpitas, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1","article-title":"Punica: Multi-tenant lora serving","volume":"6","author":"Chen Lequn","year":"2024","unstructured":"Lequn Chen, Zihao Ye, Yongji Wu, Danyang Zhuo, Luis Ceze, and Arvind Krishnamurthy. 2024. Punica: Multi-tenant lora serving. Proceedings of Machine Learning and Systems 6 (2024), 1-13.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_2_1","first-page":"1362","article-title":"Llm-inference-bench: Inference benchmarking of large language models on ai accelerators. In SC24-W: Workshops of the International Conference for High Performance Computing","author":"Chitty-Venkata Krishna Teja","year":"2024","unstructured":"Krishna Teja Chitty-Venkata, Siddhisanket Raskar, Bharat Kale, Farah Ferdaus, Aditya Tanikanti, Ken Raffenetti, Valerie Taylor, Murali Emani, and Venkatram Vishwanath. 2024. Llm-inference-bench: Inference benchmarking of large language models on ai accelerators. In SC24-W: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1362-1379.","journal-title":"Networking, Storage and Analysis. IEEE"},{"key":"e_1_3_2_1_3_1","first-page":"200","article-title":"LLM Serving With Efficient KV-Cache Management Using Triggered Operations","volume":"100","author":"Dhakal Aditya","year":"2024","unstructured":"Aditya Dhakal, Pedro Bruel, Gourav Rattihalli, Sai Rahul Chalamalasetti, and Dejan Milojicic. 2024. LLM Serving With Efficient KV-Cache Management Using Triggered Operations. Memory 100 (2024), 200.","journal-title":"Memory"},{"key":"e_1_3_2_1_4_1","volume-title":"Language models for code optimization: Survey, challenges and future directions. arXiv preprint arXiv:2501.01277","author":"Gong Jingzhi","year":"2025","unstructured":"Jingzhi Gong, Vardan Voskanyan, Paul Brookes, Fan Wu, Wei Jie, Jie Xu, Rafail Giavrimis, Mike Basios, Leslie Kanthan, and ZhengWang. 2025. Language models for code optimization: Survey, challenges and future directions. arXiv preprint arXiv:2501.01277 (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"The economic trade-offs of large language models: A case study. arXiv preprint arXiv:2306.07402","author":"Howell Kristen","year":"2023","unstructured":"Kristen Howell, Gwen Christian, Pavel Fomitchov, Gitit Kehat, Julianne Marzulla, Leanne Rolston, Jadin Tredup, Ilana Zimmerman, Ethan Selfridge, and Joseph Bradley. 2023. The economic trade-offs of large language models: A case study. arXiv preprint arXiv:2306.07402 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Blockllm: Multi-tenant finer-grained serving for large language models. arXiv preprint arXiv:2404.18322","author":"Hu Bodun","year":"2024","unstructured":"Bodun Hu, Jiamin Li, Le Xu, Myungjin Lee, Akshay Jajoo, Geon-Woo Kim, Hong Xu, and Aditya Akella. 2024. Blockllm: Multi-tenant finer-grained serving for large language models. arXiv preprint arXiv:2404.18322 (2024)."},{"key":"e_1_3_2_1_7_1","first-page":"711","article-title":"Data movement is all you need: A case study on optimizing transformers","volume":"3","author":"Ivanov Andrei","year":"2021","unstructured":"Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. 2021. Data movement is all you need: A case study on optimizing transformers. Proceedings of Machine Learning and Systems 3 (2021), 711-732.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Demystifying costefficiency in llm serving over heterogeneous gpus. arXiv preprint arXiv:2502.00722","author":"Jiang Youhe","year":"2025","unstructured":"Youhe Jiang, Fangcheng Fu, Xiaozhe Yao, Guoliang He, Xupeng Miao, Ana Klimovic, Bin Cui, Binhang Yuan, and Eiko Yoneki. 2025. Demystifying costefficiency in llm serving over heterogeneous gpus. arXiv preprint arXiv:2502.00722 (2025)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3390\/app15020586"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3542929.3563510"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/3014904.3014977"},{"key":"e_1_3_2_1_14_1","unstructured":"Jinhao Li Jiaming Xu Shan Huang Yonghua Chen Wen Li Jun Liu Yaoxiu Lian Jiayi Pan Li Ding Hao Zhou et al. 2024. Large language model inference acceleration: A comprehensive hardware perspective. arXiv preprint arXiv:2410.04466 (2024)."},{"key":"e_1_3_2_1_15_1","first-page":"663","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663-679."},{"key":"e_1_3_2_1_16_1","unstructured":"Qunyou Liu Darong Huang Marina Zapater and David Atienza. 2025. Green-LLM: SLO-Aware Dynamic Frequency Scaling for Energy-Efficient LLM Serving. arXiv:2508.16449"},{"key":"e_1_3_2_1_17_1","first-page":"1025","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Liu Yizhi","year":"2019","unstructured":"Yizhi Liu, Yao Wang, Ruofei Yu, Mu Li, Vin Sharma, and Yida Wang. 2019. Optimizing {CNN} model inference on {CPUs}. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1025-1040."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658654"},{"key":"e_1_3_2_1_19_1","volume-title":"A Cost-Benefit Analysis of On-Premise Large Language Model Deployment: Breaking Even with Commercial LLM Services. arXiv preprint arXiv:2509.18101","author":"Pan Guanzhong","year":"2025","unstructured":"Guanzhong Pan, Vishal Chodnekar, Abinas Roy, and Haibo Wang. 2025. A Cost-Benefit Analysis of On-Premise Large Language Model Deployment: Breaking Even with Commercial LLM Services. arXiv preprint arXiv:2509.18101 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3698038.3698523"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD60044.2023.00014"},{"key":"e_1_3_2_1_22_1","volume-title":"Achieving peak performance for large language models: A systematic review","author":"Rostam Zhyar Rzgar K","year":"2024","unstructured":"Zhyar Rzgar K Rostam, S\u00e1ndor Sz\u00e9n\u00e1si, and G\u00e1bor Kert\u00e9sz. 2024. Achieving peak performance for large language models: A systematic review. IEEE access (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3291606"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00102"},{"key":"e_1_3_2_1_26_1","volume-title":"In-the-loop hyperparameter optimization for llm-based automated design of heuristics. ACM Transactions on Evolutionary Learning","author":"van Stein Niki","year":"2024","unstructured":"Niki van Stein, Diederick Vermetten, and Thomas B\u00e4ck. 2024. In-the-loop hyperparameter optimization for llm-based automated design of heuristics. ACM Transactions on Evolutionary Learning (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337839"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3731599.3767396"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737413"},{"key":"e_1_3_2_1_30_1","volume-title":"Layerkv: Optimizing large language model serving with layer-wise kv cache management. arXiv preprint arXiv:2410.00428","author":"Xiong Yi","year":"2024","unstructured":"Yi Xiong, Hao Wu, Changxu Shao, Ziqing Wang, Rui Zhang, Yuhong Guo, Junping Zhao, Ke Zhang, and Zhenxuan Pan. 2024. Layerkv: Optimizing large language model serving with layer-wise kv cache management. arXiv preprint arXiv:2410.00428 (2024)."},{"key":"e_1_3_2_1_31_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521-538."},{"key":"e_1_3_2_1_32_1","volume-title":"A hardware evaluation framework for large language model inference. arXiv preprint arXiv:2312.03134","author":"Zhang Hengrui","year":"2023","unstructured":"Hengrui Zhang, August Ning, Rohan Prabhakar, and David Wentzlaff. 2023. A hardware evaluation framework for large language model inference. arXiv preprint arXiv:2312.03134 (2023)."},{"key":"e_1_3_2_1_33_1","first-page":"787","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the wild. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 787-808."},{"key":"e_1_3_2_1_34_1","unstructured":"Longteng Zhang Xiang Liu Zeyu Li Xinglin Pan Peijie Dong Ruibo Fan Rui Guo Xin Wang Qiong Luo Shaohuai Shi et al. 2023. Dissecting the runtime performance of the training fine-tuning and inference of large language models. arXiv preprint arXiv:2311.03687 (2023)."}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Companion of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:05:44Z","timestamp":1777381544000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777911.3800633"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":34,"alternative-id":["10.1145\/3777911.3800633","10.1145\/3777911"],"URL":"https:\/\/doi.org\/10.1145\/3777911.3800633","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}