{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:14:23Z","timestamp":1772694863261,"version":"3.50.1"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62232011,62302302"],"award-info":[{"award-number":["62232011,62302302"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408548","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-18","source":"Crossref","is-referenced-by-count":0,"title":["Towards Resource-Efficient Serverless LLM Inference with SLINFER"],"prefix":"10.1109","author":[{"given":"Chuhao","family":"Xu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zijun","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Han","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xueyan","family":"Tang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"meta-llama\/llama-2\u20137b-hf \u2022 hugging face","year":"2023"},{"key":"ref2","volume-title":"Phi-2: The surprising power of small language models - microsoft research","year":"2023"},{"key":"ref3","volume-title":"Sharegpt \u2022 datasets at hugging face","year":"2023"},{"key":"ref4","volume-title":"mistralai\/codestral-22b-v0.1 \u2022 hugging face","year":"2024"},{"key":"ref5","volume-title":"Open source ai year in review 2024 - a hugging face space by huggingface","year":"2024"},{"key":"ref6","volume-title":"Amazon sagemaker","year":"2025"},{"key":"ref7","volume-title":"Chatgpt \u2014 openai","year":"2025"},{"key":"ref8","volume-title":"Deploy models as serverless apis - azure machine learning \u2014 microsoft learn","year":"2025"},{"key":"ref9","volume-title":"Host your llms on cloud run - google cloud blog","year":"2025"},{"key":"ref10","volume-title":"Inference api (serverless) - hugging face","year":"2025"},{"key":"ref11","volume-title":"Inte1\u00ae distribution of openvinoTM toolkit","year":"2025"},{"key":"ref12","volume-title":"Intel\u00ae xeon\u00ae 6966p-c processor","year":"2025"},{"key":"ref13","volume-title":"Meet claude anthropic","year":"2025"},{"key":"ref14","volume-title":"Tensorrt-llm","year":"2025"},{"key":"ref15","volume-title":"What is intel\u00ae advanced matrix extensions (intel\u00ae amx)? - intel","year":"2025"},{"key":"ref16","first-page":"117","article-title":"Taming throughput-latency tradeoff in LLM inference with sarathi-serve","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024","author":"Agrawal","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00073"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.172"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2018.2794343"},{"key":"ref20","volume-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref21","first-page":"199","article-title":"Serving heterogeneous machine learning models on multi-gpu servers with spatio-temporal sharing","volume-title":"Proceedings of the 2022 USENIX Annual Technical Conference, USENIX ATC 2022","author":"Choi","year":"2022"},{"key":"ref22","first-page":"613","article-title":"Clipper: A low-latency online prediction serving system","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2017","author":"Crankshaw","year":"2017"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507732"},{"key":"ref24","article-title":"Muxserve: Flexible spatial-temporal multiplexing for multiple LLM serving","volume-title":"Forty-first International Conference on Machine Learning, ICML 2024","author":"Duan","year":"2024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"ref26","first-page":"135","article-title":"Serverlessllm: Low-latency serverless inference for large language models","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024","author":"Fu","year":"2024"},{"key":"ref27","first-page":"443","article-title":"Serving dnns like clockwork: Performance predictability from the bottom up","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event","author":"Gujarati","year":"2020"},{"key":"ref28","first-page":"1041","article-title":"Cocktail: A multidimensional optimization for model serving in cloud","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2022","author":"Gunasekaran","year":"2022"},{"key":"ref29","article-title":"Fastdecode: High-throughput gpu-efficient LLM serving using heterogeneous pipelines","volume":"abs\/2403.11421","author":"He","year":"2024","journal-title":"CoRR"},{"key":"ref30","first-page":"57","article-title":"DEEPSERVE: serverless large language model serving at scale","volume-title":"Proceedings of the 2025 USENIX Annual Technical Conference, USENIX ATC 2025","author":"Hu","year":"2025"},{"key":"ref31","first-page":"947","article-title":"Analysis of large-scale multi-tenant GPU clusters for DNN training workloads","volume-title":"Proceedings of the 2019 USENIX Annual Technical Conference, USENIX ATC 2019","author":"Jeon","year":"2019"},{"key":"ref32","article-title":"NEO: Saving GPU memory crisis with CPU offloading for online LLM inference","volume-title":"Eighth Conference on Machine Learning and Systems","author":"Jiang","year":"2025"},{"key":"ref33","first-page":"463","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event","author":"Jiang","year":"2020"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624783"},{"key":"ref35","article-title":"Pod-attention: Unlocking full prefill-decode overlap for faster LLM inference","volume":"abs\/2410.18038","author":"Kamath","year":"2024","journal-title":"CoRR"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731092"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref38","first-page":"611","article-title":"PRETZEL: opening the black box of machine learning prediction serving systems","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Lee","year":"2018"},{"key":"ref39","article-title":"Caraserve: Cpu-assisted and rank-aware lora serving for generative LLM inference","volume":"abs\/2401.11240","author":"Li","year":"2024","journal-title":"CoRR"},{"key":"ref40","first-page":"663","article-title":"Alpaserve: Statistical multiplexing with model parallelism for deep learning serving","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2023","author":"Li","year":"2023"},{"key":"ref41","article-title":"AWQ: activation-aware weight quantization for on-device LLM compression and acceleration","volume-title":"Proceedings of the Seventh Annual Conference on Machine Learning and Systems, MLSys 2024","author":"Lin","year":"2024"},{"key":"ref42","volume-title":"Towards swift serverless 11 m cold starts with paraserve","author":"Lou","year":"2025"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707251"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640413"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref47","article-title":"Flexinfer: Flexible LLM inference with CPU computations","volume-title":"Eighth Conference on Machine Learning and Systems","author":"Na","year":"2025"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/iiswc63097.2024.00024"},{"key":"ref49","first-page":"481","article-title":"Heterogeneity-aware cluster scheduling policies for deep learning workloads","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event","author":"Narayanan","year":"2020"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/isscc42614.2022.9731107"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640383"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3656019.3676949"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651329"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00019"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3590140.3629115"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707256"},{"key":"ref57","article-title":"Gemma 2: Improving open language models at a practical size","volume":"abs\/2408.00118","author":"Rivi\u00e8re","year":"2024","journal-title":"CoRR"},{"key":"ref58","first-page":"397","article-title":"Infaas: Automated model-less inference serving","volume-title":"Proceedings of the 2021 USENIX Annual Technical Conference, USENIX ATC 2021","author":"Romero","year":"2021"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486972"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507750"},{"key":"ref61","first-page":"205","article-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference, USENIX ATC 2020, July 15\u201317, 2020","author":"Shahrad","year":"2020"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"ref63","first-page":"173","article-title":"Llumnix: Dynamic scheduling for large language model serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024","author":"Sun","year":"2024"},{"key":"ref64","volume-title":"Gemma: Open models based on gemini research and technology","author":"Team","year":"2024"},{"key":"ref65","volume-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737413"},{"key":"ref67","first-page":"945","article-title":"Mlaas in the wild: Workload analysis and scheduling in large-scale heterogeneous GPU clusters","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2022","author":"Weng","year":"2022"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695948"},{"key":"ref69","article-title":"Pie: Pooling CPU memory for LLM inference","volume":"abs\/2411.09317","author":"Xu","year":"2024","journal-title":"CoRR"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"ref71","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Yu","year":"2022"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707285"},{"key":"ref73","first-page":"787","article-title":"SHEPHERD: serving dnns in the wild","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"Zhang","year":"2023"},{"key":"ref74","article-title":"Lmsys-chat-1m: A large-scale real-world LLM conversation dataset","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Zheng"},{"key":"ref75","first-page":"193","article-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024","author":"Zhong","year":"2024"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408548.pdf?arnumber=11408548","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:36:01Z","timestamp":1772692561000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408548\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408548","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}