{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T14:40:53Z","timestamp":1765291253460,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3774899.3775017","type":"proceedings-article","created":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T14:35:38Z","timestamp":1765290938000},"page":"51-57","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Illuminating the Hidden Challenges of Serverless LLM Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8854-048X","authenticated-orcid":false,"given":"Amit","family":"Samanta","sequence":"first","affiliation":[{"name":"University of Utah, Salt Lake City, UT, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4578-9925","authenticated-orcid":false,"given":"Tri Gia","family":"Nguyen","sequence":"additional","affiliation":[{"name":"FPT University, Da Nang, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Mart\u00edn Abadi Paul Barham Jianmin Chen Zhifeng Chen et al. 2016. TensorFlow: A System for Large-Scale Machine Learning. In USENIX OSDI. 265\u2013283."},{"key":"e_1_3_2_1_2_1","unstructured":"Istemi Ekin Akkus Ruichuan Chen Ivica Rimac Manuel Stein Klaus Satzke Andre Beck Paarijaat Aditya and Volker Hilt. 2018. {SAND}: towards {High-Performance} serverless computing. In USENIX ATC. 923\u2013935."},{"key":"e_1_3_2_1_3_1","volume-title":"Batch: Machine learning inference serving on serverless platforms with adaptive batching","author":"Ali Ahsan","year":"2020","unstructured":"Ahsan Ali, Riccardo Pinciroli, Feng Yan, and Evgenia Smirni. 2020. Batch: Machine learning inference serving on serverless platforms with adaptive batching. In ACM\/IEEE SC. 1\u201315."},{"key":"e_1_3_2_1_4_1","unstructured":"Anthropic API. 2024. Anthropic API Documentation. https:\/\/docs.anthropic.com\/. Accessed: 2024-10-18. Used Model: Claude 3.5 Sonnet."},{"key":"e_1_3_2_1_5_1","unstructured":"AWS Lambda. 2024. AWS Lambda. https:\/\/aws.amazon.com\/lambda\/. Accessed: 2024-10-18."},{"key":"e_1_3_2_1_6_1","unstructured":"Azure Functions. 2024. Azure Functions. https:\/\/azure.microsoft.com\/en-us\/products\/functions\/. Accessed: 2024-10-18."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368454"},{"key":"e_1_3_2_1_8_1","unstructured":"Varun Chandrasekaran Kamalika Chaudhuri Irene Giacomelli Somesh Jha and Songbai Yan. 2020. Exploring connections between active learning and model extraction. In USENIX Security. 1309\u20131326."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507732"},{"key":"e_1_3_2_1_10_1","unstructured":"Yao Fu Leyang Xue Yeqi Huang Andrei-Octavian Brabete et al. 2024. {ServerlessLLM}: {Low-Latency} serverless inference for large language models. In USENIX OSDI. 135\u2013153."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCC.2017.4250939"},{"key":"e_1_3_2_1_12_1","unstructured":"Google Cloud Functions. 2024. Google Cloud Functions. https:\/\/cloud.google.com\/functions. Accessed: 2024-10-18."},{"key":"e_1_3_2_1_13_1","volume-title":"Optimus: Warming serverless ml inference via inter-function model transformation. In ACM EuroSys. 1039\u20131053.","author":"Hong Zicong","year":"2024","unstructured":"Zicong Hong, Jian Lin, Song Guo, Sifu Luo, Wuhui Chen, Roger Wattenhofer, and Yue Yu. 2024. Optimus: Warming serverless ml inference via inter-function model transformation. In ACM EuroSys. 1039\u20131053."},{"key":"e_1_3_2_1_14_1","volume-title":"DEEPSERVE: Serverless Large Language Model Serving at Scale. arXiv preprint arXiv:2501.14417","author":"Hu Junhao","year":"2025","unstructured":"Junhao Hu, Jiang Xu, Zhixia Liu, Yulong He, Yuetao Chen, Hao Xu, Jiang Liu, Jie Meng, Baoquan Zhang, Shining Wan, et al. 2025. DEEPSERVE: Serverless Large Language Model Serving at Scale. arXiv preprint arXiv:2501.14417 (2025)."},{"key":"e_1_3_2_1_15_1","volume-title":"ENOVA: Autoscaling towards Cost-effective and Stable Serverless LLM Serving. arXiv preprint arXiv:2407.09486","author":"Huang Tao","year":"2024","unstructured":"Tao Huang, Pengfei Chen, Kyoka Gong, Jocky Hawk, Zachary Bright, Wenxin Xie, Kecheng Huang, and Zhi Ji. 2024. ENOVA: Autoscaling towards Cost-effective and Stable Serverless LLM Serving. arXiv preprint arXiv:2407.09486 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Pronghorn: Effective checkpoint orchestration for serverless hot-starts. In ACM EuroSys. 298\u2013316.","author":"Kohli Sumer","year":"2024","unstructured":"Sumer Kohli, Shreyas Kharbanda, Rodrigo Bruno, Joao Carreira, and Pedro Fonseca. 2024. Pronghorn: Effective checkpoint orchestration for serverless hot-starts. In ACM EuroSys. 298\u2013316."},{"key":"e_1_3_2_1_17_1","volume-title":"Joseph Gonzalez, Hao Zhang, and Ion Stoica.","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient memory management for large language model serving with pagedattention. In ACM SOSP. 611\u2013626."},{"key":"e_1_3_2_1_18_1","unstructured":"Woosuk Kwon Zhuohan Michael Eric Hwang Xingyu Dong Simon Tumanov Patrick Wu et al. 2024. vLLM: A High-Throughput and Memory-Efficient Inferencing Engine for Large Language Models. https:\/\/github.com\/vllm-project\/vllm. GitHub repository; Accessed: 2024-10-18."},{"key":"e_1_3_2_1_19_1","volume-title":"Llm inference serving: Survey of recent advances and opportunities","author":"Li Baolin","unstructured":"Baolin Li, Yankai Jiang, Vijay Gadepally, and Devesh Tiwari. 2024. Llm inference serving: Survey of recent advances and opportunities. In IEEE HPEC. 1\u20138."},{"key":"e_1_3_2_1_20_1","unstructured":"Chongpeng Liu Xiaojian Liao Hancheng Liu et al. 2025. PipeBoost: Resilient Pipelined Architecture for Fast Serverless LLM Scaling. arXiv preprint arXiv:2503.17707 (2025)."},{"key":"e_1_3_2_1_21_1","unstructured":"Yupei Liu Yuqi Jia Runpeng Geng Jinyuan Jia and Neil Zhenqiang Gong. 2024. Formalizing and benchmarking prompt injection attacks and defenses. In USENIX Security. 1831\u20131847."},{"key":"e_1_3_2_1_22_1","unstructured":"Chiheng Lou Sheng Qi Chao Jin Dapeng Nie Haoran Yang et al. 2025. Towards Swift Serverless LLM Cold Starts with ParaServe. arXiv preprint arXiv:2502.15524 (2025)."},{"key":"e_1_3_2_1_23_1","volume-title":"The Llama 3 Herd of Models. AI at Meta Research","author":"Meta AI.","year":"2024","unstructured":"Meta AI. 2024. The Llama 3 Herd of Models. AI at Meta Research (2024). https:\/\/ai.meta.com\/research\/publications\/the-llama-3-herd-of-models\/ Accessed: 2024-10-18."},{"key":"e_1_3_2_1_24_1","volume-title":"Triton Inference Server. https:\/\/github.com\/triton-inference-server\/server. GitHub repository","author":"Triton Inference Server NVIDIA","year":"2024","unstructured":"NVIDIA Triton Inference Server. 2024. Triton Inference Server. https:\/\/github.com\/triton-inference-server\/server. GitHub repository; Accessed: 2024-10-18."},{"key":"e_1_3_2_1_25_1","unstructured":"OpenAI. 2024. OpenAI API Documentation. https:\/\/platform.openai.com\/docs\/api-reference. Accessed: 2024-10-18. Used Model: GPT-4o."},{"key":"e_1_3_2_1_26_1","volume-title":"Snapstore: A snapshot storage system for serverless systems. In ACM\/IFIP MIDDLEWARE. 261\u2013274.","author":"Panda Abhisek","year":"2023","unstructured":"Abhisek Panda and Smruti R Sarangi. 2023. Snapstore: A snapshot storage system for serverless systems. In ACM\/IFIP MIDDLEWARE. 261\u2013274."},{"key":"e_1_3_2_1_27_1","volume-title":"Asyfunc: A high-performance and resource-efficient serverless inference system via asymmetric functions. In ACM SoCC. 324\u2013340.","author":"Pei Qiangyu","year":"2023","unstructured":"Qiangyu Pei, Yongjie Yuan, Haichuan Hu, Qiong Chen, and Fangming Liu. 2023. Asyfunc: A high-performance and resource-efficient serverless inference system via asymmetric functions. In ACM SoCC. 324\u2013340."},{"key":"e_1_3_2_1_28_1","volume-title":"https:\/\/github.com\/pytorch\/serve. GitHub repository","author":"TorchServe PyTorch","year":"2024","unstructured":"PyTorch Serving team. 2024. TorchServe. https:\/\/github.com\/pytorch\/serve. GitHub repository; Accessed: 2024-10-18."},{"key":"e_1_3_2_1_29_1","volume-title":"Daydream: Executing dynamic scientific workflows on serverless platforms with hot starts","author":"Roy Rohan Basu","year":"2022","unstructured":"Rohan Basu Roy, Tirthak Patel, and Devesh Tiwari. 2022. Daydream: Executing dynamic scientific workflows on serverless platforms with hot starts. In ACM\/IEEE SC. 1\u201318."},{"key":"e_1_3_2_1_30_1","volume-title":"Persistent memory-aware scheduling for serverless workloads","author":"Samanta Amit","unstructured":"Amit Samanta, Faraz Ahmed, Lianjie Cao, Ryan Stutsman, and Puneet Sharma. 2023. Persistent memory-aware scheduling for serverless workloads. In IEEE IPDPSW. 615\u2013621."},{"key":"e_1_3_2_1_31_1","volume-title":"Efficient Multi-Resource Scheduling for Stateless Serverless Functions with Anubis. In 2024 IEEE 24th International Symposium on Cluster, Cloud and Internet Computing (CCGrid). 106\u2013112","author":"Samanta Amit","year":"2024","unstructured":"Amit Samanta and Ryan Stutsman. 2024. Fair, Efficient Multi-Resource Scheduling for Stateless Serverless Functions with Anubis. In 2024 IEEE 24th International Symposium on Cluster, Cloud and Internet Computing (CCGrid). 106\u2013112."},{"key":"e_1_3_2_1_32_1","volume-title":"IEEE PerCom Workshops. 225\u2013230","author":"Juan Justin San","year":"2023","unstructured":"Justin San Juan and Bernard Wong. 2023. Reducing the cost of GPU cold starts in serverless deep learning inference serving. In IEEE PerCom Workshops. 225\u2013230."},{"key":"e_1_3_2_1_33_1","unstructured":"Mohammad Shahrad Rodrigo Fonseca Inigo Goiri Gohar Chaudhry Paul Batum Jason Cooke Eduardo Laureano et al. 2020. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In USENIX ATC. 205\u2013218."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Paulo Silva Daniel Fireman and Thiago Emmanuel Pereira. 2020. Prebaking functions to warm the serverless cold start. In ACM\/IFIP MIDDLEWARE. 1\u201313.","DOI":"10.1145\/3423211.3425682"},{"key":"e_1_3_2_1_35_1","unstructured":"Yifan Sui Hao Wang Hanfei Yu Yitao Hu et al. 2025. ServerlessLoRA: Minimizing Latency and Cost in Serverless Inference for LoRA-Based LLMs. arXiv preprint arXiv:2505.14468 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Ariel Szekely Adam Belay Robert Morris and M Frans Kaashoek. 2024. Unifying serverless and microservice workloads with SigmaOS. In ACM SOSP. 385\u2013402.","DOI":"10.1145\/3694715.3695947"},{"key":"e_1_3_2_1_37_1","unstructured":"TensorFlow Serving. 2024. TensorFlow Serving Documentation. https:\/\/www.tensorflow.org\/tfx\/guide\/serving. Accessed: 2024-10-18."},{"key":"e_1_3_2_1_38_1","volume-title":"Text Generation Inference (TGI). https:\/\/github.com\/huggingface\/text-generation-inference. GitHub repository","author":"Text Generation","year":"2024","unstructured":"Text Generation Inference (TGI). 2024. Text Generation Inference (TGI). https:\/\/github.com\/huggingface\/text-generation-inference. GitHub repository; Accessed: 2024-10-18."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Li Wang Yankai Jiang and Ningfang Mi. 2024. Advancing serverless computing for scalable ai model inference: Challenges and opportunities. In ACM WoSC. 1\u20136.","DOI":"10.1145\/3702634.3702950"},{"key":"e_1_3_2_1_40_1","unstructured":"Liang Wang Mengyuan Li Yinqian Zhang Thomas Ristenpart and Michael Swift. 2018. Peeking behind the curtains of serverless platforms. In USENIX ATC. 133\u2013146."},{"key":"e_1_3_2_1_41_1","volume-title":"Transformers: State-of-the-Art Natural Language Processing. EMNLP","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, et al. 2020. Transformers: State-of-the-Art Natural Language Processing. EMNLP (2020), 38\u201345."},{"key":"e_1_3_2_1_42_1","volume-title":"LLM-Mesh: Enabling Elastic Sharing for Serverless LLM Inference. arXiv preprint arXiv:2507.00507","author":"Xu Chuhao","year":"2025","unstructured":"Chuhao Xu, Zijun Li, Quan Chen, Han Zhao, and Minyi Guo. 2025. LLM-Mesh: Enabling Elastic Sharing for Serverless LLM Inference. arXiv preprint arXiv:2507.00507 (2025)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Yanan Yang Laiping Zhao Yiming Li Huanyu Zhang Jie Li Mingyang Zhao Xingzhen Chen and Keqiu Li. 2022. Infless: a native serverless system for low-latency high-throughput inference. In ACM ASPLOS. 768\u2013781.","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_44_1","volume-title":"Scale: Enabling Fast Scaling for Serverless Large Language Model Inference. arXiv preprint arXiv:2502.09922","author":"Yu Minchen","year":"2025","unstructured":"Minchen Yu, Rui Yang, Chaobo Jia, Zhaoyuan Su, Sheng Yao, Tingfeng Lan, Yuchen Yang, Yue Cheng, Wei Wang, Ao Wang, et al. 2025. {\\lambda} Scale: Enabling Fast Scaling for Serverless Large Language Model Inference. arXiv preprint arXiv:2502.09922 (2025)."},{"key":"e_1_3_2_1_45_1","volume-title":"Medusa: Accelerating serverless LLM inference with materialization. In ACM ASPLOS. 653\u2013668.","author":"Zeng Shaoxun","year":"2025","unstructured":"Shaoxun Zeng, Minimi Xie, Shiwei Gao, Youmin Chen, and Youyou Lu. 2025. Medusa: Accelerating serverless LLM inference with materialization. In ACM ASPLOS. 653\u2013668."},{"key":"e_1_3_2_1_46_1","unstructured":"Chengliang Zhang Minchen Yu Wei Wang and Feng Yan 2019. {MArk}: Exploiting cloud services for {Cost-Effective} {SLO-Aware} machine learning inference serving. In USENIX ATC. 1049\u20131062."}],"event":{"name":"WoSC11 '25: 11th International Workshop on Serverless Computing","location":"Vanderbilt University Nashville TN USA","acronym":"WoSC11 '25","sponsor":["IFIP","Usenix"]},"container-title":["Proceedings of the 11th International Workshop on Serverless Computing"],"original-title":[],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T14:36:08Z","timestamp":1765290968000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774899.3775017"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":46,"alternative-id":["10.1145\/3774899.3775017","10.1145\/3774899"],"URL":"https:\/\/doi.org\/10.1145\/3774899.3775017","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}