{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T16:20:26Z","timestamp":1772727626090,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1145\/3702634.3702950","type":"proceedings-article","created":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T16:14:07Z","timestamp":1732724047000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Advancing Serverless Computing for Scalable AI Model Inference: Challenges and Opportunities"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4515-477X","authenticated-orcid":false,"given":"Li","family":"Wang","sequence":"first","affiliation":[{"name":"Northeastern University, Boston, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9968-7560","authenticated-orcid":false,"given":"Yankai","family":"Jiang","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0934-1287","authenticated-orcid":false,"given":"Ningfang","family":"Mi","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"BATCH: Machine Learning Inference Serving on Serverless Platforms with Adaptive Batching. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. 1--15","author":"Ali Ahsan","year":"2020","unstructured":"Ahsan Ali, Riccardo Pinciroli, Feng Yan, and Evgenia Smirni. 2020. BATCH: Machine Learning Inference Serving on Serverless Platforms with Adaptive Batching. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. 1--15."},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon. 2024. AWS Bedrock. https:\/\/docs.aws.amazon.com\/bedrock\/"},{"key":"e_1_3_2_1_3_1","volume-title":"Serverless Computing Approach for Deploying Machine Learning Applications in Edge Layer. In 2022 International Conference on Information Networking (ICOIN). 396--401","author":"Bac Ta Phuong","year":"2022","unstructured":"Ta Phuong Bac, Minh Ngoc Tran, and Young Han Kim. 2022. Serverless Computing Approach for Deploying Machine Learning Applications in Edge Layer. In 2022 International Conference on Information Networking (ICOIN). 396--401."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3206366"},{"key":"e_1_3_2_1_5_1","volume-title":"BARISTA: Efficient and Scalable Serverless Serving System for Deep Learning Prediction Services. In 2019 IEEE International Conference on Cloud Engineering (IC2E). 23--33","author":"Bhattacharjee Anirban","year":"2019","unstructured":"Anirban Bhattacharjee, Ajay Dev Chhokra, Zhuangwei Kang, Hongyang Sun, Aniruddha Gokhale, and Gabor Karsai. 2019. BARISTA: Efficient and Scalable Serverless Serving System for Deep Learning Prediction Services. In 2019 IEEE International Conference on Cloud Engineering (IC2E). 23--33."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3609510.3609816"},{"key":"e_1_3_2_1_7_1","first-page":"3","article-title":"SMSS: Stateful Model Serving in Metaverse With Serverless Computing and GPU Sharing","volume":"42","author":"Cai Zinuo","year":"2023","unstructured":"Zinuo Cai, Zebin Chen, Ruhui Ma, and Haibing Guan. 2023. SMSS: Stateful Model Serving in Metaverse With Serverless Computing and GPU Sharing. IEEE J.Sel. A. Commun. 42, 3 (dec 2023), 799--811.","journal-title":"IEEE J.Sel. A. Commun."},{"key":"e_1_3_2_1_8_1","volume-title":"MOPAR: A Model Partitioning Framework for Deep Learning Inference Services on Serverless Platforms. ArXiv abs\/2404.02445","author":"Duan Jiaang","year":"2024","unstructured":"Jiaang Duan, Shiyou Qian, Dingyu Yang, Hanwen Hu, Jian Cao, and Guangtao Xue. 2024. MOPAR: A Model Partitioning Framework for Deep Learning Inference Services on Serverless Platforms. ArXiv abs\/2404.02445 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. ServerlessLLM: Low-Latency Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 135--153."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/ACM 16th International Conference on Utility and Cloud Computing (Taormina (Messina), Italy) (UCC '23)","author":"Gallego Adrien","year":"2024","unstructured":"Adrien Gallego, Uraz Odyurt, Yi Cheng, Yuandou Wang, and Zhiming Zhao. 2024. Machine Learning Inference on Serverless Platforms Using Model Decomposition. In Proceedings of the IEEE\/ACM 16th International Conference on Utility and Cloud Computing (Taormina (Messina), Italy) (UCC '23). Article 33, 6 pages."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605638"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 21st International Middleware Conference","author":"Gunasekaran Jashwant Raj","unstructured":"Jashwant Raj Gunasekaran, Prashanth Thinakaran, Nachiappan C. Nachiappan, Mahmut Taylan Kandemir, and Chita R. Das. 2020. Fifer: Tackling Resource Underutilization in the Serverless Era. In Proceedings of the 21st International Middleware Conference (Delft, Netherlands) (Middleware '20). 280--295."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629567"},{"key":"e_1_3_2_1_14_1","volume-title":"ENOVA: Autoscaling towards Cost-effective and Stable Serverless LLM Serving. arXiv preprint arXiv:2407.09486","author":"Huang Tao","year":"2024","unstructured":"Tao Huang, Pengfei Chen, Kyoka Gong, Jocky Hawk, Zachary Bright, Wenxin Xie, Kecheng Huang, and Zhi Ji. 2024. ENOVA: Autoscaling towards Cost-effective and Stable Serverless LLM Serving. arXiv preprint arXiv:2407.09486 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472501"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Justin San Juan and Bernard Wong. 2023. Reducing the Cost of GPU Cold Starts in Serverless Deep Learning Inference Serving. In 2023 IEEE International Conference on Pervasive Computing and Communications Workshops and other Affiliated Events (PerCom Workshops). 225--230.","DOI":"10.1109\/PerComWorkshops56833.2023.10150381"},{"key":"e_1_3_2_1_17_1","volume-title":"Trusted LLM Inference on the Edge with Smart Contracts. In 2024 IEEE International Conference on Blockchain and Cryptocurrency (ICBC). 1--7.","author":"Karanjai Rabimba","year":"2024","unstructured":"Rabimba Karanjai and Weidong Shi. 2024. Trusted LLM Inference on the Edge with Smart Contracts. In 2024 IEEE International Conference on Blockchain and Cryptocurrency (ICBC). 1--7."},{"key":"e_1_3_2_1_18_1","volume-title":"A Survey of Serverless Machine Learning Model Inference. arXiv preprint arXiv:2311.13587","author":"Kojs Kamil","year":"2023","unstructured":"Kamil Kojs. 2023. A Survey of Serverless Machine Learning Model Inference. arXiv preprint arXiv:2311.13587 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Li Jie","year":"2022","unstructured":"Jie Li, Laiping Zhao, Yanan Yang, Kunlin Zhan, and Keqiu Li. 2022. Tetris: Memory-efficient serverless inference through tensor sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD55607.2022.00029"},{"key":"e_1_3_2_1_21_1","unstructured":"Microsoft. 2024. Microsoft Azure AI Studio. https:\/\/learn.microsoft.com\/en-us\/azure\/ai-studio"},{"key":"e_1_3_2_1_22_1","volume-title":"FSD-Inference: Fully Serverless Distributed Inference with Scalable Cloud Communication. In 2024 IEEE 40th International Conference on Data Engineering (ICDE). 2109--2122","author":"Oakley Joe","year":"2024","unstructured":"Joe Oakley and Hakan Ferhatosmanoglu. 2024. FSD-Inference: Fully Serverless Distributed Inference with Scalable Cloud Communication. In 2024 IEEE 40th International Conference on Data Engineering (ICDE). 2109--2122."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNSM.2023.3239672"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3565382.3565878"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624664"},{"key":"e_1_3_2_1_26_1","unstructured":"Predibase. 2024. LoRAX: Multi-LoRA inference server that scales to 1000s of fine-tuned LLMs. https:\/\/github.com\/predibase\/lorax."},{"key":"e_1_3_2_1_27_1","volume-title":"INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J. Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated Model-less Inference Serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 397--411."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_29_1","volume-title":"Article 146 (jan","author":"Ye Zhisheng","year":"2024","unstructured":"Zhisheng Ye, Wei Gao, Qinghao Hu, Peng Sun, Xiaolin Wang, Yingwei Luo, Tianwei Zhang, and Yonggang Wen. 2024. Deep Learning Workload Scheduling in GPU Datacenters: A Survey. ACM Comput. Surv. 56, 6, Article 146 (jan 2024), 38 pages."},{"key":"e_1_3_2_1_30_1","volume-title":"Gillis: Serving Large Neural Networks in Serverless Functions with Automatic Model Partitioning. In 2021 IEEE 41st International Conference on Distributed Computing Systems (ICDCS). 138--148","author":"Yu Minchen","year":"2021","unstructured":"Minchen Yu, Zhifeng Jiang, Hok Chun Ng, Wei Wang, Ruichuan Chen, and Bo Li. 2021. Gillis: Serving Large Neural Networks in Serverless Functions with Automatic Model Partitioning. In 2021 IEEE 41st International Conference on Distributed Computing Systems (ICDCS). 138--148."},{"key":"e_1_3_2_1_31_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. {MArk}: Exploiting cloud services for {Cost-Effective},{SLO-Aware} machine learning inference serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1049--1062."}],"event":{"name":"WoSC10 '24: 10th International Workshop on Serverless Computing","location":"Hong Kong Hong Kong","acronym":"WoSC10 '24","sponsor":["IFIP","Usenix"]},"container-title":["Proceedings of the 10th International Workshop on Serverless Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3702634.3702950","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:09Z","timestamp":1750295889000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3702634.3702950"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":31,"alternative-id":["10.1145\/3702634.3702950","10.1145\/3702634"],"URL":"https:\/\/doi.org\/10.1145\/3702634.3702950","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2024-12-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}