{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:37:37Z","timestamp":1782833857116,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Institute of Computing Technology, Chinese Academy of Sciences","doi-asserted-by":"publisher","award":["E361060, E461040"],"award-info":[{"award-number":["E361060, E461040"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707251","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"311-325","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Dilu: Enabling GPU Resourcing-on-Demand for Serverless DL Serving via Introspective Elasticity"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7089-0315","authenticated-orcid":false,"given":"Cunchi","family":"Lv","sequence":"first","affiliation":[{"name":"ICT, CAS, Beijing, China and UCAS, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7105-8355","authenticated-orcid":false,"given":"Xiao","family":"Shi","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijing, China and Nanjing Institute of InforSuperBahn, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2170-6560","authenticated-orcid":false,"given":"Zhengyu","family":"Lei","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijing, China and UCAS, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9015-777X","authenticated-orcid":false,"given":"Jinyue","family":"Huang","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijng, China and UCAS, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2710-9660","authenticated-orcid":false,"given":"Wenting","family":"Tan","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5500-9332","authenticated-orcid":false,"given":"Xiaohui","family":"Zheng","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1680-9969","authenticated-orcid":false,"given":"Xiaofang","family":"Zhao","sequence":"additional","affiliation":[{"name":"ICT, CAS, Beijing, China and IICT, Suzhou, CAS, Suzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/kubernetes.io\/docs\/concepts\/ policy\/resource-quotas\/","author":"Kubernetes","year":"2024","unstructured":"Kubernetes resource quotas. https:\/\/kubernetes.io\/docs\/concepts\/ policy\/resource-quotas\/, 2024."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00073"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.14778\/3547305.3547313"},{"key":"e_1_3_2_1_4_1","volume-title":"Alibaba pai. https:\/\/www.aliyun.com\/product\/bigdata\/learn","year":"2024","unstructured":"Alibaba. Alibaba pai. https:\/\/www.aliyun.com\/product\/bigdata\/learn, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"Amazon sagemaker. https:\/\/aws.amazon.com\/pm\/ sagemaker\/","year":"2024","unstructured":"Amazon. Amazon sagemaker. https:\/\/aws.amazon.com\/pm\/ sagemaker\/, 2024."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357223.3362711"},{"key":"e_1_3_2_1_7_1","first-page":"20","article-title":"Sla-driven ml inference framework for clouds with heterogeneous accelerators","volume":"4","author":"Cho Junguk","year":"2022","unstructured":"Junguk Cho, Diman Zad Tootaghaj, Lianjie Cao, and Puneet Sharma. Sla-driven ml inference framework for clouds with heterogeneous accelerators. Proceedings of Machine Learning and Systems, 4:20--32, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_8_1","first-page":"199","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC'22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. Serving heterogeneous machine learning models on multi-gpu servers with spatio-temporal sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC'22), pages 199--216, 2022."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of deep bidirectional transformers for language understanding. In Jill Burstein, Christy Doran, and Thamar Solorio, editors, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pages 4171--4186, Minneapolis, Minnesota, June 2019. Association for Computational Linguistics."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421284"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507732"},{"key":"e_1_3_2_1_12_1","volume-title":"Thudm chatglm3. https:\/\/github.com\/THUDM\/ChatGLM3","author":"Tsinghua University Knowledge Engineering and Data Mining Group.","year":"2024","unstructured":"Tsinghua University Knowledge Engineering and Data Mining Group. Thudm chatglm3. https:\/\/github.com\/THUDM\/ChatGLM3, 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"Huggingface accelerate. https:\/\/pypi.org\/project\/ accelerate\/","author":"Face Hugging","year":"2024","unstructured":"Hugging Face. Huggingface accelerate. https:\/\/pypi.org\/project\/ accelerate\/, 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"Apache mxnet. https:\/\/github.com\/ apache\/mxnet","author":"Foundation Apache Software","year":"2024","unstructured":"Apache Software Foundation. Apache mxnet. https:\/\/github.com\/ apache\/mxnet, 2024."},{"key":"e_1_3_2_1_15_1","first-page":"135","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. {ServerlessLLM}:{Low-Latency} serverless inference for large language models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 135--153, 2024."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_17_1","volume-title":"https:\/\/www.tensorflow.org\/","year":"2024","unstructured":"Google. Tensorflow. https:\/\/www.tensorflow.org\/, 2024."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575721"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605638"},{"key":"e_1_3_2_1_20_1","first-page":"469","volume-title":"Ubiquitous Computing & Communications, Big Data & Cloud Computing, Social Computing & Networking, Sustainable Computing & Communications (ISPA\/IUCC\/BDCloud\/SocialCom\/SustainCom)","author":"Gu Jing","year":"2018","unstructured":"Jing Gu, Shengbo Song, Ying Li, and Hanmei Luo. Gaiagpu: Sharing gpus in container clouds. In 2018 IEEE Intl Conf on Parallel & Distributed Processing with Applications, Ubiquitous Computing & Communications, Big Data & Cloud Computing, Social Computing & Networking, Sustainable Computing & Communications (ISPA\/IUCC\/BDCloud\/SocialCom\/SustainCom), pages 469--476. IEEE, 2018."},{"key":"e_1_3_2_1_21_1","first-page":"485","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI'19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. Tiresias: A gpu cluster manager for distributed deep learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI'19), pages 485--500, 2019."},{"key":"e_1_3_2_1_22_1","first-page":"779","article-title":"Dynamic hybrid-parallel dnn training on serverless containers","volume":"4","author":"Guo Runsheng","year":"2022","unstructured":"Runsheng Guo, Victor Guo, Antonio Kim, Josh Hildred, and Khuzaima Daudjee. Hydrozoa: Dynamic hybrid-parallel dnn training on serverless containers. Proceedings of Machine Learning and Systems, 4:779--794, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3430063"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472501"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3459240"},{"key":"e_1_3_2_1_27_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC'22)","author":"Li Jie","year":"2022","unstructured":"Jie Li, Laiping Zhao, Yanan Yang, Kunlin Zhan, and Keqiu Li. Tetris: Memory-efficient serverless inference through tensor sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC'22), 2022."},{"key":"e_1_3_2_1_28_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692, 2019."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570607"},{"key":"e_1_3_2_1_30_1","volume-title":"Meta llama2. https:\/\/llama.meta.com\/llama2\/","year":"2024","unstructured":"Meta. Meta llama2. https:\/\/llama.meta.com\/llama2\/, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"https:\/\/pytorch.org\/","year":"2024","unstructured":"Meta. Pytorch. https:\/\/pytorch.org\/, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"Pytorch ddp. https:\/\/pytorch.org\/tutorials\/intermediate\/ddp_tutorial.html","year":"2024","unstructured":"Meta. Pytorch ddp. https:\/\/pytorch.org\/tutorials\/intermediate\/ddp_tutorial.html, 2024."},{"key":"e_1_3_2_1_33_1","volume-title":"Microsoft aci. https:\/\/azure.microsoft.com\/zh-tw\/products\/container-instances","year":"2024","unstructured":"Microsoft. Microsoft aci. https:\/\/azure.microsoft.com\/zh-tw\/products\/container-instances, 2024."},{"key":"e_1_3_2_1_34_1","volume-title":"Microsoft deepspeed. https:\/\/github.com\/microsoft\/DeepSpeed","year":"2024","unstructured":"Microsoft. Microsoft deepspeed. https:\/\/github.com\/microsoft\/DeepSpeed, 2024."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2020.01.004"},{"key":"e_1_3_2_1_36_1","volume-title":"Nvidia collective communications library. https:\/\/developer. nvidia.com\/nccl","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nvidia collective communications library. https:\/\/developer. nvidia.com\/nccl, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"Nvidia mig. https:\/\/www.nvidia.com\/en-us\/technologies\/ multi-instance-gpu\/","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nvidia mig. https:\/\/www.nvidia.com\/en-us\/technologies\/ multi-instance-gpu\/, 2024."},{"key":"e_1_3_2_1_38_1","volume-title":"Nvidia mps. https:\/\/docs.nvidia.com\/deploy\/mps\/","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nvidia mps. https:\/\/docs.nvidia.com\/deploy\/mps\/, 2024."},{"key":"e_1_3_2_1_39_1","volume-title":"Nvidia nsight systems. https:\/\/developer.nvidia.com\/nsightsystems","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. Nvidia nsight systems. https:\/\/developer.nvidia.com\/nsightsystems, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"Openai gpt2-large. https:\/\/huggingface.co\/openaicommunity\/ gpt2-large","author":"AI.","year":"2024","unstructured":"OpenAI. Openai gpt2-large. https:\/\/huggingface.co\/openaicommunity\/ gpt2-large, 2024."},{"key":"e_1_3_2_1_41_1","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation OSDI'21)","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R Ganger, and Eric P Xing. Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning. In 15th USENIX Symposium on Operating Systems Design and Implementation OSDI'21), 2021."},{"key":"e_1_3_2_1_42_1","first-page":"205","volume-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC'20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC'20), pages 205--218, 2020."},{"key":"e_1_3_2_1_43_1","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman. Very deep convolutional networks for large-scale image recognition. In Yoshua Bengio and Yann LeCun, editors, 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings, 2015."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"e_1_3_2_1_45_1","first-page":"495","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI'21)","author":"Thorpe John","year":"2021","unstructured":"John Thorpe, Yifan Qiao, Jonathan Eyolfson, Shen Teng, Guanzhou Hu, Zhihao Jia, Jinliang Wei, Keval Vora, Ravi Netravali, Miryung Kim, et al. Dorylus: Affordable, scalable, and accurate gnn training with distributed cpu servers and serverless threads. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI'21), pages 495--514, 2021."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737391"},{"key":"e_1_3_2_1_47_1","first-page":"69","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI'23)","author":"Zhang Zili","year":"2023","unstructured":"BingyangWu, Zili Zhang, Zhihao Bai, Xuanzhe Liu, and Xin Jin. Transparent gpu sharing in container clouds for deep learning workloads. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI'23), pages 69--85, 2023."},{"key":"e_1_3_2_1_48_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920, 2023."},{"key":"e_1_3_2_1_49_1","first-page":"533","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. Antman: Dynamic scaling on gpu clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20), pages 533--548, 2020."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3054656"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507709"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369583.3392679"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00022"},{"key":"e_1_3_2_1_54_1","volume-title":"Slo-aware, gpu-efficient serverless inference via model swapping. arXiv preprint arXiv:2306.03622","author":"Yu Minchen","year":"2023","unstructured":"Minchen Yu, Ao Wang, Dong Chen, Haoxuan Yu, Xiaonan Luo, Zhuohao Li, Wei Wang, Ruichuan Chen, Dapeng Nie, and Haoran Yang. Faaswap: Slo-aware, gpu-efficient serverless inference via model swapping. arXiv preprint arXiv:2306.03622, 2023."},{"key":"e_1_3_2_1_55_1","first-page":"1049","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC'19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. Mark: Exploiting cloud services for cost-effective, slo-aware machine learning inference serving. In 2019 USENIX Annual Technical Conference (USENIX ATC'19), pages 1049--1062, 2019."},{"key":"e_1_3_2_1_56_1","first-page":"193","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. {DistServe}: Disaggregating prefill and decoding for goodput-optimized large language model serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 193--210, 2024."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707251","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707251","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:51:30Z","timestamp":1755787890000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707251"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":56,"alternative-id":["10.1145\/3669940.3707251","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707251","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}