{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T00:55:47Z","timestamp":1773276947881,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3716011","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"798-813","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Past-Future Scheduler for LLM Serving under SLA Guarantees"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6024-7086","authenticated-orcid":false,"given":"Ruihao","family":"Gong","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6885-8423","authenticated-orcid":false,"given":"Shihao","family":"Bai","sequence":"additional","affiliation":[{"name":"SenseTime, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4295-9983","authenticated-orcid":false,"given":"Siyu","family":"Wu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8238-6595","authenticated-orcid":false,"given":"Yunqian","family":"Fan","sequence":"additional","affiliation":[{"name":"SenseTime, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4565-3913","authenticated-orcid":false,"given":"Zaijun","family":"Wang","sequence":"additional","affiliation":[{"name":"SenseTime, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4896-121X","authenticated-orcid":false,"given":"Xiuhong","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-7927","authenticated-orcid":false,"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7618-3275","authenticated-orcid":false,"given":"Xianglong","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Anthropic. 2023. Anthropic Claude. https:\/\/claude.ai\/"},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang et al. 2023a. Qwen Technical Report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023b. Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305","year":"2023","unstructured":"Baichuan. 2023. Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305 (2023). https:\/\/arxiv.org\/abs\/2309.10305"},{"key":"e_1_3_2_1_5_1","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. (2023)."},{"key":"e_1_3_2_1_6_1","unstructured":"Tri Dao Daniel Y. Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"e_1_3_2_1_8_1","volume-title":"GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_9_1","unstructured":"Github. 2022. Github Copilot. https:\/\/github.com\/features\/copilot"},{"key":"e_1_3_2_1_10_1","unstructured":"Google. 2023. Google Bard. https:\/\/bard.google.com\/"},{"key":"e_1_3_2_1_11_1","volume-title":"FlashDecoding: Faster Large Language Model Inference on GPUs. arXiv preprint arXiv:2311.01282","author":"Hong Ke","year":"2023","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, Kangdi Chen, Hanyu Dong, and Yu Wang. 2023. FlashDecoding: Faster Large Language Model Inference on GPUs. arXiv preprint arXiv:2311.01282 (2023)."},{"key":"e_1_3_2_1_12_1","unstructured":"HuggingFace. 2023. Text Generation Inference. https:\/\/github.com\/huggingface\/text-generation-inference."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_14_1","volume-title":"Yangtian Zi, Niklas Muennighoff, et al.","author":"Li Raymond","year":"2023","unstructured":"Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, et al. 2023. StarCoder: may the source be with you!arxiv: 2305.06161 [cs.CL]"},{"key":"e_1_3_2_1_15_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023a. Improved Baselines with Visual Instruction Tuning. arxiv: 2310.03744 [cs.CV]"},{"key":"e_1_3_2_1_16_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023b. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_17_1","unstructured":"Microsoft. 2023. DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference. https:\/\/github.com\/microsoft\/DeepSpeed\/tree\/master\/blogs\/deepspeed-fastgen."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K16--1028"},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. 2021. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer\/"},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA. 2023. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM\/."},{"key":"e_1_3_2_1_21_1","unstructured":"OpenAI. 2022. OpenAI ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_22_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arxiv: 2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_23_1","unstructured":"Adam Paszke Sam Gross Soumith Chintala Gregory Chanan Edward Yang Zachary DeVito Zeming Lin Alban Desmaison Luca Antiga and Adam Lerer. 2017. Automatic differentiation in PyTorch. (2017)."},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HkAClQgA-","author":"Paulus Romain","year":"2018","unstructured":"Romain Paulus, Caiming Xiong, and Richard Socher. 2018. A Deep Reinforced Model for Abstractive Summarization. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HkAClQgA-"},{"key":"e_1_3_2_1_25_1","volume-title":"Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving.","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng,, and Xinran Xu. 2024. Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving. (2024). https:\/\/arxiv.org\/abs\/2407.00079"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17--1099"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 31094--31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher Re, Ion Stoica, and Ce Zhang. 2023. FlexGen: high-throughput generative inference of large language models with a single GPU. In International Conference on Machine Learning. PMLR, 31094--31116."},{"key":"e_1_3_2_1_28_1","volume-title":"Meet Shah, Yu Jiang, Xinlei Chen, Devi Parikh, and Marcus Rohrbach. 2019. Towards VQA Models That Can Read. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 8317--8326","author":"Singh Amanpreet","unstructured":"Amanpreet Singh, Vivek Natarjan, Meet Shah, Yu Jiang, Xinlei Chen, Devi Parikh, and Marcus Rohrbach. 2019. Towards VQA Models That Can Read. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 8317--8326."},{"key":"e_1_3_2_1_29_1","unstructured":"01-AI Team. 2023a. Yi LLM. https:\/\/github.com\/01-ai\/Yi."},{"key":"e_1_3_2_1_30_1","unstructured":"InternLM Team. 2023b. InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities. https:\/\/github.com\/InternLM\/InternLM-techreport."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_32_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023a. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_33_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi et al. 2023b. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv: 2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_34_1","volume-title":"Amelie Chi Zhou, and Xiaowen Chu","author":"Wang Yuxin","year":"2024","unstructured":"Yuxin Wang, Yuhan Chen, Zeyu Li, Zhenheng Tang, Rui Guo, Xin Wang, Qiang Wang, Amelie Chi Zhou, and Xiaowen Chu. 2024. Towards Efficient and Reliable LLM Serving: A Real-World Workload Study. arxiv: 2401.17644 [cs.DC]"},{"key":"e_1_3_2_1_35_1","volume-title":"Sebastian Borgeaud, Dani Yogatama, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Yi Tay, Rishi Bommasani, Barret Zoph Colin Raffel, Sebastian Borgeaud, Dani Yogatama, et al. 2022a. Emergent Abilities of Large Language Models. Transactions on Machine Learning Research (2022). https:\/\/openreview.net\/forum?id=yzkSU5zdwD Survey Certification."},{"key":"e_1_3_2_1_36_1","volume-title":"Fei Xia, Ed H. Chi, Quoc V Le, and Denny Zhou.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, brian ichter, Fei Xia, Ed H. Chi, Quoc V Le, and Denny Zhou. 2022b. Chain of Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems, Alice H. Oh, Alekh Agarwal, Danielle Belgrave, and Kyunghyun Cho (Eds.). https:\/\/openreview.net\/forum?id=_VjQlMeSB_J"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.102"},{"key":"e_1_3_2_1_38_1","volume-title":"Service-level agreement. (2023). https:\/\/en.wikipedia.org\/wiki\/Service-level_agreement [Online","year":"2023","unstructured":"Wikipedia. 2023. Service-level agreement. (2023). https:\/\/en.wikipedia.org\/wiki\/Service-level_agreement [Online; accessed 24-October-2023]."},{"key":"e_1_3_2_1_39_1","volume-title":"Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, et al.","year":"2023","unstructured":"BigScience Workshop, Teven Le Scao, Angela Fan, Christopher Akiki, et al. 2023. BLOOM: A 176B-Parameter Open-Access Multilingual Language Model. arxiv: 2211.05100 [cs.CL]"},{"key":"e_1_3_2_1_40_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_41_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716011","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3716011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:10:02Z","timestamp":1755774602000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":41,"alternative-id":["10.1145\/3676641.3716011","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3716011","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}