{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:09:41Z","timestamp":1765465781316,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","funder":[{"name":"National Science and Technology Major Project","award":["2022ZD0115200"],"award-info":[{"award-number":["2022ZD0115200"]}]},{"name":"Northern IC Technology Innovation Center &#x28;Beijing&#x29; Co., Ltd","award":["QYJS20232801B"],"award-info":[{"award-number":["QYJS20232801B"]}]},{"name":"Natural Science Foundation of China","award":["62125403&#x3b; 92464302&#x3b; 92164301&#x3b; U24B20164"],"award-info":[{"award-number":["62125403&#x3b; 92464302&#x3b; 92164301&#x3b; U24B20164"]}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["None"],"award-info":[{"award-number":["None"]}]},{"name":"Natural Science Foundation of Jiangsu Province Basic Research Program","award":["BK20243042"],"award-info":[{"award-number":["BK20243042"]}]},{"name":"Beijing National Research Center for Information Science and Technology","award":["None"],"award-info":[{"award-number":["None"]}]},{"name":"Beijing Advanced Innovation Center for Integrated Circuits","award":["None"],"award-info":[{"award-number":["None"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3760250.3762228","type":"proceedings-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:36Z","timestamp":1765465596000},"page":"314-329","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["XY-Serve: End-to-End Versatile Production Serving for Dynamic LLM Workloads"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5289-685X","authenticated-orcid":false,"given":"Mingcong","family":"Song","sequence":"first","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6038-3709","authenticated-orcid":false,"given":"Xinru","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University, BNRist, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6937-9057","authenticated-orcid":false,"given":"Fengfan","family":"Hou","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3806-6499","authenticated-orcid":false,"given":"Jing","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1991-2253","authenticated-orcid":false,"given":"Wei","family":"Wei","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3826-2180","authenticated-orcid":false,"given":"Yipeng","family":"Ma","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3550-5636","authenticated-orcid":false,"given":"Runqiu","family":"Xiao","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7511-6624","authenticated-orcid":false,"given":"Hongjie","family":"Si","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4379-694X","authenticated-orcid":false,"given":"Dingcheng","family":"Jiang","sequence":"additional","affiliation":[{"name":"Tsinghua University, BNRist, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2309-572X","authenticated-orcid":false,"given":"Shouyi","family":"Yin","sequence":"additional","affiliation":[{"name":"Tsinghua University, BNRist, Beijing, China and Shanghai AI Laboratory, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6942-4395","authenticated-orcid":false,"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"Tsinghua University, BNRist, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3176-7572","authenticated-orcid":false,"given":"Guoping","family":"Long","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n.d.]. Accelerating Llama3 FP8 Inference with Triton Kernels. https:\/\/pytorch.org\/blog\/accelerating-llama3\/?hss_channel=lcp-78618366\/."},{"key":"e_1_3_2_1_2_1","unstructured":"[n.d.]. Deep Dive on CUTLASS Ping-Pong GEMM Kernel | PyTorch. https:\/\/pytorch.org\/blog\/cutlass-ping-pong-gemm-kernel\/."},{"key":"e_1_3_2_1_3_1","unstructured":"[n.d.]. Faster Text Generation with Self-Speculative Decoding. https:\/\/huggingface.co\/blog\/layerskip."},{"key":"e_1_3_2_1_4_1","unstructured":"[n.d.]. [Feature] integrate FlashMLA \u00b7 Issue 4384 \u00b7 sgl-project\/sglang \u2014 github.com. https:\/\/github.com\/sgl-project\/sglang\/issues\/4384. [Accessed 08-07--2025]."},{"key":"e_1_3_2_1_5_1","unstructured":"[n.d.]. Flash-Decoding for Long-Context Inference. https:\/\/pytorch.org\/blog\/flash-decoding\/."},{"key":"e_1_3_2_1_6_1","unstructured":"[n.d.]. Fused Attention \u2014 Triton documentation \u2014 tritonlang.org. https:\/\/triton-lang.org\/main\/getting-started\/tutorials\/06-fused-attention.html. [Accessed 14-07--2025]."},{"key":"e_1_3_2_1_7_1","unstructured":"[n.d.]. GitHub - Ascend\/Pytorch: Ascend PyTorch Adapter (Torch_npu). Mirror of https:\/\/gitee.com\/ascend\/pytorch. https:\/\/github.com\/Ascend\/pytorch."},{"key":"e_1_3_2_1_8_1","unstructured":"[n.d.]. GitHub - Azure\/AzurePublicDataset: Microsoft Azure Traces \u2014 github.com. https:\/\/github.com\/Azure\/AzurePublicDataset. [Accessed 12-03--2025]."},{"key":"e_1_3_2_1_9_1","unstructured":"[n.d.]. GitHub - HPMLL\/BurstGPT: A ChatGPT(GPT-3.5) & GPT-4 Workload Trace to Optimize LLM Serving Systems \u2014 github.com. https:\/\/github.com\/HPMLL\/BurstGPT. [Accessed 12-03--2025]."},{"key":"e_1_3_2_1_10_1","unstructured":"[n.d.]. GitHub - HPMLL\/BurstGPT: A ChatGPT(GPT-3.5) & GPT-4 Workload Trace to Optimize LLM Serving Systems \u2014 github.com. https:\/\/github.com\/HPMLL\/BurstGPT. [Accessed 27-07--2025]."},{"key":"e_1_3_2_1_11_1","unstructured":"[n.d.]. GitHub - kvcache-ai\/Mooncake: Mooncake is the serving platform for Kimi a leading LLM service provided by Moonshot AI. \u2014 github.com. https:\/\/github.com\/kvcache-ai\/Mooncake. [Accessed 26-07--2025]."},{"key":"e_1_3_2_1_12_1","unstructured":"[n.d.]. GitHub - Pybind\/Pybind11: Seamless Operability between C11 and Python. https:\/\/github.com\/pybind\/pybind11."},{"key":"e_1_3_2_1_13_1","unstructured":"[n.d.]. Introducing Tile-Based Programming in Warp 1.5.0 | NVIDIA Technical Blog \u2014 developer.nvidia.com. https:\/\/developer.nvidia.com\/blog\/introducing-tile-based-programming-in-warp-1--5-0\/. [Accessed 08-07--2025]."},{"key":"e_1_3_2_1_14_1","unstructured":"[n.d.]. Introduction \u2014 vLLM. https:\/\/docs.vllm.ai\/en\/latest\/automatic_prefix_caching\/apc.html."},{"key":"e_1_3_2_1_15_1","unstructured":"[n.d.]. Matrix Multiplication \u2014 Triton documentation \u2014 tritonlang.org. https:\/\/triton-lang.org\/main\/getting-started\/tutorials\/03-matrix-multiplication.html. [Accessed 14-07--2025]."},{"key":"e_1_3_2_1_16_1","unstructured":"[n.d.]. Support Page Size > 1 for FA3 by hebiao064 \u00b7 Pull Request 4832\u00b7 sgl-project\/sglang \u2014 github.com. https:\/\/github.com\/sgl-project\/sglang\/pull\/4832. [Accessed 08-07--2025]."},{"key":"e_1_3_2_1_17_1","unstructured":"[n.d.]. Torch.Nn-Native PyTorch APIs-PyTorch2.1-API List-PyTorch Network Model Porting and Training Guide-Model Development (PyTorch)-7.0.0-CANN Commercial Edition-Ascend Documentation-Ascend Community. https:\/\/www.hiascend.com\/document\/detail\/en\/canncommercial\/700\/modeldevpt\/ptmigr\/ptaoplist_000006.html."},{"key":"e_1_3_2_1_18_1","unstructured":"[n.d.]. Torch_npu.Npu_incre_flash_attention. https:\/\/www.hiascend.com\/doc_center\/source\/zh\/Pytorch\/60RC2\/apiref\/apilist\/ptaoplist_000788.html."},{"key":"e_1_3_2_1_19_1","unstructured":"[n.d.]. Torch_npu.Npu_prompt_flash_attention. https:\/\/www.hiascend.com\/doc_center\/source\/zh\/CANNCommunityEdition\/80RC1alpha001\/apiref\/fmkadptapi\/ptaoplist_000142.html."},{"key":"e_1_3_2_1_20_1","unstructured":"[n.d.]. vLLM Nightly-Benchmarks. https:\/\/github.com\/vllm-project\/vllm\/tree\/main\/.buildkite\/nightly-benchmarks."},{"key":"e_1_3_2_1_21_1","unstructured":"[n.d.]. vLLM support for Ascend NPU. https:\/\/github.com\/vllmproject\/vllm\/pull\/8054."},{"key":"e_1_3_2_1_22_1","unstructured":"2020. Optimizing Compute Shaders for L2 Locality Using Thread- Group ID Swizzling. https:\/\/developer.nvidia.com\/blog\/optimizingcompute- shaders-for-l2-locality-using-thread-group-id-swizzling\/."},{"key":"e_1_3_2_1_23_1","volume-title":"Mnemosyne: Parallelization strategies for efficiently serving multi-million context length llm inference requests without approximations. arXiv preprint arXiv:2409.17264","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Junda Chen, \u00cd\u00f1igo Goiri, Ramachandran Ramjee, Chaojie Zhang, Alexey Tumanov, and Esha Choukse. 2024. Mnemosyne: Parallelization strategies for efficiently serving multi-million context length llm inference requests without approximations. arXiv preprint arXiv:2409.17264 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117--134."},{"key":"e_1_3_2_1_25_1","volume-title":"Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, and Ramachandran Ramjee. 2023. Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495883"},{"key":"e_1_3_2_1_27_1","volume-title":"Medusa: Simple llm inference acceleration framework with multiple decoding heads","author":"Cai Tianle","year":"2024","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D Lee, Deming Chen, and Tri Dao. [n.d.]. Medusa: Simple llm inference acceleration framework with multiple decoding heads, 2024. URL https:\/\/arxiv.org\/abs\/2401.10774 ([n.d.])."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_29_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning (2023). arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_31_1","first-page":"16344","article-title":"2022. Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_2_1_34_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, et al. 2024. Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference. arXiv preprint arXiv:2401.08671 (2024)."},{"key":"e_1_3_2_1_35_1","unstructured":"Cunchen Hu Heyang Huang Liangliang Xu Xusheng Chen Jiang Xu Shuang Chen Hao Feng Chenxi Wang Sa Wang Yungang Bao et al. 2024. Inference without interference: Disaggregate llm inference for mixed downstream workloads. arXiv preprint arXiv:2401.11181 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Specserve: Efficient and slo-aware large language model serving with adaptive speculative decoding. arXiv preprint arXiv:2503.05096","author":"Huang Kaiyu","year":"2025","unstructured":"Kaiyu Huang, Hao Wu, Zhubo Shi, Han Zou, Minchen Yu, and Qingjiang Shi. 2025. Specserve: Efficient and slo-aware large language model serving with adaptive speculative decoding. arXiv preprint arXiv:2503.05096 (2025)."},{"key":"e_1_3_2_1_37_1","unstructured":"Kevin Skadron Jiayuan Meng. [n.d.]. Dynamic Warp Subdivision for Non-Speculative Runahead SIMT Gather. https:\/\/www.nvidia.com\/content\/GTC\/posters\/03_Meng_Dynamic_Warp_Subdivision.pdf."},{"key":"e_1_3_2_1_38_1","unstructured":"Yibo Jin Tao Wang Huimin Lin Mingyang Song Peiyang Li Yipeng Ma Yicheng Shan Zhengfan Yuan Cailong Li Yajing Sun et al. 2024. P\/d-serve: Serving disaggregated large language model at scale. arXiv preprint arXiv:2408.08147 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"POD-Attention: Unlocking Full Prefill-Decode Overlap for Faster LLM Inference. arXiv preprint arXiv:2410.18038","author":"Kamath Aditya K","year":"2024","unstructured":"Aditya K Kamath, Ramya Prabhu, Jayashree Mohan, Simon Peter, Ramachandran Ramjee, and Ashish Panwar. 2024. POD-Attention: Unlocking Full Prefill-Decode Overlap for Faster LLM Inference. arXiv preprint arXiv:2410.18038 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274--19286."},{"key":"e_1_3_2_1_43_1","volume-title":"Eagle: Speculative sampling requires rethinking feature uncertainty. arXiv preprint arXiv:2401.15077","author":"Li Yuhui","year":"2024","unstructured":"Yuhui Li, Fangyun Wei, Chao Zhang, and Hongyang Zhang. 2024. Eagle: Speculative sampling requires rethinking feature uncertainty. arXiv preprint arXiv:2401.15077 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00071"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875654"},{"key":"e_1_3_2_1_46_1","unstructured":"Haoran Lin Xianzhi Yu Kang Zhao Lu Hou Zongyuan Zhan Stanislav Kamenev Han Bao Ting Hu Mingkai Wang Qixin Chang et al. 2024. FastAttention: Extend FlashAttention2 to NPUs and Low-resource GPUs. arXiv preprint arXiv:2410.16663 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066","author":"Liu Xiaoxuan","year":"2024","unstructured":"Xiaoxuan Liu, Cade Daniel, Langxiang Hu, Woosuk Kwon, Zhuohan Li, Xiangxi Mo, Alvin Cheung, Zhijie Deng, Ion Stoica, and Hao Zhang. 2024. Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv preprint arXiv:2406.14066 (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"Xiaoxuan Liu Cade Daniel Langxiang Hu Woosuk Kwon Zhuohan Li Xiangxi Mo Alvin Cheung Zhijie Deng Ion Stoica and Hao Zhang. 2024. Optimizing Speculative Decoding for Serving Large Language Models Using Goodput. arXiv:2406.14066 [cs.AI] https:\/\/arxiv.org\/abs\/2406.14066"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_50_1","volume-title":"Mooncake: A kvcache-centric disaggregated architecture for llm serving. arXiv preprint arXiv:2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2024. Mooncake: A kvcache-centric disaggregated architecture for llm serving. arXiv preprint arXiv:2407.00079 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Flashattention-3: Fast and accurate attention with asynchrony and low-precision. arXiv preprint arXiv:2407.08608","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. Flashattention-3: Fast and accurate attention with asynchrony and low-precision. arXiv preprint arXiv:2407.08608 (2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_53_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_54_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024. Qwen2 technical report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.623"},{"key":"e_1_3_2_1_56_1","volume-title":"FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving. In Eighth Conference on Machine Learning and Systems. https:\/\/openreview.net\/forum?id=RXPofAsL8F","author":"Ye Zihao","year":"2025","unstructured":"Zihao Ye, Lequn Chen, Ruihang Lai, Wuwei Lin, Yineng Zhang, Stephanie Wang, Tianqi Chen, Baris Kasikci, Vinod Grover, Arvind Krishnamurthy, and Luis Ceze. 2025. FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving. In Eighth Conference on Machine Learning and Systems. https:\/\/openreview.net\/forum?id=RXPofAsL8F"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"},{"key":"e_1_3_2_1_58_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_59_1","volume-title":"Lossless large language model acceleration via self-speculative decoding. arXiv preprint arXiv:2309.08168","author":"Zhang Jun","year":"2023","unstructured":"Jun Zhang, Jue Wang, Huan Li, Lidan Shou, Ke Chen, Gang Chen, and Sharad Mehrotra. 2023. Draft & verify: Lossless large language model acceleration via self-speculative decoding. arXiv preprint arXiv:2309.08168 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 6344--6355","author":"Zhao Yao","year":"2024","unstructured":"Yao Zhao, Zhitian Xie, Chen Liang, Chenyi Zhuang, and Jinjie Gu. 2024. Lookahead: An inference acceleration framework for large language model with lossless generation accuracy. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 6344--6355."},{"key":"e_1_3_2_1_61_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. [n.d.]. Sglang: Efficient execution of structured language model programs","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. [n.d.]. Sglang: Efficient execution of structured language model programs, 2024. URL https:\/\/arxiv.org\/abs\/2312.07104 ([n.d.])."},{"key":"e_1_3_2_1_62_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193--210."}],"event":{"name":"ASPLOS '26:31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3760250.3762228","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:47Z","timestamp":1765465607000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3760250.3762228"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,11]]},"references-count":62,"alternative-id":["10.1145\/3760250.3762228","10.1145\/3760250"],"URL":"https:\/\/doi.org\/10.1145\/3760250.3762228","relation":{},"subject":[],"published":{"date-parts":[[2025,12,11]]},"assertion":[{"value":"2025-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}