{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T07:12:28Z","timestamp":1779174748500,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2212580"],"award-info":[{"award-number":["2212580"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3715996","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"897-912","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["POD-A\n            <scp>ttention<\/scp>\n            : Unlocking Full Prefill-Decode Overlap for Faster LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6565-3764","authenticated-orcid":false,"given":"Aditya K.","family":"Kamath","sequence":"first","affiliation":[{"name":"University of Washington, Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7266-5522","authenticated-orcid":false,"given":"Ramya","family":"Prabhu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5260-3203","authenticated-orcid":false,"given":"Jayashree","family":"Mohan","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4748-8524","authenticated-orcid":false,"given":"Simon","family":"Peter","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0007-6040","authenticated-orcid":false,"given":"Ramachandran","family":"Ramjee","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0621-4412","authenticated-orcid":false,"given":"Ashish","family":"Panwar","sequence":"additional","affiliation":[{"name":"Microsoft Research, Bengaluru, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2022. FlashAttention. https:\/\/github.com\/Dao-AILab\/flash-attention."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. TensorRT-LLM: A TensorRT Toolbox for Optimized Large Language Model Inference. https:\/\/github.com\/NVIDIA\/TensorRT-LLM."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. AI Infrastructure Spending Forecast to Be Over a TrillionDollars Over the Next Five Years. https:\/\/www.delloro.com\/news\/ai-infrastructure-spending-forecast-to-be-over-a-trillion-dollars-over-the-next-five-years\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. ccdv\/arxiv-summarization. https:\/\/huggingface.co\/datasets\/ccdv\/arxiv-summarization."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. CUDA C Programming Guide - Hardware Implementation. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/#hardware-implementation."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Llama-2-7B. https:\/\/huggingface.co\/meta-llama\/Llama-2-7b-hf."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. Merged PR 1865: Critical bug fixes related to sampling. https:\/\/github.com\/microsoft\/sarathi-serve\/commit\/50e59c51b85b1157e001bb8ee7a1b049d551955d#diff- 450b0de5cce8a2341140afed859dc5dd3b913fa6e62d27988fccefeacc7b33ec."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. Meta-Llama-3-8B. https:\/\/huggingface.co\/meta-llama\/Meta-Llama-3-8B."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. NVIDIA Multi-Instance GPU. https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. NVIDIA Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. NVIDIA\/cutlass: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_12_1","unstructured":"2024. Parallel Thread Execution ISA Version 8.5 - Cooperative Thread Arrays. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/#cooperative-thread-arrays."},{"key":"e_1_3_2_1_13_1","unstructured":"2024. Parallel Thread Execution ISA Version 8.5 - Special Registers:%smid. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index. html#special-registers-smid."},{"key":"e_1_3_2_1_14_1","unstructured":"2024. Performance and Tuning. https:\/\/docs.vllm.ai\/en\/v0.6.0\/models\/performance.html."},{"key":"e_1_3_2_1_15_1","unstructured":"2024. Sarathi-Serve. https:\/\/github.com\/microsoft\/sarathi-serve."},{"key":"e_1_3_2_1_16_1","volume-title":"The State of AI Infrastructure at Scale","year":"2024","unstructured":"2024. The State of AI Infrastructure at Scale 2024. https:\/\/ai-infrastructure.org\/wp-content\/uploads\/2024\/03\/The-State-of-AI-Infrastructure-at-Scale-2024.pdf."},{"key":"e_1_3_2_1_17_1","unstructured":"2024. Unify the kernel used in flash attention backend. https:\/\/github.com\/vllm-project\/vllm\/pull\/6052."},{"key":"e_1_3_2_1_18_1","unstructured":"2024. Upstream Chunked Prefill. https:\/\/github.com\/vllm-project\/vllm\/issues\/3130."},{"key":"e_1_3_2_1_19_1","unstructured":"2024. vLLM: Easy fast and cheap LLM serving for everyone. https:\/\/github.com\/vllm-project\/vllm."},{"key":"e_1_3_2_1_20_1","unstructured":"2024. Yi-6B-200K. https:\/\/huggingface.co\/01-ai\/Yi-6B-200K."},{"key":"e_1_3_2_1_21_1","volume-title":"Mnemosyne: Parallelization Strategies for Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arxiv: 2409.17264 [cs.LG] https:\/\/arxiv.org\/abs\/2409.17264","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Junda Chen, \u00cd\u00f1igo Goiri, Ramachandran Ramjee, Chaojie Zhang, Alexey Tumanov, and Esha Choukse. 2024a. Mnemosyne: Parallelization Strategies for Efficiently Serving Multi-Million Context Length LLM Inference Requests Without Approximations. arxiv: 2409.17264 [cs.LG] https:\/\/arxiv.org\/abs\/2409.17264"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of The Seventh Annual Conference on Machine Learning and Systems, 2024","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Jayashree Mohan, Ashish Panwar, Nipun Kwatra, Bhargav S Gulavani, Ramachandran Ramjee, and Alexey Tumanov. 2024b. Vidur: A Large-Scale Simulation Framework For LLM Inference. Proceedings of The Seventh Annual Conference on Machine Learning and Systems, 2024, Santa Clara (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024c. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 117--134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_2_1_24_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arxiv: 2308.16369 [cs.LG] https:\/\/arxiv.org\/abs\/2308.16369","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arxiv: 2308.16369 [cs.LG] https:\/\/arxiv.org\/abs\/2308.16369"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_2_1_26_1","unstructured":"Jeremy Appleyard and Scott Yokim. 2017. Programming Tensor Cores in CUDA 9. https:\/\/developer.nvidia.com\/blog\/programming-tensor-cores-cuda-9\/."},{"key":"e_1_3_2_1_27_1","unstructured":"Shiyi Cao Shu Liu Tyler Griggs Peter Schafhalter Xiaoxuan Liu Ying Sheng Joseph E. Gonzalez Matei Zaharia and Ion Stoica. 2024. MoE-Lightning: High-Throughput MoE Inference on Memory-constrained GPUs. arxiv: 2411.11217 [cs.DC] https:\/\/arxiv.org\/abs\/2411.11217"},{"key":"e_1_3_2_1_28_1","volume-title":"FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arxiv: 2406.06858 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06858","author":"Chang Li-Wen","year":"2024","unstructured":"Li-Wen Chang, Wenlei Bao, Qi Hou, Chengquan Jiang, Ningxin Zheng, Yinmin Zhong, Xuanrun Zhang, Zuquan Song, Ziheng Jiang, Haibin Lin, Xin Jin, and Xin Liu. 2024. FLUX: Fast Software-based Communication Overlap On GPUs Through Kernel Fusion. arxiv: 2406.06858 [cs.LG] https:\/\/arxiv.org\/abs\/2406.06858"},{"key":"e_1_3_2_1_29_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R'e. 2022. FLASHATTENTION: fast and memory-efficient exact attention with IO-awareness. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 1189, 16 pages."},{"key":"e_1_3_2_1_31_1","unstructured":"Tri Dao Daniel Haziza Francisco Massa and Grigory Sizov. 2023. Flash-Decoding for long-context inference. https:\/\/crfm.stanford.edu\/2023\/10\/12\/flashdecoding.html."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339596"},{"key":"e_1_3_2_1_33_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He. 2024. DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference. arxiv: 2401.08671 [cs.PF] https:\/\/arxiv.org\/abs\/2401.08671"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.)","volume":"6","author":"Hong Ke","year":"2024","unstructured":"Ke Hong, Guohao Dai, Jiaming Xu, Qiuli Mao, Xiuhong Li, Jun Liu, kangdi chen, Yuhan Dong, and Yu Wang. 2024. FlashDecoding: Faster Large Language Model Inference with Asynchronization, Flat GEMM Optimization, and Heuristics. In Proceedings of Machine Learning and Systems, P. Gibbons, G. Pekhimenko, and C. De Sa (Eds.), Vol. 6. 148--161. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2024\/file\/5321b1dabcd2be188d796c21b733e8c7-Paper-Conference.pdf"},{"key":"e_1_3_2_1_35_1","unstructured":"Cunchen Hu Heyang Huang Liangliang Xu Xusheng Chen Jiang Xu Shuang Chen Hao Feng Chenxi Wang Sa Wang Yungang Bao Ninghui Sun and Yizhou Shan. 2024. Inference without Interference: Disaggregate LLM Inference for Mixed Downstream Workloads. arxiv: 2401.11181 [cs.DC] https:\/\/arxiv.org\/abs\/2401.11181"},{"key":"e_1_3_2_1_36_1","volume-title":"Toward Efficient Inference for Mixture of Experts. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=stXtBqyTWX","author":"Huang Haiyang","year":"2024","unstructured":"Haiyang Huang, Newsha Ardalani, Anna Sun, Liu Ke, Shruti Bhosale, Hsien-Hsin S. Lee, Carole-Jean Wu, and Benjamin Lee. 2024. Toward Efficient Inference for Mixture of Experts. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=stXtBqyTWX"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO57630.2024.10444873"},{"key":"e_1_3_2_1_39_1","unstructured":"Hao Kang Srikant Bharadwaj James Hensman Tushar Krishna Victor Ruhle and Saravan Rajmohan. 2024. TurboAttention: Efficient Attention Approximation For High Throughputs LLMs. arxiv: 2412.08585 [cs.LG] https:\/\/arxiv.org\/abs\/2412.08585"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600228"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO53902.2022.9741270"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2014.2313342"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577479"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451160"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707256"},{"key":"e_1_3_2_1_48_1","volume-title":"Lean Attention: Hardware-Aware Scalable Attention Mechanism for the Decode-Phase of Transformers. arxiv: 2405.10480 [cs.AR] https:\/\/arxiv.org\/abs\/2405.10480","author":"Sanovar Rya","year":"2024","unstructured":"Rya Sanovar, Srikant Bharadwaj, Renee St. Amant, Victor R\u00fchle, and Saravan Rajmohan. 2024. Lean Attention: Hardware-Aware Scalable Attention Mechanism for the Decode-Phase of Transformers. arxiv: 2405.10480 [cs.AR] https:\/\/arxiv.org\/abs\/2405.10480"},{"key":"e_1_3_2_1_49_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20"},{"key":"e_1_3_2_1_50_1","volume-title":"Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E. Gonzalez, and Ion Stoica. 2024. Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 965--988. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sheng"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695964"},{"key":"e_1_3_2_1_52_1","unstructured":"Jovan Stojkovic Chaojie Zhang \u00cd\u00f1igo Goiri Josep Torrellas and Esha Choukse. 2024. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. arxiv: 2408.00741 [cs.AI] https:\/\/arxiv.org\/abs\/2408.00741"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.21"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751213"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695948"},{"key":"e_1_3_2_1_58_1","unstructured":"Bingyang Wu Yinmin Zhong Zili Zhang Gang Huang Xuanzhe Liu and Xin Jin. 2023. Fast Distributed Inference Serving for Large Language Models. arxiv: 2305.05920 [cs.LG] https:\/\/arxiv.org\/abs\/2305.05920"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.19"},{"key":"e_1_3_2_1_60_1","unstructured":"Zihao Ye Lequn Chen Ruihang Lai Wuwei Lin Yineng Zhang Stephanie Wang Tianqi Chen Baris Kasikci Vinod Grover Arvind Krishnamurthy and Luis Ceze. 2025. FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving. arxiv: 2501.01005 [cs.DC] https:\/\/arxiv.org\/abs\/2501.01005"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018754"},{"key":"e_1_3_2_1_62_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521--538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2022.3214088"},{"key":"e_1_3_2_1_64_1","volume-title":"SGLang: Efficient Execution of Structured Language Model Programs. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=VqkAKQibpq","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=VqkAKQibpq"},{"key":"e_1_3_2_1_65_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). USENIX Association, Santa Clara, CA, 193--210. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/zhong-yinmin"},{"key":"e_1_3_2_1_66_1","unstructured":"Kan Zhu Yilong Zhao Liangyu Zhao Gefei Zuo Yile Gu Dedong Xie Yufei Gao Qinyu Xu Tian Tang Zihao Ye Keisuke Kamahori Chien-Yu Lin Stephanie Wang Arvind Krishnamurthy and Baris Kasikci. 2024. NanoFlow: Towards Optimal Large Language Model Serving Throughput. arxiv: 2408.12757 [cs.DC] https:\/\/arxiv.org\/abs\/2408.12757"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715996","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3715996","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:06:12Z","timestamp":1755774372000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715996"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":66,"alternative-id":["10.1145\/3676641.3715996","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3715996","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}