{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:30:59Z","timestamp":1773588659802,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790219","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"1749-1763","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Shift Parallelism: Low-Latency, High-Throughput LLM Inference for Dynamic Workloads"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9276-5075","authenticated-orcid":false,"given":"Mert","family":"Hidayetoglu","sequence":"first","affiliation":[{"name":"Snowflake, Menlo Park, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9119-8696","authenticated-orcid":false,"given":"Aurick","family":"Qiao","sequence":"additional","affiliation":[{"name":"Snowflake, Bellevue, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2860-6859","authenticated-orcid":false,"given":"Michael","family":"Wyatt","sequence":"additional","affiliation":[{"name":"Snowflake, Menlo Park, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2153-0409","authenticated-orcid":false,"given":"Jeff","family":"Rasley","sequence":"additional","affiliation":[{"name":"Snowflake, Bellevue, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0478-8854","authenticated-orcid":false,"given":"Yuxiong","family":"He","sequence":"additional","affiliation":[{"name":"Snowflake, Bellevue, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0386-8759","authenticated-orcid":false,"given":"Samyam","family":"Rajbhandari","sequence":"additional","affiliation":[{"name":"Snowflake, Bellevue, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amey Agrawal Nitin Kedia Ashish Panwar Jayashree Mohan Nipun Kwatra Bhargav S. Gulavani Alexey Tumanov and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. arXiv:2403.02310 [cs.LG] https:\/\/arxiv.org\/abs\/2403.02310"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_2_1_3_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde de Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew N. Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. arXiv:2107.03374 [cs.LG] https:\/\/arxiv.org\/abs\/2107.03374"},{"key":"e_1_3_2_1_4_1","volume-title":"Generating Long Sequences with Sparse Transformers. ArXiv abs\/1904.10509","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. 2019. Generating Long Sequences with Sparse Transformers. ArXiv abs\/1904.10509 (2019). https:\/\/arxiv.org\/abs\/1904.10509"},{"key":"e_1_3_2_1_5_1","unstructured":"Yunfan Gao Yun Xiong Xinyu Gao Kangxiang Jia Jinliu Pan Yuxi Bi Yi Dai Jiawei Sun Meng Wang and Haofen Wang. 2024. Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv:2312.10997 [cs.CL] https:\/\/arxiv.org\/abs\/2312.10997"},{"key":"e_1_3_2_1_6_1","volume-title":"Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He.","author":"Holmes Connor","year":"2024","unstructured":"Connor Holmes, Masahiro Tanaka, Michael Wyatt, Ammar Ahmad Awan, Jeff Rasley, Samyam Rajbhandari, Reza Yazdani Aminabadi, Heyang Qin, Arash Bakhtiari, Lev Kurilenko, and Yuxiong He. 2024. DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference. arXiv:2401.08671 [cs.PF] https:\/\/arxiv.org\/abs\/2401.08671"},{"key":"e_1_3_2_1_7_1","volume-title":"Samyam Rajbhandari, and Yuxiong He.","author":"Jacobs Sam Ade","year":"2023","unstructured":"Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, and Yuxiong He. 2023. DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models. arXiv:2309.14509 [cs.LG] https:\/\/arxiv.org\/abs\/2309.14509"},{"key":"e_1_3_2_1_8_1","unstructured":"Carlos E. Jimenez John Yang AlexanderWettig Shunyu Yao Kexin Pei Ofir Press and Karthik Narasimhan. 2024. SWE-bench: Can Language Models Resolve Real-World GitHub Issues? arXiv:2310.06770 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06770"},{"key":"e_1_3_2_1_9_1","volume-title":"Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. arXiv:2309.06180 [cs.LG] https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"e_1_3_2_1_10_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL] https:\/\/arxiv.org\/abs\/2005.11401"},{"key":"e_1_3_2_1_11_1","volume-title":"State-of-the-Art Software Development Agent. All Hands AI Blog (1","author":"Neubig Graham","year":"2024","unstructured":"Graham Neubig and Xingyao Wang. 2024. OpenHands CodeAct 2.1: An Open, State-of-the-Art Software Development Agent. All Hands AI Blog (1 November 2024). https:\/\/www.all-hands.dev\/blog\/openhandscodeact-21-an-open-state-of-the-art-software-development-agent"},{"key":"e_1_3_2_1_12_1","unstructured":"Junichiro Niimi. 2024. Dynamic Sentiment Analysis with Local Large Language Models using Majority Voting: A Study on Factors Affecting Restaurant Evaluation. arXiv:2407.13069 [cs.CL] https:\/\/arxiv.org\/abs\/2407.13069"},{"key":"e_1_3_2_1_13_1","volume-title":"Accessed","author":"Developer NVIDIA","year":"2023","unstructured":"NVIDIA Developer. 2023. NVIDIA TensorRT-LLM: An Open-Source Library for Accelerating LLM Inference. https:\/\/developer.nvidia.com\/blog\/optimizing-inference-on-llmswith-tensorrt-llm-now-publicly-available\/. Accessed: August 19, 2025."},{"key":"e_1_3_2_1_14_1","unstructured":"Gabriele Oliaro Zhihao Jia Daniel Campos and Aurick Qiao. 2025. SuffixDecoding: Extreme Speculative Decoding for Emerging AI Applications. arXiv:2411.04975 [cs.CL] https:\/\/arxiv.org\/abs\/2411.04975"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Aurick Qiao Zhewei Yao Samyam Rajbhandari and Yuxiong He. 2025. SwiftKV: Fast Prefill-Optimized Inference with Knowledge-Preserving Model Transformation. arXiv:2410.03960 [cs.LG] https:\/\/arxiv.org\/abs\/2410.03960","DOI":"10.18653\/v1\/2025.emnlp-main.1306"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation \u2014 A KVCache-centric Architecture for Serving LLM Chatbot. In Proceedings of the 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, USA. https:\/\/github.com\/kvcacheai\/Mooncake Dataset file: https:\/\/github.com\/kvcache-ai\/Mooncake\/blob\/main\/FAST25-release\/traces\/conversation_trace.jsonl."},{"key":"e_1_3_2_1_18_1","volume-title":"Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving. arXiv:2407.00079 [cs.DC] https:\/\/arxiv.org\/abs\/2407.00079","author":"Qin Ruoyu","year":"2024","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2024. Mooncake: A KVCache-centric Disaggregated Architecture for LLM Serving. arXiv:2407.00079 [cs.DC] https:\/\/arxiv.org\/abs\/2407.00079"},{"key":"e_1_3_2_1_19_1","unstructured":"Samyam Rajbhandari Mert Hidayetoglu Aurick Qiao Ye Wang Juncheng Yang Jeff Rasley Michael Wyatt and Yuxiong He. 2025. Arctic Inference with Shift Parallelism: Fast and Efficient Open Source Inference System for Enterprise AI. arXiv:2507.11830 [cs.DC] https:\/\/arxiv.org\/abs\/2507.11830"},{"key":"e_1_3_2_1_20_1","unstructured":"Snowflake AI Research. 2025. ArcticInference: A vLLM plugin for low-latency high-throughput LLM inference. https:\/\/github.com\/snowflakedb\/ArcticInference."},{"key":"e_1_3_2_1_21_1","unstructured":"YeWang Gabriele Oliaro Jaeseong Lee Yuxiong He Aurick Qiao and Rajbhandari Samyam. 2025. Fastest Speculative Decoding in vLLM with Arctic Inference and Arctic Training. https:\/\/www.snowflake.com\/en\/engineering-blog\/fast-speculative-decoding-vllm-arctic."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Kechi Zhang Jia Li Ge Li Xianjie Shi and Zhi Jin. 2024. CodeAgent: Enhancing Code Generation with Tool-Integrated Agent Systems for Real-World Repo-level Coding Challenges. arXiv:2401.07339 [cs.SE] https:\/\/arxiv.org\/abs\/2401.07339","DOI":"10.18653\/v1\/2024.acl-long.737"},{"key":"e_1_3_2_1_23_1","volume-title":"Sinno Jialin Pan, and Lidong Bing","author":"Zhang Wenxuan","year":"2023","unstructured":"Wenxuan Zhang, Yue Deng, Bing Liu, Sinno Jialin Pan, and Lidong Bing. 2023. Sentiment Analysis in the Era of Large Language Models: A Reality Check. arXiv:2305.15005 [cs.CL] https:\/\/arxiv.org\/abs\/2305.15005"},{"key":"e_1_3_2_1_24_1","volume-title":"Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng.","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E. Gonzalez, Clark Barrett, and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. arXiv:2312.07104 [cs.AI] https:\/\/arxiv.org\/abs\/2312.07104"},{"key":"e_1_3_2_1_25_1","unstructured":"Yinmin Zhong Shengyu Liu Junda Chen Jianbo Hu Yibo Zhu Xuanzhe Liu Xin Jin and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv:2401.09670 [cs.DC] https:\/\/arxiv.org\/abs\/2401.09670"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:02:56Z","timestamp":1773583376000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790219"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":25,"alternative-id":["10.1145\/3779212.3790219","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790219","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}