{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:51:59Z","timestamp":1772905919386,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2400014"],"award-info":[{"award-number":["2400014"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["1907765"],"award-info":[{"award-number":["1907765"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676659","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T12:53:56Z","timestamp":1744203236000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["ALISE: Accelerating Large Language Model Serving with Speculative Scheduling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9437-9788","authenticated-orcid":false,"given":"Youpeng","family":"Zhao","sequence":"first","affiliation":[{"name":"University of Central Florida, Orlando, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0926-4761","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Central Florida, Orlando, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Meta AI. 2024. Introducing Llama 3.1: Our most capable models to date. https:\/\/ai.meta.com\/blog\/meta-llama-3-1\/"},{"key":"e_1_3_2_1_2_1","volume-title":"Shivanshu Purohit, Usvsn Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal.","author":"Biderman Stella Rose","year":"2023","unstructured":"Stella Rose Biderman, Hailey Schoelkopf, Quentin G. Anthony, Herbie Bradley, Kyle O'Brien, Eric Hallahan, Mohammad Aflah Khan, Shivanshu Purohit, Usvsn Sai Prashanth, Edward Raff, Aviya Skowron, Lintang Sutawika, and Oskar van der Wal. 2023. Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling."},{"key":"e_1_3_2_1_3_1","volume-title":"Language Models are Few-Shot Learners. ArXiv abs\/2005.14165","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, T. J. Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeff Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. ArXiv abs\/2005.14165 (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"Generating Long Sequences with Sparse Transformers. ArXiv abs\/1904.10509","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. 2019. Generating Long Sequences with Sparse Transformers. ArXiv abs\/1904.10509 (2019)."},{"key":"e_1_3_2_1_5_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). 613--627."},{"key":"e_1_3_2_1_6_1","volume-title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. ArXiv abs\/2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. ArXiv abs\/2307.08691 (2023)."},{"key":"e_1_3_2_1_7_1","first-page":"16344","article-title":"2022. Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv abs\/1810.04805 (2019)."},{"key":"e_1_3_2_1_9_1","volume-title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. International Conference on Learning Representations (ICLR)","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2023. GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers. International Conference on Learning Representations (ICLR) (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"USENIX Symposium on Operating Systems Design and Implementation.","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In USENIX Symposium on Operating Systems Design and Implementation."},{"key":"e_1_3_2_1_11_1","volume-title":"Priority queues. The Art of Multiprocessor Programming","author":"Herlihy Maurice","year":"2019","unstructured":"Maurice Herlihy, Nir Shavit, Victor Luchangco, and Michael Spear.2019. Priority queues. The Art of Multiprocessor Programming (2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"S3: Increasing GPU Utilization during Generative Inference for Higher Throughput. Advances in Neural Information Processing Systems 36","author":"Jin Yunho","year":"2023","unstructured":"Yunho Jin, Chun-Feng Wu, David Brooks, and Gu-Yeon Wei. 2023. S3: Increasing GPU Utilization during Generative Inference for Higher Throughput. Advances in Neural Information Processing Systems 36 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_14_1","volume-title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. ArXiv abs\/2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. ArXiv abs\/2306.00978 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Efficient Estimation of Word Representations in Vector Space. In International Conference on Learning Representations.","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Kai Chen, Gregory S. Corrado, and Jeffrey Dean. 2013. Efficient Estimation of Word Representations in Vector Space. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","unstructured":"NVIDIA. 2019. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer"},{"key":"e_1_3_2_1_17_1","unstructured":"OpenAI. 2022. Introducting ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Myle Ott Sergey Edunov Alexei Baevski Angela Fan Sam Gross Nathan Ng David Grangier and Michael Auli. 2019. fairseq: A Fast Extensible Toolkit for Sequence Modeling. In North American Chapter of the Association for Computational Linguistics. 6151--6162.","DOI":"10.18653\/v1\/N19-4009"},{"key":"e_1_3_2_1_19_1","volume-title":"Jan Leike, and Ryan J. Lowe.","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke E. Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Francis Christiano, Jan Leike, and Ryan J. Lowe. 2022. Training language models to follow instructions with human feedback. ArXiv abs\/2203.02155 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Anselm Levskaya, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction. ArXiv abs\/2404.08509","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu, Weichao Mao, Archit Patke, Shengkun Cui, Saurabh Jha, Chen Wang, Hubertus Franke, Zbigniew T Kalbarczyk, Tamer Ba\u015far, and Ravishankar K Iyer. 2024. Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction. ArXiv abs\/2404.08509 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. 2021. INFaaS: Automated model-less inference serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). 397--411."},{"key":"e_1_3_2_1_24_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. ArXiv abs\/1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. ArXiv abs\/1910.01108 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 31094--31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark W. Barrett, Joseph Gonzalez, Percy Liang, Christopher R\u00e9, Ioan Cristian Stoica, and Ce Zhang. 2023. High-throughput Generative Inference of Large Language Models with a Single GPU. In International Conference on Machine Learning. PMLR, 31094--31116."},{"key":"e_1_3_2_1_26_1","unstructured":"Abraham Silberschatz Greg Gagne and Peter Galvin. 2004. Operating System Principles."},{"key":"e_1_3_2_1_27_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_28_1","unstructured":"ShareGPT Team. 2023. ShareGPT. https:\/\/sharegpt.com\/"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_30_1","unstructured":"Hugo Touvron Louis Martin Kevin R. Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Daniel M. Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony S. Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel M. Kloumann A. V. Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith R. Subramanian Xia Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zhengxu Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. ArXiv abs\/2307.09288 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Attention is All you Need. ArXiv abs\/1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam M. Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. ArXiv abs\/1706.03762 (2017)."},{"key":"e_1_3_2_1_32_1","volume-title":"OpenChat: Advancing Open-source Language Models with Mixed-Quality Data. ArXiv abs\/2309.11235","author":"Wang Guan","year":"2023","unstructured":"Guan Wang, Sijie Cheng, Xianyuan Zhan, Xiangang Li, Sen Song, and Yang Liu. 2023. OpenChat: Advancing Open-source Language Models with Mixed-Quality Data. ArXiv abs\/2309.11235 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"HuggingFace's Transformers: State-of-the-art Natural Language Processing. ArXiv abs\/1910.03771","author":"Wolf Thomas","year":"2019","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, and Jamie Brew. 2019. HuggingFace's Transformers: State-of-the-art Natural Language Processing. ArXiv abs\/1910.03771 (2019)."},{"key":"e_1_3_2_1_34_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In USENIX Symposium on Operating Systems Design and Implementation.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu and Joo Seong Jeong. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In USENIX Symposium on Operating Systems Design and Implementation."},{"key":"e_1_3_2_1_36_1","volume-title":"SHEPHERD: Serving DNNs in the Wild. In Symposium on Networked Systems Design and Implementation.","author":"Zhang Hong","year":"2023","unstructured":"Hong Zhang, Yupeng Tang, Anurag Khandelwal, and Ion Stoica. 2023. SHEPHERD: Serving DNNs in the Wild. In Symposium on Networked Systems Design and Implementation."},{"key":"e_1_3_2_1_37_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. ArXiv abs\/2205.01068 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"Real-Time Systems Symposium","author":"Zhao Wei","year":"1989","unstructured":"Wei Zhao and John A. Stankovic. 1989. Performance analysis of FCFS and improved FCFS scheduling algorithms for dynamic real-time computer systems. [1989] Proceedings. Real-Time Systems Symposium (1989), 156--165."},{"key":"e_1_3_2_1_39_1","volume-title":"ALISA: Accelerating Large Language Model Inference via Sparsity-Aware KV Caching. ArXiv abs\/2403.17312","author":"Zhao Youpeng","year":"2024","unstructured":"Youpeng Zhao, Di Wu, and Jun Wang. 2024. ALISA: Accelerating Large Language Model Inference via Sparsity-Aware KV Caching. ArXiv abs\/2403.17312 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Response Length Perception and Sequence Scheduling: An LLM-Empowered LLM Inference Pipeline. ArXiv abs\/2305.13144","author":"Zheng Zangwei","year":"2023","unstructured":"Zangwei Zheng, Xiaozhe Ren, Fuzhao Xue, Yang Luo, Xin Jiang, and Yang You. 2023. Response Length Perception and Sequence Scheduling: An LLM-Empowered LLM Inference Pipeline. ArXiv abs\/2305.13144 (2023)."}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"]},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676659","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:57Z","timestamp":1750290237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":40,"alternative-id":["10.1145\/3676536.3676659","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676659","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}