{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T15:54:23Z","timestamp":1774454063490,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"NSF","award":["CNS-2147909"],"award-info":[{"award-number":["CNS-2147909"]}]},{"name":"NSF","award":["CNS-2211882"],"award-info":[{"award-number":["CNS-2211882"]}]},{"name":"NSF","award":["CNS-2239351"],"award-info":[{"award-number":["CNS-2239351"]}]},{"name":"Amazon Research Award"},{"name":"Cisco Research Award"},{"name":"Google Faculty Research Award"},{"name":"Meta Research Award"},{"name":"Oracle Research Award"},{"name":"Qualcomm Innovation Fellowship"},{"name":"Samsung GRO Research Award"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620666.3651335","type":"proceedings-article","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T12:08:21Z","timestamp":1713960501000},"page":"932-949","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":66,"title":["SpecInfer: Accelerating Large Language Model Serving with Tree-based Speculative Inference and Verification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9371-8358","authenticated-orcid":false,"given":"Xupeng","family":"Miao","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5406-0736","authenticated-orcid":false,"given":"Gabriele","family":"Oliaro","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8409-2717","authenticated-orcid":false,"given":"Zhihao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3375-497X","authenticated-orcid":false,"given":"Xinhao","family":"Cheng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5756-2744","authenticated-orcid":false,"given":"Zeyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1578-2597","authenticated-orcid":false,"given":"Zhengxin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2677-0659","authenticated-orcid":false,"given":"Rae Ying Yee","family":"Wong","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8694-9246","authenticated-orcid":false,"given":"Alan","family":"Zhu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7909-5416","authenticated-orcid":false,"given":"Lijie","family":"Yang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6840-4691","authenticated-orcid":false,"given":"Xiaoxiang","family":"Shi","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7197-4965","authenticated-orcid":false,"given":"Chunan","family":"Shi","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7797-573X","authenticated-orcid":false,"given":"Zhuoming","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5626-4551","authenticated-orcid":false,"given":"Daiyaan","family":"Arfeen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6763-0108","authenticated-orcid":false,"given":"Reyna","family":"Abhyankar","sequence":"additional","affiliation":[{"name":"University of California San Diego, San Diego, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1270-5185","authenticated-orcid":false,"given":"Zhihao","family":"Jia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1533","volume-title":"Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing","author":"Berant Jonathan","year":"2013","unstructured":"Jonathan Berant, Andrew Chou, Roy Frostig, and Percy Liang. Semantic parsing on Freebase from question-answer pairs. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, pages 1533--1544, Seattle, Washington, USA, October 2013. Association for Computational Linguistics."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"e_1_3_2_1_3_1","volume-title":"Language models are few-shot learners","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners, 2020."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1985.6312218"},{"key":"e_1_3_2_1_5_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318, 2023."},{"key":"e_1_3_2_1_6_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. TVM: An automated end-to-end optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 578--594, 2018."},{"key":"e_1_3_2_1_7_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311, 2022."},{"key":"e_1_3_2_1_8_1","first-page":"30318","article-title":"8-bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems, 35:30318--30332, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","first-page":"5547","volume-title":"International Conference on Machine Learning","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning, pages 5547--5569. PMLR, 2022."},{"key":"e_1_3_2_1_10_1","volume-title":"Sparsegpt: Massive language models can be accurately pruned in one-shot","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. Sparsegpt: Massive language models can be accurately pruned in one-shot, 2023."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate quantization for generative pre-trained transformers. In International Conference on Learning Representations, 2023."},{"issue":"771","key":"e_1_3_2_1_12_1","first-page":"1612","article-title":"A short introduction to boosting","volume":"14","author":"Freund Yoav","year":"1999","unstructured":"Yoav Freund, Robert Schapire, and Naoki Abe. A short introduction to boosting. Journal-Japanese Society For Artificial Intelligence, 14(771-780):1612, 1999.","journal-title":"Journal-Japanese Society For Artificial Intelligence"},{"key":"e_1_3_2_1_13_1","volume-title":"Citeseer","author":"Gabbay Freddy","year":"1996","unstructured":"Freddy Gabbay and Avi Mendelson. Speculative execution based on value prediction. Citeseer, 1996."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105151"},{"key":"e_1_3_2_1_15_1","unstructured":"Aaron Gokaslan* Vanya Cohen* Ellie Pavlick and Stefanie Tellex. Openwebtext corpus. http:\/\/Skylion007.github.io\/OpenWebTextCorpus 2019."},{"key":"e_1_3_2_1_16_1","volume-title":"Computer architecture: a quantitative approach","author":"Hennessy John L","year":"2011","unstructured":"John L Hennessy and David A Patterson. Computer architecture: a quantitative approach. Elsevier, 2011."},{"key":"e_1_3_2_1_17_1","first-page":"21099","article-title":"Accelerated sparse neural training: A provable and efficient method to find n: m transposable masks","volume":"34","author":"Hubara Itay","year":"2021","unstructured":"Itay Hubara, Brian Chmiel, Moshe Island, Ron Banner, Joseph Naor, and Daniel Soudry. Accelerated sparse neural training: A provable and efficient method to find n: m transposable masks. Advances in Neural Information Processing Systems, 34:21099--21111, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","unstructured":"HuggingFace. Large language model text generation inference. https:\/\/github.com\/huggingface\/text-generation-inference. (Accessed on 08\/09\/2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Hugging face. https:\/\/huggingface.co","author":"Hugging Face Inc.","year":"2023","unstructured":"Hugging Face Inc. Hugging face. https:\/\/huggingface.co, 2023."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 2nd Conference on Systems and Machine Learning, SysML'19","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. In Proceedings of the 2nd Conference on Systems and Machine Learning, SysML'19, 2019."},{"key":"e_1_3_2_1_22_1","volume-title":"Assisted generation: a new direction toward low-latency text generation","author":"Gante Joao","year":"2023","unstructured":"Joao Gante. Assisted generation: a new direction toward low-latency text generation, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Big little transformer decoder","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Karttikeya Mangalam, Suhong Moon, John Canny, Jitendra Malik, Michael W. Mahoney, Amir Gholami, and Kurt Keutzer. Big little transformer decoder, 2023."},{"key":"e_1_3_2_1_24_1","volume-title":"vllm: Easy, fast, and cheap llm serving with pagedattention. See https:\/\/vllm.ai\/ (accessed","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Yu, Joseph E Gonzalez, Hao Zhang, and Ion Stoica. vllm: Easy, fast, and cheap llm serving with pagedattention. See https:\/\/vllm.ai\/ (accessed 9 August 2023), 2023."},{"key":"e_1_3_2_1_25_1","volume-title":"Fast inference from transformers via speculative decoding. arXiv preprint arXiv:2211.17192","author":"Leviathan Yaniv","year":"2022","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding. arXiv preprint arXiv:2211.17192, 2022."},{"key":"e_1_3_2_1_26_1","volume-title":"What makes good in-context examples for gpt-3? arXiv preprint arXiv:2101.06804","author":"Liu Jiachang","year":"2021","unstructured":"Jiachang Liu, Dinghan Shen, Yizhe Zhang, Bill Dolan, Lawrence Carin, and Weizhu Chen. What makes good in-context examples for gpt-3? arXiv preprint arXiv:2101.06804, 2021."},{"key":"e_1_3_2_1_27_1","volume-title":"Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Hongyi Jin, Tianqi Chen, and Zhihao Jia. Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234, 2023."},{"key":"e_1_3_2_1_28_1","volume-title":"Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:2305.09781","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. Specinfer: Accelerating generative llm serving with speculative inference and token tree verification. arXiv preprint arXiv:2305.09781, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"Spotserve: Serving generative large language models on preemptible instances. arXiv preprint arXiv:2311.15566","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Chunan Shi, Jiangfei Duan, Xiaoli Xi, Dahua Lin, Bin Cui, and Zhihao Jia. Spotserve: Serving generative large language models on preemptible instances. arXiv preprint arXiv:2311.15566, 2023."},{"key":"e_1_3_2_1_30_1","volume-title":"https:\/\/huggingface.co\/datasets\/MohamedRashad\/ChatGPT-prompts","year":"2023","unstructured":"MohamedRashad. Chatgpt-prompts. https:\/\/huggingface.co\/datasets\/MohamedRashad\/ChatGPT-prompts, 2023."},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Learning Representations","author":"Nguyen Xuan-Phi","year":"2020","unstructured":"Xuan-Phi Nguyen, Shafiq Joty, Steven Hoi, and Richard Socher. Tree-structured attention with hierarchical accumulation. In International Conference on Learning Representations, 2020."},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. Fastertransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer. (Accessed on 08\/09\/2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Gpt-4 technical report","author":"AI.","year":"2023","unstructured":"OpenAI. Gpt-4 technical report, 2023."},{"key":"e_1_3_2_1_34_1","volume-title":"chatbot instruction prompts. https:\/\/huggingface.co\/datasets\/alespalla\/chatbot_instruction_prompts","author":"Palla Alessandro","year":"2023","unstructured":"Alessandro Palla. chatbot instruction prompts. https:\/\/huggingface.co\/datasets\/alespalla\/chatbot_instruction_prompts, 2023."},{"key":"e_1_3_2_1_35_1","volume-title":"Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. nuqmm: Quantized matmul for efficient inference of large-scale generative language models. arXiv preprint arXiv:2206.09557","author":"Park Gunho","year":"2022","unstructured":"Gunho Park, Baeseong Park, Se Jung Kwon, Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. nuqmm: Quantized matmul for efficient inference of large-scale generative language models. arXiv preprint arXiv:2206.09557, 2022."},{"key":"e_1_3_2_1_36_1","volume-title":"Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277","author":"Peng Baolin","year":"2023","unstructured":"Baolin Peng, Chunyuan Li, Pengcheng He, Michel Galley, and Jianfeng Gao. Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277, 2023."},{"key":"e_1_3_2_1_37_1","volume-title":"Scaling language models: Methods, analysis & insights from training gopher. arXiv preprint arXiv:2112.11446","author":"Rae Jack W","year":"2021","unstructured":"Jack W Rae, Sebastian Borgeaud, Trevor Cai, Katie Millican, Jordan Hoffmann, Francis Song, John Aslanides, Sarah Henderson, Roman Ring, Susannah Young, et al. Scaling language models: Methods, analysis & insights from training gopher. arXiv preprint arXiv:2112.11446, 2021."},{"key":"e_1_3_2_1_38_1","volume-title":"Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100","author":"Scao Teven Le","year":"2022","unstructured":"Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ili\u0107, Daniel Hesslow, Roman Castagn\u00e9, Alexandra Sasha Luccioni, Fran\u00e7ois Yvon, Matthias Gall\u00e9, et al. Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100, 2022."},{"key":"e_1_3_2_1_39_1","volume-title":"Flexgen: High-throughput generative inference of large language models with a single gpu","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark Barrett, Joseph E. Gonzalez, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. Flexgen: High-throughput generative inference of large language models with a single gpu, 2023."},{"key":"e_1_3_2_1_40_1","volume-title":"High-throughput generative inference of large language models with a single gpu","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark Barrett, Joseph E. Gonzalez, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. High-throughput generative inference of large language models with a single gpu, 2023."},{"key":"e_1_3_2_1_41_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi Mohammad","year":"2020","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using model parallelism, 2020."},{"key":"e_1_3_2_1_42_1","first-page":"202","volume-title":"A study of branch prediction strategies. In 25 years of the international symposia on Computer architecture (selected papers)","author":"Smith James E","year":"1998","unstructured":"James E Smith. A study of branch prediction strategies. In 25 years of the international symposia on Computer architecture (selected papers), pages 202--215, 1998."},{"key":"e_1_3_2_1_43_1","volume-title":"Using deepspeed and mega-tron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, et al. Using deepspeed and mega-tron to train megatron-turing nlg 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990, 2022."},{"key":"e_1_3_2_1_44_1","first-page":"31","article-title":"Blockwise parallel decoding for deep autoregressive models","author":"Stern Mitchell","year":"2018","unstructured":"Mitchell Stern, Noam Shazeer, and Jakob Uszkoreit. Blockwise parallel decoding for deep autoregressive models. Advances in Neural Information Processing Systems, 31, 2018.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_45_1","volume-title":"Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca, 2023."},{"key":"e_1_3_2_1_46_1","volume-title":"et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_47_1","first-page":"267","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. Unity: Accelerating DNN training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 267--284, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_48_1","volume-title":"Attention is all you need","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need, 2017."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_1_50_1","first-page":"37","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Wang Haojie","year":"2021","unstructured":"Haojie Wang, Jidong Zhai, Mingyu Gao, Zixuan Ma, Shizhi Tang, Liyan Zheng, Yuanzhi Li, Kaiyuan Rong, Yuanyong Chen, and Zhihao Jia. PET: Optimizing tensor programs with partially equivalent transformations and automated corrections. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21), pages 37--54. USENIX Association, July 2021."},{"key":"e_1_3_2_1_51_1","unstructured":"Heming Xia Tao Ge Si-Qing Chen Furu Wei and Zhifang Sui. Speculative decoding: Lossless speedup of autoregressive translation."},{"key":"e_1_3_2_1_52_1","volume-title":"Smoothquant: Accurate and efficient post-training quantization for large language models. arXiv preprint arXiv:2211.10438","author":"Xiao Guangxuan","year":"2022","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. arXiv preprint arXiv:2211.10438, 2022."},{"key":"e_1_3_2_1_53_1","volume-title":"Inference with reference: Lossless acceleration of large language models. arXiv preprint arXiv:2304.04487","author":"Yang Nan","year":"2023","unstructured":"Nan Yang, Tao Ge, Liang Wang, Binxing Jiao, Daxin Jiang, Linjun Yang, Rangan Majumder, and Furu Wei. Inference with reference: Lossless acceleration of large language models. arXiv preprint arXiv:2304.04487, 2023."},{"key":"e_1_3_2_1_54_1","first-page":"27168","article-title":"Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. Advances in Neural Information Processing Systems, 35:27168--27183, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_55_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521--538, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_56_1","volume-title":"Pretraining-based natural language generation for text summarization. arXiv preprint arXiv:1902.09243","author":"Zhang Haoyu","year":"2019","unstructured":"Haoyu Zhang, Jianjun Xu, and Ji Wang. Pretraining-based natural language generation for text summarization. arXiv preprint arXiv:1902.09243, 2019."},{"key":"e_1_3_2_1_57_1","volume-title":"Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068, 2022."},{"key":"e_1_3_2_1_58_1","unstructured":"Lianmin Zheng Chengfan Jia Minmin Sun Zhao Wu Cody Hao Yu Ameer Haj-Ali Yida Wang Jun Yang Danyang Zhuo Koushik Sen et al. Ansor: Generating high-performance tensor programs for deep learning. In 14th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 20) pages 863--879 2020."}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651335","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651335"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":58,"alternative-id":["10.1145\/3620666.3651335","10.1145\/3620666"],"URL":"https:\/\/doi.org\/10.1145\/3620666.3651335","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}