{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:34:38Z","timestamp":1772724878159,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T00:00:00Z","timestamp":1728777600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Samsung Advanced Institute of Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["2023-00228255"],"award-info":[{"award-number":["2023-00228255"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["21A20151113068"],"award-info":[{"award-number":["21A20151113068"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,14]]},"DOI":"10.1145\/3656019.3676949","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T10:34:08Z","timestamp":1728642848000},"page":"233-245","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Improving Throughput-oriented LLM Inference with CPU Computations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2312-3049","authenticated-orcid":false,"given":"Daon","family":"Park","sequence":"first","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6645-6161","authenticated-orcid":false,"given":"Bernhard","family":"Egger","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,10,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.3390\/s21144758"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","volume-title":"Yolov4: Optimal speed and accuracy of object detection. arXiv preprint arXiv:2004.10934","author":"Bochkovskiy Alexey","year":"2020","unstructured":"Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan\u00a0Mark Liao. 2020. Yolov4: Optimal speed and accuracy of object detection. arXiv preprint arXiv:2004.10934 (2020)."},{"key":"e_1_3_2_1_5_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243210"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508421"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_10_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Ge Suyu","year":"2023","unstructured":"Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, and Jianfeng Gao. 2023. Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","unstructured":"HuggingFace. 2022. HuggingFace Accelerate. https:\/\/huggingface.co\/docs\/accelerate\/index."},{"key":"e_1_3_2_1_12_1","unstructured":"Albert\u00a0Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio\u00a0Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven\u00a0Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William\u00a0El Sayed. 2023. Mistral 7B. arxiv:2310.06825\u00a0[cs.CL]"},{"key":"e_1_3_2_1_13_1","volume-title":"SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael\u00a0W Mahoney, and Kurt Keutzer. 2023. SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3390\/s21041399"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"e_1_3_2_1_17_1","volume-title":"Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time. Advances in Neural Information Processing Systems 36","author":"Liu Zichang","year":"2024","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2024. Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Pointer Sentinel Mixture Models. In International Conference on Learning Representations.","author":"Merity Stephen","year":"2017","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. 2017. Pointer Sentinel Mixture Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","unstructured":"Nvidia. 2023. Magnum IO GPUDirect Storage. https:\/\/developer.nvidia.com\/gpudirect-storage."},{"key":"e_1_3_2_1_20_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arxiv:2303.08774\u00a0[cs.CL]"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3609510.3609815"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1101\/2021.02.12.430858"},{"key":"e_1_3_2_1_23_1","unstructured":"Noam Shazeer. 2019. Fast Transformer Decoding: One Write-Head is All You Need. arxiv:1911.02150\u00a0[cs.NE]"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 31094\u201331116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094\u201331116."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_1_27_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 38087\u201338099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_2_1_29_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for { Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_30_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems 36","author":"Zhang Zhenyu","year":"2024","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, 2024. H2o: Heavy-hitter oracle for efficient generative inference of large language models. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Efficiently Programming Large Language Models using SGLang. arXiv preprint arXiv:2312.07104","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody\u00a0Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph\u00a0E Gonzalez, 2023. Efficiently Programming Large Language Models using SGLang. arXiv preprint arXiv:2312.07104 (2023)."}],"event":{"name":"PACT '24: International Conference on Parallel Architectures and Compilation Techniques","location":"Long Beach CA USA","acronym":"PACT '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 2024 International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676949","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3656019.3676949","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:56:32Z","timestamp":1755892592000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676949"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,13]]},"references-count":32,"alternative-id":["10.1145\/3656019.3676949","10.1145\/3656019"],"URL":"https:\/\/doi.org\/10.1145\/3656019.3676949","relation":{},"subject":[],"published":{"date-parts":[[2024,10,13]]},"assertion":[{"value":"2024-10-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}