{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T15:12:04Z","timestamp":1778166724766,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T00:00:00Z","timestamp":1713744000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,22]]},"DOI":"10.1145\/3642970.3655832","type":"proceedings-article","created":{"date-parts":[[2024,4,19]],"date-time":"2024-04-19T10:46:57Z","timestamp":1713523617000},"page":"144-152","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Towards Pareto Optimal Throughput in Small Language Model Serving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5535-0012","authenticated-orcid":false,"given":"Pol G.","family":"Recasens","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2207-1904","authenticated-orcid":false,"given":"Yue","family":"Zhu","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0204-2362","authenticated-orcid":false,"given":"Chen","family":"Wang","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2335-4798","authenticated-orcid":false,"given":"Eun Kyung","family":"Lee","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8377-6757","authenticated-orcid":false,"given":"Olivier","family":"Tardieu","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5346-7331","authenticated-orcid":false,"given":"Alaa","family":"Youssef","sequence":"additional","affiliation":[{"name":"IBM Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1963-7418","authenticated-orcid":false,"given":"Jordi","family":"Torres","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Universitat Polit\u00e8cnica de Catalunya"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3037-3580","authenticated-orcid":false,"given":"Josep Ll.","family":"Berral","sequence":"additional","affiliation":[{"name":"Universitat Polit\u00e8cnica de Catalunya, Barcelona Supercomputing Center"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_2_1","volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)."},{"key":"e_1_3_2_1_3_1","volume-title":"Beyond Efficiency: A Systematic Survey of Resource-Efficient Large Language Models. arXiv preprint arXiv:2401.00625","author":"Bai Guangji","year":"2024","unstructured":"Guangji Bai, Zheng Chai, Chen Ling, Shiyu Wang, Jiaying Lu, Nan Zhang, Tingwei Shi, Ziyang Yu, Mengdan Zhu, Yifei Zhang, et al. 2024. Beyond Efficiency: A Systematic Survey of Resource-Efficient Large Language Models. arXiv preprint arXiv:2401.00625 (2024)."},{"key":"e_1_3_2_1_4_1","first-page":"240","article-title":"2023. Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","volume-title":"TinyStories: How Small Can Language Models Be and Still Speak Coherent English? arXiv preprint arXiv:2305.07759","author":"Eldan Ronen","year":"2023","unstructured":"Ronen Eldan and Yuanzhi Li. 2023. TinyStories: How Small Can Language Models Be and Still Speak Coherent English? arXiv preprint arXiv:2305.07759 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Prompt cache: Modular attention reuse for low-latency inference. arXiv preprint arXiv:2311.04934","author":"Gim In","year":"2023","unstructured":"In Gim, Guojun Chen, Seung-seob Lee, Nikhil Sarda, Anurag Khandelwal, and Lin Zhong. 2023. Prompt cache: Modular attention reuse for low-latency inference. arXiv preprint arXiv:2311.04934 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al.","author":"Gunasekar Suriya","year":"2023","unstructured":"Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio C\u00e9sar Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, et al. 2023. Textbooks Are All You Need. arXiv preprint arXiv:2306.11644 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al.","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022. Training compute-optimal large language models. arXiv preprint arXiv:2203.15556 (2022)."},{"key":"e_1_3_2_1_10_1","unstructured":"HuggingFace. 2023. Text Generation Inference. hrtps:\/\/huggingface.co\/docs\/text-generation-inference\/index."},{"key":"e_1_3_2_1_11_1","volume-title":"S3: Increasing GPU Utilization during Generative Inference for Higher Throughput. arXiv preprint arXiv:2306.06000","author":"Jin Yunho","year":"2023","unstructured":"Yunho Jin, Chun-Feng Wu, David Brooks, and Gu-Yeon Wei. 2023. S3: Increasing GPU Utilization during Generative Inference for Higher Throughput. arXiv preprint arXiv:2306.06000 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael W Mahoney, and Kurt Keutzer. 2023. SqueezeLLM: Dense-and-Sparse Quantization. arXiv preprint arXiv:2306.07629 (2023)."},{"key":"e_1_3_2_1_13_1","unstructured":"Sehoon Kim Coleman Hooper Thanakul Wattanawong Minwoo Kang Ruohan Yan Hasan Genc Grace Dinh Qijing Huang Kurt Keutzer Michael W Mahoney et al. 2023. Full stack optimization of transformer inference: a survey. arXiv preprint arXiv:2302.14017 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_15_1","unstructured":"Zhuohan Li Lianmin Zheng Yinmin Zhong Vincent Liu Ying Sheng Xin Jin Yanping Huang Zhifeng Chen Hao Zhang Joseph E Gonzalez et al. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. arXiv preprint arXiv:2302.11665 (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"Microsoft. 2023. DeepSpeed-FastGen. https:\/\/github.com\/microsoft\/DeepSpeed\/tree\/master\/blogs\/deepspeed-fastgen."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA. 2023. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_2_1_18_1","unstructured":"NVIDIA. 2023. GPU Performance Background User's Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-gpu-background\/index.html#understand-perf."},{"key":"e_1_3_2_1_19_1","unstructured":"NVIDIA. 2024. DCGM User guide: Feature overview. https:\/\/docs.nvidia.com\/datacenter\/dcgm\/latest\/user-guide\/feature-overview.html."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"PyTorch. 2024. Accelerating PyTorch with CUDA Graphs. https:\/\/pytorch.org\/blog\/accelerating-pytorch-with-cuda-graphs\/."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC58863.2023.10363447"},{"key":"e_1_3_2_1_23_1","unstructured":"ShareGPT. 2023. ShareGPT. https:\/\/sharegpt.com\/."},{"key":"e_1_3_2_1_24_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 31094--31116","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094--31116."},{"key":"e_1_3_2_1_26_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_27_1","unstructured":"MosaicML NLP Team. 2023. Introducing MPT-7B: A New Standard for Open-Source Commercially Usable LLMs. www.mosaicml.com\/blog\/mpt-7b Accessed: 2023-05-05."},{"key":"e_1_3_2_1_28_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"vLLM. 2024. Project update. https:\/\/docs.google.com\/presentation\/d\/12ml2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA\/mobilepresent?slide=id.g2b46085d608_1_0."},{"key":"e_1_3_2_1_30_1","unstructured":"Ben Wang and Aran Komatsuzaki. 2021. GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model. https:\/\/github.com\/kingoflolz\/mesh-transformer-jax."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. PMLR, 38087--38099","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087--38099."},{"key":"e_1_3_2_1_34_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_35_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."}],"event":{"name":"EuroSys '24: Nineteenth European Conference on Computer Systems","location":"Athens Greece","acronym":"EuroSys '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 4th Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3642970.3655832","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3642970.3655832","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:15:18Z","timestamp":1755908118000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3642970.3655832"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,22]]},"references-count":35,"alternative-id":["10.1145\/3642970.3655832","10.1145\/3642970"],"URL":"https:\/\/doi.org\/10.1145\/3642970.3655832","relation":{},"subject":[],"published":{"date-parts":[[2024,4,22]]},"assertion":[{"value":"2024-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}