{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:05:31Z","timestamp":1775815531871,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Postgraduate Research & Practice Innovation Program of Jiangsu Province","award":["KYCX24_0247"],"award-info":[{"award-number":["KYCX24_0247"]}]},{"name":"Nanjing Key S&T Special Projects","award":["202309006"],"award-info":[{"award-number":["202309006"]}]},{"name":"Ant Research Program of Ant Group"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,22]]},"DOI":"10.1145\/3696410.3714930","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:52:18Z","timestamp":1745362338000},"page":"829-839","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["SCOOT: SLO-Oriented Performance Tuning for LLM Inference Engines"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0336-6916","authenticated-orcid":false,"given":"Ke","family":"Cheng","sequence":"first","affiliation":[{"name":"Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1095-9632","authenticated-orcid":false,"given":"Zhi","family":"Wang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4874-2260","authenticated-orcid":false,"given":"Wen","family":"Hu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5465-9626","authenticated-orcid":false,"given":"Tiannuo","family":"Yang","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8645-0680","authenticated-orcid":false,"given":"Jianguo","family":"Li","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6581-6399","authenticated-orcid":false,"given":"Sheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/www.aliyun.com\/product\/bailian","author":"Alibaba","year":"2024","unstructured":"Alibaba cloud bailian platform. https:\/\/www.aliyun.com\/product\/bailian, 2024."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/aws.amazon.com\/sagemaker","author":"Aws","year":"2024","unstructured":"Aws sagemaker. https:\/\/aws.amazon.com\/sagemaker, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/github.com\/vllm-project\/vllm","year":"2024","unstructured":"vllm. https:\/\/github.com\/vllm-project\/vllm, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"https:\/\/github.com\/NVIDIA\/TensorRT-LLM","year":"2024","unstructured":"Tensorrt-llm. https:\/\/github.com\/NVIDIA\/TensorRT-LLM, 2024."},{"key":"e_1_3_2_1_6_1","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, et al. Orca: A distributed serving system for transformer-based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521--538, 2022."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_8_1","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, et al. Taming throughput-latency tradeoff in llm inference with sarathi-serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 117--134, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"Deploying foundation model powered agent services: A survey","author":"Xu Wenchao","year":"2024","unstructured":"Wenchao Xu, Jinyu Chen, Peirong Zheng, et al. Deploying foundation model powered agent services: A survey. 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"A survey on automatic parameter tuning for big data processing systems. ACM Computing Surveys (CSUR), 53(2):1--37","author":"Herodotou Herodotos","year":"2020","unstructured":"Herodotos Herodotou, Yuxing Chen, and Jiaheng Lu. A survey on automatic parameter tuning for big data processing systems. ACM Computing Surveys (CSUR), 53(2):1--37, 2020."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sorms.2014.05.001"},{"key":"e_1_3_2_1_12_1","volume-title":"Sumit Singh Chauhan, and Vijay Kumar. A review on genetic algorithm: past, present, and future. Multimedia tools and applications, 80:8091--8126","author":"Katoch Sourabh","year":"2021","unstructured":"Sourabh Katoch, Sumit Singh Chauhan, and Vijay Kumar. A review on genetic algorithm: past, present, and future. Multimedia tools and applications, 80:8091--8126, 2021."},{"key":"e_1_3_2_1_13_1","volume-title":"A survey of reinforcement learning algorithms for dynamically varying environments. ACM Computing Surveys (CSUR), 54(6):1--25","author":"Padakandla Sindhu","year":"2021","unstructured":"Sindhu Padakandla. A survey of reinforcement learning algorithms for dynamically varying environments. ACM Computing Surveys (CSUR), 54(6):1--25, 2021."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582078"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.5555\/2955491.2955599"},{"key":"e_1_3_2_1_16_1","volume-title":"Multiobjective bayesian global optimization using expected hypervolume improvement gradient. Swarm and evolutionary computation, 44:945--956","author":"Yang Kaifeng","year":"2019","unstructured":"Kaifeng Yang, Michael Emmerich, Andr\u00e9 Deutz, and Thomas B\u00e4ck. Multiobjective bayesian global optimization using expected hypervolume improvement gradient. Swarm and evolutionary computation, 44:945--956, 2019."},{"key":"e_1_3_2_1_17_1","first-page":"9851","article-title":"Differentiable expected hypervolume improvement for parallel multi-objective bayesian optimization","volume":"33","author":"Daulton Samuel","year":"2020","unstructured":"Samuel Daulton, Maximilian Balandat, and Eytan Bakshy. Differentiable expected hypervolume improvement for parallel multi-objective bayesian optimization. Advances in Neural Information Processing Systems, 33:9851--9864, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","first-page":"19274","volume-title":"International Conference on Machine Learning","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning, pages 19274--19286. PMLR, 2023."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0962492900002804"},{"key":"e_1_3_2_1_20_1","first-page":"1674","volume-title":"International conference on machine learning","author":"Snoek Jasper","year":"2014","unstructured":"Jasper Snoek, Kevin Swersky, Rich Zemel, and Ryan Adams. Input warping for bayesian optimization of non-stationary functions. In International conference on machine learning, pages 1674--1682. PMLR, 2014."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129065704001899"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.13643"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2021.3054811"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3104322.3104451"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623278.3624770"},{"key":"e_1_3_2_1_26_1","volume-title":"et al. Scikit-learn: Machine learning in python. the Journal of machine Learning research, 12:2825--2830","author":"Pedregosa Fabian","year":"2011","unstructured":"Fabian Pedregosa, Ga\u00ebl Varoquaux, Alexandre Gramfort, et al. Scikit-learn: Machine learning in python. the Journal of machine Learning research, 12:2825--2830, 2011."},{"key":"e_1_3_2_1_27_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, et al. Llama 2: Open foundation and fine-tuned chat models. 2023."},{"key":"e_1_3_2_1_28_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, et al. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems, 35:16344--16359, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"The Twelfth International Conference on Learning Representations","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. Flashattention-2: Faster attention with better parallelism and work partitioning. In The Twelfth International Conference on Learning Representations, 2023."},{"key":"e_1_3_2_1_30_1","unstructured":"Nikhil Jha and Kevin Wang. Improving large language model throughput with efficient long-term memory management."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.257"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"e_1_3_2_1_33_1","first-page":"36","article-title":"Speculative decoding with big little decoder","author":"Kim Sehoon","year":"2024","unstructured":"Sehoon Kim, Karttikeya Mangalam, Suhong Moon, et al. Speculative decoding with big little decoder. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","first-page":"36","article-title":"Spectr: Fast speculative decoding via optimal transport","author":"Sun Ziteng","year":"2024","unstructured":"Ziteng Sun, Ananda Theertha Suresh, Jae Hun Ro, et al. Spectr: Fast speculative decoding via optimal transport. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457291"},{"key":"e_1_3_2_1_36_1","volume-title":"et al. Gptuner: A manual-reading database tuning system via gpt-guided bayesian optimization","author":"Lao Jiale","year":"2023","unstructured":"Jiale Lao, Yibo Wang, Yufei Li, et al. Gptuner: A manual-reading database tuning system via gpt-guided bayesian optimization. 2023."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00332"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2017.2647939"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403299"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611548"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599953"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052662"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3159652.3159665"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00025"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00031"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605578"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.2514\/6.1998-4771"},{"key":"e_1_3_2_1_48_1","volume-title":"A fast and elitist multiobjective genetic algorithm: Nsga-ii","author":"Deb Kalyanmoy","year":"2002","unstructured":"Kalyanmoy Deb, Amrit Pratap, Sameer Agarwal, and TAMT Meyarivan. A fast and elitist multiobjective genetic algorithm: Nsga-ii. IEEE transactions on evolutionary computation, 6(2):182--197, 2002."},{"key":"e_1_3_2_1_49_1","volume-title":"Llm inference in c\/c. https:\/\/github.com\/ggerganov\/llama.cpp","year":"2024","unstructured":"llama.cpp: Llm inference in c\/c. https:\/\/github.com\/ggerganov\/llama.cpp, 2024."},{"key":"e_1_3_2_1_50_1","volume-title":"https:\/\/docs.vllm.ai\/en\/v0.4.2\/models\/performance.html","year":"2024","unstructured":"vllm performance and tuning. https:\/\/docs.vllm.ai\/en\/v0.4.2\/models\/performance.html, 2024."},{"key":"e_1_3_2_1_51_1","volume-title":"https:\/\/github.com\/NVIDIA\/TensorRT-LLM\/blob\/main\/docs\/source\/performance\/perf-bestpractices.md","author":"Best","year":"2024","unstructured":"Best practices for tuning the performance of tensorrt-llm. https:\/\/github.com\/NVIDIA\/TensorRT-LLM\/blob\/main\/docs\/source\/performance\/perf-bestpractices.md, 2024."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714930","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714930","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:54Z","timestamp":1750295934000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714930"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":51,"alternative-id":["10.1145\/3696410.3714930","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714930","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}