{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:57Z","timestamp":1750309557138,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Research Grant","award":["SH2024JK29"],"award-info":[{"award-number":["SH2024JK29"]}]},{"name":"the National Natural Sience Foundation of China (NSFC)","award":["U20A20179, 62372009"],"award-info":[{"award-number":["U20A20179, 62372009"]}]},{"name":"the National Key R&D Program of China","award":["2024YFB2906602"],"award-info":[{"award-number":["2024YFB2906602"]}]},{"name":"High Performance Computing Platform of Peking University"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,22]]},"DOI":"10.1145\/3696410.3714950","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:52:18Z","timestamp":1745362338000},"page":"2309-2318","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["BATON: Enhancing Batch-wise Inference Efficiency for Large Language Models via Dynamic Re-batching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0563-745X","authenticated-orcid":false,"given":"Peizhuang","family":"Cong","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4020-6772","authenticated-orcid":false,"given":"Qizhi","family":"Chen","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8152-0977","authenticated-orcid":false,"given":"Haochen","family":"Zhao","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2402-5854","authenticated-orcid":false,"given":"Tong","family":"Yang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GPT understands, too. AI Open","author":"Liu Xiao","year":"2023","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2023. GPT understands, too. AI Open (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1109\/CVPRW63382.2024.00724"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1145\/3626772.3657675"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.18653\/v1\/2023.emnlp-main.600"},{"unstructured":"Openai. 2024. ChatGPT. https:\/\/openai.com Retrieved 2024 from","key":"e_1_3_2_1_5_1"},{"unstructured":"Microsoft. 2024. Copilot. https:\/\/copilot.microsoft.com Retrieved 2024 from","key":"e_1_3_2_1_6_1"},{"unstructured":"Google. 2024. Gemini. https:\/\/gemini.google.com Retrieved 2024 from","key":"e_1_3_2_1_7_1"},{"unstructured":"Baidu. 2024. ERINE. https:\/\/yiyan.baidu.com Retrieved 2024 from","key":"e_1_3_2_1_8_1"},{"unstructured":"Alibaba. 2024. Qwen. https:\/\/qianwen.aliyun.com Retrieved 2024 from","key":"e_1_3_2_1_9_1"},{"unstructured":"Moonshot. 2024. Kimi. https:\/\/kimi.moonshot.cn Retrieved 2024 from","key":"e_1_3_2_1_10_1"},{"key":"e_1_3_2_1_11_1","volume-title":"Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Noah Fiedel, Kiril Gorovoy, Jeremiah Harmsen, Li Lao, Fangwei Li, Vinu Rajashekhar, Sukriti Ramesh, and Jordan Soyke. 2017. Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139 (2017)."},{"key":"e_1_3_2_1_12_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati Arpan","year":"2020","unstructured":"Arpan Gujarati, Reza Karimi, Safya Alzayat, Wei Hao, Antoine Kaufmann, Ymir Vigfusson, and Jonathan Mace. 2020. Serving {DNNs} like clockwork: Performance predictability from the bottom up. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 443--462."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/CVPR.2016.90"},{"unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901.","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation. 117--134","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation. 117--134."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/ISCA59077.2024.00019"},{"unstructured":"NVIDIA. 2024a. FasterTransformer. https:\/\/github.com\/NVIDIA\/ FasterTransformer Retrieved 2024 from","key":"e_1_3_2_1_17_1"},{"unstructured":"NVIDIA. 2024b. Triton Inference Server. https:\/\/developer.nvidia.com\/ nvidia-triton-inference-server Retrieved 2024 from","key":"e_1_3_2_1_18_1"},{"key":"e_1_3_2_1_19_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_20_1","volume-title":"Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast distributed inference serving for large language models. arXiv preprint arXiv:2305.05920 (2023)."},{"unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9.","key":"e_1_3_2_1_21_1"},{"key":"e_1_3_2_1_22_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023a. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023b. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023).","key":"e_1_3_2_1_23_1"},{"key":"e_1_3_2_1_24_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. {DistServe}: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193--210."},{"unstructured":"Openai. 2024. GPT-2. https:\/\/huggingface.co\/openai-community\/gpt2 Retrieved 2024 from","key":"e_1_3_2_1_25_1"},{"unstructured":"Meta. 2024. Llama. https:\/\/www.llama2.ai Retrieved 2024 from","key":"e_1_3_2_1_26_1"},{"key":"e_1_3_2_1_27_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Fault-tolerant Generative LLM Serving. In Forty-first International Conference on Machine Learning.","author":"Strati Foteini","year":"2024","unstructured":"Foteini Strati, Sara McAllister, Amar Phanishayee, Jakub Tarnawski, and Ana Klimovic. 2024. D\u00e9j\u00e0Vu: KV-cache Streaming for Fast, Fault-tolerant Generative LLM Serving. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_29_1","volume-title":"Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav S Gulavani, and Ramachandran Ramjee. 2023. Sarathi: Efficient llm inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369 (2023)."},{"key":"e_1_3_2_1_30_1","first-page":"18015","article-title":"S^3: Increasing GPU Utilization during Generative Inference for Higher Throughput","volume":"36","author":"Jin Yunho","year":"2023","unstructured":"Yunho Jin, Chun-Feng Wu, David Brooks, and Gu-Yeon Wei. 2023. S^3: Increasing GPU Utilization during Generative Inference for Higher Throughput. Advances in Neural Information Processing Systems, Vol. 36 (2023), 18015--18027.","journal-title":"Advances in Neural Information Processing Systems"},{"unstructured":"Hugging Face. 2024a. Llama-2--7b-chat-hf. https:\/\/huggingface.co\/meta-llama\/Llama-2--7b-chat-hf Retrieved 2024 from","key":"e_1_3_2_1_31_1"},{"unstructured":"Hugging Face. 2024b. Transformers. https:\/\/github.com\/huggingface\/transformers Retrieved 2024 from","key":"e_1_3_2_1_32_1"},{"key":"e_1_3_2_1_33_1","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, Vol. 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_34_1","DOI":"10.5555\/3600270.3602446"},{"unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730--27744.","key":"e_1_3_2_1_35_1"},{"unstructured":"Google. 2024. PaLM2. https:\/\/ai.google\/discover\/palm2\/ Retrieved 2024 from","key":"e_1_3_2_1_36_1"},{"key":"e_1_3_2_1_37_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiaozhe Ren, and et. al. 2024. Response length perception and sequence scheduling: An llm-empowered llm inference pipeline. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"unstructured":"Sehoon Kim Coleman Hooper Thanakul Wattanawong Minwoo Kang Ruohan Yan Hasan Genc Grace Dinh Qijing Huang Kurt Keutzer Michael W Mahoney et al. 2023. Full stack optimization of transformer inference: a survey. arXiv preprint arXiv:2302.14017 (2023).","key":"e_1_3_2_1_38_1"},{"key":"e_1_3_2_1_39_1","volume-title":"A survey of techniques for optimizing transformer inference. Journal of Systems Architecture","author":"Chitty-Venkata Krishna Teja","year":"2023","unstructured":"Krishna Teja Chitty-Venkata, Sparsh Mittal, Murali Emani, Venkatram Vishwanath, and Arun K Somani. 2023. A survey of techniques for optimizing transformer inference. Journal of Systems Architecture (2023), 102990."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_40_1","DOI":"10.1109\/IISWC55918.2022.00018"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1109\/LCA.2023.3323482"},{"key":"e_1_3_2_1_42_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_44_1","DOI":"10.18653\/v1\/2024.naacl-industry.1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1145\/3605573.3605613"},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on Machine Learning. PMLR, 37524--37539","author":"Wu Xiaoxia","year":"2023","unstructured":"Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, and Yuxiong He. 2023. Understanding int4 quantization for language models: latency speedup, composability, and failure cases. In International Conference on Machine Learning. PMLR, 37524--37539."},{"key":"e_1_3_2_1_47_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323 (2022)."},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track. 792--810","author":"Cheng Zhoujun","year":"2023","unstructured":"Zhoujun Cheng, Jungo Kasai, and Tao Yu. 2023. Batch Prompting: Efficient Inference with Large Language Model APIs. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track. 792--810."},{"unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Daniel Y Fu Zhiqiang Xie Beidi Chen Clark Barrett Joseph E Gonzalez et al. 2023. High-throughput Generative Inference of Large Language Models with a Single GPU March 2023. arXiv preprint arXiv:2303.06865 (2023).","key":"e_1_3_2_1_49_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_50_1","DOI":"10.1145\/3437801.3441578"}],"event":{"sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"acronym":"WWW '25","name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia"},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714950","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:54Z","timestamp":1750295934000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714950"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":50,"alternative-id":["10.1145\/3696410.3714950","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714950","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}