{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:24:35Z","timestamp":1773318275757,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759823","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:05:39Z","timestamp":1762963539000},"page":"1725-1741","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["gLLM: Global Balanced Pipeline Parallelism Systems for Distributed LLMs Serving with Token Throttling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2979-4486","authenticated-orcid":false,"given":"Tianyu","family":"Guo","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3507-4299","authenticated-orcid":false,"given":"Xianwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4707-9492","authenticated-orcid":false,"given":"Jiangsu","family":"Du","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-5715","authenticated-orcid":false,"given":"Zhiguang","family":"Chen","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2166-977X","authenticated-orcid":false,"given":"Nong","family":"Xiao","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"ShareGPT Team. 2023. ShareGPT. https:\/\/sharegpt.com\/."},{"key":"e_1_3_3_3_3_2","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav\u00a0S. Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024, Ada Gavrilovska and Douglas\u00a0B. Terry (Eds.). USENIX Association, 117\u2013134. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","unstructured":"Ebtesam Almazrouei Hamza Alobeidli Abdulaziz Alshamsi Alessandro Cappelli Ruxandra Cojocaru M\u00e9rouane Debbah \u00c9tienne Goffinet Daniel Hesslow Julien Launay Quentin Malartic Daniele Mazzotta Badreddine Noune Baptiste Pannier and Guilherme Penedo. 2023. The Falcon Series of Open Language Models. CoRR abs\/2311.16867 (2023). 10.48550\/ARXIV.2311.16867 arXiv:https:\/\/arXiv.org\/abs\/2311.16867","DOI":"10.48550\/ARXIV.2311.16867"},{"key":"e_1_3_3_3_5_2","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023","author":"Borzunov Alexander","year":"2023","unstructured":"Alexander Borzunov, Max Ryabinin, Artem Chumachenko, Dmitry Baranchuk, Tim Dettmers, Younes Belkada, Pavel Samygin, and Colin\u00a0A. Raffel. 2023. Distributed Inference and Fine-tuning of Large Language Models Over The Internet. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023, Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/28bf1419b9a1f908c15f6195f58cb865-Abstract-Conference.html"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"e_1_3_3_3_7_2","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","unstructured":"DeepSeek-AI Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan Damai Dai Daya Guo Dejian Yang Deli Chen Dongjie Ji Erhang Li Fangyun Lin Fucong Dai Fuli Luo Guangbo Hao Guanting Chen Guowei Li H. Zhang Han Bao Hanwei Xu Haocheng Wang Haowei Zhang Honghui Ding Huajian Xin Huazuo Gao Hui Li Hui Qu J.\u00a0L. Cai Jian Liang Jianzhong Guo Jiaqi Ni Jiashi Li Jiawei Wang Jin Chen Jingchang Chen Jingyang Yuan Junjie Qiu Junlong Li Junxiao Song Kai Dong Kai Hu Kaige Gao Kang Guan Kexin Huang Kuai Yu Lean Wang Lecong Zhang Lei Xu Leyi Xia Liang Zhao Litong Wang Liyue Zhang Meng Li Miaojun Wang Mingchuan Zhang Minghua Zhang Minghui Tang Mingming Li Ning Tian Panpan Huang Peiyi Wang Peng Zhang Qiancheng Wang Qihao Zhu Qinyu Chen Qiushi Du R.\u00a0J. Chen R.\u00a0L. Jin Ruiqi Ge Ruisong Zhang Ruizhe Pan Runji Wang Runxin Xu Ruoyu Zhang Ruyi Chen S.\u00a0S. Li Shanghao Lu Shangyan Zhou Shanhuang Chen Shaoqing Wu Shengfeng Ye Shengfeng Ye Shirong Ma Shiyu Wang Shuang Zhou Shuiping Yu Shunfeng Zhou Shuting Pan T. Wang Tao Yun Tian Pei Tianyu Sun W.\u00a0L. Xiao and Wangding Zeng. 2024. DeepSeek-V3 Technical Report. CoRR abs\/2412.19437 (2024). 10.48550\/ARXIV.2412.19437 arXiv:https:\/\/arXiv.org\/abs\/2412.19437","DOI":"10.48550\/ARXIV.2412.19437"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.478"},{"key":"e_1_3_3_3_11_2","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024","author":"Didolkar Aniket","year":"2024","unstructured":"Aniket Didolkar, Anirudh Goyal, Nan\u00a0Rosemary Ke, Siyuan Guo, Michal Valko, Timothy\u00a0P. Lillicrap, Danilo\u00a0Jimenez Rezende, Yoshua Bengio, Michael\u00a0C. Mozer, and Sanjeev Arora. 2024. Metacognitive Capabilities of LLMs: An Exploration in Mathematical Problem Solving. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub\u00a0M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/2318d75a06437eaa257737a5cf3ab83c-Abstract-Conference.html"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","unstructured":"Jiangsu Du Ziming Liu Jiarui Fang Shenggui Li Yongbin Li Yutong Lu and Yang You. 2022. EnergonAI: An Inference System for 10-100 Billion Parameter Transformer Models. CoRR abs\/2209.02341 (2022). 10.48550\/ARXIV.2209.02341 arXiv:https:\/\/arXiv.org\/abs\/2209.02341","DOI":"10.48550\/ARXIV.2209.02341"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638466"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan Anirudh Goyal Anthony Hartshorn Aobo Yang Archi Mitra Archie Sravankumar Artem Korenev Arthur Hinsvark Arun Rao Aston Zhang Aur\u00e9lien Rodriguez Austen Gregerson Ava Spataru Baptiste Rozi\u00e8re Bethany Biron Binh Tang Bobbie Chern Charlotte Caucheteux Chaya Nayak Chloe Bi Chris Marra Chris McConnell Christian Keller Christophe Touret Chunyang Wu Corinne Wong Cristian\u00a0Canton Ferrer Cyrus Nikolaidis Damien Allonsius Daniel Song Danielle Pintz Danny Livshits David Esiobu Dhruv Choudhary Dhruv Mahajan Diego Garcia-Olano Diego Perino Dieuwke Hupkes Egor Lakomkin Ehab AlBadawy Elina Lobanova Emily Dinan Eric\u00a0Michael Smith Filip Radenovic Frank Zhang Gabriel Synnaeve Gabrielle Lee Georgia\u00a0Lewis Anderson Graeme Nail Gr\u00e9goire Mialon Guan Pang Guillem Cucurell Hailey Nguyen Hannah Korevaar Hu Xu Hugo Touvron Iliyan Zarov Imanol\u00a0Arrieta Ibarra Isabel\u00a0M. Kloumann Ishan Misra Ivan Evtimov Jade Copet Jaewon Lee Jan Geffert Jana Vranes Jason Park Jay Mahadeokar Jeet Shah Jelmer van\u00a0der Linde Jennifer Billock Jenny Hong Jenya Lee Jeremy Fu Jianfeng Chi Jianyu Huang Jiawen Liu Jie Wang Jiecao Yu Joanna Bitton Joe Spisak Jongsoo Park Joseph Rocca Joshua Johnstun Joshua Saxe Junteng Jia Kalyan\u00a0Vasuden Alwala Kartikeya Upasani Kate Plawiak Ke Li Kenneth Heafield Kevin Stone and et al.2024. The Llama 3 Herd of Models. CoRR abs\/2407.21783 (2024). 10.48550\/ARXIV.2407.21783 arXiv:https:\/\/arXiv.org\/abs\/2407.21783","DOI":"10.48550\/ARXIV.2407.21783"},{"key":"e_1_3_3_3_15_2","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2022. Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. J. Mach. Learn. Res. 23 (2022) 120:1\u2013120:39. https:\/\/jmlr.org\/papers\/v23\/21-0998.html"},{"key":"e_1_3_3_3_16_2","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil\u00a0R. Devanur Gregory\u00a0R. Ganger and Phillip\u00a0B. Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. CoRR abs\/1806.03377 (2018). arXiv:https:\/\/arXiv.org\/abs\/1806.03377http:\/\/arxiv.org\/abs\/1806.03377"},{"key":"e_1_3_3_3_17_2","first-page":"103","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia\u00a0Xu Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, Hanna\u00a0M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d\u2019Alch\u00e9-Buc, Emily\u00a0B. Fox, and Roman Garnett (Eds.). 103\u2013112. https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/093f65e080a295f8076b1c5722a46aa2-Abstract.html"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_3_3_19_2","first-page":"673","volume-title":"Proceedings of the 2022 USENIX Annual Technical Conference, USENIX ATC 2022, Carlsbad, CA, USA, July 11-13, 2022","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, Xiaoyong Liu, and Wei Lin. 2022. Whale: Efficient Giant Model Training over Heterogeneous GPUs. In Proceedings of the 2022 USENIX Annual Technical Conference, USENIX ATC 2022, Carlsbad, CA, USA, July 11-13, 2022, Jiri Schindler and Noa Zilberman (Eds.). USENIX Association, 673\u2013688. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/jia-xianyan"},{"key":"e_1_3_3_3_20_2","series-title":"Proceedings of Machine Learning Research","first-page":"16639","volume-title":"International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","volume":"202","author":"Kim Taebum","year":"2023","unstructured":"Taebum Kim, Hyoungjoo Kim, Gyeong-In Yu, and Byung-Gon Chun. 2023. BPipe: Memory-Balanced Pipeline Parallelism for Training Large Language Models. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 16639\u201316653. https:\/\/proceedings.mlr.press\/v202\/kim23l.html"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_3_22_2","first-page":"155","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee, Jungi Lee, Junghwan Seo, and Jaewoong Sim. 2024. InfiniGen: Efficient Generative Inference of Large Language Models with Dynamic KV Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024, Ada Gavrilovska and Douglas\u00a0B. Terry (Eds.). USENIX Association, 155\u2013172. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/lee"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.730"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.163"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710869"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.FINDINGS-ACL.411"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710878"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.151"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1609\/AAAISS.V3I1.31198"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","unstructured":"Yixuan Mei Yonghao Zhuang Xupeng Miao Juncheng Yang Zhihao Jia and Rashmi Vinayak. 2024. Helix: Distributed Serving of Large Language Models via Max-Flow on Heterogeneous GPUs. CoRR abs\/2406.01566 (2024). 10.48550\/ARXIV.2406.01566 arXiv:https:\/\/arXiv.org\/abs\/2406.01566","DOI":"10.48550\/ARXIV.2406.01566"},{"key":"e_1_3_3_3_32_2","series-title":"Proceedings of Machine Learning Research","first-page":"7937","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event","volume":"139","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-Efficient Pipeline-Parallel DNN Training. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 7937\u20137947. http:\/\/proceedings.mlr.press\/v139\/narayanan21a.html"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_3_34_2","unstructured":"NVIDIA. 2023. Faster Transformer. https:\/\/github.com\/NVIDIA\/FasterTransformer."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","unstructured":"OpenAI. 2023. GPT-4 Technical Report. CoRR abs\/2303.08774 (2023). 10.48550\/ARXIV.2303.08774 arXiv:https:\/\/arXiv.org\/abs\/2303.08774","DOI":"10.48550\/ARXIV.2303.08774"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"e_1_3_3_3_37_2","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024","author":"Qi Penghui","year":"2024","unstructured":"Penghui Qi, Xinyi Wan, Guangxing Huang, and Min Lin. 2024. Zero Bubble (Almost) Pipeline Parallelism. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=tuzTN0eIO5"},{"key":"e_1_3_3_3_38_2","first-page":"155","volume-title":"23rd USENIX Conference on File and Storage Technologies, FAST 2025, Santa Clara, CA, February 25-27, 2025","author":"Qin Ruoyu","year":"2025","unstructured":"Ruoyu Qin, Zheming Li, Weiran He, Jialei Cui, Feng Ren, Mingxing Zhang, Yongwei Wu, Weimin Zheng, and Xinran Xu. 2025. Mooncake: Trading More Storage for Less Computation - A KVCache-centric Architecture for Serving LLM Chatbot. In 23rd USENIX Conference on File and Storage Technologies, FAST 2025, Santa Clara, CA, February 25-27, 2025, Haryadi\u00a0S. Gunawi and Vasily Tarasov (Eds.). USENIX Association, 155\u2013170. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qin"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy\u00a0P. Lillicrap Jean-Baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser Ioannis Antonoglou Rohan Anil Sebastian Borgeaud Andrew\u00a0M. Dai Katie Millican Ethan Dyer Mia Glaese Thibault Sottiaux Benjamin Lee Fabio Viola Malcolm Reynolds Yuanzhong Xu James Molloy Jilin Chen Michael Isard Paul Barham Tom Hennigan Ross McIlroy Melvin Johnson Johan Schalkwyk Eli Collins Eliza Rutherford Erica Moreira Kareem Ayoub Megha Goel Clemens Meyer Gregory Thornton Zhen Yang Henryk Michalewski Zaheer Abbas Nathan Schucher Ankesh Anand Richard Ives James Keeling Karel Lenc Salem Haykal Siamak Shakeri Pranav Shyam Aakanksha Chowdhery Roman Ring Stephen Spencer Eren Sezener and et al.2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. CoRR abs\/2403.05530 (2024). 10.48550\/ARXIV.2403.05530 arXiv:https:\/\/arXiv.org\/abs\/2403.05530","DOI":"10.48550\/ARXIV.2403.05530"},{"key":"e_1_3_3_3_40_2","series-title":"Proceedings of Machine Learning Research","first-page":"29416","volume-title":"International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","volume":"202","author":"Ryabinin Max","year":"2023","unstructured":"Max Ryabinin, Tim Dettmers, Michael Diskin, and Alexander Borzunov. 2023. SWARM Parallelism: Training Large Models Can Be Surprisingly Communication-Efficient. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 29416\u201329440. https:\/\/proceedings.mlr.press\/v202\/ryabinin23a.html"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29858"},{"key":"e_1_3_3_3_42_2","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024","author":"Setlur Amrith","year":"2024","unstructured":"Amrith Setlur, Saurabh Garg, Xinyang Geng, Naman Garg, Virginia Smith, and Aviral Kumar. 2024. RL on Incorrect Synthetic Data Scales the Efficiency of LLM Math Reasoning by Eight-Fold. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub\u00a0M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/4b77d5b896c321a29277524a98a50215-Abstract-Conference.html"},{"key":"e_1_3_3_3_43_2","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub\u00a0M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/7ede97c3e082c6df10a8d6103a2eebd2-Abstract-Conference.html"},{"key":"e_1_3_3_3_44_2","series-title":"Proceedings of Machine Learning Research","first-page":"31094","volume-title":"International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA","volume":"202","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 31094\u201331116. https:\/\/proceedings.mlr.press\/v202\/sheng23a.html"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.37"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","unstructured":"Jovan Stojkovic Chaojie Zhang \u00cd\u00f1igo Goiri Josep Torrellas and Esha Choukse. 2024. DynamoLLM: Designing LLM Inference Clusters for Performance and Energy Efficiency. CoRR abs\/2408.00741 (2024). 10.48550\/ARXIV.2408.00741 arXiv:https:\/\/arXiv.org\/abs\/2408.00741","DOI":"10.48550\/ARXIV.2408.00741"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.531"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.379"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1609\/AAAISS.V3I1.31210"},{"key":"e_1_3_3_3_51_2","first-page":"563","volume-title":"Proceedings of the 2024 USENIX Annual Technical Conference, USENIX ATC 2024, Santa Clara, CA, USA, July 10-12, 2024","author":"Um Taegeon","year":"2024","unstructured":"Taegeon Um, Byungsoo Oh, Minyoung Kang, Woo-Yeon Lee, Goeun Kim, Dongseob Kim, Youngtaek Kim, Mohd Muzzammil, and Myeongjae Jeon. 2024. Metis: Fast Automatic Distributed Training on Heterogeneous GPUs. In Proceedings of the 2024 USENIX Annual Technical Conference, USENIX ATC 2024, Santa Clara, CA, USA, July 10-12, 2024, Saurabh Bagchi and Yiying Zhang (Eds.). USENIX Association, 563\u2013578. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/um"},{"key":"e_1_3_3_3_52_2","first-page":"5998","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"e_1_3_3_3_54_2","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024","author":"Wang Yubo","year":"2024","unstructured":"Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, Tianle Li, Max Ku, Kai Wang, Alex Zhuang, Rongqi Fan, Xiang Yue, and Wenhu Chen. 2024. MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub\u00a0M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/ad236edc564f3e3156e1b2feafb99a24-Abstract-Datasets_and_Benchmarks_Track.html"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei Huan Lin Jian Yang Jianhong Tu Jianwei Zhang Jianxin Yang Jiaxi Yang Jingren Zhou Junyang Lin Kai Dang Keming Lu Keqin Bao Kexin Yang Le Yu Mei Li Mingfeng Xue Pei Zhang Qin Zhu Rui Men Runji Lin Tianhao Li Tingyu Xia Xingzhang Ren Xuancheng Ren Yang Fan Yang Su Yichang Zhang Yu Wan Yuqiong Liu Zeyu Cui Zhenru Zhang and Zihan Qiu. 2024. Qwen2.5 Technical Report. CoRR abs\/2412.15115 (2024). 10.48550\/ARXIV.2412.15115 arXiv:https:\/\/arXiv.org\/abs\/2412.15115","DOI":"10.48550\/ARXIV.2412.15115"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.779"},{"key":"e_1_3_3_3_57_2","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022, Marcos\u00a0K. Aguilera and Hakim Weatherspoon (Eds.). USENIX Association, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","unstructured":"Beibei Zhang Hongwei Zhu Feng Gao Zhihui Yang and Xiaoyang\u00a0Sean Wang. 2023. Moirai: Towards Optimal Placement for Distributed Inference on Heterogeneous Devices. CoRR abs\/2312.04025 (2023). 10.48550\/ARXIV.2312.04025 arXiv:https:\/\/arXiv.org\/abs\/2312.04025","DOI":"10.48550\/ARXIV.2312.04025"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"publisher","unstructured":"Hongbin Zhang Taosheng Wei Zhenyi Zheng Jiangsu Du Zhiguang Chen and Yutong Lu. 2025. TD-Pipe: Temporally-Disaggregated Pipeline Parallelism Architecture for High-Throughput LLM Inference. CoRR abs\/2506.10470 (2025). 10.48550\/ARXIV.2506.10470 arXiv:https:\/\/arXiv.org\/abs\/2506.10470","DOI":"10.48550\/ARXIV.2506.10470"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629580"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.ACL-LONG.852"},{"key":"e_1_3_3_3_62_2","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric\u00a0P. Xing, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2022. Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022, Marcos\u00a0K. Aguilera and Hakim Weatherspoon (Eds.). USENIX Association, 559\u2013578. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zheng-lianmin"},{"key":"e_1_3_3_3_63_2","volume-title":"Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Chuyue Sun, Jeff Huang, Cody\u00a0Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph\u00a0E. Gonzalez, Clark\u00a0W. Barrett, and Ying Sheng. 2024. SGLang: Efficient Execution of Structured Language Model Programs. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10 - 15, 2024, Amir Globersons, Lester Mackey, Danielle Belgrave, Angela Fan, Ulrich Paquet, Jakub\u00a0M. Tomczak, and Cheng Zhang (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2024\/hash\/724be4472168f31ba1c9ac630f15dec8-Abstract-Conference.html"},{"key":"e_1_3_3_3_64_2","first-page":"193","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2024, Santa Clara, CA, USA, July 10-12, 2024, Ada Gavrilovska and Douglas\u00a0B. Terry (Eds.). USENIX Association, 193\u2013210. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/zhong-yinmin"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759823","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:39:48Z","timestamp":1773254388000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759823"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":63,"alternative-id":["10.1145\/3712285.3759823","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759823","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}