{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T23:10:49Z","timestamp":1771888249563,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T00:00:00Z","timestamp":1708387200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,2]]},"DOI":"10.1145\/3627535.3638466","type":"proceedings-article","created":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T14:22:41Z","timestamp":1708438961000},"page":"42-54","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Liger: Interleaving Intra- and Inter-Operator Parallelism for Distributed Large Model Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4707-9492","authenticated-orcid":false,"given":"Jiangsu","family":"Du","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9850-8384","authenticated-orcid":false,"given":"Jinhui","family":"Wei","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1417-3012","authenticated-orcid":false,"given":"Jiazhi","family":"Jiang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7966-2941","authenticated-orcid":false,"given":"Shenggan","family":"Cheng","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5582-1031","authenticated-orcid":false,"given":"Dan","family":"Huang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-5715","authenticated-orcid":false,"given":"Zhiguang","family":"Chen","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,2,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AMD. 2023. AMD Infinity Architecture. https:\/\/www.amd.com\/en\/technologies\/infinity-architecture"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_4_1","volume-title":"ATP: Adaptive Tensor Parallelism for Foundation Models. arXiv preprint arXiv:2301.08658","author":"Cheng Shenggan","year":"2023","unstructured":"Shenggan Cheng, Ziming Liu, Jiangsu Du, and Yang You. 2023. ATP: Adaptive Tensor Parallelism for Foundation Models. arXiv preprint arXiv:2301.08658 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Shenggan Cheng Xuanlei Zhao Guangyang Lu Jiarui Fang Zhongming Yu Tian Zheng Ruidong Wu Xiwen Zhang Jian Peng and Yang You. 2023. FastFold: Reducing AlphaFold Training Time from 11 Days to 67 Hours. arXiv:2203.00854 [cs.LG]"},{"key":"e_1_3_2_1_6_1","unstructured":"ONNX Runtime developers. 2018. ONNX Runtime. https:\/\/github.com\/microsoft\/onnxruntime"},{"key":"e_1_3_2_1_7_1","volume-title":"EnergonAI: An Inference System for 10--100 Billion Parameter Transformer Models. arXiv preprint arXiv:2209.02341","author":"Du Jiangsu","year":"2022","unstructured":"Jiangsu Du, Ziming Liu, Jiarui Fang, Shenggui Li, Yongbin Li, Yutong Lu, and Yang You. 2022. EnergonAI: An Inference System for 10--100 Billion Parameter Transformer Models. arXiv preprint arXiv:2209.02341 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"What do compressed large language models forget? robustness challenges in model compression. arXiv e-prints","author":"Du Mengnan","year":"2021","unstructured":"Mengnan Du, Subhabrata Mukherjee, Yu Cheng, Milad Shokouhi, Xia Hu, and Ahmed Hassan Awadallah. 2021. What do compressed large language models forget? robustness challenges in model compression. arXiv e-prints (2021), arXiv-2110."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS.2013.12"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575703"},{"key":"e_1_3_2_1_13_1","unstructured":"Github. 2023. Copilot. https:\/\/github.com\/features\/copilot"},{"key":"e_1_3_2_1_14_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. 2022. Microsecond-scale preemption for concurrent GPU-accelerated DNN inferences. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 539--558."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-demo.22"},{"key":"e_1_3_2_1_16_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, et al. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8--14, 2019, Vancouver, BC, Canada. 103--112."},{"key":"e_1_3_2_1_17_1","volume-title":"TimeGraph: GPU Scheduling for Real-Time Multi-Tasking Environments. In 2011 USENIX Annual Technical Conference (USENIX ATC 11)","author":"Kato Shinpei","year":"2011","unstructured":"Shinpei Kato, Karthik Lakshmanan, et al. 2011. TimeGraph: GPU Scheduling for Real-Time Multi-Tasking Environments. In 2011 USENIX Annual Technical Conference (USENIX ATC 11)."},{"key":"e_1_3_2_1_18_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations.","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, et al. 2020. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663--679."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 6543--6552","author":"Li Zhuohan","year":"2021","unstructured":"Zhuohan Li, Siyuan Zhuang, Shiyuan Guo, Danyang Zhuo, Hao Zhang, Dawn Song, and Ion Stoica. 2021. Terapipe: Token-level pipeline parallelism for training large-scale language models. In International Conference on Machine Learning. PMLR, 6543--6552."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2023. Fast Multi-GPU collectives with NCCL. https:\/\/developer.nvidia.com\/blog\/fast-multi-gpu-collectives-nccl\/"},{"key":"e_1_3_2_1_23_1","unstructured":"NVIDIA. 2023. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer"},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA. 2023. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests\/tree\/master"},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA. 2023. NVLink and NVSwitch. https:\/\/www.nvidia.com\/en-us\/data-center\/nvlink\/"},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA. 2023. TensorRT. https:\/\/github.com\/NVIDIA\/TensorRT"},{"key":"e_1_3_2_1_27_1","unstructured":"OpenAI. 2023. Chatgpt. https:\/\/chat.openai.com\/chat"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, et al. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Rajbhandari Samyam","unstructured":"Samyam Rajbhandari, Conglong Li, and et al. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International Conference on Machine Learning. PMLR, 18332--18346."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"e_1_3_2_1_32_1","volume-title":"Mesh-tensorflow: Deep learning for supercomputers. Advances in neural information processing systems 31","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, et al. 2018. Mesh-tensorflow: Deep learning for supercomputers. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_34_1","unstructured":"Ying Sheng Lianmin Zheng Binhang Yuan Zhuohan Li Max Ryabinin Beidi Chen Percy Liang Christopher Re Ion Stoica and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. (2023)."},{"key":"e_1_3_2_1_35_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri et al. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053 (2019). arXiv:1909.08053 http:\/\/arxiv.org\/abs\/1909.08053"},{"key":"e_1_3_2_1_36_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"Fast Distributed Inference Serving for Large Language Models. arXiv preprint arXiv:2305.05920","author":"Wu Bingyang","year":"2023","unstructured":"Bingyang Wu, Yinmin Zhong, Zili Zhang, Gang Huang, Xuanzhe Liu, and Xin Jin. 2023. Fast Distributed Inference Serving for Large Language Models. arXiv preprint arXiv:2305.05920 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Xiao Wencong","year":"2020","unstructured":"Wencong Xiao, Shiru Ren, Yong Li, Yang Zhang, Pengyang Hou, Zhi Li, Yihui Feng, Wei Lin, and Yangqing Jia. 2020. {AntMan}: Dynamic scaling on {GPU} clusters for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 533--548."},{"key":"e_1_3_2_1_39_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for {Transformer-Based} Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_40_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_42_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhang Wei","unstructured":"Wei Zhang, Binghao Chen, and et al. 2022. {PilotFish}: Harvesting Free Cycles of Cloud Gaming with Deep Learning Training. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 217--232."},{"key":"e_1_3_2_1_43_1","unstructured":"Zhenyu Zhang Ying Sheng Tianyi Zhou et al. 2023. H2O : Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models. arXiv preprint arXiv:2306.14048 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 556--569","author":"Zhou Quan","unstructured":"Quan Zhou, Haiquan Wang, and et al. 2023. MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 556--569."},{"key":"e_1_3_2_1_45_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. 2022. {PetS}: A Unified Framework for {Parameter-Efficient} Transformers Serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 489--504."}],"event":{"name":"PPoPP '24: 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Edinburgh United Kingdom","acronym":"PPoPP '24","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638466","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627535.3638466","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:26Z","timestamp":1750182566000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638466"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,20]]},"references-count":45,"alternative-id":["10.1145\/3627535.3638466","10.1145\/3627535"],"URL":"https:\/\/doi.org\/10.1145\/3627535.3638466","relation":{},"subject":[],"published":{"date-parts":[[2024,2,20]]},"assertion":[{"value":"2024-02-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}