{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:21:48Z","timestamp":1777422108752,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754621","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"689-698","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["TD-Pipe: Temporally-Disaggregated Pipeline Parallelism Architecture for High-Throughput LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9145-3919","authenticated-orcid":false,"given":"Hongbin","family":"Zhang","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9788-0743","authenticated-orcid":false,"given":"Taosheng","family":"Wei","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1442-496X","authenticated-orcid":false,"given":"Zhenyi","family":"Zheng","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4707-9492","authenticated-orcid":false,"given":"Jiangsu","family":"Du","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-5715","authenticated-orcid":false,"given":"Zhiguang","family":"Chen","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal et\u00a0al. 2024. Taming { Throughput-Latency} Tradeoff in { LLM} Inference with { Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 117\u2013134."},{"key":"e_1_3_3_1_3_2","volume-title":"Character AI","author":"AI Character","year":"2024","unstructured":"Character AI. 2024. Character AI. https:\/\/character.ai"},{"key":"e_1_3_3_1_4_2","volume-title":"Amazon codewhisperer","year":"2024","unstructured":"Amazon. 2024. Amazon codewhisperer. https:\/\/aws.amazon.com\/ codewhisperer\/"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_3_1_6_2","unstructured":"Anyscale. 2025. Anyscale Batch API. https:\/\/docs.anyscale.com\/examples\/batch-llm. [Accessed 05-03-2025]."},{"key":"e_1_3_3_1_7_2","unstructured":"Tom Brown et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.97"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638466"},{"key":"e_1_3_3_1_10_2","volume-title":"shareGPT v3","author":"Face Hugging","year":"2024","unstructured":"Hugging Face. 2024. shareGPT v3. https:\/\/huggingface.co\/datasets\/anon8231489123\/ShareGPT_Vicuna_unfilteredr"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575703"},{"key":"e_1_3_3_1_12_2","volume-title":"Github copilot","year":"2024","unstructured":"Github. 2024. Github copilot. https:\/\/github.com\/features\/copilot"},{"key":"e_1_3_3_1_13_2","unstructured":"Jian Hu et\u00a0al. 2024. OpenRLHF: An Easy-to-use Scalable and High-performance RLHF Framework. arxiv:https:\/\/arXiv.org\/abs\/2405.11143\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2405.11143"},{"key":"e_1_3_3_1_14_2","unstructured":"Yanping Huang et\u00a0al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_16_2","first-page":"155","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Lee Wonbeom","year":"2024","unstructured":"Wonbeom Lee et\u00a0al. 2024. { InfiniGen} : Efficient Generative Inference of Large Language Models with Dynamic { KV} Cache Management. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 155\u2013172."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_1_18_2","volume-title":"FasterTransformer","year":"2023","unstructured":"NVIDIA. 2023. FasterTransformer. https:\/\/github.com\/NVIDIA\/FasterTransformer"},{"key":"e_1_3_3_1_19_2","volume-title":"TensorRT-LLM","year":"2024","unstructured":"NVIDIA. 2024. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM"},{"key":"e_1_3_3_1_20_2","volume-title":"Chatgpt","year":"2024","unstructured":"OpenAI. 2024. Chatgpt. https:\/\/chat.openai.com"},{"key":"e_1_3_3_1_21_2","unstructured":"OpenAI. 2025. OpenAI Batch API. https:\/\/platform.openai.com\/docs\/guides\/batch. [Accessed 05-03-2025]."},{"key":"e_1_3_3_1_22_2","first-page":"75","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Qiu Haoran","year":"2024","unstructured":"Haoran Qiu et\u00a0al. 2024. Power-aware Deep Learning Model Serving with \u00b5-Serve. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). USENIX Association, Santa Clara, CA, 75\u201393. https:\/\/www.usenix.org\/conference\/atc24\/presentation\/qiu"},{"key":"e_1_3_3_1_23_2","first-page":"31094","volume-title":"International Conference on Machine Learning","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng et\u00a0al. 2023. Flexgen: High-throughput generative inference of large language models with a single gpu. In International Conference on Machine Learning. PMLR, 31094\u201331116."},{"key":"e_1_3_3_1_24_2","unstructured":"Mohammad Shoeybi et\u00a0al. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.08053 (2019)."},{"key":"e_1_3_3_1_25_2","unstructured":"Hugo Touvron et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_26_2","unstructured":"Hugo Touvron et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_27_2","unstructured":"An Yang et\u00a0al. 2025. Qwen2.5 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.15115\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.15115"},{"key":"e_1_3_3_1_28_2","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu et\u00a0al. 2022. Orca: A distributed serving system for { Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_3_1_29_2","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et\u00a0al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02414 (2022)."},{"key":"e_1_3_3_1_30_2","unstructured":"Susan Zhang et\u00a0al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01068 (2022)."},{"key":"e_1_3_3_1_31_2","unstructured":"Yilong Zhao et\u00a0al. 2024. BlendServe: Optimizing Offline Inference for Auto-regressive Large Models with Resource-aware Batching. arxiv:https:\/\/arXiv.org\/abs\/2411.16102\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2411.16102"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Lianmin Zheng et\u00a0al. 2024. Sglang: Efficient execution of structured language model programs. Advances in Neural Information Processing Systems 37 (2024) 62557\u201362583.","DOI":"10.52202\/079017-2000"},{"key":"e_1_3_3_1_33_2","unstructured":"Zhen Zheng et\u00a0al. 2025. BatchLLM: Optimizing Large Batched LLM Inference with Global Prefix Sharing and Throughput-oriented Token Batching. arxiv:https:\/\/arXiv.org\/abs\/2412.03594\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.03594"},{"key":"e_1_3_3_1_34_2","first-page":"193","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong et\u00a0al. 2024. { DistServe} : Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 193\u2013210."}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754621","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:39:57Z","timestamp":1766219997000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754621"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":33,"alternative-id":["10.1145\/3754598.3754621","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754621","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}