{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T21:44:46Z","timestamp":1781991886338,"version":"3.54.5"},"reference-count":11,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Micro"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1109\/mm.2025.3575361","type":"journal-article","created":{"date-parts":[[2025,6,5]],"date-time":"2025-06-05T13:36:50Z","timestamp":1749130610000},"page":"54-59","source":"Crossref","is-referenced-by-count":1,"title":["Splitwise: Efficient Generative LLM Inference Using Phase Splitting"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0371-5522","authenticated-orcid":false,"given":"Esha","family":"Choukse","sequence":"first","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3611-5160","authenticated-orcid":false,"given":"Pratyush","family":"Patel","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8334-1291","authenticated-orcid":false,"given":"Chaojie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0628-4515","authenticated-orcid":false,"given":"Aashaka","family":"Shah","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2591-4012","authenticated-orcid":false,"given":"\u00cd\u00f1igo","family":"Goiri","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7998-3681","authenticated-orcid":false,"given":"Saeed","family":"Maleki","sequence":"additional","affiliation":[{"name":"xAI, Palo Alto, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9662-2661","authenticated-orcid":false,"given":"Rodrigo","family":"Fonseca","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5971-5084","authenticated-orcid":false,"given":"Ricardo","family":"Bianchini","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Add Splitwise implementation to vLLM","year":"2024"},{"key":"ref2","volume-title":"Azure public dataset: Azure LLM inference trace","year":"2024"},{"key":"ref3","volume-title":"Specialized cloud provider","year":"2023"},{"key":"ref4","volume-title":"Day 6: One more thing, DeepSeek-V3\/R1 inference system overview","year":"2025"},{"key":"ref5","volume-title":"How data centers and the energy sector can sate AIs hunger for power","year":"2024"},{"key":"ref6","article-title":"SARATHI: Efficient LLM inference by piggybacking decodes with chunked prefills","author":"Agrawal","year":"2023"},{"key":"ref7","article-title":"Introducing NVIDIA dynamo, a low-latency distributed inference framework for scaling reasoning AI models","volume-title":"NVIDIA","author":"Elmeleegy","year":"2025"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref9","article-title":"Mooncake: A KVCache-centric disaggregated architecture for LLM serving","author":"Qin","year":"2024"},{"key":"ref10","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref11","article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","author":"Zhong","year":"2024"}],"container-title":["IEEE Micro"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/40\/11176860\/11024200.pdf?arnumber=11024200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T13:17:25Z","timestamp":1759238245000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11024200\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":11,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/mm.2025.3575361","relation":{},"ISSN":["0272-1732","1937-4143"],"issn-type":[{"value":"0272-1732","type":"print"},{"value":"1937-4143","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7]]}}}