{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:13:33Z","timestamp":1759331613964,"version":"build-2065373602"},"reference-count":11,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Micro"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1109\/mm.2025.3575361","type":"journal-article","created":{"date-parts":[[2025,6,5]],"date-time":"2025-06-05T13:36:50Z","timestamp":1749130610000},"page":"54-59","source":"Crossref","is-referenced-by-count":0,"title":["Splitwise: Efficient Generative LLM Inference Using Phase Splitting"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0371-5522","authenticated-orcid":false,"given":"Esha","family":"Choukse","sequence":"first","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3611-5160","authenticated-orcid":false,"given":"Pratyush","family":"Patel","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8334-1291","authenticated-orcid":false,"given":"Chaojie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0628-4515","authenticated-orcid":false,"given":"Aashaka","family":"Shah","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2591-4012","authenticated-orcid":false,"given":"\u00cd\u00f1igo","family":"Goiri","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7998-3681","authenticated-orcid":false,"given":"Saeed","family":"Maleki","sequence":"additional","affiliation":[{"name":"xAI, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9662-2661","authenticated-orcid":false,"given":"Rodrigo","family":"Fonseca","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5971-5084","authenticated-orcid":false,"given":"Ricardo","family":"Bianchini","sequence":"additional","affiliation":[{"name":"Microsoft Corporation, Redmond, WA, USA"}]}],"member":"263","reference":[{"volume-title":"Add Splitwise implementation to vLLM","year":"2024","key":"ref1"},{"volume-title":"Azure public dataset: Azure LLM inference trace","year":"2024","key":"ref2"},{"volume-title":"Specialized cloud provider","year":"2023","key":"ref3"},{"volume-title":"Day 6: One more thing, DeepSeek-V3\/R1 inference system overview","year":"2025","key":"ref4"},{"volume-title":"How data centers and the energy sector can sate AIs hunger for power","year":"2024","key":"ref5"},{"article-title":"SARATHI: Efficient LLM inference by piggybacking decodes with chunked prefills","year":"2023","author":"Agrawal","key":"ref6"},{"key":"ref7","article-title":"Introducing NVIDIA dynamo, a low-latency distributed inference framework for scaling reasoning AI models","volume-title":"NVIDIA","author":"Elmeleegy","year":"2025"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"article-title":"Mooncake: A KVCache-centric disaggregated architecture for LLM serving","year":"2024","author":"Qin","key":"ref9"},{"article-title":"Llama 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref10"},{"article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","year":"2024","author":"Zhong","key":"ref11"}],"container-title":["IEEE Micro"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/40\/11176860\/11024200.pdf?arnumber=11024200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T13:17:25Z","timestamp":1759238245000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11024200\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":11,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/mm.2025.3575361","relation":{},"ISSN":["0272-1732","1937-4143"],"issn-type":[{"type":"print","value":"0272-1732"},{"type":"electronic","value":"1937-4143"}],"subject":[],"published":{"date-parts":[[2025,7]]}}}