{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T09:32:56Z","timestamp":1773999176116,"version":"3.50.1"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T00:00:00Z","timestamp":1765152000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,8]]},"DOI":"10.1109\/globecom59602.2025.11432531","type":"proceedings-article","created":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T20:04:01Z","timestamp":1773950641000},"page":"5689-5694","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Batch Processing for Private Cloud LLM Inference: Modeling and Performance Comparison"],"prefix":"10.1109","author":[{"given":"Hiroki","family":"Nakai","sequence":"first","affiliation":[{"name":"The University of Osaka,Graduate School of Engineering,Osaka,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yoshiaki","family":"Inoue","sequence":"additional","affiliation":[{"name":"The University of Osaka,Graduate School of Engineering,Osaka,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tetsuya","family":"Takine","sequence":"additional","affiliation":[{"name":"The University of Osaka,Graduate School of Engineering,Osaka,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","article-title":"The Llama 3 Herd of Models","author":"Grattafiori","year":"2024"},{"key":"ref3","article-title":"SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot","volume-title":"Proc. ICML 2023","author":"Frantar"},{"key":"ref4","article-title":"In-Context Learning Distillation: Transferring Few-Shot Learning Ability of PreTrained Language Models","author":"Huang","year":"2022"},{"key":"ref5","first-page":"27168","article-title":"ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers","volume-title":"Proc. NeurIPS 2022","author":"Yao"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref7","article-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve","volume-title":"Proc. USENIX OSDI 2024","author":"Agrawal"},{"key":"ref8","article-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models","volume-title":"Proc. USENIX OSDI 2022","author":"Yu"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640411"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"ref11","article-title":"Fast Distributed Inference Serving for Large Language Models","author":"Wu","year":"2023"},{"key":"ref12","first-page":"18015","article-title":"S3: Increasing GPU Utilization during Generative Inference for Higher Throughput","volume-title":"Proc. NeurIPS 2023","author":"Jin"},{"key":"ref13","article-title":"Llumnix: Dynamic Scheduling for Large Language Model Serving","volume-title":"Proc. USENIX OSDI 2024","author":"Sun"},{"key":"ref14","article-title":"dLoRA: Dynamically Orchestrating Requests and Adapters for LoRA LLM Serving","volume-title":"Proc. USENIX OSDI 2024","author":"Wu"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3242724"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3582080"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2022.01.004"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3430063"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TCC.2024.3350561"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.peva.2025.102468"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11134-022-09794-3"},{"key":"ref22","article-title":"Retentive Network: A Successor to Transformer for Large Language Models","author":"Sun","year":"2023"},{"key":"ref23","article-title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","author":"Abdin","year":"2024"}],"event":{"name":"GLOBECOM 2025 - 2025 IEEE Global Communications Conference","location":"Taipei, Taiwan","start":{"date-parts":[[2025,12,8]]},"end":{"date-parts":[[2025,12,12]]}},"container-title":["GLOBECOM 2025 - 2025 IEEE Global Communications Conference"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11431620\/11431622\/11432531.pdf?arnumber=11432531","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T05:58:25Z","timestamp":1773986305000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11432531\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,8]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/globecom59602.2025.11432531","relation":{},"subject":[],"published":{"date-parts":[[2025,12,8]]}}}