{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:22:12Z","timestamp":1772695332411,"version":"3.50.1"},"reference-count":58,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408556","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-14","source":"Crossref","is-referenced-by-count":0,"title":["eGPU: Production-Scale Elastic Sharing Over 10,000 GPUs"],"prefix":"10.1109","author":[{"given":"Xiaochuan","family":"Tang","sequence":"first","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Qi","sequence":"additional","affiliation":[{"name":"University of California,Merced"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianbo","family":"Dong","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yinghao","family":"Yu","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhennan","family":"Xue","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daocheng","family":"Ying","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheng","family":"Cao","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyi","family":"Lu","sequence":"additional","affiliation":[{"name":"University of California,Merced"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","journal-title":"A Krispr Approach to Kubernetes Infrastructure"},{"key":"ref2","journal-title":"Containerd"},{"key":"ref3","journal-title":"Google Kubernetes Engine"},{"key":"ref4","journal-title":"gRPC"},{"key":"ref5","journal-title":"MTIA: META\u2019S First Generation of AI Accelerators"},{"key":"ref6","journal-title":"NVIDIA Blackwell Architecture Technical Brief"},{"key":"ref7","journal-title":"NVIDIA Multi-Instance GPU User Guide"},{"key":"ref8","journal-title":"NVIDIA Multi-Process Service"},{"key":"ref9","journal-title":"NVLink"},{"key":"ref10","journal-title":"OCI Standards"},{"key":"ref11","journal-title":"PCIe Gen6"},{"key":"ref12","journal-title":"Podman"},{"key":"ref13","journal-title":"PouchContainer"},{"key":"ref14","author":"Bai","year":"2023","journal-title":"Qwen Technical Teport"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2890784"},{"key":"ref16","first-page":"199","article-title":"Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi","year":"2022"},{"key":"ref17","first-page":"199","article-title":"Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi","year":"2022"},{"key":"ref18","author":"Devlin","year":"2018","journal-title":"Bert: Pre-training of Deep Bidirectional Transformers for Language Understanding"},{"key":"ref19","journal-title":"Docker"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/1618525.1618534"},{"key":"ref21","first-page":"443","article-title":"Serving DNNs like Clockwork: Performance Predictability from the Bottom Up","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Gujarati","year":"2020"},{"key":"ref22","first-page":"539","article-title":"Microsecond-scale Preemption for Concurrent GPU-accelerated DNN Inferences","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Han","year":"2022"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3068281"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"ref26","first-page":"947","article-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon","year":"2019"},{"key":"ref27","first-page":"947","article-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19).","author":"Jeon"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2012.08.002"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0177459"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607054"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/NAS.2015.7255222"},{"key":"ref32","article-title":"Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training","volume-title":"International Conference on Learning Representations","author":"Lin","year":"2018"},{"key":"ref33","author":"Mao","year":"2020","journal-title":"Resource Management Schemes for Cloud-native Platforms with Computing Containers of Docker and Kubernetes"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2010.41"},{"key":"ref35","journal-title":"NCCL Communicators"},{"key":"ref36","journal-title":"NVIDIA Collective Communication Library (NCCL)"},{"key":"ref37","year":"2023","journal-title":"NVIDIA A100 TENSOR CORE GPU"},{"key":"ref38","year":"2023","journal-title":"NVIDIA H100 TENSOR CORE GPU"},{"key":"ref39","journal-title":"NVIDIA CONTAINERS AND DEEP LEARNING FRAMEWORKS"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00463-x"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2015.139"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2017.29"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/2897839.2927468"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161020"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2005.251"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"ref49","first-page":"109","article-title":"Gpuvm: Why not virtualizing gpus at the hypervisor?","volume-title":"USENIX Annual Technical Conference","author":"Suzuki","year":"2014"},{"key":"ref50","first-page":"7663","article-title":"Communication Compression for Decentralized Training","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems, ser. NIPS\u201918.","author":"Tang","year":"2018"},{"key":"ref51","author":"Touvron","year":"2023","journal-title":"LLaMA: Open and Efficient Foundation Language Models"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.3390\/s23042215"},{"key":"ref53","author":"Wang","year":"2023","journal-title":"Fine-tuning Language Models over Slow Networks using Activation Compression with Guarantees"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.7873\/DATE.2014.358"},{"key":"ref55","first-page":"945","article-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng","year":"2022"},{"key":"ref56","first-page":"69","article-title":"Transparent GPU Sharing in Container Clouds for Deep Learning Workloads","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wu","year":"2023"},{"key":"ref57","first-page":"595","article-title":"Gandiva: Introspective Cluster Scheduling for Deep Learning","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Xiao","year":"2018"},{"key":"ref58","first-page":"533","article-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning","author":"Xiao","year":"2020","journal-title":"OSDI"},{"key":"ref59","first-page":"98","article-title":"Fine-grained GPU Sharing Primitives for Deep Learning Applications","volume-title":"Proceedings of Machine Learning and Systems","volume":"2","author":"Yu","year":"2020"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408556.pdf?arnumber=11408556","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:51:16Z","timestamp":1772693476000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408556\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408556","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}