{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T19:41:35Z","timestamp":1771702895929,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3716280","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"1348-1364","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Vela: A Virtualized LLM Training System with GPU Direct RoCE"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3789-5453","authenticated-orcid":false,"given":"Apoorve","family":"Mohan","sequence":"first","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0636-5439","authenticated-orcid":false,"given":"Robert","family":"Walkup","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8068-6165","authenticated-orcid":false,"given":"Bengi","family":"Karacali","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6946-2313","authenticated-orcid":false,"given":"Ming-hung","family":"Chen","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5909-9891","authenticated-orcid":false,"given":"Abdullah","family":"Kayi","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6163-0060","authenticated-orcid":false,"given":"Liran","family":"Schour","sequence":"additional","affiliation":[{"name":"IBM Research, Haifa, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1234-7657","authenticated-orcid":false,"given":"Shweta","family":"Salaria","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6945-0818","authenticated-orcid":false,"given":"Sophia","family":"Wen","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4555-9257","authenticated-orcid":false,"given":"I-hsin","family":"Chung","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8557-6783","authenticated-orcid":false,"given":"Abdul","family":"Alim","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7906-8651","authenticated-orcid":false,"given":"Constantinos","family":"Evangelinos","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9850-3201","authenticated-orcid":false,"given":"Lixiang","family":"Luo","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4294-4907","authenticated-orcid":false,"given":"Marc","family":"Dombrowa","sequence":"additional","affiliation":[{"name":"IBM Research, Yortown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9506-8467","authenticated-orcid":false,"given":"Laurent","family":"Schares","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4410-2175","authenticated-orcid":false,"given":"Ali","family":"Sydney","sequence":"additional","affiliation":[{"name":"IBM Research, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4490-5253","authenticated-orcid":false,"given":"Pavlos","family":"Maniotis","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3182-219X","authenticated-orcid":false,"given":"Sandhya","family":"Koteshwara","sequence":"additional","affiliation":[{"name":"IBM Research, Yortown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1227-3988","authenticated-orcid":false,"given":"Brent","family":"Tang","sequence":"additional","affiliation":[{"name":"IBM Cloud, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2349-3969","authenticated-orcid":false,"given":"Joel","family":"Belog","sequence":"additional","affiliation":[{"name":"IBM Cloud, Lowell, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4306-7574","authenticated-orcid":false,"given":"Rei","family":"Odaira","sequence":"additional","affiliation":[{"name":"IBM Cloud, Austin, TX, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1424-9977","authenticated-orcid":false,"given":"Vasily","family":"Tarasov","sequence":"additional","affiliation":[{"name":"IBM Research, Almaden, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2883-359X","authenticated-orcid":false,"given":"Eran","family":"Gampel","sequence":"additional","affiliation":[{"name":"IBM Cloud, Haifa, Israel"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0970-6750","authenticated-orcid":false,"given":"Drew","family":"Thorstensen","sequence":"additional","affiliation":[{"name":"IBM Cloud, Durham, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9408-1080","authenticated-orcid":false,"given":"Talia","family":"Gershon","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7595-3477","authenticated-orcid":false,"given":"Seetharami","family":"Seelam","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Zane Adam. 2024. The convergence of HPC and AI: Driving innovation at speed. https:\/\/www.ibm.com\/blog\/the-convergence-of-hpc-andai- driving-innovation-at-speed\/"},{"key":"e_1_3_2_1_2_1","unstructured":"AMD. 2024. ROCmRDMA: Remote Device Programming. https: \/\/doc-ju-30.readthedocs.io\/en\/latest\/Remote_Device_Programming\/ Remote-Device-Programming.html"},{"key":"e_1_3_2_1_3_1","unstructured":"Rajesh Anantharaman. 2024. Google Cloud demonstrates the world's largest distributed training job for large language models across 50000 TPU v5e chips. https:\/\/cloud.google.com\/blog\/products\/compute\/theworlds- largest-distributed-llm-training-job-on-tpu-v5e"},{"key":"e_1_3_2_1_4_1","unstructured":"AWS. 2024. AWS Nitro System. https:\/\/aws.amazon.com\/ec2\/nitro\/"},{"key":"e_1_3_2_1_5_1","volume-title":"Azure: NDv2 sizes series. https:\/\/learn.microsoft.com\/enus\/ azure\/virtual-machines\/sizes\/gpu-accelerated\/ndv2-series?tabs= sizebasic","year":"2024","unstructured":"Azure. 2024. Azure: NDv2 sizes series. https:\/\/learn.microsoft.com\/enus\/ azure\/virtual-machines\/sizes\/gpu-accelerated\/ndv2-series?tabs= sizebasic"},{"key":"e_1_3_2_1_6_1","unstructured":"Ian Buck. 2020. NVIDIA A100 Launches on AWS Marking Dawn of Next Decade in Accelerated Cloud Computing. https: \/\/blogs.nvidia.com\/blog\/nvidia-a100-launches-on-aws\/"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.728"},{"key":"e_1_3_2_1_8_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Firestone Daniel","year":"2017","unstructured":"Daniel Firestone. 2017. {VFP}: A virtual switch platform for host {SDN} in the public cloud. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). 315--328."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672233"},{"key":"e_1_3_2_1_11_1","unstructured":"Talia Gershon Seetharami Seelam Brian Belgodere Milton Bonilla Lan Hoang Danny Barnett I Chung Apoorve Mohan Ming-Hung Chen Lixiang Luo et al. 2024. The infrastructure powering IBM's Gen AI model development. arXiv preprint arXiv:2407.05467 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Google. 2023. Announcing A3 supercomputers with NVIDIA H100 GPUs purpose-built for AI. https:\/\/cloud.google.com\/blog\/products\/ compute\/introducing-a3-supercomputers-with-nvidia-h100-gpus"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ANCS.2019.8901881"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_15_1","unstructured":"KVM. 2019. KVM: x86: Sync the pending Posted-Interrupts. https:\/\/patchwork.kernel.org\/project\/kvm\/patch\/1547793240--9157- 1-git-send-email-luwei.kang@intel.com\/"},{"key":"e_1_3_2_1_16_1","unstructured":"Nesreen K Ahmed Le Chen Akash Dutta et al. 2024. The Landscape and Challenges of HPC Research and LLMs. (2024). doi:10.48550\/ arXiv.2402.02018"},{"key":"e_1_3_2_1_17_1","unstructured":"John Lee. 2023. Microsoft Azure Eagle is a Paradigm Shifting Cloud Supercomputer. https:\/\/www.servethehome.com\/microsoft-azureeagle- is-a-paradigm-shifting-cloud-supercomputer-nvidia-intel\/"},{"key":"e_1_3_2_1_18_1","unstructured":"Meta. 2022. Introducing the AI Research SuperCluster - Meta's cutting-edge AI supercomputer for AI research. https:\/\/ai.meta.com\/ blog\/ai-rsc\/"},{"key":"e_1_3_2_1_19_1","unstructured":"Meta. 2022. Open hardware for AI infrastructure. https: \/\/engineering.fb.com\/2022\/10\/18\/open-source\/ocp-summit-2022- grand-teton\/"},{"key":"e_1_3_2_1_20_1","unstructured":"Meta. 2024. Building Meta's GenAI Infrastructure. https: \/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/sizes\/gpuaccelerated\/ ndv2-series?tabs=sizebasic"},{"key":"e_1_3_2_1_21_1","unstructured":"Microsoft. 2022. Hypervisor security on the Azure fleet. https:\/\/ learn.microsoft.com\/en-us\/azure\/security\/fundamentals\/hypervisor"},{"key":"e_1_3_2_1_22_1","unstructured":"Microsoft. 2024. Microsoft Azure Boost. https:\/\/learn.microsoft.com\/ en-us\/azure\/azure-boost\/overview"},{"key":"e_1_3_2_1_23_1","unstructured":"Microsoft. 2024. Virtual machines in Azure. https:\/\/ learn.microsoft.com\/en-us\/azure\/virtual-machines"},{"key":"e_1_3_2_1_24_1","unstructured":"Apoorve Mohan. 2022. NCCL Test Failing with newer NCCL versions for RoCE with Dual HCA inside VMs https:\/\/github.com\/NVIDIA\/ nccl\/issues\/749 ."},{"key":"e_1_3_2_1_25_1","volume-title":"Infrastructure as code","author":"Morris Kief","unstructured":"Kief Morris. 2020. Infrastructure as code (2nd ed.). O'Reilly Media. https:\/\/www.oreilly.com\/library\/view\/infrastructure-as-code\/ 9781098114664\/","edition":"2"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_1_27_1","volume-title":"Energy Sciences Network","year":"2023","unstructured":"ESnet: Energy Sciences Network. 2023. iPerf3: https:\/\/github.com\/ esnet\/iperf ."},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA. 2021. CUDA Samples: https:\/\/github.com\/NVIDIA\/cudasamples"},{"key":"e_1_3_2_1_29_1","unstructured":"NVIDIA. 2022. NVIDIA Teams With Microsoft to Build Massive Cloud AI Computer. https:\/\/nvidianews.nvidia.com\/news\/nvidia-microsoftaccelerate- cloud-enterprise-ai"},{"key":"e_1_3_2_1_30_1","unstructured":"NVIDIA. 2023. NVIDIA DGX SuperPOD: Next Generation Scalable Infrastructure for AI Leadership. https:\/\/docs.nvidia.com\/https: \/docs.nvidia.com\/dgx-superpod-reference-architecture-dgxh100. pdf"},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA. 2024. Driver Persistence Mode: https:\/\/docs.nvidia.com\/ deploy\/driver-persistence\/index.html ."},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2024. iommu: Allow ATS to work on VFs when the PF uses IDENTITY. https:\/\/patchwork.kernel.org\/project\/linux-pci\/patch\/0- v1-0fb4d2ab67707e706-ats_vf_jgg@nvidia.com\/"},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA. 2024. NVIDIA GPU Admin Tool: https:\/\/github.com\/NVIDIA\/ gpu-admin-tools ."},{"key":"e_1_3_2_1_34_1","unstructured":"Nvidia. 2024. NVIDIA GPUDirect: Enhancing Data Movement and Access for GPUs. https:\/\/developer.nvidia.com\/gpudirect"},{"key":"e_1_3_2_1_35_1","unstructured":"Greg Pauloski. 2021. BERT-Pytorch: https:\/\/github.com\/gpauloski\/ BERT-PyTorch ."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"key":"e_1_3_2_1_37_1","unstructured":"Linux RDMA. 2022. Linux RDMA Perftest: https:\/\/github.com\/linuxrdma\/ perftest ."},{"key":"e_1_3_2_1_38_1","unstructured":"Amazon Web Services. 2024. Recommended GPU Instances. https: \/\/docs.aws.amazon.com\/dlami\/latest\/devguide\/gpu.html"},{"key":"e_1_3_2_1_39_1","unstructured":"Hamid Shojanazeri. 2022. Getting Started with Fully Sharded Data Parallel (FSDP) https:\/\/pytorch.org\/tutorials\/intermediate\/ FSDP_tutorial.html ."},{"key":"e_1_3_2_1_40_1","unstructured":"SIGPCI. 2024. PCI-SIG Specifications. https:\/\/pcisig.com\/ specifications"},{"key":"e_1_3_2_1_41_1","unstructured":"A Tekin A Tuncer Durak C Piechurski D Kaliszan F Aylin Sungur F Roberts\u00e9n and P Gschwandtner. 2021. State-of-the-art and trends for computing and interconnect network solutions for HPC and AI. (2021). https:\/\/dps.uibk.ac.at\/~philipp\/publication\/tekin-2021-state\/ tekin-2021-state.pdf"},{"key":"e_1_3_2_1_42_1","unstructured":"VMWare. 2024. VMware EXSi. https:\/\/www.vmware.com\/products\/ cloud-infrastructure\/esxi-and-esx"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489525.3511693"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/JLT.2020.2966517"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716280","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3716280","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:07:44Z","timestamp":1755774464000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3716280"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":44,"alternative-id":["10.1145\/3676641.3716280","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3716280","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}