{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:41:34Z","timestamp":1760420494452,"version":"build-2065373602"},"reference-count":43,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,22]]},"DOI":"10.1109\/icnp65844.2025.11192450","type":"proceedings-article","created":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T17:38:54Z","timestamp":1760377134000},"page":"1-13","source":"Crossref","is-referenced-by-count":0,"title":["Symphony: Collective Coordination in Multi-Tenant GPU Clusters"],"prefix":"10.1109","author":[{"given":"Manaf","family":"Bin-Yahya","sequence":"first","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Amir","family":"Shani","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hossein","family":"Shafieirad","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seyed Hossein","family":"Mortazavi","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Ying","sequence":"additional","affiliation":[{"name":"University of Toronto"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aaron","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Waterloo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Geng","family":"Li","sequence":"additional","affiliation":[{"name":"China Telecom Cloud Computing Research Institute"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Majid","family":"Ghaderi","sequence":"additional","affiliation":[{"name":"University of Calgary"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"year":"2023","key":"ref1","article-title":"Alibaba gpu cluster dataset 2023"},{"year":"2019","key":"ref2","article-title":"The MOSEK Optimization Toolbox for MATLAB Manual"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737460"},{"key":"ref4","first-page":"2397","article-title":"Pythia: A Suite for Analyzing Large Language Models across Training and Scaling","volume-title":"International Conference on Machine Learning","author":"Biderman"},{"article-title":"Arcane: Adaptive routing with caching and network exploration","year":"2024","author":"Bonato","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672239"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3641289"},{"key":"ref8","first-page":"1327","article-title":"NetHint: White-Box Networking for Multi-Tenant Data Centers","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Chen"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787480"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626315"},{"year":"2023","key":"ref11","article-title":"Overview of and motivation for the forthcoming ultra ethernet consortium specification"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626322"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672233"},{"key":"ref14","first-page":"947","article-title":"Analysis of {Large-Scale}{Multi-Tenant}{GPU} clusters for {DNN} training workloads","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon"},{"key":"ref15","first-page":"1","article-title":"Beyond Data and Model Parallelism for Deep Neural Networks","volume-title":"Proc. Machine Learning and Systems (MLSys)","volume":"1","author":"Jia"},{"key":"ref16","first-page":"745","article-title":"{MegaScale}: Scaling large language model training to more than 10,000 {GPUs}","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Jiang"},{"key":"ref17","first-page":"741","article-title":"ATP: In-Network Aggregation for Multi-tenant Learning","volume-title":"Proc. 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Lao"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587436"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672249"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2024.103082"},{"key":"ref21","first-page":"809","article-title":"Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using Syndicate","volume-title":"Proc. 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Mahajan"},{"key":"ref22","first-page":"289","article-title":"Themis: Fair and Efficient GPU Cluster Scheduling","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3538401.3546599"},{"year":"2017","key":"ref24","article-title":"Nvidia collective communications library (nccl)"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"article-title":"Pollux: Co-Adaptive Cluster Scheduling for Goodput-Optimized Deep Learning","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Qiao","key":"ref27"},{"issue":"8","key":"ref28","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref29","first-page":"1403","article-title":"CASSINI: Network-Aware Job Scheduling in Machine Learning Clusters","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Rajasekaran"},{"article-title":"MLTCP: Congestion Control for DNN Training","year":"2024","author":"Rajasekaran","key":"ref30"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s10951-006-7042-y"},{"key":"ref33","first-page":"593","article-title":"TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches","volume-title":"Proc. 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"Shah"},{"year":"2024","key":"ref34","article-title":"Distributed data parallel implementation"},{"year":"2024","key":"ref35","article-title":"Fully sharded data parallel implementation"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"ref37","first-page":"172","article-title":"Blink: Fast and Generic Collectives for Distributed ML","volume-title":"Proc. 3rd Machine Learning and Systems Conference (MLSys)","author":"Wang"},{"key":"ref38","first-page":"739","article-title":"TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang"},{"key":"ref39","first-page":"945","article-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00068"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672252"},{"key":"ref42","first-page":"523","article-title":"Holmes: Localizing irregularities in {LLM} training with mega-scale {GPU} clusters","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Yao"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787484"}],"event":{"name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","start":{"date-parts":[[2025,9,22]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2025,9,25]]}},"container-title":["2025 IEEE 33rd International Conference on Network Protocols (ICNP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11192357\/11192322\/11192450.pdf?arnumber=11192450","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:17:24Z","timestamp":1760419044000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11192450\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,22]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/icnp65844.2025.11192450","relation":{},"subject":[],"published":{"date-parts":[[2025,9,22]]}}}