{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:43:27Z","timestamp":1760420607555,"version":"build-2065373602"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T00:00:00Z","timestamp":1758499200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,22]]},"DOI":"10.1109\/icnp65844.2025.11192367","type":"proceedings-article","created":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T17:38:54Z","timestamp":1760377134000},"page":"1-11","source":"Crossref","is-referenced-by-count":0,"title":["Canvas: Scalable Collective Communication Scheduling for Large-Scale GPU Clusters"],"prefix":"10.1109","author":[{"given":"Chenyang","family":"Hei","sequence":"first","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Zhao","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fuliang","family":"Li","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengxi","family":"Gao","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tongrui","family":"Liu","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiuzhu","family":"Sha","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingwei","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"volume-title":"ROCm Communication Collectives Library (RCCL)","year":"2024","key":"ref1"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. of NeurIPS","volume":"33","author":"Brown"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/1122971.1122975"},{"key":"ref5","first-page":"241","article-title":"Blue-connect: Decomposing all-reduce for deep learning on heterogeneous network hierarchy","volume-title":"Proc. of MLSys","volume":"1","author":"Cho"},{"issue":"240","key":"ref6","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575724"},{"issue":"120","key":"ref8","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ipdps64566.2025.00089"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8191(06)80021-9"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00023"},{"key":"ref12","article-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism","volume-title":"Proc. of NeurIPS","volume":"32","author":"Huang"},{"journal-title":"White paper, Intel Corporation","article-title":"Gaudi Training Platform White Paper","year":"2019","key":"ref13"},{"volume-title":"oneAPI Collective Communications Library (oneCCL)","year":"2024","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"ref16","article-title":"Imagenet classification with deep convolutional neural networks","volume-title":"Proc. of NeurIPS","volume":"25","author":"Krizhevsky"},{"article-title":"Highly available data parallel ml training on mesh networks","year":"2020","author":"Kumar","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218538"},{"article-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","year":"2020","author":"Lepikhin","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3345387"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672249"},{"article-title":"Plink: Discovering and exploiting datacenter network locality for efficient cloud-based distributed training","volume-title":"Proc. of MLSys","author":"Luo","key":"ref22"},{"volume-title":"Microsoft Collective Communication Library (MSCCL)","year":"2024","key":"ref23"},{"article-title":"Massively distributed sgd: Imagenet\/resnet-50 training in a flash","year":"2018","author":"Mikami","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"journal-title":"White paper","article-title":"NVIDIA DGX SuperPOD: Instant Infrastructure for AI Leadership","year":"2020","key":"ref26"},{"journal-title":"Technical overview","article-title":"NVIDIA NVSWITCH The World\u2019s Highest-Bandwidth On-Node Switch","year":"2021","key":"ref27"},{"volume-title":"Nvidia NVLink and NVSwitch","year":"2021","key":"ref28"},{"volume-title":"NVIDIA Collective Communications Library (NCCL)","year":"2024","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"key":"ref31","first-page":"613","article-title":"Synthesizing runtime programmable switch updates","volume-title":"Proc. of USENIX NSDI","author":"Qiu"},{"article-title":"Horovod: fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref32"},{"key":"ref33","first-page":"593","article-title":"{TACCL}: Guiding collective algorithm synthesis using communication sketches","volume-title":"Proc. of USENIX NSDI","author":"Shah"},{"article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","year":"2019","author":"Shoeybi","key":"ref34"},{"volume-title":"Program synthesis by sketching","year":"2008","author":"Solar-Lezama","key":"ref35"},{"article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","year":"2024","author":"Team","key":"ref36"},{"key":"ref37","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ml","volume-title":"Proc. of MLSys","volume":"2","author":"Wang"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00068"},{"key":"ref39","first-page":"548","article-title":"Synthesizing optimal parallelism placement and reduction strategies on hierarchical systems for deep learning","volume-title":"Proc. of MLSys","volume":"4","author":"Xie"}],"event":{"name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","start":{"date-parts":[[2025,9,22]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2025,9,25]]}},"container-title":["2025 IEEE 33rd International Conference on Network Protocols (ICNP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11192357\/11192322\/11192367.pdf?arnumber=11192367","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T05:08:24Z","timestamp":1760418504000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11192367\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,22]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icnp65844.2025.11192367","relation":{},"subject":[],"published":{"date-parts":[[2025,9,22]]}}}