{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T04:06:56Z","timestamp":1749269216509,"version":"3.41.0"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["62272292"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Networks"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1016\/j.comnet.2025.111285","type":"journal-article","created":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T14:58:03Z","timestamp":1748098683000},"page":"111285","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["vClos: Network contention aware scheduling for distributed machine learning tasks in multi-tenant GPU clusters"],"prefix":"10.1016","volume":"268","author":[{"given":"Xinchi","family":"Han","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8395-5109","authenticated-orcid":false,"given":"Shizhen","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Yongxi","family":"Lv","sequence":"additional","affiliation":[]},{"given":"Peirui","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Weihao","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0337-1812","authenticated-orcid":false,"given":"Qinwei","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yunzhuo","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2615-0525","authenticated-orcid":false,"given":"Shengkai","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8956-5779","authenticated-orcid":false,"given":"Ximeng","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Cui","sequence":"additional","affiliation":[]},{"given":"Chenghu","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Xinbing","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"year":"2019","series-title":"The computing power needed to train AI is now rising seven times faster than ever before","author":"Hao","key":"10.1016\/j.comnet.2025.111285_b1"},{"year":"2018","series-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","key":"10.1016\/j.comnet.2025.111285_b2"},{"year":"2019","series-title":"Hello, it\u2019s GPT-2 - how can I help you? Towards the use of pretrained language models for task-oriented dialogue systems","author":"Budzianowski","key":"10.1016\/j.comnet.2025.111285_b3"},{"issue":"4","key":"10.1016\/j.comnet.2025.111285_b4","doi-asserted-by":"crossref","first-page":"681","DOI":"10.1007\/s11023-020-09548-1","article-title":"GPT-3: Its nature, scope, limits, and consequences","volume":"30","author":"Floridi","year":"2020","journal-title":"Minds Mach."},{"year":"2021","series-title":"How to take full advantage of GPUs in large language models","author":"Sagar","key":"10.1016\/j.comnet.2025.111285_b5"},{"key":"10.1016\/j.comnet.2025.111285_b6","doi-asserted-by":"crossref","unstructured":"Q. Hu, P. Sun, S. Yan, Y. Wen, T. Zhang, Characterization and Prediction of Deep Learning Workloads in Large-Scale GPU Datacenters, in: SC21: International Conference for High Performance Computing, Networking, Storage and Analysis, 2021, pp. 1\u201315.","DOI":"10.1145\/3458817.3476223"},{"key":"10.1016\/j.comnet.2025.111285_b7","unstructured":"Q. Weng, W. Xiao, Y. Yu, W. Wang, C. Wang, J. He, Y. Li, L. Zhang, W. Lin, Y. Ding, {MLaaS} in the Wild: Workload Analysis and Scheduling in {Large-Scale} Heterogeneous {GPU} Clusters, in: 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), 2022, pp. 945\u2013960."},{"key":"10.1016\/j.comnet.2025.111285_b8","series-title":"MLSys","article-title":"Plink: Discovering and exploiting locality for accelerated distributed training on the public cloud","author":"Luo","year":"2020"},{"year":"2020","series-title":"Communication-efficient distributed deep learning: A comprehensive survey","author":"Tang","key":"10.1016\/j.comnet.2025.111285_b9"},{"key":"10.1016\/j.comnet.2025.111285_b10","doi-asserted-by":"crossref","unstructured":"Y. Hu, Y. Liu, Z. Liu, A Survey on Convolutional Neural Network Accelerators: GPU, FPGA and ASIC, in: 2022 14th International Conference on Computer Research and Development, ICCRD, 2022, pp. 100\u2013107.","DOI":"10.1109\/ICCRD54409.2022.9730377"},{"year":"2020","series-title":"Scaling laws for neural language models","author":"Kaplan","key":"10.1016\/j.comnet.2025.111285_b11"},{"year":"2024","series-title":"Evaluating emerging AI\/ML accelerators: IPU, RDU, and NVIDIA\/AMD GPUs","author":"Peng","key":"10.1016\/j.comnet.2025.111285_b12"},{"year":"2022","series-title":"Deep learning workload scheduling in GPU datacenters: Taxonomy, challenges and vision","author":"Gao","key":"10.1016\/j.comnet.2025.111285_b13"},{"key":"10.1016\/j.comnet.2025.111285_b14","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.comnet.2025.111285_b15","doi-asserted-by":"crossref","unstructured":"S. Rashidi, W. Won, S. Srinivasan, S. Sridharan, T. Krishna, Themis: A network bandwidth-aware collective scheduling policy for distributed training of DL models, in: Proceedings of the 49th Annual International Symposium on Computer Architecture, 2022, pp. 581\u2013596.","DOI":"10.1145\/3470496.3527382"},{"year":"2019","series-title":"Deep learning recommendation model for personalization and recommendation systems","author":"Naumov","key":"10.1016\/j.comnet.2025.111285_b16"},{"key":"10.1016\/j.comnet.2025.111285_b17","series-title":"2023 USENIX Annual Technical Conference","first-page":"961","article-title":"SmartMoE: Efficiently training Sparsely-Activated models through combining offline and online parallelization","author":"Zhai","year":"2023"},{"key":"10.1016\/j.comnet.2025.111285_b18","unstructured":"X. Liu, Y. Wang, F. Fu, X. Miao, S. Zhu, X. Nie, C. Bin, NetMoE: Accelerating MoE Training through Dynamic Sample Placement, in: The Thirteenth International Conference on Learning Representations."},{"key":"10.1016\/j.comnet.2025.111285_b19","doi-asserted-by":"crossref","unstructured":"Z. Zhang, C. Chang, H. Lin, Y. Wang, R. Arora, X. Jin, Is network the bottleneck of distributed training?, in: Proceedings of the Workshop on Network Meets AI & ML, 2020, pp. 8\u201313.","DOI":"10.1145\/3405671.3405810"},{"key":"10.1016\/j.comnet.2025.111285_b20","series-title":"2013 Proceedings IEEE INFOCOM","first-page":"2130","article-title":"On the impact of packet spraying in data center networks","author":"Dixit","year":"2013"},{"key":"10.1016\/j.comnet.2025.111285_b21","doi-asserted-by":"crossref","DOI":"10.1016\/j.comnet.2019.07.008","article-title":"Multipath transport and packet spraying for efficient data delivery in data centres","volume":"162","author":"Kheirkhah","year":"2019","journal-title":"Comput. Netw."},{"key":"10.1016\/j.comnet.2025.111285_b22","doi-asserted-by":"crossref","unstructured":"S. Ghorbani, Z. Yang, P.B. Godfrey, Y. Ganjali, A. Firoozshahian, DRILL: Micro Load Balancing for Low-latency Data Center Networks, in: The Conference of the ACM Special Interest Group, 2017.","DOI":"10.1145\/3098822.3098839"},{"key":"10.1016\/j.comnet.2025.111285_b23","unstructured":"E. Vanini, R. Pan, M. Alizadeh, P. Taheri, T. Edsall, Let it flow: Resilient asymmetric load balancing with flowlet switching, in: 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), 2017, pp. 407\u2013420."},{"key":"10.1016\/j.comnet.2025.111285_b24","series-title":"Proc. 3rd ACM Workshop on Hot Topics in Networks (Hotnets-III)","article-title":"Harnessing tcp\u2019s burstiness with flowlet switching","author":"Sinha","year":"2004"},{"key":"10.1016\/j.comnet.2025.111285_b25","series-title":"GLOBECOM 2020-2020 IEEE Global Communications Conference","first-page":"1","article-title":"PLB: Adaptive partial congestion-aware load balancing for datacenter networks","author":"Liu","year":"2020"},{"key":"10.1016\/j.comnet.2025.111285_b26","doi-asserted-by":"crossref","unstructured":"C.H. Song, X.Z. Khooi, R. Joshi, I. Choi, J. Li, M.C. Chan, Network Load Balancing with In-network Reordering Support for RDMA, in: Proceedings of the ACM SIGCOMM 2023 Conference, 2023, pp. 816\u2013831.","DOI":"10.1145\/3603269.3604849"},{"issue":"5","key":"10.1016\/j.comnet.2025.111285_b27","doi-asserted-by":"crossref","first-page":"85","DOI":"10.1109\/MM.2021.3091475","article-title":"ACCL: Architecting highly scalable distributed training systems with highly efficient collective communication library","volume":"41","author":"Dong","year":"2021","journal-title":"IEEE Micro"},{"key":"10.1016\/j.comnet.2025.111285_b28","series-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","first-page":"855","article-title":"Hashing linearity enables relative path control in data centers","author":"Zhang","year":"2021"},{"key":"10.1016\/j.comnet.2025.111285_b29","series-title":"Proceedings of the ACM SIGCOMM 2013 Conference on SIGCOMM","first-page":"327","article-title":"Participatory networking: an API for application control of SDNs","author":"Ferguson","year":"2013"},{"key":"10.1016\/j.comnet.2025.111285_b30","doi-asserted-by":"crossref","unstructured":"T. Yang, B. Yuan, S. Zhang, T. Zhang, B. Liu, Approaching optimal compression with fast update for large scale routing tables, in: IEEE International Workshop on Quality of Service, 2012.","DOI":"10.1109\/IWQoS.2012.6245978"},{"key":"10.1016\/j.comnet.2025.111285_b31","doi-asserted-by":"crossref","unstructured":"S. Soh, L. Hiryanto, R.P. Gopalan, S. Rai, Dynamic Router Tables for Full Expansion\/Compression IP Lookup, in: IEEE Region 10 Conference, 2005.","DOI":"10.1109\/TENCON.2005.300982"},{"year":"2007","series-title":"The Effect of Routing-Update Time on Network\u2019s Performability","author":"Zolpirani","key":"10.1016\/j.comnet.2025.111285_b32"},{"year":"2023","series-title":"NVIDIA DGX SuperPOD: Next generation scalable infrastructure for ai leadership","author":"NVIDIA","key":"10.1016\/j.comnet.2025.111285_b33"},{"year":"2022","series-title":"TopoOpt: Optimizing the network topology for distributed DNN training","author":"Wang","key":"10.1016\/j.comnet.2025.111285_b34"},{"issue":"6","key":"10.1016\/j.comnet.2025.111285_b35","first-page":"40","article-title":"A cloud-optimized transport protocol for elastic and scalable HPC","author":"Shalev","year":"2020","journal-title":"IEEE Micro"},{"issue":"2","key":"10.1016\/j.comnet.2025.111285_b36","doi-asserted-by":"crossref","DOI":"10.1007\/s42979-020-0114-9","article-title":"Detecting affect states using VGG16, ResNet50 and SE-ResNet50 networks","volume":"1","author":"Theckedath","year":"2020","journal-title":"SN Comput. Sci."},{"year":"2022","series-title":"DeepSpeed-MoE: Advancing mixture-of-experts inference and training to power next-generation AI scale","author":"Rajbhandari","key":"10.1016\/j.comnet.2025.111285_b37"},{"year":"2019","series-title":"Large batch optimization for deep learning: Training BERT in 76 minutes","author":"You","key":"10.1016\/j.comnet.2025.111285_b38"},{"year":"2019","series-title":"Control batch size and learning rate to generalize well: Theoretical and empirical evidence","author":"He","key":"10.1016\/j.comnet.2025.111285_b39"},{"key":"10.1016\/j.comnet.2025.111285_b40","series-title":"Proceedings of the ACM SIGCOMM 2024 Conference","first-page":"691","article-title":"Alibaba HPN: A data center network for large language model training","author":"Qian","year":"2024"},{"key":"10.1016\/j.comnet.2025.111285_b41","doi-asserted-by":"crossref","unstructured":"A. Kabbani, B. Vamanan, J. Hasan, F. Duch\u00eane, Flowbender: Flow-level adaptive routing for improved latency and throughput in datacenter networks, in: ACM, 2014.","DOI":"10.1145\/2674005.2674985"},{"year":"2025","series-title":"VCLOS","author":"Hanxinchi","key":"10.1016\/j.comnet.2025.111285_b42"},{"key":"10.1016\/j.comnet.2025.111285_b43","series-title":"Proceedings of the Nineteenth European Conference on Computer Systems","first-page":"66","article-title":"Halflife: An adaptive flowlet-based load balancer with fading timeout in data center networks","author":"Liu","year":"2024"},{"key":"10.1016\/j.comnet.2025.111285_b44","unstructured":"S. Kassing, A. Valadarsky, A. Singla, Netbench, [Online]. https:\/\/github.com\/ndal-eth\/netbench."},{"key":"10.1016\/j.comnet.2025.111285_b45","series-title":"2023 Optical Fiber Communications Conference and Exhibition","first-page":"1","article-title":"Sip architecture for accelerating collective communication in distributed deep learning","author":"Wu","year":"2023"},{"key":"10.1016\/j.comnet.2025.111285_b46","doi-asserted-by":"crossref","unstructured":"N. Jouppi, G. Kurian, S. Li, P. Ma, R. Nagarajan, L. Nai, N. Patil, S. Subramanian, A. Swing, B. Towles, et al., Tpu v4: An optically reconfigurable supercomputer for machine learning with hardware support for embeddings, in: Proceedings of the 50th Annual International Symposium on Computer Architecture, 2023, pp. 1\u201314.","DOI":"10.1145\/3579371.3589350"},{"key":"10.1016\/j.comnet.2025.111285_b47","doi-asserted-by":"crossref","unstructured":"D. Raghavan, P.A. Levis, M.A. Zaharia, I. Zhang, Breakfast of champions: towards zero-copy serialization with NIC scatter-gather, in: Proceedings of the Workshop on Hot Topics in Operating Systems, 2021.","DOI":"10.1145\/3458336.3465287"},{"key":"10.1016\/j.comnet.2025.111285_b48","doi-asserted-by":"crossref","unstructured":"J. Dong, Z. Cao, T. Zhang, J. Ye, S. Wang, F. Feng, L. Zhao, X. Liu, L. Song, L. Peng, Y. Guo, X. Jiang, L. Tang, Y. Du, Y. Zhang, P. Pan, Y. Xie, EFLOPS: Algorithm and System Co-Design for a High Performance Distributed Training Platform, in: 2020 IEEE International Symposium on High Performance Computer Architecture, HPCA, 2020, pp. 610\u2013622.","DOI":"10.1109\/HPCA47549.2020.00056"},{"issue":"4","key":"10.1016\/j.comnet.2025.111285_b49","doi-asserted-by":"crossref","first-page":"242","DOI":"10.1145\/2043164.2018465","article-title":"Towards predictable datacenter networks","volume":"41","author":"Ant","year":"2011","journal-title":"Comput. Commun. Rev.: A Q. Publ. Spec. Interes. Group Data Commun."},{"year":"2024","series-title":"Rail-only: A low-cost high-performance network for training LLMs with trillion parameters","author":"Wang","key":"10.1016\/j.comnet.2025.111285_b50"},{"key":"10.1016\/j.comnet.2025.111285_b51","series-title":"Proceedings of the 2019 USENIX Conference on Usenix Annual Technical Conference","first-page":"947","article-title":"Analysis of large-scale multi-tenant GPU clusters for DNN training workloads","author":"Jeon","year":"2019"},{"key":"10.1016\/j.comnet.2025.111285_b52","series-title":"Networked Systems Design and Implementation","article-title":"F10: a fault-tolerant engineered network","author":"Liu","year":"2013"},{"key":"10.1016\/j.comnet.2025.111285_b53","unstructured":"W. Xiao, R. Bhardwaj, R. Ramjee, M. Sivathanu, N. Kwatra, Z. Han, P. Patel, X. Peng, H. Zhao, Q. Zhang, F. Yang, L. Zhou, Gandiva: Introspective Cluster Scheduling for Deep Learning, in: OSDI, 2018."},{"key":"10.1016\/j.comnet.2025.111285_b54","unstructured":"D. Narayanan, K. Santhanam, F. Kazhamiaka, A. Phanishayee, M. Zaharia, Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads, in: OSDI, 2020."},{"key":"10.1016\/j.comnet.2025.111285_b55","unstructured":"H. Zhao, Z. Han, Z. Yang, Q. Zhang, F. Yang, L. Zhou, M. Yang, F.C. Lau, Y. Wang, Y. Xiong, et al., Hived: sharing a {GPU} cluster for deep learning with guarantees, in: OSDI, 2020."},{"key":"10.1016\/j.comnet.2025.111285_b56","unstructured":"A. Qiao, S.K. Choe, S.J. Subramanya, W. Neiswanger, Q. Ho, H. Zhang, G.R. Ganger, E.P. Xing, Pollux: Co-adaptive Cluster Scheduling for Goodput-Optimized Deep Learning, in: OSDI, 2021."},{"key":"10.1016\/j.comnet.2025.111285_b57","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ml","volume":"2","author":"Wang","year":"2020","journal-title":"Proc. Mach. Learn. Syst."}],"container-title":["Computer Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1389128625002531?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1389128625002531?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T17:39:18Z","timestamp":1749231558000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1389128625002531"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":57,"alternative-id":["S1389128625002531"],"URL":"https:\/\/doi.org\/10.1016\/j.comnet.2025.111285","relation":{},"ISSN":["1389-1286"],"issn-type":[{"type":"print","value":"1389-1286"}],"subject":[],"published":{"date-parts":[[2025,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"vClos: Network contention aware scheduling for distributed machine learning tasks in multi-tenant GPU clusters","name":"articletitle","label":"Article Title"},{"value":"Computer Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.comnet.2025.111285","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"111285"}}