{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T08:06:52Z","timestamp":1782806812200,"version":"3.54.5"},"reference-count":76,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2024,7,31]],"date-time":"2024-07-31T00:00:00Z","timestamp":1722384000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,7,31]],"date-time":"2024-07-31T00:00:00Z","timestamp":1722384000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s11432-023-3894-4","type":"journal-article","created":{"date-parts":[[2024,8,7]],"date-time":"2024-08-07T07:02:27Z","timestamp":1723014147000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["SDCC: software-defined collective communication for distributed training"],"prefix":"10.1007","volume":"67","author":[{"given":"Xin","family":"Jin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhen","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yunshan","family":"Jia","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yun","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuanzhe","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,7,31]]},"reference":[{"key":"3894_CR1","first-page":"1026","volume-title":"Proceedings of IEEE International Conference on Computer Vision (ICCV)","author":"K M He","year":"2015","unstructured":"He K M, Zhang X Y, Ren S Q, et al. Delving deep into rectifiers: surpassing human-level performance on ImageNet classification. In: Proceedings of IEEE International Conference on Computer Vision (ICCV), Santiago, 2015. 1026\u20131034"},{"key":"3894_CR2","volume-title":"Comparing deep neural networks against humans: object recognition when the signal gets weaker","author":"R Geirhos","year":"2018","unstructured":"Geirhos R, Janssen D H J, Sch\u00fctt H H, et al. Comparing deep neural networks against humans: object recognition when the signal gets weaker. 2018. ArXiv:1706.06969"},{"key":"3894_CR3","first-page":"5934","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"W Xiong","year":"2018","unstructured":"Xiong W, Wu L, Alleva F, et al. The Microsoft 2017 conversational speech recognition system. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Calgary, 2018. 5934\u20135938"},{"key":"3894_CR4","volume-title":"Achieving human parity in conversational speech recognition","author":"W Xiong","year":"2016","unstructured":"Xiong W, Droppo J, Huang X, et al. Achieving human parity in conversational speech recognition. 2016. ArXiv:1610.05256"},{"key":"3894_CR5","volume-title":"Measuring the algorithmic efficiency of neural networks","author":"D Hernandez","year":"2020","unstructured":"Hernandez D, Brown T B. Measuring the algorithmic efficiency of neural networks. 2020. ArXiv:2005.04305"},{"key":"3894_CR6","first-page":"1","volume-title":"Proceedings of the 47th International Conference on Parallel Processing (ICPP)","author":"Y You","year":"2018","unstructured":"You Y, Zhang Z, Hsieh C J, et al. Imagenet training in minutes. In: Proceedings of the 47th International Conference on Parallel Processing (ICPP), Eugene, 2018. 1\u201310"},{"key":"3894_CR7","first-page":"1","volume-title":"Proceedings of Workshop on Systems for ML and Open Source Software at the Annual Conference on Neural Information Processing Systems","author":"X Y Jia","year":"2018","unstructured":"Jia X Y, Song S T, He W, et al. Highly scalable deep learning training system with mixed-precision: training imageNet in four minutes. In: Proceedings of Workshop on Systems for ML and Open Source Software at the Annual Conference on Neural Information Processing Systems, Montr\u00e9al, 2018. 1\u20138"},{"key":"3894_CR8","first-page":"82","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"L Luo","year":"2020","unstructured":"Luo L, West P, Krishnamurthy A, et al. PLink: discovering and exploiting datacenter network locality for efficient cloud-based distributed training. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Austin, 2020. 82\u201397"},{"key":"3894_CR9","first-page":"533","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"W C Xiao","year":"2020","unstructured":"Xiao W C, Ren S R, Li Y, et al. AntMan: dynamic scaling on GPU clusters for deep learning. In: Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI), Banff, 2020. 533\u2013548"},{"key":"3894_CR10","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow","author":"A Sergeev","year":"2018","unstructured":"Sergeev A, Balso M D. Horovod: fast and easy distributed deep learning in TensorFlow. 2018. ArXiv:1802.05799"},{"key":"3894_CR11","first-page":"1","volume-title":"Proceedings of the 6th International Conference on Learning Representations (ICLR)","author":"Y J Lin","year":"2018","unstructured":"Lin Y J, Han S, Mao H Z, et al. Deep gradient compression: reducing the communication bandwidth for distributed training. In: Proceedings of the 6th International Conference on Learning Representations (ICLR), Vancouver, 2018. 1\u201314"},{"key":"3894_CR12","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1145\/3341301.3359642","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles Organizers (SOSP)","author":"Y H Peng","year":"2019","unstructured":"Peng Y H, Zhu Y B, Chen Y R. A generic communication scheduler for distributed DNN training acceleration. In: Proceedings of the 27th ACM Symposium on Operating Systems Principles Organizers (SOSP), Ontario, 2019. 16\u201329"},{"key":"3894_CR13","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1145\/3405671.3405810","volume-title":"Proceedings of the ACM SIGCOMM Workshop on Network Meets AI & ML (NetAI)","author":"Z Zhang","year":"2020","unstructured":"Zhang Z, Chang C K, Lin H B, et al. Is network the bottleneck of distributed training? In: Proceedings of the ACM SIGCOMM Workshop on Network Meets AI & ML (NetAI), 2020. 8\u201313"},{"key":"3894_CR14","first-page":"1","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"Z H Jia","year":"2019","unstructured":"Jia Z H, Zaharia M, Aiken A. Beyond data and model parallelism for deep neural networks. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Stanford, 2019. 1\u201313"},{"key":"3894_CR15","first-page":"2279","volume-title":"Proceedings of International Conference on Machine Learning (ICML)","author":"Z H Jia","year":"2018","unstructured":"Jia Z H, Lin S N, Qi C R, et al. Exploring hidden dimensions in parallelizing convolutional neural networks. In: Proceedings of International Conference on Machine Learning (ICML), Stockholm, 2018. 2279\u20132288"},{"key":"3894_CR16","first-page":"8026","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"A Paszke","year":"2019","unstructured":"Paszke A, Gross S, Massa F, et al. PyTorch: an imperative style, high-performance deep learning library. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems, Vancouver, 2019. 8026\u20138037"},{"key":"3894_CR17","first-page":"265","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation","author":"M Abadi","year":"2016","unstructured":"Abadi M, Barham P, Chen J M, et al. TensorFlow: a system for large-scale machine learning. In: Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation, Savannah, 2016. 265\u2013283"},{"key":"3894_CR18","doi-asserted-by":"publisher","first-page":"3505","DOI":"10.1145\/3394486.3406703","volume-title":"Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","author":"J Rasley","year":"2020","unstructured":"Rasley J, Rajbhandari S, Ruwase O, et al. DeepSpeed: system optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2020. 3505\u20133506"},{"key":"3894_CR19","first-page":"1058","volume-title":"Proceedings of the 15th Annual Conference of the International Speech Communication Association (ISCA)","author":"F Seide","year":"2014","unstructured":"Seide F, Fu H, Droppo J, et al. 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. In: Proceedings of the 15th Annual Conference of the International Speech Communication Association (ISCA), Singapore, 2014. 1058\u20131062"},{"key":"3894_CR20","first-page":"53","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"H Lim","year":"2019","unstructured":"Lim H, Andersen D G, Kaminsky M. 3LC: lightweight and effective traffic compression for distributed machine learning. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Stanford, 2019. 53\u201364"},{"key":"3894_CR21","first-page":"132","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"A Jayarajan","year":"2019","unstructured":"Jayarajan A, Wei J L, Gibson G, et al. Priority-based parameter propagation for distributed DNN training. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Stanford, 2019. 132\u2013145"},{"key":"3894_CR22","first-page":"418","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"S H Hashemi","year":"2019","unstructured":"Hashemi S H, Jyothi S A, Campbell R. TicTac: accelerating distributed deep learning with communication scheduling. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Stanford, 2019. 418\u2013430"},{"key":"3894_CR23","first-page":"485","volume-title":"Proceedings of the 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"J C Gu","year":"2019","unstructured":"Gu J C, Chowdhury M, Shin K G, et al. Tiresias: a GPU cluster manager for distributed deep learning. In: Proceedings of the 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Boston, 2019. 485\u2013500"},{"key":"3894_CR24","first-page":"499","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Z H Bai","year":"2020","unstructured":"Bai Z H, Zhang Z, Zhu Y B, et al. PipeSwitch: fast pipelined context switching for deep learning applications. In: Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI), Banff, 2020. 499\u2013514"},{"key":"3894_CR25","first-page":"98","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"P F Yu","year":"2020","unstructured":"Yu P F, Chowdhury M. Fine-grained GPU sharing primitives for deep learning applications. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Austin, 2020. 98\u2013111"},{"key":"3894_CR26","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1145\/2934664","volume":"59","author":"M Zaharia","year":"2016","unstructured":"Zaharia M, Xin R S, Wendell P, et al. Apache spark: a unified engine for big data processing. Commun ACM, 2016, 59: 56\u201365","journal-title":"Commun ACM"},{"key":"3894_CR27","first-page":"599","volume-title":"Proceedings of the 11th USENIX symposium on operating systems design and implementation (OSDI)","author":"J E Gonzalez","year":"2014","unstructured":"Gonzalez J E, Xin R S, Dave A, et al. GraphX: graph processing in a distributed dataflow framework. In: Proceedings of the 11th USENIX symposium on operating systems design and implementation (OSDI), Broomfield, 2014. 599\u2013613"},{"key":"3894_CR28","first-page":"439","volume-title":"Proceedings of the 24th ACM Symposium on Operating Systems Principles (SOSP)","author":"D G Murray","year":"2013","unstructured":"Murray D G, McSherry F, Isaacs R, et al. Naiad: a timely dataflow system. In: Proceedings of the 24th ACM Symposium on Operating Systems Principles (SOSP), Farmington, 2013. 439\u2013455"},{"key":"3894_CR29","doi-asserted-by":"publisher","first-page":"35","DOI":"10.18653\/v1\/2020.ngt-1.4","volume-title":"Proceedings of the 4th Workshop on Neural Generation and Translation","author":"A F Aji","year":"2020","unstructured":"Aji A F, Heafield K. Compressing neural machine translation models with 4-bit precision. In: Proceedings of the 4th Workshop on Neural Generation and Translation, 2020. 35\u201342"},{"key":"3894_CR30","first-page":"97","volume-title":"Proceedings of European Parallel Virtual Machine\/Message Passing Interface Users\u2019 Group Meeting","author":"E Gabriel","year":"2004","unstructured":"Gabriel E, Fagg G E, Bosilca G, et al. Open MPI: goals, concept, and design of a next generation MPI implementation. In: Proceedings of European Parallel Virtual Machine\/Message Passing Interface Users\u2019 Group Meeting, Budapest, 2004. 97\u2013104"},{"key":"3894_CR31","first-page":"770","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"K M He","year":"2016","unstructured":"He K M, Zhang X Y, Ren S Q, et al. Deep residual learning for image recognition. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Las Vegas, 2016. 770\u2013778"},{"key":"3894_CR32","first-page":"1","volume-title":"Proceedings of the 3rd International Conference on Learning Representations (ICLR)","author":"K Simonyan","year":"2015","unstructured":"Simonyan K, Zisserman A. Very deep convolutional networks for large-scale image recognition. In: Proceedings of the 3rd International Conference on Learning Representations (ICLR), San Diego, 2015. 1\u201314"},{"key":"3894_CR33","first-page":"4171","volume-title":"Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"J Devlin","year":"2019","unstructured":"Devlin J, Chang M W, Lee K, et al. BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), Minneapolis, 2019. 4171\u20134186"},{"key":"3894_CR34","first-page":"248","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"J Deng","year":"2009","unstructured":"Deng J, Dong W, Socher R, et al. ImageNet: a large-scale hierarchical image database. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Miami, 2009. 248\u2013255"},{"key":"3894_CR35","first-page":"1112","volume-title":"Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"A Williams","year":"2018","unstructured":"Williams A, Nangia N, Bowman S R. A broad-coverage challenge corpus for sentence understanding through inference. In: Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT), New Orleans, 2018. 1112\u20131122"},{"key":"3894_CR36","first-page":"241","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"M Cho","year":"2019","unstructured":"Cho M, Finkler U, Kung D. BlueConnect: novel hierarchical all-reduce on multi-tired network for deep learning. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Stanford, 2019. 241\u2013251"},{"key":"3894_CR37","first-page":"1","volume-title":"Proceedings of the 3rd ACM Symposium on Cloud Computing (SoCC)","author":"B Farley","year":"2012","unstructured":"Farley B, Juels A, Varadarajan V, et al. More for your money: exploiting performance heterogeneity in public clouds. In: Proceedings of the 3rd ACM Symposium on Cloud Computing (SoCC), New York, 2012. 1\u201314"},{"key":"3894_CR38","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1145\/1355734.1355746","volume":"38","author":"N McKeown","year":"2008","unstructured":"McKeown N, Anderson T, Balakrishnan H, et al. OpenFlow: enabling innovation in campus networks. ACM SIGCOMM Comput Commun Rev, 2008, 38: 69\u201374","journal-title":"ACM SIGCOMM Comput Commun Rev"},{"key":"3894_CR39","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1145\/2534169.2486019","volume":"43","author":"S Jain","year":"2013","unstructured":"Jain S, Kumar A, Mandal S, et al. B4: experience with a globally-deployed software defined WAN. ACM SIGCOMM Comput Commun Rev, 2013, 43: 3\u201314","journal-title":"ACM SIGCOMM Comput Commun Rev"},{"key":"3894_CR40","first-page":"74","volume-title":"Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"C Y Hong","year":"2018","unstructured":"Hong C Y, Mandal S, Al-Fares M, et al. B4 and after: managing hierarchy, partitioning, and asymmetry for availability and scale in Google\u2019s software-defined WAN. In: Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), Budapest, 2018. 74\u201387"},{"key":"3894_CR41","first-page":"15","volume-title":"Proceedings of the Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"C Y Hong","year":"2013","unstructured":"Hong C Y, Kandula S, Mahajan R, et al. Achieving high utilization with software-driven WAN. In: Proceedings of the Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), Hong Kong, 2013. 15\u201326"},{"key":"3894_CR42","first-page":"1","volume-title":"Proceedings of the 13th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"A Gupta","year":"2016","unstructured":"Gupta A, MacDavid R, Birkner R, et al. An industrial-scale software defined internet exchange point. In: Proceedings of the 13th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Boston, 2016. 1\u201314"},{"key":"3894_CR43","first-page":"35","volume-title":"Proceedings of the 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"X Jin","year":"2018","unstructured":"Jin X, Li X Z, Zhang H Y, et al. NetChain: scale-free sub-RTT coordination. In: Proceedings of the 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Renton, 2018. 35\u201349"},{"key":"3894_CR44","first-page":"126","volume-title":"Proceedings of Annual Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"Z L Yu","year":"2020","unstructured":"Yu Z L, Zhang Y W, Braverman V, et al. NetLock: fast, centralized lock management using programmable switches. In: Proceedings of Annual Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), 2020. 126\u2013138"},{"key":"3894_CR45","doi-asserted-by":"publisher","first-page":"376","DOI":"10.14778\/3368289.3368301","volume":"13","author":"H Zhu","year":"2019","unstructured":"Zhu H, Bai Z, Li J, et al. Harmonia: near-linear scalability for replicated storage with in-network conflict detection. Proc VLDB Endow, 2019, 13: 376\u2013389","journal-title":"Proc VLDB Endow"},{"key":"3894_CR46","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1145\/2656877.2656890","volume":"44","author":"P Bosshart","year":"2014","unstructured":"Bosshart P, Daly D, Gibb G, et al. P4: programming protocol-independent packet processors. ACM SIGCOMM Comput Commun Rev, 2014, 44: 87\u201395","journal-title":"ACM SIGCOMM Comput Commun Rev"},{"key":"3894_CR47","first-page":"1","volume-title":"Proceedings of IEEE Conference on Computer Communications (INFOCOM)","author":"A Shukla","year":"2021","unstructured":"Shukla A, Hudemann K, V\u00e1gi Z, et al. Fix with P6: verifying programmable switches at runtime. In: Proceedings of IEEE Conference on Computer Communications (INFOCOM), 2021. 1\u201310"},{"key":"3894_CR48","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1145\/3098822.3098829","volume-title":"Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"S Narayana","year":"2017","unstructured":"Narayana S, Sivaraman A, Nathan V, et al. Language-directed hardware design for network performance monitoring. In: Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), Los Angeles, 2017. 85\u201398"},{"key":"3894_CR49","first-page":"430","volume-title":"Proceedings of the 26th International Conference on Network Protocols (ICNP)","author":"R Shah","year":"2018","unstructured":"Shah R, Shirke A, Trehan A, et al. pcube: primitives for network data plane programming. In: Proceedings of the 26th International Conference on Network Protocols (ICNP), Cambridge, 2018. 430\u2013435"},{"key":"3894_CR50","first-page":"1","volume-title":"Proceedings of the 10th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"C Monsanto","year":"2013","unstructured":"Monsanto C, Reich J, Foster N, et al. Composing software defined networks. In: Proceedings of the 10th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Lombard, 2013. 1\u201314"},{"key":"3894_CR51","first-page":"351","volume-title":"Proceedings of the 9th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"T Koponen","year":"2010","unstructured":"Koponen T, Casado M, Gude N, et al. Onix: a distributed control platform for large-scale production networks. In: Proceedings of the 9th USENIX Symposium on Operating Systems Design and Implementation (OSDI), Vancouver, 2010. 351\u2013364"},{"key":"3894_CR52","first-page":"1","volume-title":"Proceedings of Symposium on SDN Research (SOSR)","author":"S H Yeganeh","year":"2016","unstructured":"Yeganeh S H, Ganjali Y. Beehive: simple distributed programming in software-defined networks. In: Proceedings of Symposium on SDN Research (SOSR), Santa Clare, 2016. 1\u201312"},{"key":"3894_CR53","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1145\/3422604.3425933","volume-title":"Proceedings of the 19th ACM Workshop on Hot Topics in Networks (HotNets)","author":"M Hogan","year":"2020","unstructured":"Hogan M, Landau-Feibish S, Arashloo M T, et al. Elastic switch programming with P4All. In: Proceedings of the 19th ACM Workshop on Hot Topics in Networks (HotNets), Chicago, 2020. 168\u2013174"},{"key":"3894_CR54","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1145\/3422604.3425941","volume-title":"Proceedings of the 19th ACM Workshop on Hot Topics in Networks (HotNets)","author":"P Wintermeyer","year":"2020","unstructured":"Wintermeyer P, Apostolaki M, Dietm\u00fcller A, et al. P2GO: P4 profile-guided optimizations. In: Proceedings of the 19th ACM Workshop on Hot Topics in Networks (HotNets), Chicago, 2020. 146\u2013152"},{"key":"3894_CR55","first-page":"1","volume-title":"Proceedings of ACM on Programming Languages (POPL)","author":"R Doenges","year":"2021","unstructured":"Doenges R, Arashloo M T, Bautista S, et al. Petr4: formal foundations for p4 data planes. In: Proceedings of ACM on Programming Languages (POPL), 2021. 1\u201332"},{"key":"3894_CR56","first-page":"296","volume-title":"Proceedings of Annual Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"L C Yu","year":"2020","unstructured":"Yu L C, Sonchack J, Liu V. Mantis: reactive programmable switches. In: Proceedings of Annual Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), 2020. 296\u2013309"},{"key":"3894_CR57","doi-asserted-by":"publisher","first-page":"156","DOI":"10.1145\/3342280.3342343","volume-title":"Proceedings of ACM SIGCOMM 2019 Conference Posters and Demos","author":"V Natesh","year":"2019","unstructured":"Natesh V, Kannan P G, Sivaraman A, et al. Sluice: network-wide data plane programming. In: Proceedings of ACM SIGCOMM 2019 Conference Posters and Demos, Beijing, 2019. 156\u2013158"},{"key":"3894_CR58","first-page":"701","volume-title":"Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"K F Hsu","year":"2020","unstructured":"Hsu K F, Beckett R, Chen A, et al. Contra: a programmable system for performance-aware routing. In: Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Santa Clare, 2020. 701\u2013722"},{"key":"3894_CR59","first-page":"30","volume-title":"Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM)","author":"A Narayan","year":"2018","unstructured":"Narayan A, Cangialosi F, Raghavan D, et al. Restructuring endpoint congestion control. In: Proceedings of Conference of the ACM Special Interest Group on Data Communication (SIGCOMM), Budapest, 2018. 30\u201343"},{"key":"3894_CR60","first-page":"93","volume-title":"Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"M T Arashloo","year":"2020","unstructured":"Arashloo M T, Lavrov A, Ghobadi M, et al. Enabling programmable transport protocols in high-speed NICs. In: Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Santa Clare, 2020. 93\u2013109"},{"key":"3894_CR61","first-page":"1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles Organizers (SOSP)","author":"D Narayanan","year":"2019","unstructured":"Narayanan D, Harlap A, Phanishayee A, et al. PipeDream: generalized pipeline parallelism for DNN training. In: Proceedings of the 27th ACM Symposium on Operating Systems Principles Organizers (SOSP), Ontario, 2019. 1\u201315"},{"key":"3894_CR62","first-page":"103","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS)","author":"Y P Huang","year":"2019","unstructured":"Huang Y P, Cheng Y L, Bapna A, et al. GPipe: efficient training of giant neural networks using pipeline parallelism. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems (NeurIPS), Vancouver, 2019. 103\u2013112"},{"key":"3894_CR63","doi-asserted-by":"publisher","first-page":"207101","DOI":"10.1007\/s11432-021-3416-y","volume":"65","author":"Y R Liu","year":"2022","unstructured":"Liu Y R, Hu Y Q, Qian H, et al. ZOOpt: a toolbox for derivative-free optimization. Sci China Inf Sci, 2022, 65: 207101","journal-title":"Sci China Inf Sci"},{"key":"3894_CR64","first-page":"1","volume-title":"Proceedings of International Conference for High Performance Computing, Networking, Storage, and Analysis","author":"S Rajbhandari","year":"2020","unstructured":"Rajbhandari S, Rasley J, Ruwase O, et al. ZeRO: memory optimizations toward training trillion parameter models. In: Proceedings of International Conference for High Performance Computing, Networking, Storage, and Analysis, 2020. 1\u201316"},{"key":"3894_CR65","first-page":"463","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Y M Jiang","year":"2020","unstructured":"Jiang Y M, Zhu Y B, Lan C, et al. A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters. In: Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI), Banff, 2020. 463\u2013479"},{"key":"3894_CR66","first-page":"561","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"P Moritz","year":"2018","unstructured":"Moritz P, Nishihara R, Wang S, et al. Ray: a distributed framework for emerging AI applications. In: Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (OSDI), Carlsbad, 2018. 561\u2013577"},{"key":"3894_CR67","doi-asserted-by":"publisher","first-page":"112103","DOI":"10.1007\/s11432-020-3182-1","volume":"65","author":"H L Dai","year":"2022","unstructured":"Dai H L, Peng X, Shi X H, et al. Reveal training performance mystery between TensorFlow and PyTorch in the single GPU environment. Sci China Inf Sci, 2022, 65: 112103","journal-title":"Sci China Inf Sci"},{"key":"3894_CR68","first-page":"1","volume-title":"Proceedings of the 8th International Workshop on Interconnection Network Architecture-On-Chip, Multi-Chip","author":"B Prisacari","year":"2014","unstructured":"Prisacari B, Rodriguez G, Garcia M, et al. Performance implications of remote-only load balancing under adversarial traffic in dragonflies. In: Proceedings of the 8th International Workshop on Interconnection Network Architecture-On-Chip, Multi-Chip, Vienna, 2014. 1\u20134"},{"key":"3894_CR69","volume-title":"MSCCL: microsoft collective communication library","author":"M Cowan","year":"2022","unstructured":"Cowan M, Maleki S, Musuvathi M, et al. MSCCL: microsoft collective communication library. 2022. ArXiv:2201.11840"},{"key":"3894_CR70","first-page":"593","volume-title":"Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"A Shah","year":"2023","unstructured":"Shah A, Chidambaram V, Cowan M, et al. TACCL: guiding collective algorithm synthesis using communication sketches. In: Proceedings of the 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Boston, 2023. 593\u2013612"},{"key":"3894_CR71","first-page":"62","volume-title":"Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"Z X Cai","year":"2021","unstructured":"Cai Z X, Liu Z Y, Maleki S, et al. Synthesizing optimal collective algorithms. In: Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), Virtual Event, 2021. 62\u201375"},{"key":"3894_CR72","first-page":"172","volume-title":"Proceedings of Conference on Machine Learning and Systems (MLSys)","author":"G H Wang","year":"2020","unstructured":"Wang G H, Venkataraman S, Phanishayee A, et al. Blink: fast and generic collectives for distributed ML. In: Proceedings of Conference on Machine Learning and Systems (MLSys), Austin, 2020. 172\u2013186"},{"key":"3894_CR73","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1145\/3411029.3411037","volume-title":"Proceedings of the 4th Asia-Pacific Workshop on Networking (APNet)","author":"X C Wan","year":"2020","unstructured":"Wan X C, Zhang H, Wang H, et al. Rat-resilient allreduce tree for distributed machine learning. In: Proceedings of the 4th Asia-Pacific Workshop on Networking (APNet), 2020. 52\u201357"},{"key":"3894_CR74","first-page":"507","volume-title":"Proceedings of the 11th ACM Symposium on Cloud Computing (SoCC)","author":"Y R Chen","year":"2020","unstructured":"Chen Y R, Peng Y H, Bao Y X, et al. Elastic parameter server load distribution in deep learning clusters. In: Proceedings of the 11th ACM Symposium on Cloud Computing (SoCC), 2020. 507\u2013521"},{"key":"3894_CR75","first-page":"785","volume-title":"Proceedings of the 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"A Sapio","year":"2021","unstructured":"Sapio A, Canini M, Ho C Y, et al. Scaling distributed machine learning with in-network aggregation. In: Proceedings of the 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Boston, 2021. 785\u2013808"},{"key":"3894_CR76","first-page":"741","volume-title":"Proceedings of the 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI)","author":"C L Lao","year":"2021","unstructured":"Lao C L, Le Y, Mahajan K, et al. ATP: in-network aggregation for multi-tenant learning. In: Proceedings of the 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI), Boston, 2021. 741\u2013761"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-023-3894-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-023-3894-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-023-3894-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,7]],"date-time":"2024-08-07T08:38:59Z","timestamp":1723019939000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-023-3894-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,31]]},"references-count":76,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["3894"],"URL":"https:\/\/doi.org\/10.1007\/s11432-023-3894-4","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7,31]]},"assertion":[{"value":"5 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 August 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 October 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 July 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"192104"}}