{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T18:41:38Z","timestamp":1767292898108,"version":"3.48.0"},"reference-count":78,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Hong Kong RGC TRS","award":["T41-603\/20R"],"award-info":[{"award-number":["T41-603\/20R"]}]},{"name":"GRF","award":["16213621"],"award-info":[{"award-number":["16213621"]}]},{"name":"ITC ACCESS"},{"name":"TACC"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Netw."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/ton.2025.3605231","type":"journal-article","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T17:33:18Z","timestamp":1758043998000},"page":"653-667","source":"Crossref","is-referenced-by-count":0,"title":["High-Performance RoCE-Capable Multicast for Commodity RDMA Datacenters"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8228-2552","authenticated-orcid":false,"given":"Wenxue","family":"Li","sequence":"first","affiliation":[{"name":"iSING Laboratory, The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5530-5659","authenticated-orcid":false,"given":"Junyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Company Ltd., Beijing, China"}]},{"given":"Yufei","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Technologies Company Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1876-0329","authenticated-orcid":false,"given":"Gaoxiong","family":"Zeng","sequence":"additional","affiliation":[{"name":"Huawei Technologies Company Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3184-4081","authenticated-orcid":false,"given":"Zilong","family":"Wang","sequence":"additional","affiliation":[{"name":"iSING Laboratory, The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5151-0997","authenticated-orcid":false,"given":"Chaoliang","family":"Zeng","sequence":"additional","affiliation":[{"name":"iSING Laboratory, The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]},{"given":"Pengpeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Huawei Technologies Company Ltd., Beijing, China"}]},{"given":"Qiaoling","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Company Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2587-6028","authenticated-orcid":false,"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"iSING Laboratory, The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]}],"member":"263","reference":[{"volume-title":"Using Reliable Multicast for Data Distribution with Opendds","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2774993.2774999"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378496"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00033"},{"volume-title":"HPL\u2014A Portable Implementation of the High-Performance Linpack Benchmark for Distributed-Memory Computers","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2004.04.001"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.17487\/rfc3176"},{"issue":"2011","key":"ref8","first-page":"1","article-title":"Kafka: A distributed messaging system for log processing","volume-title":"Proc. NetDB","volume":"11","author":"Kreps"},{"volume-title":"Learning Apache Kafka","year":"2015","author":"Garg","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ALLERTON.2015.7447112"},{"key":"ref11","first-page":"519","article-title":"When cloud storage meets RDMA","volume-title":"Proc. 18th USENIX Symp. NSDI","author":"Gao"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787484"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787510"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"ref15","first-page":"463","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous GPU\/CPU clusters","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Design Implement.","author":"Jiang"},{"key":"ref16","first-page":"317","article-title":"Fast and concurrent RDF queries with RDMA-based distributed graph exploration","volume-title":"Proc. 12th USENIX Symp. Operating Syst. Design Implement.","author":"Shi"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626299"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303967"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/52324.52349"},{"key":"ref20","first-page":"1075","article-title":"Orca: Server-assisted multicast for datacenter networks","volume-title":"Proc. 19th USENIX Symp. Networked Syst. Design Implement.","author":"Diab"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2020.3020869"},{"volume-title":"OpenMPI: Open Source High Performance Computing","key":"ref22"},{"volume-title":"NVIDIA Collective Communication Library (NCCL)","key":"ref23"},{"issue":"10","key":"ref24","first-page":"95","article-title":"Spark: Cluster computing with working sets","volume":"10","author":"Zaharia","year":"2010","journal-title":"HotCloud"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/2043164.2018448"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00074"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2018.00020"},{"volume-title":"Top 500 List","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544238"},{"key":"ref30","article-title":"Bloom: A 176B-parameter open-access multilingual language model","author":"Workshop","year":"2022","journal-title":"arXiv:2211.05100"},{"key":"ref31","first-page":"19","article-title":"Communication efficient distributed machine learning with the parameter server","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"27","author":"Li"},{"key":"ref32","first-page":"741","article-title":"ATP: In-network aggregation for multi-tenant learning","volume-title":"Proc. NSDI","volume":"21","author":"Lao"},{"key":"ref33","first-page":"401","article-title":"FaRM: Fast remote memory","volume-title":"Proc. 11th USENIX Symp. Networked Syst. Design Implement.","author":"Dragojevi{\u0107}"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483576"},{"key":"ref35","first-page":"1","article-title":"SRNIC: A scalable architecture for RDMA NICs","volume-title":"Proc. 20th USENIX Symp. Networked Syst. Design Implement.","author":"Wang"},{"key":"ref36","first-page":"71","article-title":"RDMA is Turing complete, we just did not know it yet!","volume-title":"Proc. NSDI","author":"Reda"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582037"},{"volume-title":"BlueField","key":"ref38"},{"article-title":"Bringing hpc techniques to deep learning","year":"2017","author":"Gibiansky","key":"ref39"},{"volume-title":"Supplement To InfiniBand Architecture Specification Volume 1 Release 1.2.2 Annex A17: RoCEv2 (IP Routable RoCE)","key":"ref40"},{"volume-title":"Infiniband Architecture Volume 1, General Specifications","key":"ref41"},{"volume-title":"Linux Manual Page","key":"ref42"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/347057.347390"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/964723.383081"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.1998.662916"},{"volume-title":"PFC-Free Low Delay Control Protocol","author":"Dai","key":"ref46"},{"journal-title":"802.1Qbb\u2014Priority-Based Flow Control","key":"ref47"},{"volume-title":"Dynamic Connected Transport","key":"ref48"},{"volume-title":"Virtex UltraScale","key":"ref49"},{"volume-title":"Unifiex Communication X","key":"ref50"},{"volume-title":"Libibverbs","key":"ref51"},{"key":"ref52","volume":"1","author":"Crowley","year":"2003","journal-title":"Network Processor Design: Issues and Practices."},{"volume-title":"Edge-Core Networks","key":"ref53"},{"key":"ref54","first-page":"785","article-title":"Scaling distributed machine learning with in-network aggregation","volume-title":"Proc. NSDI","author":"Sapio"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3067731"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2013.6702619"},{"volume-title":"Ns-3, a Discrete-Event Network Simulator for Internet Systems","key":"ref59"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2019.2902875"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230557"},{"volume-title":"Virtex UltraScale+","key":"ref62"},{"volume-title":"BT IPTV","key":"ref63"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2013.SUP.0513014"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2009.5062120"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TCC.2022.3142066"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8486290"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2016.7524383"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICNP49622.2020.9259408"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2018.00023"},{"key":"ref71","first-page":"1093","article-title":"Yeti: Stateless and generalized multicast forwarding","volume-title":"Proc. 19th USENIX Symp. Networked Syst. Design Implement.","author":"Diab"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS54860.2022.00108"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3464298.3493393"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/90.650139"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2015.7218381"},{"volume-title":"Scalable Hierarchical Aggregation and Reduction Protocol (SHARP)","key":"ref76"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/2535372.2535380"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707266"}],"container-title":["IEEE Transactions on Networking"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10723154\/11317935\/11165220.pdf?arnumber=11165220","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T18:36:39Z","timestamp":1767292599000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11165220\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":78,"URL":"https:\/\/doi.org\/10.1109\/ton.2025.3605231","relation":{},"ISSN":["2998-4157"],"issn-type":[{"type":"electronic","value":"2998-4157"}],"subject":[],"published":{"date-parts":[[2026]]}}}