{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T00:05:35Z","timestamp":1756339535955,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3718958.3750480","type":"proceedings-article","created":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:54:11Z","timestamp":1756313651000},"page":"85-98","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Revisiting RDMA Reliability for Lossy Fabrics"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8228-2552","authenticated-orcid":false,"given":"Wenxue","family":"Li","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology and Huawei"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2156-9683","authenticated-orcid":false,"given":"Xiangzhou","family":"Liu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"given":"Yunxuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"given":"Zihao","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"given":"Wei","family":"Gu","sequence":"additional","affiliation":[{"name":"Huawei"}]},{"given":"Tao","family":"Qian","sequence":"additional","affiliation":[{"name":"Huawei"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1876-0329","authenticated-orcid":false,"given":"Gaoxiong","family":"Zeng","sequence":"additional","affiliation":[{"name":"Huawei"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8530-5259","authenticated-orcid":false,"given":"Shoushou","family":"Ren","sequence":"additional","affiliation":[{"name":"Huawei"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4830-7808","authenticated-orcid":false,"given":"Xinyang","family":"Huang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8779-4768","authenticated-orcid":false,"given":"Zhenghang","family":"Ren","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5949-9756","authenticated-orcid":false,"given":"Bowen","family":"Liu","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6926-7801","authenticated-orcid":false,"given":"Junxue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2587-6028","authenticated-orcid":false,"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4572-4956","authenticated-orcid":false,"given":"Bingyang","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei"}]}],"member":"320","published-online":{"date-parts":[[2025,8,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2020. 802.1Qbb - Priority-based Flow Control. https:\/\/1.ieee802.org\/dcb\/802-1qbb\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2023. EdgeCore AS9516. https:\/\/www.edge-core.com\/_upload\/images\/2023-061-DCS810_AS9516-32D_DS_R07_20230503.pdf."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. NVIDIA InfiniBand Adaptive Routing Technology - Accelerating HPC and AI Applications. https:\/\/resources.nvidia.com\/en-us-cloud-native-supercomputing-dpus-campaign\/infiniband-white-paper-adaptive-routing."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. AMD Alveo\u2122 U250 Data Center Accelerator Card. https:\/\/www.amd.com\/en\/products\/accelerators\/alveo\/u250\/a-u250-a64g-pq-g.html."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Google Falcon. https:\/\/cloud.google.com\/blog\/topics\/systems\/introducing-falcon-a-reliable-low-latency-hardware-transport."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Libibverbs. https:\/\/github.com\/linux-rdma\/rdma-core\/blob\/master\/Documentation\/libibverbs.md."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. NVIDIA ConnectX-5. https:\/\/www.nvidia.com\/en-sg\/networking\/ethernet\/connectx-5\/."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. NVIDIA ConnectX-6 Dx. https:\/\/resources.nvidia.com\/en-us-accelerated-networking-resource-library\/networking-overal-dp."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. NVIDIA ConnectX-7. https:\/\/resources.nvidia.com\/en-us-accelerated-networking-resource-library\/connectx-7-datasheet."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. OpenMPI. https:\/\/www.open-mpi.org\/."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. Ultra Ethernet Consortium. https:\/\/ultraethernet.org\/wp-content\/uploads\/sites\/20\/2023\/10\/23.07.12-UEC-1.0-Overview-FINAL-WITH-LOGO.pdf."},{"key":"e_1_3_2_1_12_1","unstructured":"2024. Zero Touch RoCE. https:\/\/docs.nvidia.com\/networking\/display\/winof2v237\/ethernet+network."},{"key":"e_1_3_2_1_13_1","unstructured":"2025. NVIDIA Spectrum Platform. https:\/\/www.nvidia.com\/en-us\/networking\/spectrumx\/."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626316"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_1_16_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Bai Wei","year":"2023","unstructured":"Wei Bai, Shanim Sainul Abdeen, Ankit Agrawal, Krishan Kumar Attre, Paramvir Bahl, Ameya Bhagat, Gowri Bhaskara, Tanya Brokhman, Lei Cao, Ahmad Cheema, et al. 2023. Empowering azure storage with {RDMA}. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 49\u201367."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544235"},{"key":"e_1_3_2_1_18_1","volume-title":"11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14)","author":"Cheng Peng","year":"2014","unstructured":"Peng Cheng, Fengyuan Ren, Ran Shu, and Chuang Lin. 2014. Catch the whole lot in an action: Rapid precise packet loss notification in data center. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14). 17\u201328."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"Yuchung Cheng Neal Cardwell Nandita Dukkipati and Priyaranjan Jha. 2021. The RACK-TLP Loss Detection Algorithm for TCP. RFC 8985. 10.17487\/RFC8985","DOI":"10.17487\/RFC8985"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098840"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2013.6567015"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672233"},{"key":"e_1_3_2_1_23_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Gao Yixiao","year":"2021","unstructured":"Yixiao Gao, Qiang Li, Lingbo Tang, Yongqing Xi, Pengcheng Zhang, Wenwen Peng, Bo Li, Yaohui Wu, Shaozong Liu, Lei Yan, et al. 2021. When cloud storage meets {RDMA}. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 519\u2013533."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3375235.3375239"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098825"},{"key":"e_1_3_2_1_27_1","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Hu Jinbin","year":"2025","unstructured":"Jinbin Hu, Wenxue Li, Xiangzhou Liu, Junfeng Wang, Bowen Liu, Ping Yin, Jianxin Wang, Jiawei Huang, and Kai Chen. 2025. {FLB}: Fine-grained Load Balancing for Lossless Datacenter Networks. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 365\u2013380."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3405878"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3143361.3143382"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663408.3663418"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. 149\u2013160","author":"Kabbani Abdul","year":"2014","unstructured":"Abdul Kabbani, Balajee Vamanan, Jahangir Hasan, and Fabien Duchene. 2014. Flowbender: Flow-level adaptive routing for improved latency and throughput in datacenter networks. In Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. 149\u2013160."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2890955.2890968"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles. 756\u2013771","author":"Kim Jongyul","year":"2021","unstructured":"Jongyul Kim, Insu Jang, Waleed Reda, Jaeseong Im, Marco Canini, Dejan Kosti\u0107, Youngjin Kwon, Simon Peter, and Emmett Witchel. 2021. Linefs: Efficient smart-nic offload of a distributed file system with pipeline parallelism. In Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles. 756\u2013771."},{"key":"e_1_3_2_1_34_1","volume-title":"STrack: A Reliable Multipath Transport for AI\/ML Clusters. arXiv preprint arXiv:2407.15266","author":"Le Yanfang","year":"2024","unstructured":"Yanfang Le, Rong Pan, Peter Newman, Jeremias Blendin, Abdul Kabbani, Vipin Jain, Raghava Sivaramu, and Francis Matus. 2024. STrack: A Reliable Multipath Transport for AI\/ML Clusters. arXiv preprint arXiv:2407.15266 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663408.3663409"},{"key":"e_1_3_2_1_36_1","volume-title":"2023 IEEE 31st International Conference on Network Protocols (ICNP). IEEE, 1\u201311","author":"Li Wenxue","year":"2023","unstructured":"Wenxue Li, Chaoliang Zeng, Jinbin Hu, and Kai Chen. 2023. Towards finegrained and practical flow control for datacenter networks. In 2023 IEEE 31st International Conference on Network Protocols (ICNP). IEEE, 1\u201311."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342085"},{"key":"e_1_3_2_1_38_1","volume-title":"15th USENIX symposium on networked systems design and implementation (NSDI 18)","author":"Lu Yuanwei","year":"2018","unstructured":"Yuanwei Lu, Guo Chen, Bojie Li, Kun Tan, Yongqiang Xiong, Peng Cheng, Jiansong Zhang, Enhong Chen, and Thomas Moscibroda. 2018. {Multi-Path} transport for {RDMA} in datacenters. In 15th USENIX symposium on networked systems design and implementation (NSDI 18). 357\u2013371."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3106989.3106993"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544238"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787510"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230557"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230564"},{"key":"e_1_3_2_1_44_1","unstructured":"NCCL. 2024. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 2014 ACM conference on SIGCOMM. 307\u2013318","author":"Perry Jonathan","year":"2014","unstructured":"Jonathan Perry, Amy Ousterhout, Hari Balakrishnan, Devavrat Shah, and Hans Fugal. 2014. Fastpass: A centralized\" zero-queue\" datacenter network. In Proceedings of the 2014 ACM conference on SIGCOMM. 307\u2013318."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the ACM SIGCOMM 2022 Conference. 207\u2013218","author":"Qureshi Mubashir Adnan","year":"2022","unstructured":"Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, David Wetherall, and Abdul Kabbani. 2022. PLB: congestion signals are simple and effective for network load balancing. In Proceedings of the ACM SIGCOMM 2022 Conference. 207\u2013218."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2043164.2018467"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787472"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.3016891"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604849"},{"key":"e_1_3_2_1_51_1","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Vanini Erico","year":"2017","unstructured":"Erico Vanini, Rong Pan, Mohammad Alizadeh, Parvin Taheri, and Tom Edsall. 2017. Let it flow: Resilient asymmetric load balancing with flowlet switching. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). 407\u2013420."},{"key":"e_1_3_2_1_52_1","volume-title":"IEEE INFOCOM 2024-IEEE Conference on Computer Communications. IEEE, 1381\u20131390","author":"Wan Zirui","year":"2024","unstructured":"Zirui Wan, Jiao Zhang, Mingxuan Yu, Junwei Liu, Jun Yao, Xinghua Zhao, and Tao Huang. 2024. Bicc: Bilateral congestion control in cross-datacenter rdma networks. In IEEE INFOCOM 2024-IEEE Conference on Computer Communications. IEEE, 1381\u20131390."},{"key":"e_1_3_2_1_53_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Zilong","year":"2023","unstructured":"Zilong Wang, Layong Luo, Qingsong Ning, Chaoliang Zeng, Wenxue Li, Xinchen Wan, Peng Xie, Tao Feng, Ke Cheng, Xiongfei Geng, et al. 2023. {SRNIC}: A Scalable Architecture for {RDMA}{NICs}. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 1\u201314."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Xu Kaiqiang","year":"2025","unstructured":"Kaiqiang Xu, Decang Sun, Hao Wang, Zhenghang Ren, Xinchen Wan, Xudong Liao, Zilong Wang, Junxue Zhang, and Kai Chen. 2025. Design and Operation of Shared Machine Learning Clusters on Campus. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 295\u2013310."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098841"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787484"}],"event":{"name":"SIGCOMM '25: ACM SIGCOMM 2025 Conference","location":"S\u00e3o Francisco Convent Coimbra Portugal","acronym":"SIGCOMM '25","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the ACM SIGCOMM 2025 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3718958.3750480","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T17:00:50Z","timestamp":1756314050000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3718958.3750480"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,27]]},"references-count":56,"alternative-id":["10.1145\/3718958.3750480","10.1145\/3718958"],"URL":"https:\/\/doi.org\/10.1145\/3718958.3750480","relation":{},"subject":[],"published":{"date-parts":[[2025,8,27]]},"assertion":[{"value":"2025-08-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}