{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T03:35:26Z","timestamp":1768534526515,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,3]],"date-time":"2024-08-03T00:00:00Z","timestamp":1722643200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62132022, No.62372053"],"award-info":[{"award-number":["No.62132022, No.62372053"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","award":["No.ZR2023LZH011"],"award-info":[{"award-number":["No.ZR2023LZH011"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,3]]},"DOI":"10.1145\/3663408.3663426","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T12:23:29Z","timestamp":1719923009000},"page":"122-128","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Hostmesh: Monitor and Diagnose Networks in Rail-optimized RoCE Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5874-6610","authenticated-orcid":false,"given":"Kefei","family":"Liu","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5614-3420","authenticated-orcid":false,"given":"Jiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6144-7899","authenticated-orcid":false,"given":"Zhuo","family":"Jiang","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8781-2700","authenticated-orcid":false,"given":"Xuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0542-8281","authenticated-orcid":false,"given":"Shixian","family":"Guo","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1201-3293","authenticated-orcid":false,"given":"Yangyang","family":"Bai","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8576-9109","authenticated-orcid":false,"given":"Yongbin","family":"Dong","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4427-7015","authenticated-orcid":false,"given":"Zhang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6179-4332","authenticated-orcid":false,"given":"Xiang","family":"Shi","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3809-1879","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4093-3187","authenticated-orcid":false,"given":"Haoran","family":"Wei","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8575-7195","authenticated-orcid":false,"given":"Zicheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Douyin Vision Co., Ltd., China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4628-7535","authenticated-orcid":false,"given":"Yongchen","family":"Pan","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7718-0669","authenticated-orcid":false,"given":"Tian","family":"Pan","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3545-1122","authenticated-orcid":false,"given":"Tao","family":"Huang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18)","author":"Arzani Behnaz","year":"2018","unstructured":"Behnaz Arzani, Selim Ciraci, Luiz Chamon, Yibo Zhu, Hongqiang\u00a0Harry Liu, Jitu Padhye, Boon\u00a0Thau Loo, and Geoff Outhred. 2018. 007: Democratically finding the cause of packet drops. In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18). 419\u2013435."},{"key":"e_1_3_2_1_2_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Bai Wei","year":"2023","unstructured":"Wei Bai, Shanim\u00a0Sainul Abdeen, Ankit Agrawal, Krishan\u00a0Kumar Attre, Paramvir Bahl, Ameya Bhagat, Gowri Bhaskara, Tanya Brokhman, Lei Cao, Ahmad Cheema, 2023. Empowering azure storage with RDMA. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 49\u201367."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3405894"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1644893.1644924"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00056"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.2006.885460"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3405851"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026895"},{"key":"e_1_3_2_1_9_1","volume-title":"When Cloud Storage Meets RDMA. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Gao Yixiao","year":"2021","unstructured":"Yixiao Gao, Qiang Li, Lingbo Tang, Yongqing Xi, Pengcheng Zhang, Wenwen Peng, Bo Li, Yaohui Wu, Shaozong Liu, Lei Yan, 2021. When Cloud Storage Meets RDMA. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). 519\u2013533."},{"key":"e_1_3_2_1_10_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Geng Yilong","year":"2019","unstructured":"Yilong Geng, Shiyu Liu, Zi Yin, Ashish Naik, Balaji Prabhakar, Mendel Rosenblum, and Amin Vahdat. 2019. SIMON: A simple and scalable method for sensing, inference and measurement in data center networks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 549\u2013564."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787496"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230555"},{"key":"e_1_3_2_1_14_1","volume-title":"11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14)","author":"Handigol Nikhil","year":"2014","unstructured":"Nikhil Handigol, Brandon Heller, Vimalkumar Jeyakumar, David Mazi\u00e8res, and Nick McKeown. 2014. I know what your packet did last hop: Using packet histories to troubleshoot networks. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14). 71\u201385."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3405877"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/1452335.1452343"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_18_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Kalia Anuj","year":"2019","unstructured":"Anuj Kalia, Michael Kaminsky, and David Andersen. 2019. Datacenter RPCs can be General and Fast. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 1\u201316."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626299"},{"key":"e_1_3_2_1_20_1","volume-title":"Scalable and Simple Distributed Transactions with Two-Sided Datagram RPCs. In 12th USENIX Symposium on Operating Systems Design and Implementation. 185\u2013201","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia, Michael Kaminsky, and David\u00a0G Andersen. 2016. FaSST: Fast, Scalable and Simple Distributed Transactions with Two-Sided Datagram RPCs. In 12th USENIX Symposium on Operating Systems Design and Implementation. 185\u2013201."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604827"},{"key":"e_1_3_2_1_22_1","unstructured":"Linux. 2024. rdma-core. https:\/\/github.com\/linux-rdma\/rdma-core."},{"key":"e_1_3_2_1_23_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Liu Kefei","year":"2023","unstructured":"Kefei Liu, Zhuo Jiang, Jiao Zhang, Haoran Wei, Xiaolong Zhong, Lizhuang Tan, Tian Pan, and Tao Huang. 2023. Hostping: Diagnosing intra-host network bottlenecks in RDMA servers. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 15\u201329."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2019.8891004"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934879"},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA. 2023. Doubling all2all Performance with NVIDIA Collective Communication Library 2.12. https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/."},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA. 2023. NCCL Environment Variables. https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/env.html."},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA. 2023. NVIDIA DGX SuperPOD: Next Generation Scalable Infrastructure for AI Leadership. https:\/\/docs.nvidia.com\/https:\/docs.nvidia.com\/dgx-superpod-reference-architecture-dgx-h100.pdf."},{"key":"e_1_3_2_1_29_1","unstructured":"NVIDIA. 2024. NCCL. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_30_1","volume-title":"2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Peng Yanghua","year":"2017","unstructured":"Yanghua Peng, Ji Yang, Chuan Wu, Chuanxiong Guo, Chengchen Hu, and Zongpeng Li. 2017. deTector: a Topology-aware Monitoring System for Data Center Networks. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). 55\u201368."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626310"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the Internet Measurement Conference","author":"Roy Arjun","year":"2018","unstructured":"Arjun Roy, Deepak Bansal, David Brumley, Harish\u00a0Kumar Chandrappa, Parag Sharma, Rishabh Tewari, Behnaz Arzani, and Alex\u00a0C Snoeren. 2018. Cloud datacenter sdn monitoring: Experiences and challenges. In Proceedings of the Internet Measurement Conference 2018. 464\u2013470."},{"key":"e_1_3_2_1_33_1","volume-title":"NetBouncer: Active Device and Link Failure Localization in Data Center Networks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Tan Cheng","year":"2019","unstructured":"Cheng Tan, Ze Jin, Chuanxiong Guo, Tianrong Zhang, Haitao Wu, Karl Deng, Dongming Bi, and Dong Xiang. 2019. NetBouncer: Active Device and Link Failure Localization in Data Center Networks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 599\u2013614."},{"key":"e_1_3_2_1_34_1","volume-title":"Rat-Resilient Allreduce Tree for Distributed Machine Learning. In 4th Asia-Pacific Workshop on Networking. 52\u201357","author":"Wan Xinchen","year":"2020","unstructured":"Xinchen Wan, Hong Zhang, Hao Wang, Shuihai Hu, Junxue Zhang, and Kai Chen. 2020. Rat-Resilient Allreduce Tree for Distributed Machine Learning. In 4th Asia-Pacific Workshop on Networking. 52\u201357."},{"key":"e_1_3_2_1_35_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Wang Weitao","year":"2022","unstructured":"Weitao Wang, Xinyu\u00a0Crystal Wu, Praveen Tammana, Ang Chen, and TS\u00a0Eugene Ng. 2022. Closed-loop network performance monitoring and diagnosis with SpiderMon. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 267\u2013285."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230544"},{"key":"e_1_3_2_1_37_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Yu Da","year":"2019","unstructured":"Da Yu, Yibo Zhu, Behnaz Arzani, Rodrigo Fonseca, Tianrong Zhang, Karl Deng, and Lihua Yuan. 2019. dShark: A General, Easy to Program and Scalable Framework for Analyzing In-network Packet Traces. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 207\u2013220."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604837"},{"key":"e_1_3_2_1_39_1","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation. 991\u20131010","author":"Zhao Yikai","year":"2021","unstructured":"Yikai Zhao, Kaicheng Yang, Zirui Liu, Tong Yang, Li Chen, Shiyi Liu, Naiqian Zheng, Ruixin Wang, Hanbo Wu, Yi Wang, 2021. LightGuardian: A full-visibility, lightweight, in-band telemetry system using sketchlets. In 18th USENIX Symposium on Networked Systems Design and Implementation. 991\u20131010."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3387514.3406214"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3555050.3569116"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787483"}],"event":{"name":"APNet 2024: The 8th Asia-Pacific Workshop on Networking","location":"Sydney Australia","acronym":"APNet 2024"},"container-title":["Proceedings of the 8th Asia-Pacific Workshop on Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3663408.3663426","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3663408.3663426","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T23:32:38Z","timestamp":1755905558000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3663408.3663426"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,3]]},"references-count":42,"alternative-id":["10.1145\/3663408.3663426","10.1145\/3663408"],"URL":"https:\/\/doi.org\/10.1145\/3663408.3663426","relation":{},"subject":[],"published":{"date-parts":[[2024,8,3]]},"assertion":[{"value":"2024-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}