{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:26:07Z","timestamp":1774599967021,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3718958.3750488","type":"proceedings-article","created":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:54:11Z","timestamp":1756313651000},"page":"381-394","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["CEIO: A Cache-Efficient Network I\/O Architecture for NIC-CPU Data Paths"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5949-9756","authenticated-orcid":false,"given":"Bowen","family":"Liu","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong, Hong Kong"},{"name":"Nanjing University, State Key Laboratory for Novel Software Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4830-7808","authenticated-orcid":false,"given":"Xinyang","family":"Huang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8875-1278","authenticated-orcid":false,"given":"Qijing","family":"Li","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9984-5438","authenticated-orcid":false,"given":"Zhuobin","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2831-0012","authenticated-orcid":false,"given":"Yijun","family":"Sun","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8228-2552","authenticated-orcid":false,"given":"Wenxue","family":"Li","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6926-7801","authenticated-orcid":false,"given":"Junxue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2115-1965","authenticated-orcid":false,"given":"Ping","family":"Yin","sequence":"additional","affiliation":[{"name":"Inspur, Jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2587-6028","authenticated-orcid":false,"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,8,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604878"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2534169.2486031"},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2024. AMD Pensando\u2122 Pollara 400 Adapters. https:\/\/www.amd.com\/en\/products\/accelerators\/pensando.html. Accessed 2025-01-01."},{"key":"e_1_3_2_1_5_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Anderson Thomas E","year":"2020","unstructured":"Thomas E Anderson, Marco Canini, Jongyul Kim, Dejan Kosti\u0107, Youngjin Kwon, Simon Peter, Waleed Reda, Henry N Schuh, and Emmett Witchel. 2020. Assise: Performance and availability via client-local {NVM} in a distributed file system. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 1011\u20131027."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 13th ACM workshop on hot topics in networks. 1\u20137.","author":"Bai Wei","year":"2014","unstructured":"Wei Bai, Li Chen, Kai Chen, Dongsu Han, Chen Tian, and Weicheng Sun. 2014. PIAS: Practical information-agnostic flow scheduling for data center networks. In Proceedings of the 13th ACM workshop on hot topics in networks. 1\u20137."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543668"},{"key":"e_1_3_2_1_8_1","volume-title":"ARM cache stashing","author":"ARM.","year":"2017","unstructured":"ARM. ARM cache stashing. 2017. https:\/\/developer.arm.com\/documentation\/102407\/0100\/Cache-stashing. Accessed 2025-01-01."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472888"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the ACM SIGCOMM 2022 Conference. 767\u2013779","author":"Cai Qizhe","year":"2022","unstructured":"Qizhe Cai, Midhul Vuppalapati, Jaehyun Hwang, Christos Kozyrakis, and Rachit Agarwal. 2022. Towards \u03bc s tail latency and terabit ethernet: disaggregating the host network stack. In Proceedings of the ACM SIGCOMM 2022 Conference. 767\u2013779."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2022.3198843"},{"key":"e_1_3_2_1_12_1","volume-title":"Demystifying datapath accelerator enhanced off-path smartnic. arXiv preprint arXiv:2402.03041","author":"Chen Xuzheng","year":"2024","unstructured":"Xuzheng Chen, Jie Zhang, Ting Fu, Yifan Shen, Shu Ma, Kun Qian, Lingjun Zhu, Chao Shi, Yin Zhang, Ming Liu, and Zeke Wang. 2024. Demystifying datapath accelerator enhanced off-path smartnic. arXiv preprint arXiv:2402.03041 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303968"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the Conference of the ACM Special Interest Group on Data Communication. 1\u201314","author":"Chole Sharad","year":"2017","unstructured":"Sharad Chole, Andy Fingerhut, Sha Ma, Anirudh Sivaraman, Shay Vargaftik, Alon Berger, Gal Mendelson, Mohammad Alizadeh, Shang-Tse Chuang, Isaac Keslassy, Ariel Orda, and Tom Edsall. 2017. drmt: Disaggregated programmable switching. In Proceedings of the Conference of the ACM Special Interest Group on Data Communication. 1\u201314."},{"key":"e_1_3_2_1_15_1","volume-title":"Intel data direct i\/o technology (intel DDIO): A primer","author":"Intel Corporation","year":"2012","unstructured":"Intel Corporation. Intel data direct i\/o technology (intel DDIO): A primer. 2012. https:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/technology-briefs\/data-direct-i-o-technology-brief.pdf. Accessed 2025-01-01."},{"key":"e_1_3_2_1_16_1","volume-title":"11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14)","author":"Dragojevi\u0107 Aleksandar","year":"2014","unstructured":"Aleksandar Dragojevi\u0107, Dushyanth Narayanan, Miguel Castro, and Orion Hodson. 2014. FaRM: Fast remote memory. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14). 401\u2013414."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649455"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446724"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303977"},{"key":"e_1_3_2_1_20_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Farshin Alireza","year":"2020","unstructured":"Alireza Farshin, Amir Roozbeh, Gerald Q Maguire Jr, and Dejan Kosti\u0107. 2020. Reexamining Direct Cache Access to Optimize I\/O Intensive Applications for Multi-hundred-gigabit Networks. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). 673\u2013689."},{"key":"e_1_3_2_1_21_1","volume-title":"2020 IEEE 28th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM). IEEE, 38\u201346","author":"Forencich Alex","year":"2020","unstructured":"Alex Forencich, Alex C Snoeren, George Porter, and George Papen. 2020. Corundum: An open-source 100-gbps nic. In 2020 IEEE 28th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM). IEEE, 38\u201346."},{"key":"e_1_3_2_1_22_1","volume-title":"DPDK: Data plane development kit","author":"Foundation Linux","year":"2024","unstructured":"Linux Foundation. 2024. DPDK: Data plane development kit. http:\/\/dpdk.org. Accessed 2025-01-01."},{"key":"e_1_3_2_1_23_1","unstructured":"Linux Foundation. 2025. RDMA-Core github repository. https:\/\/github.com\/linux-rdma\/rdma-core. Accessed 2025-01-01."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691825.3691829"},{"key":"e_1_3_2_1_25_1","unstructured":"Google. 2025. gRPC github repository. https:\/\/github.com\/grpc\/grpc. Accessed 2025-01-01."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3281411.3281443"},{"key":"e_1_3_2_1_28_1","volume-title":"Understanding Routable PCIe Performance for Composable Infrastructures. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hou Wentao","year":"2024","unstructured":"Wentao Hou, Jie Zhang, Zeke Wang, and Ming Liu. 2024. Understanding Routable PCIe Performance for Composable Infrastructures. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 297\u2013312."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.23"},{"key":"e_1_3_2_1_30_1","unstructured":"The implementation of low-level I\/O operations in eRPC. Line 110\u2013141. https:\/\/github.com\/erpc-io\/eRPC\/blob\/master\/src\/transport_impl\/dpdk\/dpdk_transport_datapath.cc. Accessed 2025-01-01."},{"key":"e_1_3_2_1_31_1","unstructured":"Baidu Inc. 2025. dPerf github repository. https:\/\/github.com\/baidu\/dperf. Accessed 2025-01-01."},{"key":"e_1_3_2_1_32_1","unstructured":"Intel. 2021. Intel Xeon Silver 4309Y Specification. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/sku\/215275\/intel-xeon-silver-4309y-processor-12m-cache-2-80-ghz\/specifications.html. Accessed 2025-01-01."},{"key":"e_1_3_2_1_33_1","unstructured":"Intel. 2024. ntel\u00ae Infrastructure Processing Unit (Intel\u00ae IPU). https:\/\/www.intel.com\/content\/www\/us\/en\/products\/details\/network-io\/ipu.html. Accessed 2025-01-01."},{"key":"e_1_3_2_1_34_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 947\u2013960."},{"key":"e_1_3_2_1_35_1","volume-title":"11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14)","author":"Jeong EunYoung","year":"2014","unstructured":"EunYoung Jeong, Shinae Wood, Muhammad Jamshed, Haewon Jeong, Sunghwan Ihm, Dongsu Han, and KyoungSoo Park. 2014. mTCP: a Highly Scalable User-level TCP Stack for Multicore Systems. In 11th USENIX Symposium on Networked Systems Design and Implementation (NSDI 14). 489\u2013502."},{"key":"e_1_3_2_1_36_1","volume-title":"SOSP '21: ACM SIGOPS 28th Symposium on Operating Systems Principles, Virtual Event \/ Koblenz, Germany, October 26\u201329","author":"Jongyul Kim","year":"2021","unstructured":"Kim Jongyul, Jang Insu, Reda Waleed, Im Jaeseong, Canini Marco, Kostic Dejan, Kwon Youngjin, Peter Simon, and Witchel Emmett. 2021. LineFS: Efficient Smart-NIC Offload of a Distributed File System with Pipeline Parallelism. In SOSP '21: ACM SIGOPS 28th Symposium on Operating Systems Principles, Virtual Event \/ Koblenz, Germany, October 26\u201329, 2021. 756\u2013771."},{"key":"e_1_3_2_1_37_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Kalia Anuj","year":"2019","unstructured":"Anuj Kalia, Michael Kaminsky, and David Andersen. 2019. Datacenter RPC can be general and fast. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 1\u201316."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626299"},{"key":"e_1_3_2_1_39_1","volume-title":"2016 USENIX Annual Technical Conference (USENIX ATC 16)","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia, Michael Kaminsky, and David G Andersen. 2016. Design guidelines for high performance RDMA systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16). 437\u2013450."},{"key":"e_1_3_2_1_40_1","volume-title":"Scalable and Simple Distributed Transactions with Two-Sided RDMA Datagram RPCs. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia, Michael Kaminsky, and David G Andersen. 2016. FaSST: fast, Scalable and Simple Distributed Transactions with Two-Sided RDMA Datagram RPCs. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 185\u2013201."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897393"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691825.3691831"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3663408.3663409"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342079"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the Nineteenth European Conference on Computer Systems. 132\u2013147","author":"Lu Fangming","year":"2024","unstructured":"Fangming Lu, Xingda Wei, Zhuobin Huang, Rong Chen, Minyu Wu, and Haibo Chen. 2024. Serialization\/Deserialization-free State Transfer in Serverless Work-flows. In Proceedings of the Nineteenth European Conference on Computer Systems. 132\u2013147."},{"key":"e_1_3_2_1_46_1","unstructured":"Marvell. 2020. Marvell LiquidIO\u2122 III. https:\/\/www.marvell.com\/content\/dam\/marvell\/en\/public-collateral\/embedded-processors\/marvell-liquidio-III-solutions-brief.pdf. Accessed 2025-01-01."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230543.3230560"},{"key":"e_1_3_2_1_48_1","unstructured":"NVIDIA. 2020. https:\/\/www.nvidia.com\/en-us\/geforce\/graphics-cards\/30-series\/rtx-3090-3090ti\/. Accessed 2025-01-01."},{"key":"e_1_3_2_1_49_1","unstructured":"NVIDIA. 2022. https:\/\/www.nvidia.com\/en-us\/geforce\/graphics-cards\/40-series\/rtx-4090\/. Accessed 2025-01-01."},{"key":"e_1_3_2_1_50_1","unstructured":"NVIDIA. 2024. https:\/\/docs.nvidia.com\/cuda\/gpudirect-rdma\/. Accessed 2025-01-01."},{"key":"e_1_3_2_1_51_1","unstructured":"NVIDIA. 2024. DOCA Software Development Kit v2.9.2 LTS. https:\/\/docs.nvidia.com\/doca\/archive\/2-9-2-lts-ovs-update\/index.html. Accessed 2025-01-01."},{"key":"e_1_3_2_1_52_1","unstructured":"NVIDIA. 2024. NVIDIA ConnectX-7 Network Adapter. https:\/\/docs.nvidia.com\/networking\/display\/mlnxofedv24100700\/flow+steering. Accessed 2025-01-01."},{"key":"e_1_3_2_1_53_1","unstructured":"NVIDIA. 2024. OVS Offload Using ASAP Direct. https:\/\/docs.nvidia.com\/networking\/display\/mlnxofedv24100700\/ovs+offload+using+asap2+direct. Accessed 2025-01-01."},{"key":"e_1_3_2_1_54_1","unstructured":"NVIDIA. 2024. Performance Tuning for Mellanox Adapters. https:\/\/enterprise-support.nvidia.com\/s\/article\/performance-tuning-for-mellanox-adapters. Accessed 2025-01-01."},{"key":"e_1_3_2_1_55_1","unstructured":"NVIDIA. 2024. Shared Receive Queues in BlueField-3 Platform. https:\/\/docs.nvidia.com\/networking\/display\/bluefielddpuosv452\/shared+rq+mode. Accessed 2025-01-01."},{"key":"e_1_3_2_1_56_1","unstructured":"NVIDIA. 2025. NCCL github repository. https:\/\/github.com\/NVIDIA\/nccl. Accessed 2025-01-01."},{"key":"e_1_3_2_1_57_1","unstructured":"OpenMPI. 2025. OpenMPI github repository. https:\/\/github.com\/open-mpi\/ompi. Accessed 2025-01-01."},{"key":"e_1_3_2_1_58_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Ousterhout Amy","year":"2019","unstructured":"Amy Ousterhout, Joshua Fried, Jonathan Behrens, Adam Belay, and Hari Balakrishnan. 2019. Shenango: Achieving high CPU efficiency for latency-sensitive datacenter workloads. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). 361\u2013378."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.5555\/2789770.2789779"},{"key":"e_1_3_2_1_60_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Pirelli Solal","year":"2020","unstructured":"Solal Pirelli and George Candea. 2020. A Simpler and Faster NIC Driver Model for Network Functions. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 225\u2013241."},{"key":"e_1_3_2_1_61_1","volume-title":"ShRing: Networking with Shared Receive Rings. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Pismenny Boris","year":"2023","unstructured":"Boris Pismenny, Adam Morrison, and Dan Tsafrir. 2023. ShRing: Networking with Shared Receive Rings. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 949\u2013968."},{"key":"e_1_3_2_1_62_1","unstructured":"NVIDIA BlueField Networking Platform. 2025. https:\/\/www.nvidia.com\/en-us\/networking\/products\/data-processing-unit\/. Accessed 2025-01-01."},{"key":"e_1_3_2_1_63_1","unstructured":"Redis. 2025. Redis github repository. https:\/\/github.com\/redis\/redis. Accessed 2025-01-01."},{"key":"e_1_3_2_1_64_1","volume-title":"19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)","author":"Ren Zhenghang","year":"2025","unstructured":"Zhenghang Ren, Yuxuan Li, Zilong Wang, Xinyang Huang, Wenxue Li, Kaiqiang Xu, Xudong Liao, Yijun Sun, Bowen Liu, Han Tian, Junxue Zhang, Mingfei Wang, Zhizhen Zhong, Guyue Liu, Ying Zhang, and Kai. Chen. 2025. Enabling Efficient GPU Communication over Multiple NICs with FuseLink. In 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25). 91\u2013108."},{"key":"e_1_3_2_1_65_1","volume-title":"Ens\u03c9: A Streaming Interface for NIC-Application Communication. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Sadok Hugo","year":"2023","unstructured":"Hugo Sadok, Nirav Atre, Zhipeng Zhao, Daniel S Berger, James C Hoe, Aurojit Panda, Justine Sherry, and Ren Wang. 2023. Ens\u03c9: A Streaming Interface for NIC-Application Communication. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 1005\u20131025."},{"key":"e_1_3_2_1_66_1","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Schuh Henry N","year":"2024","unstructured":"Henry N Schuh, Arvind Krishnamurthy, David Culler, Henry M Levy, Luigi Rizzo, Samira Khan, and Brent E Stephens. 2024. CC-NIC: a Cache-Coherent Interface to the NIC. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 52\u201368."},{"key":"e_1_3_2_1_67_1","unstructured":"Apache Spark. 2025. Spark github repository. https:\/\/github.com\/apache\/spark. Accessed 2025-01-01."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2010.5416638"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3230718.3230727"},{"key":"e_1_3_2_1_70_1","volume-title":"CacheCloud: Towards Speed-of-light Datacenter Communication. In 10th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 18)","author":"Thomas Shelby","year":"2018","unstructured":"Shelby Thomas, Geoffrey M Voelker, and George Porter. 2018. CacheCloud: Towards Speed-of-light Datacenter Communication. In 10th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 18)."},{"key":"e_1_3_2_1_71_1","volume-title":"Introduction to receive side scaling","year":"2017","unstructured":"Microsoft. Introduction to receive side scaling. 2017. https:\/\/docs.microsoft.com\/en-us\/windows-hardware\/drivers\/network\/introduction-to-receive-side-scaling. Accessed 2025-01-01."},{"key":"e_1_3_2_1_72_1","volume-title":"Scaling in the linux networking stack","author":"Tom Herbert","year":"2011","unstructured":"Herbert Tom and de Bruijn Willem. Scaling in the linux networking stack. 2011. https:\/\/www.kernel.org\/doc\/Documentation\/networking\/scaling.txt. Accessed 2025-01-01."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672271"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/TON.2024.3524247"},{"key":"e_1_3_2_1_75_1","volume-title":"Towards Domain-Specific Network Transport for Distributed DNN Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Wang Hao","year":"2024","unstructured":"Hao Wang, Han Tian, Jingrong Chen, Xinchen Wan, Jiacheng Xia, Gaoxiong Zeng, Wei Bai, Junchen Jiang, Yong Wang, and Kai Chen. 2024. Towards Domain-Specific Network Transport for Distributed DNN Training. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). 1421\u20131443."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3508042"},{"key":"e_1_3_2_1_77_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Zilong","year":"2023","unstructured":"Zilong Wang, Layong Luo, Qingsong Ning, Chaoliang Zeng, Wenxue Li, Xinchen Wan, Peng Xie, Tao Feng, Ke Cheng, Xiongfei Geng, Tianhao Wang, Weicheng Ling, Kejia Huo, Pingbo An, Kui Ji, Shideng Zhang, Bin Xu, Ruiqing Feng, Tao Ding, Kai Chen, and Chuanxiong Guo. 2023. SRNIC: A scalable architecture for RDMA NICs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 1\u201314."},{"key":"e_1_3_2_1_78_1","volume-title":"Characterizing Off-path SmartNIC for Accelerating Distributed Systems. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Wei Xingda","year":"2023","unstructured":"Xingda Wei, Rongxin Cheng, Yuhan Yang, Rong Chen, and Haibo Chen. 2023. Characterizing Off-path SmartNIC for Accelerating Distributed Systems. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 987\u20131004."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/2815400.2815419"},{"key":"e_1_3_2_1_80_1","volume-title":"Cepheus: Accelerating Datacenter Applications with High-Performance RoCE-Capable Multicast. In IEEE International Symposium on High-Performance Computer Architecture, HPCA 2024","author":"Wenxue Li","year":"2024","unstructured":"Li Wenxue, Zhang Junyi, Liu Yufei, Zeng Gaoxiong, Wang Zilong, Zeng Chaoliang, Zhou Pengpeng, Qiaoling Wang, and Kai Chen. 2024. Cepheus: Accelerating Datacenter Applications with High-Performance RoCE-Capable Multicast. In IEEE International Symposium on High-Performance Computer Architecture, HPCA 2024, Edinburgh, United Kingdom, March 2\u20136, 2024. IEEE, 908\u2013921."},{"key":"e_1_3_2_1_81_1","volume-title":"Proceedings of the ACM SIGCOMM 2023 Conference. 1028\u20131042","author":"Xing Jiarong","year":"2023","unstructured":"Jiarong Xing, Yiming Qiu, Kuo-Feng Hsu, Songyuan Sui, Khalid Manaa, Omer Shabtai, Yonatan Piasetzky, Matty Kadosh, Arvind Krishnamurthy, TS Eugene Ng, and Ang Chen. 2023. Unleashing SmartNIC packet processing performance in P4. In Proceedings of the ACM SIGCOMM 2023 Conference. 1028\u20131042."},{"key":"e_1_3_2_1_82_1","volume-title":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"1","author":"Xu Kaiqiang","year":"2025","unstructured":"Kaiqiang Xu, Decang Sun, Hao Wang, Zhenghang Ren, Xinchen Wan, Xudong Liao, Zilong Wang, Junxue Zhang, and Kai Chen. 2025. Design and Operation of Shared Machine Learning Clusters on Campus. In Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1. 295\u2013310."},{"key":"e_1_3_2_1_83_1","volume-title":"17th USENIX Conference on File and Storage Technologies (FAST 19)","author":"Yang Jian","year":"2019","unstructured":"Jian Yang, Joseph Izraelevitz, and Steven Swanson. 2019. Orion: A distributed file system for Non-Volatile main memory and RDMA-Capable networks. In 17th USENIX Conference on File and Storage Technologies (FAST 19). 221\u2013234."},{"key":"e_1_3_2_1_84_1","volume-title":"2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 112\u2013125","author":"Yuan Yifan","year":"2021","unstructured":"Yifan Yuan, Mohammad Alian, Yipeng Wang, Ren Wang, Ilia Kurakin, Charlie Tai, and Nam Sung Kim. 2021. Don't forget the I\/O when allocating your LLC. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 112\u2013125."}],"event":{"name":"SIGCOMM '25: ACM SIGCOMM 2025 Conference","location":"S\u00e3o Francisco Convent Coimbra Portugal","acronym":"SIGCOMM '25","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the ACM SIGCOMM 2025 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3718958.3750488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T17:00:25Z","timestamp":1756314025000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3718958.3750488"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,27]]},"references-count":84,"alternative-id":["10.1145\/3718958.3750488","10.1145\/3718958"],"URL":"https:\/\/doi.org\/10.1145\/3718958.3750488","relation":{},"subject":[],"published":{"date-parts":[[2025,8,27]]},"assertion":[{"value":"2025-08-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}