{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T08:02:50Z","timestamp":1773734570119,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Research Grants Council, University Grants Committee","doi-asserted-by":"publisher","award":["Hong Kong RGC TRS T41-603\/20-R"],"award-info":[{"award-number":["Hong Kong RGC TRS T41-603\/20-R"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707266","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"295-310","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Design and Operation of Shared Machine Learning Clusters on Campus"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0501-5968","authenticated-orcid":false,"given":"Kaiqiang","family":"Xu","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3442-4656","authenticated-orcid":false,"given":"Decang","family":"Sun","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9883-2400","authenticated-orcid":false,"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8779-4768","authenticated-orcid":false,"given":"Zhenghang","family":"Ren","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6503-5309","authenticated-orcid":false,"given":"Xinchen","family":"Wan","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8380-1879","authenticated-orcid":false,"given":"Xudong","family":"Liao","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3184-4081","authenticated-orcid":false,"given":"Zilong","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6926-7801","authenticated-orcid":false,"given":"Junxue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2587-6028","authenticated-orcid":false,"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_1_2_1","volume-title":"Retrieved","author":"Services Amazon Web","year":"2024","unstructured":"Amazon Web Services 2024. Amazon Machine Images in Amazon EC2. Retrieved December 13, 2024 from hhttps:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/AMIs.html"},{"key":"e_1_3_2_1_3_1","volume-title":"Retrieved","year":"2024","unstructured":"Anaconda. 2024. User guide - conda documentation. Retrieved December 13, 2024 from https:\/\/docs.conda.io\/projects\/conda\/en\/latest\/userguide\/index.html"},{"key":"e_1_3_2_1_4_1","volume-title":"Information-Agnostic Flow Scheduling for Commodity Data Centers. In 12th USENIX Symposium on Networked Systems Design and Implementation, NSDI 15","author":"Bai Wei","year":"2015","unstructured":"Wei Bai, Kai Chen, Hao Wang, Li Chen, Dongsu Han, and Chen Tian. 2015. Information-Agnostic Flow Scheduling for Commodity Data Centers. In 12th USENIX Symposium on Networked Systems Design and Implementation, NSDI 15, Oakland, CA, USA, May 4--6, 2015. USENIX Association, 455--468. https:\/\/www.usenix.org\/conference\/nsdi15\/technical-sessions\/presentation\/bai"},{"key":"e_1_3_2_1_5_1","volume-title":"Retrieved","author":"Brophy Bill","year":"2014","unstructured":"Bill Brophy, Martin Perry, Moe Jette, Yiannis Georgiou, and Matthieu Hautreux. 2014. Slurm Processes Isolation. Retrieved December 13, 2024 from https:\/\/slurm.schedmd.com\/SUG14\/process_isolation.pdf"},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6--12, 2020, virtual. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2012","author":"Chen Kai","year":"2012","unstructured":"Kai Chen, Ankit Singla, Atul Singh, Kishore Ramachandran, Lei Xu, Yueping Zhang, Xitao Wen, and Yan Chen. 2012. OSA: An Optical Switching Architecture for Data Center Networks with Unprecedented Flexibility. In Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2012, San Jose, CA, USA, April 25--27, 2012. USENIX Association, 239--252. https:\/\/www.usenix.org\/conference\/nsdi12\/technical-sessions\/presentation\/chen_kai"},{"key":"e_1_3_2_1_8_1","volume-title":"Retrieved","author":"Containerd","year":"2024","unstructured":"Containerd 2024. An industry-standard container runtime with an emphasis on simplicity, robustness and portability. Retrieved December 13, 2024 from https:\/\/containerd.io\/"},{"key":"e_1_3_2_1_9_1","unstructured":"DaoCloud. 2024. https:\/\/github.com\/daocloud."},{"key":"e_1_3_2_1_10_1","volume-title":"Retrieved","year":"2024","unstructured":"Docker. 2024. Dockerfile overview. Retrieved December 13, 2024 from https:\/\/docs.docker.com\/build\/concepts\/dockerfile\/"},{"key":"e_1_3_2_1_11_1","volume-title":"Retrieved","author":"Edmon Paul","year":"2022","unstructured":"Paul Edmon. 2022. Cluster Fragmentation. Retrieved December 13, 2024 from https:\/\/www.rc.fas.harvard.edu\/blog\/cluster-fragmentation\/"},{"key":"e_1_3_2_1_12_1","volume-title":"Retrieved","year":"2024","unstructured":"GitHub 2024. Github Pull Request: Submit kubernetes job without changing tuxiv.conf format. Retrieved December 13, 2024 from https:\/\/github.com\/turingaicloud\/tcloud-sdk\/pull\/2"},{"key":"e_1_3_2_1_13_1","volume-title":"d.]. Gluster Documentation. Retrieved","author":"FS.","year":"2024","unstructured":"GlusterFS. [n. d.]. Gluster Documentation. Retrieved December 13, 2024 from https:\/\/docs.gluster.org\/en\/main\/"},{"key":"e_1_3_2_1_14_1","volume-title":"d.]. Get started | Cloud TPU | Google Cloud. Retrieved","year":"2024","unstructured":"Google. [n. d.]. Get started | Cloud TPU | Google Cloud. Retrieved December 13, 2024 from https:\/\/cloud.google.com\/tpu\/docs\/quick-starts"},{"key":"e_1_3_2_1_15_1","volume-title":"Retrieved","author":"Google Groups","year":"2022","unstructured":"Google Groups. 2022. What is an easy way to prevent users from running programs on the master\/login node. Retrieved December 13, 2024 from https:\/\/groups.google.com\/g\/slurm-users\/c\/HKMTjPQN9l0"},{"key":"e_1_3_2_1_16_1","volume-title":"Retrieved","author":"Haleva Raz","year":"2021","unstructured":"Raz Haleva. 2021. The Challenges of Sharing GPUs and How to Solve Them. Retrieved December 13, 2024 from https:\/\/developer.hpe.com\/blog\/the-challenges-of-sharing-gpus-and-how-to-solve-them\/"},{"key":"e_1_3_2_1_17_1","volume-title":"High Performance TCP\/HTTP Load Balancer. Retrieved","year":"2024","unstructured":"HAProxy 2024. HAProxy, The Reliable, High Performance TCP\/HTTP Load Balancer. Retrieved December 13, 2024 from https:\/\/www.haproxy.org\/"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575705"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439295"},{"key":"e_1_3_2_1_20_1","volume-title":"Retrieved","author":"Face Hugging","year":"2024","unstructured":"Hugging Face. 2024. Datasets - Hugging Face. Retrieved December 13, 2024 from https:\/\/huggingface.co\/datasets'sort=most_rows"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 2019 USENIX Annual Technical Conference, USENIX ATC 2019","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In Proceedings of the 2019 USENIX Annual Technical Conference, USENIX ATC 2019, Renton, WA, USA, July 10--12, 2019. USENIX Association, 947--960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_22_1","unstructured":"Microsoft 2024. Overview of Single Root I\/O Virtualization (SRIOV). https:\/\/learn.microsoft.com\/en-us\/windows-hardware\/drivers\/network\/overview-of-single-root-i-o-virtualization--sr-iov-."},{"key":"e_1_3_2_1_23_1","volume-title":"Ray: A Distributed Framework for Emerging AI Applications. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I. Jordan, and Ion Stoica. 2018. Ray: A Distributed Framework for Emerging AI Applications. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8--10, 2018. USENIX Association, 561--577. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/nishihara"},{"key":"e_1_3_2_1_24_1","volume-title":"Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Narayanan Deepak","year":"2020","unstructured":"Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. 2020. Heterogeneity-Aware Cluster Scheduling Policies for Deep Learning Workloads. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020. USENIX Association, 481--498. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/narayanan-deepak"},{"key":"e_1_3_2_1_25_1","volume-title":"Retrieved","author":"NVIDIA","year":"2020","unstructured":"NVIDIA 2020. Overview of NCCL - NCCL documentation. Retrieved December 13, 2024 from https:\/\/docs.nvidia.com\/deeplearning\/nccl\/user-guide\/docs\/overview.html"},{"key":"e_1_3_2_1_26_1","volume-title":"Retrieved","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2024. About CUDA | NVIDIA Developer. Retrieved December 13, 2024 from https:\/\/developer.nvidia.com\/about-cuda"},{"key":"e_1_3_2_1_27_1","volume-title":"Retrieved","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. 2024. Data Center Networking Concepts. Retrieved December 13, 2024 from https:\/\/docs.nvidia.com\/networkingethernet-software\/guides\/EVPN-Network-Reference\/Data-Center-Networking-Concepts\/"},{"key":"e_1_3_2_1_28_1","volume-title":"Retrieved","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2024. NVIDIA Solutions for Higher Education and Research. Retrieved December 13, 2024 from https:\/\/www.nvidia.com\/en-us\/industries\/higher-education-research\/"},{"key":"e_1_3_2_1_29_1","volume-title":"Retrieved","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2024. Single Root IO Virtualization (SR-IOV) - NVIDIA Docs. Retrieved December 13, 2024 from https:\/\/docs.nvidia.com\/networking\/display\/mlnxofedv24010331\/singlerootiovirtualization(sr-iov)"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437359.3465603"},{"key":"e_1_3_2_1_31_1","volume-title":"Retrieved","author":"Pan Peter","year":"2024","unstructured":"Peter Pan and Kaiqiang Xu. 2024. Breaking Boundaries: TACC as an Unified Cloud-Native Infra for AI HPC. Retrieved December 13, 2024 from https:\/\/www.classcentral.com\/course\/youtube-breakingboundaries-tacc-as-an-unified-cloud-native-infra-for-ai-hpc-peterpan-kaiqiang-xu-312983"},{"key":"e_1_3_2_1_32_1","volume-title":"Retrieved","author":"Computing Princeton Research","year":"2024","unstructured":"Princeton Research Computing. [n. d.]. Top 10 Mistakes to Avoid on the Research Computing Clusters. Retrieved December 13, 2024 from https: \/\/researchcomputing.princeton.edu\/get-started\/mistakes-avoid"},{"key":"e_1_3_2_1_33_1","volume-title":"22nd USENIX Conference on File and Storage Technologies, FAST 2024","author":"Qian Yingjin","year":"2024","unstructured":"Yingjin Qian, Marc-Andr\u00e9 Vef, Patrick Farrell, Andreas Dilger, Xi Li, Shuichi Ihara, Yinjin Fu, Wei Xue, and Andr\u00e9 Brinkmann. 2024. Combining Buffered I\/O and Direct I\/O in Distributed File Systems. In 22nd USENIX Conference on File and Storage Technologies, FAST 2024, Santa Clara, CA, USA, February 27--29, 2024. USENIX Association, 17--33. https:\/\/www.usenix.org\/conference\/fast24\/presentation\/qian"},{"key":"e_1_3_2_1_34_1","volume-title":"Pollux: Co-adaptive Cluster Scheduling for Goodput- Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2021","author":"Qiao Aurick","year":"2021","unstructured":"Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R. Ganger, and Eric P. Xing. 2021. Pollux: Co-adaptive Cluster Scheduling for Goodput- Optimized Deep Learning. In 15th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2021, July 14--16, 2021. USENIX Association. https:\/\/www.usenix.org\/conference\/osdi21\/ presentation\/qiao"},{"key":"e_1_3_2_1_35_1","volume-title":"Retrieved","year":"2024","unstructured":"Ray. [n. d.]. Ray Clusters Overview. Retrieved December 13, 2024 from https:\/\/docs.ray.io\/en\/latest\/cluster\/getting-started.html"},{"key":"e_1_3_2_1_36_1","volume-title":"Retrieved","author":"Reddit","year":"2021","unstructured":"Reddit 2021. Research Group GPU Sharing: some opensource tool. Retrieved December 13, 2024 from https: \/\/www.reddit.com\/r\/MachineLearning\/comments\/k07jn9\/d_ research_group_gpu_sharing_some_opensource_tool\/"},{"key":"e_1_3_2_1_37_1","volume-title":"Retrieved","author":"Reddit","year":"2022","unstructured":"Reddit 2022. What is the best way to manage GPU server for multi-users? Retrieved December 13, 2024 from https:\/\/www.reddit.com\/r\/MachineLearning\/comments\/ve987y\/ d_what_is_the_best_way_to_manage_gpu_server_for\/"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/S10664-024--10471--7"},{"key":"e_1_3_2_1_40_1","volume-title":"Retrieved","year":"2024","unstructured":"Run:ai 2024. Simplify GPU Sharing in Multi-GPU Environments. Retrieved December 13, 2024 from https:\/\/www.run.ai\/guides\/multigpu\/ simplify-gpu-sharing-part-1"},{"key":"e_1_3_2_1_41_1","volume-title":"Retrieved","year":"2021","unstructured":"SchedMD 2021. Slurm Workload Manager - Overview. Retrieved December 13, 2024 from https:\/\/slurm.schedmd.com\/overview.html"},{"key":"e_1_3_2_1_42_1","volume-title":"Retrieved","year":"2024","unstructured":"SchedMD 2024. Slurm Workload Manager - strigger. Retrieved December 13, 2024 from https:\/\/slurm.schedmd.com\/strigger.html"},{"key":"e_1_3_2_1_43_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. CoRR abs\/1802.05799 (2018). arXiv:1802.05799 http:\/\/arxiv.org\/abs\/1802.05799"},{"key":"e_1_3_2_1_44_1","volume-title":"d.]. Spark - Cluster Mode Overview. Retrieved","year":"2024","unstructured":"Spark. [n. d.]. Spark - Cluster Mode Overview. Retrieved December 13, 2024 from https:\/\/spark.apache.org\/docs\/latest\/cluster-overview.html"},{"key":"e_1_3_2_1_45_1","volume-title":"Retrieved","author":"SSH Academy [n. d.]. SSH Tunneling","year":"2024","unstructured":"SSH Academy [n. d.]. SSH Tunneling: Client Command and Server Configuration. Retrieved December 13, 2024 from https:\/\/www.ssh. com\/academy\/ssh\/tunneling-example"},{"key":"e_1_3_2_1_46_1","volume-title":"Retrieved","author":"Stanford Research Computing Center","year":"2024","unstructured":"Stanford Research Computing Center. 2024. Running Jobs - Sherlock. Retrieved December 13, 2024 from https:\/\/www.sherlock.stanford.edu\/ docs\/user-guide\/running-jobs\/"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613175"},{"key":"e_1_3_2_1_48_1","volume-title":"Retrieved","author":"Authors TACC","year":"2024","unstructured":"TACC Authors. 2024. Open Source Artifacts: TACC. Retrieved December 13, 2024 from https:\/\/tacc.ust.hk\/#opensource"},{"key":"e_1_3_2_1_49_1","volume-title":"Retrieved","author":"Authors TACC","year":"2024","unstructured":"TACC Authors. 2024. Scalable AI Infrastructure Designed for Evolving Machine Learning Research. Retrieved December 13, 2024 from https:\/\/tacc.ust.hk\/"},{"key":"e_1_3_2_1_50_1","volume-title":"Retrieved","author":"Authors The Kubernetes","year":"2024","unstructured":"The Kubernetes Authors 2024. Overview | Kubernetes. Retrieved December 13, 2024 from https:\/\/kubernetes.io\/docs\/concepts\/overview\/"},{"key":"e_1_3_2_1_51_1","volume-title":"Retrieved","author":"Vondrus Vladimir","year":"2022","unstructured":"Vladimir Vondrus. 2022. Zero-waste single-pass packing of power-of-two textures - blog.magnum.graphics. Retrieved December 13, 2024 from https:\/\/blog.magnum.graphics\/backstage\/pot-array-packing\/"},{"key":"e_1_3_2_1_52_1","volume-title":"Towards Domain-Specific Network Transport for Distributed DNN Training. In 21st USENIX Symposium on Networked Systems Design and Implementation, NSDI 2024","author":"Wang Hao","year":"2024","unstructured":"Hao Wang, Han Tian, Jingrong Chen, Xinchen Wan, Jiacheng Xia, Gaoxiong Zeng, Wei Bai, Junchen Jiang, Yong Wang, and Kai Chen. 2024. Towards Domain-Specific Network Transport for Distributed DNN Training. In 21st USENIX Symposium on Networked Systems Design and Implementation, NSDI 2024, Santa Clara, CA, April 15--17, 2024. USENIX Association. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/wang-hao"},{"key":"e_1_3_2_1_53_1","volume-title":"SRNIC: A Scalable Architecture for RDMA NICs. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"Luo Layong","year":"2023","unstructured":"ZilongWang, Layong Luo, Qingsong Ning, Chaoliang Zeng,Wenxue Li, Xinchen Wan, Peng Xie, Tao Feng, Ke Cheng, Xiongfei Geng, Tianhao Wang, Weicheng Ling, Kejia Huo, Pingbo An, Kui Ji, Shideng Zhang, Bin Xu, Ruiqing Feng, Tao Ding, Kai Chen, and Chuanxiong Guo. 2023. SRNIC: A Scalable Architecture for RDMA NICs. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023, Boston, MA, April 17--19, 2023. USENIX Association, 1--14. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/wang-zilong"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.932708"},{"key":"e_1_3_2_1_55_1","volume-title":"MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2022","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In 19th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2022, Renton,WA, USA, April 4--6, 2022. USENIX Association, 945--960. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/weng"},{"key":"e_1_3_2_1_56_1","volume-title":"TACC: A Full-stack Cloud Computing Infrastructure for Machine Learning Tasks. CoRR abs\/2110.01556","author":"Xu Kaiqiang","year":"2021","unstructured":"Kaiqiang Xu, XinchenWan, HaoWang, Zhenghang Ren, Xudong Liao, Decang Sun, Chaoliang Zeng, and Kai Chen. 2021. TACC: A Full-stack Cloud Computing Infrastructure for Machine Learning Tasks. CoRR abs\/2110.01556 (2021). arXiv:2110.01556 https:\/\/arxiv.org\/abs\/2110.01556"},{"key":"e_1_3_2_1_57_1","volume-title":"Zeus: Understanding and Optimizing GPU Energy Consumption of DNN Training. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"You Jie","year":"2023","unstructured":"Jie You, Jae-Won Chung, and Mosharaf Chowdhury. 2023. Zeus: Understanding and Optimizing GPU Energy Consumption of DNN Training. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023, Boston, MA, April 17--19, 2023. USENIX Association, 119--139. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/you"},{"key":"e_1_3_2_1_58_1","volume-title":"Retrieved","year":"2024","unstructured":"Yushan. 2024. Managing Multiple CUDA cuDNN Installations. Retrieved December 13, 2024 from https:\/\/medium.com\/@yushantripleseven\/managing-multiple-cudacudnn-installations-ba9cdc5e2654"},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2012","author":"Zaharia Matei","year":"2012","unstructured":"Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauly, Michael J. Franklin, Scott Shenker, and Ion Stoica. 2012. Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing. In Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2012, San Jose, CA, USA, April 25--27, 2012. USENIX Association, 15--28. https:\/\/www.usenix.org\/conference\/nsdi12\/technicalsessions\/presentation\/zaharia"},{"key":"e_1_3_2_1_60_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023","author":"Zhang Junxue","year":"2023","unstructured":"Junxue Zhang, Xiaodian Cheng, Wei Wang, Liu Yang, Jinbin Hu, and Kai Chen. 2023. FLASH: Towards a High-performance Hardware Acceleration Architecture for Cross-silo Federated Learning. In 20th USENIX Symposium on Networked Systems Design and Implementation, NSDI 2023, Boston, MA, April 17--19, 2023. USENIX Association, 1057--1079. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhangjunxue"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544229"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707266","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707266","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:46:38Z","timestamp":1755787598000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707266"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":61,"alternative-id":["10.1145\/3669940.3707266","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707266","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}