{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:40:16Z","timestamp":1782834016229,"version":"3.54.5"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1109\/icnp61940.2024.10858546","type":"proceedings-article","created":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T13:29:45Z","timestamp":1738675785000},"page":"1-12","source":"Crossref","is-referenced-by-count":3,"title":["Kspeed: Beating I\/O Bottlenecks of Data Provisioning for RDMA Training Clusters"],"prefix":"10.1109","author":[{"given":"Jianbo","family":"Dong","sequence":"first","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Qi","sequence":"additional","affiliation":[{"name":"University of California, Merced,Department of Computer Science,Merced,CA,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tianjing","family":"Xu","sequence":"additional","affiliation":[{"name":"China Construction Bank Operations Data Center,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoli","family":"Liu","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chen","family":"Wei","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rongyao","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoyi","family":"Lu","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zheng","family":"Cao","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Binzhang","family":"Fu","sequence":"additional","affiliation":[{"name":"Alibaba Group,Beijing,China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","journal-title":"Nvidia teases its most powerful GPU ever."},{"key":"ref2","volume-title":"Alluxio","year":"2022"},{"key":"ref3","volume-title":"GlusterFS","year":"2022"},{"key":"ref4","volume-title":"LITS 2017 Dataset","year":"2022"},{"key":"ref5","volume-title":"WAV File Format","year":"2022"},{"key":"ref6","volume-title":"Convolutional Network for Image Classification in PyTorch","year":"2023"},{"key":"ref7","volume-title":"GPT-4","year":"2023"},{"key":"ref8","volume-title":"NVIDIA DALI User Guide","year":"2023"},{"key":"ref9","first-page":"265","article-title":"TensorFlow: A System for Large-Scale Machine Learning","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi","year":"2016"},{"key":"ref10","volume-title":"YouTube-8M: A Large-Scale Video Classification Benchmark","author":"Abu-El-Haija","year":"2016"},{"key":"ref11","first-page":"267","article-title":"Pacman: Coordinated Memory Caching for Parallel Jobs","volume-title":"9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)","author":"Ananthanarayanan","year":"2012"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.299"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155446"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46723-8_49"},{"key":"ref16","article-title":"Quartet: Harmonizing Task Scheduling and Caching for Cluster Computing","volume-title":"8th USENIX Workshop on Hot Topics in Storage and File Systems (HotStorage 16)","author":"Deslauriers","year":"2016"},{"key":"ref17","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","author":"Devlin","year":"2018"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00056"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476181"},{"key":"ref20","first-page":"689","article-title":"Cachew: Machine Learning Input Data Processing as a Service","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Graur"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00112"},{"key":"ref23","first-page":"1","article-title":"Beyond Data and Model Parallelism for Deep Neural Networks","volume-title":"Proceedings of Machine Learning and Systems","volume":"1","author":"Jia","year":"2019"},{"key":"ref24","first-page":"463","article-title":"A Unified Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association","author":"Jiang"},{"key":"ref25","first-page":"770","article-title":"Deep Residual Learning for Image Recognition","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Kaiming","year":"2016"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.5555\/2999134.2999257"},{"key":"ref27","first-page":"283","article-title":"Quiver: An Informed Storage Cache for Deep Learning","volume-title":"18th USENIX Conference on File and Storage Technologies (FAST 20)","author":"Kumar","year":"2020"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref29","volume-title":"Alluxio: A virtual distributed file system","author":"Li","year":"2018"},{"key":"ref30","first-page":"1","article-title":"Tachyon: Reliable, Memory Speed Storage for Cluster Computing Frameworks","volume-title":"Proceedings of the ACM Symposium on Cloud Computing","author":"Li","year":"2014"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623612"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/sc.2018.00068"},{"issue":"5","key":"ref34","first-page":"771","article-title":"Analyzing and Mitigating Data Stalls in DNN Training","volume-title":"Proc. VLDB Endow.","volume":"14","author":"Mohan","year":"2021"},{"key":"ref35","volume-title":"DALI","year":"2022"},{"key":"ref36","volume-title":"Inside Volta: The World\u2019s Most Advanced Data Center GPU","year":"2022"},{"key":"ref37","volume-title":"The NVIDIA Collective Communication Library (NCCL)","year":"2022"},{"key":"ref38","volume-title":"GPT-3 Powers the Next Generation of Apps","year":"2022"},{"key":"ref39","article-title":"Pytorch: An Imperative Style, High-Performance Deep Learning Library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref41","volume":"abs\/1802.05799","author":"Sergeev","year":"2018","journal-title":"Horovod: Fast and Easy Distributed Deep Learning in TensorFlow"},{"key":"ref42","author":"Shoeybi","year":"2019","journal-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism"},{"issue":"1","key":"ref43","first-page":"38","article-title":"Crail: A High-Performance I\/O Architecture for Distributed Data Processing","volume":"40","author":"Stuedi","year":"2017","journal-title":"IEEE Data Eng. Bull."},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"ref45","volume-title":"Pushing the limits of GPU performance with XLA","year":"2022"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3572833"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/MASCOTS.2018.00023"}],"event":{"name":"2024 IEEE 32nd International Conference on Network Protocols (ICNP)","location":"Charleroi, Belgium","start":{"date-parts":[[2024,10,28]]},"end":{"date-parts":[[2024,10,31]]}},"container-title":["2024 IEEE 32nd International Conference on Network Protocols (ICNP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10858485\/10858498\/10858546.pdf?arnumber=10858546","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T19:52:58Z","timestamp":1778788378000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10858546\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/icnp61940.2024.10858546","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]}}}