{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,2]],"date-time":"2025-10-02T00:47:42Z","timestamp":1759366062734,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CAREER CNS-2144796","CCF-2107470"],"award-info":[{"award-number":["CAREER CNS-2144796","CCF-2107470"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3731569.3764857","type":"proceedings-article","created":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:43:24Z","timestamp":1759322604000},"page":"979-995","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Managing Scalable Direct Storage Accesses for GPUs with GoFS"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8883-5392","authenticated-orcid":false,"given":"Shaobo","family":"Li","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6195-2269","authenticated-orcid":false,"given":"Yirui Eric","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0363-9486","authenticated-orcid":false,"given":"Yuqi","family":"Xue","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9653-3283","authenticated-orcid":false,"given":"Yuan","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1125-671X","authenticated-orcid":false,"given":"Jian","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2021. Optane SSD. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/memory-and-storage\/optane-ssd\/optane-ssd-overview.html."},{"key":"e_1_3_2_1_2_1","unstructured":"2022. Samsung V-NAND SSD 990 PRO Datasheet Rev. 1.0. https:\/\/download.semiconductor.samsung.com\/resources\/data-sheet\/Samsung_NVMe_SSD_990_PRO_Datasheet_Rev.1.0.pdf."},{"key":"e_1_3_2_1_3_1","unstructured":"Andy Adinets. [n. d.]. CUDA Dynamic Parallelism API and Principles. https:\/\/developer.nvidia.com\/blog\/cuda-dynamic-parallelism-api-principles\/."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD DirectGMA. [n. d.]. https:\/\/www.bitflow.com\/technology\/directgma\/."},{"key":"e_1_3_2_1_5_1","volume-title":"FlashNeuron: SSD-Enabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Bae Jonghyun","unstructured":"Jonghyun Bae, Jongsung Lee, Yunho Jin, Sam Son, Shine Kim, Hakbeom Jang, Tae Jun Ham, and Jae W. Lee. 2021. FlashNeuron: SSD-Enabled Large-Batch Training of Very Deep Neural Networks. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 387\u2013401. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/bae"},{"key":"e_1_3_2_1_6_1","volume-title":"The GAP benchmark suite. arXiv preprint arXiv:1508.03619","author":"Beamer Scott","year":"2015","unstructured":"Scott Beamer, Krste Asanovi\u0107, and David Patterson. 2015. The GAP benchmark suite. arXiv preprint arXiv:1508.03619 (2015)."},{"key":"e_1_3_2_1_7_1","volume-title":"SPIN: Seamless Operating System Integration of Peer-to-Peer DMA Between SSDs and GPUs. In 2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Bergman Shai","year":"2017","unstructured":"Shai Bergman, Tanya Brokhman, Tzachi Cohen, and Mark Silberstein. 2017. SPIN: Seamless Operating System Integration of Peer-to-Peer DMA Between SSDs and GPUs. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, 167\u2013179. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/bergman"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 26th Symposium on Operating Systems Principles","author":"Bhat Srivatsa S.","year":"2017","unstructured":"Srivatsa S. Bhat, Rasha Eqbal, Austin T. Clements, M. Frans Kaashoek, and Nickolai Zeldovich. 2017. Scaling a file system to many cores using an operation log. In Proceedings of the 26th Symposium on Operating Systems Principles (Shanghai, China) (SOSP '17). Association for Computing Machinery, New York, NY, USA, 69\u201386. 10.1145\/3132747.3132779"},{"key":"e_1_3_2_1_9_1","volume-title":"Proc. of the Thirteenth International World Wide Web Conference (WWW","author":"Boldi Paolo","year":"2004","unstructured":"Paolo Boldi and Sebastiano Vigna. 2004. The WebGraph Framework I: Compression Techniques. In Proc. of the Thirteenth International World Wide Web Conference (WWW 2004). ACM Press, Manhattan, USA, 595\u2013601."},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. of the 2nd Usenix Conference on File and Storage Technologies","volume":"215","author":"Bonwick Jeff","year":"2003","unstructured":"Jeff Bonwick, Matt Ahrens, Val Henson, Mark Maybee, and Mark Shellenbaum. 2003. The zettabyte file system. In Proc. of the 2nd Usenix Conference on File and Storage Technologies, Vol. 215. 1."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD'18)","author":"Borisyuk Fedor","year":"2018","unstructured":"Fedor Borisyuk, Albert Gordo, and Viswanath Sivakumar. 2018. Rosetta: Large Scale System for Text Detection and Recognition in Images. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD'18). London, United Kingdom."},{"key":"e_1_3_2_1_12_1","volume-title":"Scalable Persistent Memory File System with Kernel-Userspace Collaboration. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Chen Youmin","year":"2021","unstructured":"Youmin Chen, Youyou Lu, Bohong Zhu, Andrea C. Arpaci-Dusseau, Remzi H. Arpaci-Dusseau, and Jiwu Shu. 2021. Scalable Persistent Memory File System with Kernel-Userspace Collaboration. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 81\u201395. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/chen-youmin"},{"key":"e_1_3_2_1_13_1","volume-title":"Optimistic Crash Consistency. In The 24th ACM Symposium on Operating System Principles (SOSP'13)","author":"Chidambaram Vijay","unstructured":"Vijay Chidambaram, Thanu S. Pillai, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. 2013. Optimistic Crash Consistency. In The 24th ACM Symposium on Operating System Principles (SOSP'13). Farmington, PA."},{"key":"e_1_3_2_1_14_1","unstructured":"Advanced Micro Devices. 2025. Troubleshoot BAR access limitation. https:\/\/rocm.docs.amd.com\/en\/latest\/how-to\/Bar-Memory.html."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (Washington DC, USA) (KDD '22)","author":"Du Ming","year":"2022","unstructured":"Ming Du, Arnau Ramisa, Amit Kumar K C, Sampath Chanda, Mengjiao Wang, Neelakandan Rajesh, Shasha Li, Yingchuan Hu, Tao Zhou, Nagashri Lakshminarayana, Son Tran, and Doug Gray. 2022. Amazon Shop the Look: A Visual Search System for Fashion and Home. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (Washington DC, USA) (KDD '22). Association for Computing Machinery, New York, NY, USA, 2822\u20132830. 10.1145\/3534678.3539071"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","author":"Eran Haggai","year":"2022","unstructured":"Haggai Eran, Maxim Fudim, Gabi Malka, Gal Shalom, Noam Cohen, Amit Hermony, Dotan Levi, Liran Liss, and Mark Silberstein. 2022. FlexDriver: a network driver for your accelerator. In Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (Lausanne, Switzerland) (ASPLOS '22). Association for Computing Machinery, New York, NY, USA, 1115\u20131129. 10.1145\/3503222.3507776"},{"key":"e_1_3_2_1_17_1","unstructured":"Matthias Fey and Jan Eric Lenssen. 2019. Fast Graph Representation Learning with PyTorch Geometric. https:\/\/arxiv.org\/abs\/1903.02428"},{"key":"e_1_3_2_1_18_1","unstructured":"FIO Benchmarks. [n. d.]. https:\/\/linux.die.net\/man\/1\/fio."},{"key":"e_1_3_2_1_19_1","unstructured":"Yunfan Gao Yun Xiong Xinyu Gao Kangxiang Jia Jinliu Pan Yuxi Bi Yi Dai Jiawei Sun Meng Wang and Haofen Wang. 2024. Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv:2312.10997 [cs.CL] https:\/\/arxiv.org\/abs\/2312.10997"},{"key":"e_1_3_2_1_20_1","unstructured":"Google. 2024. Better performance with the tf.data API | TensorFlow Core. https:\/\/www.tensorflow.org\/guide\/data_performance."},{"key":"e_1_3_2_1_21_1","unstructured":"GPUDirect Storage: A Direct Path Between Storage and GPU Memory. [n. d.]. https:\/\/developer.nvidia.com\/blog\/gpudirect-storage\/."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE international conference on computer vision (ICCV'15)","author":"Kiapour M Hadi","year":"2015","unstructured":"M Hadi Kiapour, Xufeng Han, Svetlana Lazebnik, Alexander C Berg, and Tamara L Berg. 2015. Where to Buy It: Matching Street Clothing Photos in Online Shops. In Proceedings of the IEEE international conference on computer vision (ICCV'15). Santiago, Chile."},{"key":"e_1_3_2_1_23_1","volume-title":"Inductive representation learning on large graphs. Advances in neural information processing systems 30","author":"Hamilton Will","year":"2017","unstructured":"Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017. Inductive representation learning on large graphs. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_24_1","volume-title":"Curriculum audiovisual learning. arXiv preprint arXiv:2001.09414","author":"Hu Di","year":"2020","unstructured":"Di Hu, Zheng Wang, Haoyi Xiong, Dong Wang, Feiping Nie, and Dejing Dou. 2020. Curriculum audiovisual learning. arXiv preprint arXiv:2001.09414 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"Ogb-lsc: A large-scale challenge for machine learning on graphs. arXiv preprint arXiv:2103.09430","author":"Hu Weihua","year":"2021","unstructured":"Weihua Hu, Matthias Fey, Hongyu Ren, Maho Nakata, Yuxiao Dong, and Jure Leskovec. 2021. Ogb-lsc: A large-scale challenge for machine learning on graphs. arXiv preprint arXiv:2103.09430 (2021)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys'20)","author":"Jain Paras","year":"2020","unstructured":"Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. 2020. Breaking the Memory Wall with Optimal Tensor Rematerialization. In Proceedings of Machine Learning and Systems (MLSys'20)."},{"key":"e_1_3_2_1_27_1","volume-title":"Ravishankar Krishnawamy, and Rohan Kadekodi.","author":"Subramanya Suhas Jayaram","year":"2019","unstructured":"Suhas Jayaram Subramanya, Fnu Devvrit, Harsha Vardhan Simhadri, Ravishankar Krishnawamy, and Rohan Kadekodi. 2019. DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD'15)","author":"Jing Yushi","year":"2015","unstructured":"Yushi Jing, David Liu, Dmitry Kislyuk, Andrew Zhai, Jiajing Xu, Jeff Donahue, and Sarah Tavel. 2015. Visual Search at Pinterest. In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD'15). Sydney, Australia."},{"key":"e_1_3_2_1_29_1","volume-title":"Billion-scale similarity search with gpus. arXiv preprint arXiv:1702.08734","author":"Johnson Jeff","year":"2017","unstructured":"Jeff Johnson, Matthijs Douze, and Herv\u00e9 J\u00e9gou. 2017. Billion-scale similarity search with gpus. arXiv preprint arXiv:1702.08734 (2017)."},{"key":"e_1_3_2_1_30_1","volume-title":"SpanFS: A Scalable File System on Fast Storage Devices. In 2015 USENIX Annual Technical Conference (USENIX ATC 15)","author":"Kang Junbin","year":"2015","unstructured":"Junbin Kang, Benlong Zhang, Tianyu Wo, Weiren Yu, Lian Du, Shuai Ma, and Jinpeng Huai. 2015. SpanFS: A Scalable File System on Fast Storage Devices. In 2015 USENIX Annual Technical Conference (USENIX ATC 15). USENIX Association, Santa Clara, CA, 249\u2013261. https:\/\/www.usenix.org\/conference\/atc15\/technical-session\/presentation\/kang"},{"key":"e_1_3_2_1_31_1","volume-title":"20th USENIX Conference on File and Storage Technologies (FAST 22)","author":"Kim Dohyun","year":"2022","unstructured":"Dohyun Kim, Kwangwon Min, Joontaek Oh, and Youjip Won. 2022. ScaleXFS: Getting scalability of XFS back on the ring. In 20th USENIX Conference on File and Storage Technologies (FAST 22). USENIX Association, Santa Clara, CA, 329\u2013344. https:\/\/www.usenix.org\/conference\/fast22\/presentation\/kim-dohyun"},{"key":"e_1_3_2_1_32_1","volume-title":"GPUnet: Networking Abstractions for GPU Programs. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Kim Sangman","year":"2014","unstructured":"Sangman Kim, Seonggu Huh, Xinya Zhang, Yige Hu, Amir Wated, Emmett Witchel, and Mark Silberstein. 2014. GPUnet: Networking Abstractions for GPU Programs. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14). USENIX Association, Broomfield, CO, 201\u2013216. https:\/\/www.usenix.org\/conference\/osdi14\/technical-sessions\/presentation\/kim"},{"key":"e_1_3_2_1_33_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys'20)","author":"Kogan Alex","year":"2020","unstructured":"Alex Kogan, Dave Dice, and Shady Issa. 2020. Scalable range locks for scalable address spaces and beyond. In Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys'20). Heraklion, Greece."},{"key":"e_1_3_2_1_35_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 29th Symposium on Operating Systems Principles","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. In Proceedings of the 29th Symposium on Operating Systems Principles (Koblenz, Germany) (SOSP '23). Association for Computing Machinery, New York, NY, USA, 611\u2013626. 10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 13th USENIX Conference on File and Storage Technologies (FAST'15)","author":"Lee Changman","year":"2015","unstructured":"Changman Lee, Dongbo Sim, Joo-Young Hwang, and Sangyeun Cho. 2015. F2FS: A New File System for Flash Storage. In Proceedings of the 13th USENIX Conference on File and Storage Technologies (FAST'15). Santa Clara, CA."},{"key":"e_1_3_2_1_38_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL] https:\/\/arxiv.org\/abs\/2005.11401"},{"key":"e_1_3_2_1_39_1","unstructured":"Shuying Liang. 2024. Simple is beautiful: Revolutionizing GNN Training Infrastructure at LinkedIn. https:\/\/flyte.org\/case-study\/simple-is-beautiful-revolutionizing-gnn-training-infrastructure-at-linkedin."},{"key":"e_1_3_2_1_40_1","volume-title":"Max: A Multicore-Accelerated File System for Flash Storage. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Liao Xiaojian","year":"2021","unstructured":"Xiaojian Liao, Youyou Lu, Erci Xu, and Jiwu Shu. 2021. Max: A Multicore-Accelerated File System for Flash Storage. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 877\u2013891. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/liao"},{"key":"e_1_3_2_1_41_1","unstructured":"Yuan Lin and Vinod Grover. 2018. Using CUDA Warp-Level Primitives. https:\/\/developer.nvidia.com\/blog\/using-cuda-warp-level-primitives."},{"key":"e_1_3_2_1_42_1","volume-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 121\u2013125","author":"Lu Rui","year":"2017","unstructured":"Rui Lu, Kailun Wu, Zhiyao Duan, and Changshui Zhang. 2017. Deep ranking: Triplet MatchNet for music metric learning. In 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 121\u2013125."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 52nd IEEE\/ACM International Symposium on Microarchitecture (MICRO'19)","author":"Mailthoday Vikram Sharma","year":"2019","unstructured":"Vikram Sharma Mailthoday, Zaid Qureshi, Weixin Liang, Ziyan Feng, Simon Garcia de Gonzalo, Youjie Li, Hubertus Franke, Jinjun Xiong, Jian Huang, and Wenmei Hwu. 2019. DeepStore: In-Storage Acceleration for Intelligent Queries. In Proceedings of the 52nd IEEE\/ACM International Symposium on Microarchitecture (MICRO'19). Columbus, OH."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462545"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles (SOSP'19)","author":"Mawhirter Daniel","year":"2019","unstructured":"Daniel Mawhirter and Bo Wu. 2019. AutoMine: Harmonizing High-Level Abstraction and High Performance for Graph Mining. In Proceedings of the 27th ACM Symposium on Operating Systems Principles (SOSP'19). Huntsville, Ontario, Canada."},{"key":"e_1_3_2_1_46_1","unstructured":"P. E. McKenney D. Sarma and M. Soni. 2004. Scaling dcache with RCU. Linux Journal (2004)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2717511"},{"key":"e_1_3_2_1_48_1","unstructured":"Microsoft. 2025. Support boundary for high accuracy time - Windows Server | Microsoft Learn. https:\/\/learn.microsoft.com\/en-us\/troubleshoot\/windows-server\/active-directory\/support-boundary-high-accuracy-time."},{"key":"e_1_3_2_1_49_1","volume-title":"Understanding Manycore Scalability of File Systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16)","author":"Min Changwoo","year":"2016","unstructured":"Changwoo Min, Sanidhya Kashyap, Steffen Maass, and Taesoo Kim. 2016. Understanding Manycore Scalability of File Systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16). USENIX Association, Denver, CO, 71\u201385. https:\/\/www.usenix.org\/conference\/atc16\/technical-sessions\/presentation\/min"},{"key":"e_1_3_2_1_50_1","unstructured":"National Institute of Standards and Technology (US). 2008. The keyed-hash message authentication code (HMAC). Technical Report. Washington D.C."},{"key":"e_1_3_2_1_51_1","unstructured":"NVIDIA. 2024. Multi-Process Service r555 documentation. https:\/\/docs.nvidia.com\/deploy\/mps\/index.html."},{"key":"e_1_3_2_1_52_1","unstructured":"NVIDIA. 2024. Time Library \u2013 libcudacxx 2.5 documentation. https:\/\/nvidia.github.io\/cccl\/libcudacxx\/standard_api\/time_library.html."},{"key":"e_1_3_2_1_53_1","unstructured":"NVIDIA. 2025. 1. Overview \u2014 GPUDirect RDMA 13.0 documentation. https:\/\/docs.nvidia.com\/cuda\/gpudirect-rdma\/."},{"key":"e_1_3_2_1_54_1","unstructured":"NVIDIA. 2025. Field Identifiers \u2014 NVIDIA DCGM Documentation latest documentation. https:\/\/docs.nvidia.com\/datacenter\/dcgm\/latest\/dcgm-api\/dcgm-api-field-ids.html."},{"key":"e_1_3_2_1_55_1","unstructured":"NVIDIA Corporation. 2018. NVIDIA CUDA C Programming Guide."},{"key":"e_1_3_2_1_56_1","unstructured":"NVIDIA Magnum IO GPUDirect Stoage CuFile API. [n. d.]. https:\/\/docs.nvidia.com\/gpudirect-storage\/pdf\/api-reference-guide.pdf."},{"key":"e_1_3_2_1_57_1","volume-title":"GeminiFS: A Companion File System for GPUs. In 23rd USENIX Conference on File and Storage Technologies (FAST 25)","author":"Qiu Shi","year":"2025","unstructured":"Shi Qiu, Weinan Liu, Yifan Hu, Jianqin Yan, Zhirong Shen, Xin Yao, Renhai Chen, Gong Zhang, and Yiming Zhang. 2025. GeminiFS: A Companion File System for GPUs. In 23rd USENIX Conference on File and Storage Technologies (FAST 25). USENIX Association, Santa Clara, CA, 221\u2013236. https:\/\/www.usenix.org\/conference\/fast25\/presentation\/qiu"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"2","author":"Qureshi Zaid","year":"2023","unstructured":"Zaid Qureshi, Vikram Sharma Mailthody, Isaac Gelado, Seungwon Min, Amna Masood, Jeongmin Park, Jinjun Xiong, C. J. Newburn, Dmitri Vainbrand, I-Hsin Chung, Michael Garland, William Dally, and Wen-mei Hwu. 2023. GPU-Initiated On-Demand High-Throughput Storage Access in the BaM System Architecture. In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (Vancouver, BC, Canada) (ASPLOS 2023). Association for Computing Machinery, New York, NY, USA, 325\u2013339. 10.1145\/3575693.3575748"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Olatunji Ruwase Jeff Rasley Shaden Smith and Yuxiong He. 2021. ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. arXiv:2104.07857 [cs.DC] https:\/\/arxiv.org\/abs\/2104.07857","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_60_1","volume-title":"CrossFS: A Cross-layered Direct-Access File System. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ren Yujie","year":"2020","unstructured":"Yujie Ren, Changwoo Min, and Sudarsun Kannan. 2020. CrossFS: A Cross-layered Direct-Access File System. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 137\u2013154. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/ren"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/2501620.2501623"},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the 18th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS'13)","author":"Silberstein Mark","year":"2013","unstructured":"Mark Silberstein, Bryan Ford, Idit Keidar, and Emmett Witchel. 2013. GPUfs: integrating file systems with GPUs. In Proceedings of the 18th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS'13). Houston, Texas, USA."},{"key":"e_1_3_2_1_63_1","volume-title":"Facebook users are uploading 350 million new photos each day. Business insider 18","author":"Smith Cooper","year":"2013","unstructured":"Cooper Smith. 2013. Facebook users are uploading 350 million new photos each day. Business insider 18 (2013)."},{"key":"e_1_3_2_1_64_1","volume-title":"FED: Fast and Efficient Dataset Deduplication Framework with GPU Acceleration. arXiv:2501.01046 [cs.CL] https:\/\/arxiv.org\/abs\/2501.01046","author":"Son Youngjun","year":"2025","unstructured":"Youngjun Son, Chaewon Kim, and Jaejin Lee. 2025. FED: Fast and Efficient Dataset Deduplication Framework with GPU Acceleration. arXiv:2501.01046 [cs.CL] https:\/\/arxiv.org\/abs\/2501.01046"},{"key":"e_1_3_2_1_65_1","article-title":"Filebench: A flexible framework for file system benchmarking","volume":"41","author":"Tarasov Vasily","year":"2016","unstructured":"Vasily Tarasov, Erez Zadok, and Spencer Shepler. 2016. Filebench: A flexible framework for file system benchmarking. The USENIX Magazine 41, 1 (2016).","journal-title":"The USENIX Magazine"},{"key":"e_1_3_2_1_66_1","unstructured":"Adam Thompson and CJ Newburn. 2019. GPUDirect Storage: A Direct Path Between Storage and GPU Memory. https:\/\/developer.nvidia.com\/blog\/gpudirect-storage\/."},{"key":"e_1_3_2_1_67_1","volume-title":"Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems","author":"Tork Maroun","year":"2020","unstructured":"Maroun Tork, Lina Maudlej, and Mark Silberstein. 2020. Lynx: A SmartNIC-driven Accelerator-centric Architecture for Network Servers. In Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems (Lausanne, Switzerland) (ASPLOS '20). Association for Computing Machinery, New York, NY, USA, 117\u2013131. 10.1145\/3373376.3378528"},{"key":"e_1_3_2_1_68_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the 25th Symposium on Operating Systems Principles","author":"Tsai Chia-Che","unstructured":"Chia-Che Tsai, Yang Zhan, Jayashree Reddy, Yizheng Jiao, Tao Zhang, and Donald E. Porter. 2015. How to get more value from your file system directory cache. In Proceedings of the 25th Symposium on Operating Systems Principles (Monterey, California) (SOSP '15). Association for Computing Machinery, New York, NY, USA, 441\u2013456. 10.1145\/2815400.2815405"},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the 15th USENIX Conference on File and Storage Technologies (FAST'17)","author":"Reddy Vangoor Bharath Kumar","year":"2017","unstructured":"Bharath Kumar Reddy Vangoor, Vasily Tarasov, and Erez Zadok. 2017. To FUSE or Not to FUSE: Performance of User-Space File Systems. In Proceedings of the 15th USENIX Conference on File and Storage Technologies (FAST'17). Santa Clara, CA."},{"key":"e_1_3_2_1_71_1","volume-title":"Generic System Calls for GPUs. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA'18)","author":"Vesel\u00fd J\u00e1n","year":"2018","unstructured":"J\u00e1n Vesel\u00fd, Arkaprava Basu, Abhishek Bhattacharjee, Gabriel H. Loh, Mark Oskin, and Steven K. Reinhardt. 2018. Generic System Calls for GPUs. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA'18). 843\u2013856. 10.1109\/ISCA.2018.00075"},{"key":"e_1_3_2_1_72_1","volume-title":"Learning Two-branch Neural Networks for Image-text Matching Tasks","author":"Wang Liwei","year":"2018","unstructured":"Liwei Wang, Yin Li, Jing Huang, and Svetlana Lazebnik. 2018. Learning Two-branch Neural Networks for Image-text Matching Tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018)."},{"key":"e_1_3_2_1_73_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Wang Zeke","year":"2022","unstructured":"Zeke Wang, Hongjing Huang, Jie Zhang, Fei Wu, and Gustavo Alonso. 2022. FpgaNIC: An FPGA-based Versatile 100Gb Smart-NIC for GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). USENIX Association, Carlsbad, CA, 967\u2013986. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/wang-zeke"},{"key":"e_1_3_2_1_74_1","volume-title":"NOVA: A Log-structured File System for Hybrid Volatile\/Non-Volatile Main Memories. In The 14th USENIX Conference on File and Storage Technologies (FAST'16)","author":"Xu Jian","year":"2016","unstructured":"Jian Xu and Steven Swanson. 2016. NOVA: A Log-structured File System for Hybrid Volatile\/Non-Volatile Main Memories. In The 14th USENIX Conference on File and Storage Technologies (FAST'16). Santa Clara, CA."},{"key":"e_1_3_2_1_75_1","volume-title":"LSUN: Construction of a Large-scale Image Dataset using Deep Learning with Humans in the Loop. arXiv preprint arXiv:1506.03365","author":"Yu Fisher","year":"2015","unstructured":"Fisher Yu, Yinda Zhang, Shuran Song, Ari Seff, and Jianxiong Xiao. 2015. LSUN: Construction of a Large-scale Image Dataset using Deep Learning with Humans in the Loop. arXiv preprint arXiv:1506.03365 (2015)."},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO'23)","author":"Zhang Haoyang","year":"2023","unstructured":"Haoyang Zhang, Yirui Zhou, Yuqi Xue, Yiqi Liu, and Jian Huang. 2023. G10: Enabling An Efficient Unified GPU Memory and Storage Architecture with Smart Tensor Migrations. In Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO'23). Toronto, ON, Canada."},{"key":"e_1_3_2_1_77_1","volume-title":"NVMMU: A Non-volatile Memory Management Unit for Heterogeneous GPU-SSD Architectures. In 2015 International Conference on Parallel Architecture and Compilation (PACT). 13\u201324","author":"Zhang Jie","year":"2015","unstructured":"Jie Zhang, David Donofrio, John Shalf, Mahmut T. Kandemir, and Myoungsoo Jung. 2015. NVMMU: A Non-volatile Memory Management Unit for Heterogeneous GPU-SSD Architectures. In 2015 International Conference on Parallel Architecture and Compilation (PACT). 13\u201324. 10.1109\/PACT.2015.43"}],"event":{"name":"SOSP '25: ACM SIGOPS 31st Symposium on Operating Systems Principles","location":"Lotte Hotel World Seoul Republic of Korea","acronym":"SOSP '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 31st Symposium on Operating Systems Principles"],"original-title":[],"deposited":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T12:52:11Z","timestamp":1759323131000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731569.3764857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":77,"alternative-id":["10.1145\/3731569.3764857","10.1145\/3731569"],"URL":"https:\/\/doi.org\/10.1145\/3731569.3764857","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}