{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T09:28:38Z","timestamp":1780478918276,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":75,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF","award":["CCF-1919130, CCF-2151021, CCF-2326494"],"award-info":[{"award-number":["CCF-1919130, CCF-2151021, CCF-2326494"]}]},{"DOI":"10.13039\/100000015","name":"DOE U.S. Department of Energy","doi-asserted-by":"publisher","award":["DE-AC05-76RL01830"],"award-info":[{"award-number":["DE-AC05-76RL01830"]}],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3729514","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"837-852","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["SmartNIC-GPU-CPU Heterogeneous System for Large Machine Learning Model with Software-Hardware Codesign"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5872-4464","authenticated-orcid":false,"given":"Anqi","family":"Guo","sequence":"first","affiliation":[{"name":"Department of Electrical &amp; Computer Engineering, Boston University, Boston, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8513-9566","authenticated-orcid":false,"given":"Yuchen","family":"Hao","sequence":"additional","affiliation":[{"name":"Meta Platforms, Menlo Park, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9373-6755","authenticated-orcid":false,"given":"Xiteng","family":"Yao","sequence":"additional","affiliation":[{"name":"Department of Electrical &amp; Computer Engineering, Boston University, Boston, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5968-5476","authenticated-orcid":false,"given":"Shining","family":"Yang","sequence":"additional","affiliation":[{"name":"Boston University, Boston, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7595-5539","authenticated-orcid":false,"given":"Jianyu","family":"Huang","sequence":"additional","affiliation":[{"name":"Meta Platforms, Menlo Park, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3644-2922","authenticated-orcid":false,"given":"Tony (Tong)","family":"Geng","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Rochester, Rochester, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3443-9113","authenticated-orcid":false,"given":"Martin","family":"Herbordt","sequence":"additional","affiliation":[{"name":"Department of Electrical &amp; Computer Engineering, Boston University, Boston, Massachusetts, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"C. Bobda J. Mandebi P. Chow M. Ewais N. Tarafdar J.C. Vega K. Eguro D. Koch S. Handagala M. Leeser M.C. Herbordt H. Shahzad P. Hofstee B. Ringlein J. Szefer A. Sanaullah and R. Tessier. 2022. The Future of FPGA Acceleration in Datacenters and the Cloud. ACM Transactions on Reconfigurable Technology and Systems 15 3 (2022) 1\u201342. https:\/\/doi.org\/10.1145\/3506713","DOI":"10.1145\/3506713"},{"key":"e_1_3_3_1_3_2","unstructured":"Broadcom. 2019. Stingray PS250 2x50-Gb High-Performance Data Center SmartNIC. https:\/\/docs.broadcom.com\/doc\/PS250-PB."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783710"},{"key":"e_1_3_3_1_5_2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. CoRR abs\/1604.06174 (2016). arXiv:https:\/\/arXiv.org\/abs\/1604.06174http:\/\/arxiv.org\/abs\/1604.06174"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476178"},{"key":"e_1_3_3_1_7_2","unstructured":"Facebook Research. 2019. DLRM: Deep Learning Recommendation Model. https:\/\/github.com\/facebookresearch\/dlrm"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Jiarui Fang Zilin Zhu Shenggui Li Hui Su Yang Yu Jie Zhou and Yang You. 2023. Parallel Training of Pre-Trained Models via Chunk-Based Dynamic Memory Management. IEEE Transactions on Parallel and Distributed Systems 34 1 (Jan. 2023) 304\u2013315. https:\/\/doi.org\/10.1109\/tpds.2022.3219819","DOI":"10.1109\/TPDS.2022.3219819"},{"key":"e_1_3_3_1_9_2","first-page":"51","volume-title":"15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18)","author":"Firestone Daniel","year":"2018","unstructured":"Daniel Firestone, Andrew Putnam, Sambhrama Mundkur, Derek Chiou, Alireza Dabagh, Mike Andrewartha, Hari Angepat, Vivek Bhanu, Adrian Caulfield, Eric Chung, Harish\u00a0Kumar Chandrappa, Somesh Chaturmohta, Matt Humphrey, Jack Lavier, Norman Lam, Fengfen Liu, Kalin Ovtcharov, Jitu Padhye, Gautham Popuri, Shachar Raindel, Tejas Sapre, Mark Shaw, Gabriel Silva, Madhan Sivakumar, Nisheeth Srivastava, Anshuman Verma, Qasim Zuhair, Deepak Bansal, Doug Burger, Kushagra Vaid, David\u00a0A. Maltz, and Albert Greenberg. 2018. Azure Accelerated Networking: SmartNICs in the Public Cloud. In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18). USENIX Association, Renton, WA, 51\u201366. https:\/\/www.usenix.org\/conference\/nsdi18\/presentation\/firestone"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM53951.2022.9786193"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL57034.2022.00071"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593724"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Pouya Haghi Anqi Guo Qingqing Xiong Chen Yang Tong Geng Justin\u00a0T. Broaddus Ryan Marshall Derek Schafer Anthony Skjellum and Martin\u00a0C. Herbordt. 2022. Reconfigurable switches for high performance and flexible MPI collectives. Concurrency and Computation: Practice and Experience 34 6 (2022) e6769. https:\/\/doi.org\/10.1002\/cpe.6769 arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1002\/cpe.6769","DOI":"10.1002\/cpe.6769"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593739"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3650200.3656616"},{"key":"e_1_3_3_1_16_2","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil\u00a0R. Devanur Gregory\u00a0R. Ganger and Phillip\u00a0B. Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. ArXiv (2018). https:\/\/api.semanticscholar.org\/CorpusID:47016772"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378465"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126970"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"e_1_3_3_1_20_2","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, Article 10, 10\u00a0pages. https:\/\/doi.org\/10.5555\/3454287.3454297"},{"key":"e_1_3_3_1_21_2","unstructured":"Intel. 2021. Intel\u00ae Infrastructure Processing Unit (Intel\u00ae IPU). https:\/\/www.intel.com\/content\/www\/us\/en\/products\/network-io\/smartnic.html."},{"key":"e_1_3_3_1_22_2","unstructured":"Intel. 2022. Intel FPGA SmartNIC. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/details\/fpga\/platforms\/smartnic\/n6000-pl-platform.html."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/1048935.1050173"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Hai Jin Bo Liu Wenbin Jiang Yang Ma Xuanhua Shi Bingsheng He and Shaofeng Zhao. 2018. Layer-Centric Memory Reuse and Data Migration for Extreme-Scale Deep Learning on Many-Core Architectures. ACM Trans. Archit. Code Optim. 15 3 Article 37 (Sept. 2018) 26\u00a0pages. https:\/\/doi.org\/10.1145\/3243904","DOI":"10.1145\/3243904"},{"key":"e_1_3_3_1_25_2","unstructured":"Jared Kaplan Sam McCandlish Tom Henighan Tom\u00a0B. Brown Benjamin Chess Rewon Child Scott Gray Alec Radford Jeff Wu and Dario Amodei. 2020. Scaling Laws for Neural Language Models. https:\/\/api.semanticscholar.org\/CorpusID:210861095"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Zhiquan Lai Shengwei Li Xudong Tang Keshi Ge Weijie Liu Yabo Duan Linbo Qiao and Dongsheng Li. 2023. Merak: An Efficient Distributed DNN Training Framework With Automated 3D Parallelism for Giant Foundation Models. IEEE Transactions on Parallel and Distributed Systems 34 5 (2023) 1466\u20131478. https:\/\/doi.org\/10.1109\/TPDS.2023.3247001","DOI":"10.1109\/TPDS.2023.3247001"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Ang Li Shuaiwen\u00a0Leon Song Jieyang Chen Jiajia Li Xu Liu Nathan\u00a0R. Tallent and Kevin\u00a0J. Barker. 2020. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. IEEE Transactions on Parallel and Distributed Systems 31 1 (2020) 94\u2013110. https:\/\/doi.org\/10.1109\/TPDS.2019.2928289","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934897"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. Proc. VLDB Endow. 13 12 (2020) 3005\u20133018. https:\/\/doi.org\/10.14778\/3415478.3415530","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_3_1_30_2","unstructured":"Meta. 2025. LLaMA Github. https:\/\/github.com\/meta-llama\/llama"},{"key":"e_1_3_3_1_31_2","volume-title":"6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, April 30 - May 3, 2018, Conference Track Proceedings","author":"Micikevicius Paulius","year":"2018","unstructured":"Paulius Micikevicius, Sharan Narang, Jonah Alben, Gregory\u00a0F. Diamos, Erich Elsen, David Garc\u00eda, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, and Hao Wu. 2018. Mixed Precision Training. In 6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, April 30 - May 3, 2018, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=r1gs9JgRZ"},{"key":"e_1_3_3_1_32_2","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun\u00a0Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson\u00a0G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arxiv:https:\/\/arXiv.org\/abs\/1906.00091\u00a0[cs.IR] https:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_3_1_33_2","unstructured":"Nvidia. 2021. NVIDIA BLUEFIELD-2 DPU. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/documents\/datasheet-nvidia-bluefield-2-dpu.pdf."},{"key":"e_1_3_3_1_34_2","unstructured":"NVIDIA. 2023. What Is NVLink. https:\/\/blogs.nvidia.com\/blog\/2023\/03\/06\/what-is-nvidia-nvlink\/."},{"key":"e_1_3_3_1_35_2","unstructured":"OCT-FPGA. 2025. OCT Tutorials. https:\/\/github.com\/OCT-FPGA\/OCT-Tutorials"},{"key":"e_1_3_3_1_36_2","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat Red Avila and et\u00a0al. Igor\u00a0Babuschkin. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_1_37_2","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga Alban Desmaison Andreas K\u00f6pf Edward Yang Zach DeVito Martin Raison Alykhan Tejani Sasank Chilamkurthy Benoit Steiner Lu Fang Junjie Bai and Soumith Chintala. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. arxiv:https:\/\/arXiv.org\/abs\/1912.01703\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1912.01703"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"e_1_3_3_1_39_2","unstructured":"Bharadwaj Pudipeddi Maral Mesmakhosroshahi Jinwen Xi and Sujeeth Bharadwaj. 2020. Training Large Neural Networks with Constant Memory using a New Execution Algorithm. arxiv:https:\/\/arXiv.org\/abs\/2002.05645\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2002.05645"},{"key":"e_1_3_3_1_40_2","series-title":"(SC \u201920)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: memory optimizations toward training trillion parameter models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 20, 16\u00a0pages. https:\/\/doi.org\/10.5555\/3433701.3433727"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00057"},{"key":"e_1_3_3_1_43_2","unstructured":"Jie Ren Samyam Rajbhandari Reza\u00a0Yazdani Aminabadi Olatunji Ruwase Shuangyan Yang Minjia Zhang Dong Li and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. arxiv:https:\/\/arXiv.org\/abs\/2101.06840\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2101.06840"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Whit Schonbein Ryan\u00a0E. Grant Matthew G.\u00a0F. Dosanjh and Dorian\u00a0C. Arnold. 2019. INCA: In-Network Compute Assistance. SC19: International Conference for High Performance Computing Networking Storage and Analysis (2019) 1\u201313. https:\/\/api.semanticscholar.org\/CorpusID:207940021","DOI":"10.1145\/3295500.3356153"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622807"},{"key":"e_1_3_3_1_47_2","volume-title":"Advances in Neural Information Processing Systems","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, Ryan Sepassi, and Blake Hechtman. 2018. Mesh-TensorFlow: Deep Learning for Supercomputers. In Advances in Neural Information Processing Systems , S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.), Vol.\u00a031. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/3a37abdeefe1dab1b30f7c5c7e581b93-Paper.pdf"},{"key":"e_1_3_3_1_48_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. ArXiv abs\/1909.08053 (2019). https:\/\/api.semanticscholar.org\/CorpusID:202660670"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575712"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507726"},{"key":"e_1_3_3_1_51_2","unstructured":"Romal Thoppilan Daniel\u00a0De Freitas Jamie Hall Noam Shazeer Apoorv Kulshreshtha Heng-Tze Cheng Alicia Jin Taylor Bos Leslie Baker Yu Du YaGuang Li Hongrae Lee Huaixiu\u00a0Steven Zheng Amin Ghafouri Marcelo Menegali Yanping Huang Maxim Krikun Dmitry Lepikhin James Qin and et\u00a0al. Dehao\u00a0Chen. 2022. LaMDA: Language Models for Dialog Applications. arxiv:https:\/\/arXiv.org\/abs\/2201.08239\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2201.08239"},{"key":"e_1_3_3_1_52_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_1_53_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian\u00a0Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa and Isabel\u00a0Kloumann et al.2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:https:\/\/arXiv.org\/abs\/2307.09288\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Linnan Wang Jinmian Ye Yiyang Zhao Wei Wu Ang Li Shuaiwen\u00a0Leon Song Zenglin Xu and Tim Kraska. 2018. Superneurons: dynamic GPU memory management for training deep neural networks. SIGPLAN Not. 53 1 (Feb. 2018) 41\u201353. https:\/\/doi.org\/10.1145\/3200691.3178491","DOI":"10.1145\/3200691.3178491"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_3_1_56_2","unstructured":"Xilinx. 2020. Alveo U25 SmartNIC Accelerator Card. https:\/\/www.xilinx.com\/products\/boards-and-kits\/alveo\/u25.html."},{"key":"e_1_3_3_1_57_2","unstructured":"Xilinx. 2021. ALVEO\u2122 SN1000 SmartNICs. https:\/\/www.xilinx.com\/content\/dam\/xilinx\/publications\/product-briefs\/xilinx-alveo-sn1000-product-brief.pdf."},{"key":"e_1_3_3_1_58_2","unstructured":"Xilinx. 2025. XUP Vitis Network Example. https:\/\/github.com\/Xilinx\/xup_vitis_network_example Accessed: 2025-03-16."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604882"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00042"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/FPL.2018.00039"},{"key":"e_1_3_3_1_62_2","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arxiv:https:\/\/arXiv.org\/abs\/2304.11277\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2304.11277"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3729514","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3729514","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:00:16Z","timestamp":1755867616000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3729514"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":61,"alternative-id":["10.1145\/3721145.3729514","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3729514","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}