{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T15:43:35Z","timestamp":1781797415925,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"DOI":"10.13039\/100002418","name":"Intel Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100002418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731055","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"34-48","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["FRED: A Wafer-scale Fabric for 3D Parallel DNN Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6472-9920","authenticated-orcid":false,"given":"Saeed","family":"Rashidi","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1715-9144","authenticated-orcid":false,"given":"William","family":"Won","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8662-5820","authenticated-orcid":false,"given":"Sudarshan","family":"Srinivasan","sequence":"additional","affiliation":[{"name":"Intel, Bangalore, Karnataka, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6188-1134","authenticated-orcid":false,"given":"Puneet","family":"Gupta","sequence":"additional","affiliation":[{"name":"UCLA, Los Angeles, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5738-6942","authenticated-orcid":false,"given":"Tushar","family":"Krishna","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, GA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2015. MPI: A Message-Passing Interface Standard. https:\/\/www.mpi-forum.org\/docs\/mpi-3.1\/mpi31-report.pdf."},{"key":"e_1_3_3_2_3_2","unstructured":"2020. ASTRA-SIM: Enabling SW\/HW Co-Design Exploration for Distributed DL Training Platforms. https:\/\/github.com\/astra-sim\/astra-sim.git."},{"key":"e_1_3_3_2_4_2","unstructured":"2021. Scale-Out Packageless Processing. https:\/\/nanocad.ee.ucla.edu\/wp-content\/papercite-data\/pdf\/phdth11.pdf."},{"key":"e_1_3_3_2_5_2","unstructured":"2022. Compute Express Link 3.0. https:\/\/www.computeexpresslink.org\/_files\/ugd\/0c1418_a8713008916044ae9604405d10a7773b.pdf."},{"key":"e_1_3_3_2_6_2","unstructured":"2023. IEEE Heterogeneous Integration Roadmap. https:\/\/eps.ieee.org\/images\/files\/HIR_2023\/ch20_thermalfinal.pdf."},{"key":"e_1_3_3_2_7_2","unstructured":"2024. Heterogeneous Integration Roadmap. https:\/\/eps.ieee.org\/images\/files\/HIR_2024\/HIR_2024_ch22_2D-3D.pdf."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1117\/12.868482"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Pat Bosshart Dan Daly Glen Gibb Martin Izzard Nick McKeown Jennifer Rexford Cole Schlesinger Dan Talayco Amin Vahdat George Varghese and David Walker. 2014. P4: Programming Protocol-Independent Packet Processors. SIGCOMM Comput. Commun. Rev. 44 3 (jul 2014) 87\u201395. 10.1145\/2656877.2656890","DOI":"10.1145\/2656877.2656890"},{"key":"e_1_3_3_2_10_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel\u00a0M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv:https:\/\/arXiv.org\/abs\/2005.14165\u00a0[cs.CL]"},{"key":"e_1_3_3_2_11_2","unstructured":"Cerebras. 2021. Cerebras Systems: Achieving Industry Best AI Performance Through A Systems Approach. https:\/\/f.hubspotusercontent30.net\/hubfs\/8968533\/Cerebras-CS-2-Whitepaper.pdf."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Ernie Chan Marcel Heimlich Avi Purkayastha and Robert van\u00a0de Geijn. 2007. Collective Communication: Theory Practice and Experience: Research Articles. Concurrency and Computation: Practice and Experience 19 13 (Sep. 2007) 1749\u20131783.","DOI":"10.1002\/cpe.1206"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/1122971.1122975"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","unstructured":"Chihming Chang and Rami Melhem. 1997. Arbitrary Size Benes Networks. Parallel Processing Letters 07 03 (1997) 279\u2013284. 10.1142\/S0129626497000292 arXiv:10.1142\/S0129626497000292","DOI":"10.1142\/S0129626497000292"},{"key":"e_1_3_3_2_15_2","series-title":"(ISCA \u201924)","volume-title":"Proceedings of the 51th Annual International Symposium on Computer Architecture","author":"Chen Shuangliang","year":"2024","unstructured":"Shuangliang Chen, Saptadeep Pal, and Rakesh Kumar. 2024. Wafterscale Network Switches. In Proceedings of the 51th Annual International Symposium on Computer Architecture(ISCA \u201924). Association for Computing Machinery."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Yu-Hsin Chen Tien-Ju Yang Joel Emer and Vivienne Sze. 2019. Eyeriss v2: A flexible accelerator for emerging deep neural networks on mobile devices. IEEE Journal on Emerging and Selected Topics in Circuits and Systems 9 2 (2019) 292\u2013308.","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"M. Cho U. Finkler M. Serrano D. Kung and H. Hunter. 2019. BlueConnect: Decomposing all-reduce for deep learning on heterogeneous network hierarchy. IBM Journal of Research and Development 63 6 (Oct. 2019) 1:1\u201311. 10.1147\/JRD.2019.2947013","DOI":"10.1147\/JRD.2019.2947013"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"Charles Clos. 1953. A study of non-blocking switching networks. Bell System Technical Journal 32 2 (1953) 406\u2013424.","DOI":"10.1002\/j.1538-7305.1953.tb01433.x"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.5555\/2821589"},{"key":"e_1_3_3_2_20_2","unstructured":"Google. 2021. Google Open-Sources Trillion-Parameter AI Language Model Switch Transformer. https:\/\/www.infoq.com\/news\/2021\/02\/google-trillion-parameter-ai\/."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247947"},{"key":"e_1_3_3_2_22_2","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arxiv:https:\/\/arXiv.org\/abs\/1512.03385\u00a0[cs.CV]"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Reza Hojabr Mehdi Modarressi Masoud Daneshtalab Ali Yasoubi and Ahmad Khonsari. 2017. Customizing clos network-on-chip for neural networks. IEEE Trans. Comput. 66 11 (2017) 1865\u20131877.","DOI":"10.1109\/TC.2017.2715158"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","unstructured":"Yanping Huang Youlong Cheng Ankur Bapna Orhan Firat Mia\u00a0Xu Chen Dehao Chen HyoukJoong Lee Jiquan Ngiam Quoc\u00a0V. Le Yonghui Wu and Zhifeng Chen. 2018. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. 10.48550\/ARXIV.1811.06965","DOI":"10.48550\/ARXIV.1811.06965"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00083"},{"key":"e_1_3_3_2_26_2","unstructured":"Deca\u00a0Technologies Inc.2024. Adaptive Patterning. https:\/\/thinkdeca.com\/adaptive-patterning\/"},{"key":"e_1_3_3_2_27_2","unstructured":"Zhihao Jia Matei Zaharia and Alex Aiken. 2018. Beyond Data and Model Parallelism for Deep Neural Networks. CoRR abs\/1807.05358 (2018). arxiv:https:\/\/arXiv.org\/abs\/1807.05358http:\/\/arxiv.org\/abs\/1807.05358"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"e_1_3_3_2_29_2","unstructured":"Sameer Kumar and Norm Jouppi. 2020. Highly Available Data Parallel ML training on Mesh Networks. arxiv:https:\/\/arXiv.org\/abs\/2011.03605\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2011.03605"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173176"},{"key":"e_1_3_3_2_31_2","first-page":"741","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Lao ChonLam","year":"2021","unstructured":"ChonLam Lao, Yanfang Le, Kshiteej Mahajan, Yixi Chen, Wenfei Wu, Aditya Akella, and Michael Swift. 2021. ATP: In-network Aggregation for Multi-tenant Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). USENIX Association, 741\u2013761. https:\/\/www.usenix.org\/conference\/nsdi21\/presentation\/lao"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00069"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","unstructured":"Dmitry Lepikhin HyoukJoong Lee Yuanzhong Xu Dehao Chen Orhan Firat Yanping Huang Maxim Krikun Noam Shazeer and Zhifeng Chen. 2020. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. 10.48550\/ARXIV.2006.16668","DOI":"10.48550\/ARXIV.2006.16668"},{"key":"e_1_3_3_2_34_2","unstructured":"Ang Li Shuaiwen\u00a0Leon Song Jieyang Chen Jiajia Li Xu Liu Nathan\u00a0R. Tallent and Kevin\u00a0J. Barker. 2019. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. arxiv:https:\/\/arXiv.org\/abs\/1903.04611\u00a0[cs.AR]"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. 10.48550\/ARXIV.2006.15704","DOI":"10.48550\/ARXIV.2006.15704"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322259"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ECTC.2019.00093"},{"key":"e_1_3_3_2_38_2","unstructured":"Meta. 2025. [Distributed w\/ TorchTitan] Breaking Barriers: Training Long Context LLMs with 1M Sequence Length in PyTorch Using Context Parallel. https:\/\/discuss.pytorch.org\/t\/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel\/215082."},{"key":"e_1_3_3_2_39_2","unstructured":"Microsoft. 2020. Turing-NLG: A 17-billion-parameter language model by Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/blog\/turing-nlg-a-17-billion-parameter-language-model-by-microsoft\/."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","unstructured":"Deepak Narayanan Aaron Harlap Amar Phanishayee Vivek Seshadri Nikhil\u00a0R. Devanur Gregory\u00a0R. Ganger Phillip\u00a0B. Gibbons and Matei Zaharia. 2019. PipeDream: Generalized Pipeline Parallelism for DNN Training(SOSP \u201919). Association for Computing Machinery New York NY USA 1\u201315. 10.1145\/3341301.3359646","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_43_2","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun\u00a0Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson\u00a0G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arxiv:https:\/\/arXiv.org\/abs\/1906.00091\u00a0[cs.IR]"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Dimitris Nikolaidis Panos Groumas Christos Kouloumentas and Hercules Avramopoulos. 2022. Novel Benes Network Routing Algorithm and Hardware Implementation. Technologies 10 1 (2022). https:\/\/www.mdpi.com\/2227-7080\/10\/1\/16","DOI":"10.3390\/technologies10010016"},{"key":"e_1_3_3_2_45_2","unstructured":"NVIDIA. 2020. Mellanox SHARP. https:\/\/docs.mellanox.com\/display\/sharpv214."},{"key":"e_1_3_3_2_46_2","unstructured":"NVIDIA. 2022. NVIDIA H100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3414622.3431906"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586194"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00042"},{"key":"e_1_3_3_2_50_2","unstructured":"Huwan Peng Scott Davidson Richard Shi Shuaiwen\u00a0Leon Song and Michael Taylor. 2024. Chiplet Cloud: Building AI Supercomputers for Serving Large Generative Language Models. arxiv:https:\/\/arXiv.org\/abs\/2307.02666\u00a0[cs.AR] https:\/\/arxiv.org\/abs\/2307.02666"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672265"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arxiv:https:\/\/arXiv.org\/abs\/1910.02054\u00a0[cs.LG]","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI51249.2020.00020"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00018"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527382"},{"key":"e_1_3_3_2_58_2","first-page":"785","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Sapio Amedeo","year":"2021","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan Ports, and Peter Richtarik. 2021. Scaling Distributed Machine Learning with In-Network Aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). USENIX Association, 785\u2013808. https:\/\/www.usenix.org\/conference\/nsdi21\/presentation\/sapio"},{"key":"e_1_3_3_2_59_2","unstructured":"Sean Lie. 2021. Thinking Outside the Die: Architecting the ML Accelerator of the Future. https:\/\/www.microarch.org\/micro54\/media\/lie-keynote.pdf."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","unstructured":"Debendra\u00a0Das Sharma and Thomas Coughlin. 2024. Universal Chiplet Interconnect Express: An Open Industry Standard for Memory and Storage Applications. Computer 57 01 (Jan. 2024) 75\u201381. 10.1109\/MC.2023.3318769","DOI":"10.1109\/MC.2023.3318769"},{"key":"e_1_3_3_2_62_2","unstructured":"Anton Shilov. 2024. TSMC\u2019s System-on-Wafer Platform Goes 3D: CoW-SoW Stacks Up the Chips. https:\/\/www.anandtech.com\/show\/21372\/tsmcs-system-on-wafer-platform-goes-3d-cow-sow."},{"key":"e_1_3_3_2_63_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arxiv:https:\/\/arXiv.org\/abs\/1909.08053\u00a0[cs.CL]"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593704"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","unstructured":"Rajeev Thakur Rolf Rabenseifner and William Gropp. 2005. Optimization of Collective Communication Operations in MPICH. International Journal of High Performance Computing Applications 19 1 (Feb. 2005) 49\u201366. 10.1177\/1094342005051521","DOI":"10.1177\/1094342005051521"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:05:31Z","timestamp":1750503931000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731055"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":64,"alternative-id":["10.1145\/3695053.3731055","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731055","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}