{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T00:18:33Z","timestamp":1777421913414,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,11]],"date-time":"2022-06-11T00:00:00Z","timestamp":1654905600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,18]]},"DOI":"10.1145\/3470496.3527405","type":"proceedings-article","created":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T19:06:01Z","timestamp":1654023961000},"page":"567-580","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":39,"title":["A software-defined tensor streaming multiprocessor for large-scale machine learning"],"prefix":"10.1145","author":[{"given":"Dennis","family":"Abts","sequence":"first","affiliation":[{"name":"Groq Inc."}]},{"given":"Garrin","family":"Kimmell","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Andrew","family":"Ling","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"John","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST\/Groq Inc."}]},{"given":"Matt","family":"Boyd","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Andrew","family":"Bitar","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Sahil","family":"Parmar","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Ibrahim","family":"Ahmed","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Roberto","family":"DiCecco","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"David","family":"Han","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"John","family":"Thompson","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Michael","family":"Bye","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Jennifer","family":"Hwang","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Jeremy","family":"Fowers","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Peter","family":"Lillian","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Ashwin","family":"Murthy","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Elyas","family":"Mehtabuddin","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Chetan","family":"Tekur","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Thomas","family":"Sohmers","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Kris","family":"Kang","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Stephen","family":"Maresh","sequence":"additional","affiliation":[{"name":"Groq Inc."}]},{"given":"Jonathan","family":"Ross","sequence":"additional","affiliation":[{"name":"Groq Inc."}]}],"member":"320","published-online":{"date-parts":[[2022,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00023"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1362622.1362630"},{"key":"e_1_3_2_1_3_1","volume-title":"Conference on High Performance Computing Networking, Storage and Analysis (SC). 1--11","author":"Ahn Jung Ho","unstructured":"Jung Ho Ahn , Nathan Binkert , Al Davis , Moray McLaren , and Robert S. Schreiber . 2009. HyperX: Topology, Routing, and Packaging of Efficient Large-Scale Networks . In Conference on High Performance Computing Networking, Storage and Analysis (SC). 1--11 . Jung Ho Ahn, Nathan Binkert, Al Davis, Moray McLaren, and Robert S. Schreiber. 2009. HyperX: Topology, Routing, and Packaging of Efficient Large-Scale Networks. In Conference on High Performance Computing Networking, Storage and Analysis (SC). 1--11."},{"key":"e_1_3_2_1_4_1","volume-title":"The Cray XC Scaleable System","author":"Alverson Bob","unstructured":"Bob Alverson , Edwin Froese , Larry Kaplan , and Duncan Roweth . 2012. The Cray XC Scaleable System . In Cray Inc. White Paper . Bob Alverson, Edwin Froese, Larry Kaplan, and Duncan Roweth. 2012. The Cray XC Scaleable System. In Cray Inc. White Paper."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736029"},{"key":"e_1_3_2_1_6_1","unstructured":"Tom B Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell etal 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020).  Tom B Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_7_1","unstructured":"Cerebras CS-1. 2021. http:\/\/cerebras.net. http:\/\/cerebras.net (2021).  Cerebras CS-1. 2021. http:\/\/cerebras.net. http:\/\/cerebras.net (2021)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00083"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.127260"},{"key":"e_1_3_2_1_11_1","unstructured":"W. J. Dally and B. Towles. 2004. Principles and Practices of Interconnection Networks. Morgan Kaufmann San Francisco CA.  W. J. Dally and B. Towles. 2004. Principles and Practices of Interconnection Networks. Morgan Kaufmann San Francisco CA."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.15439\/2015F86"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/1508244.1508255"},{"key":"e_1_3_2_1_14_1","volume-title":"Parker","author":"Flajslik Mario","year":"2018","unstructured":"Mario Flajslik , Eric Borch , and Mike A . Parker . 2018 . Megafly: a topology for exascale systems. In High Performance Computing, Rio Yokota, Mich\u00e8le Weiland, David Keyes, and Carsten Trinitis (Eds.). Springer International Publishing , Cham, 289--310. Mario Flajslik, Eric Borch, and Mike A. Parker. 2018. Megafly: a topology for exascale systems. In High Performance Computing, Rio Yokota, Mich\u00e8le Weiland, David Keyes, and Carsten Trinitis (Eds.). Springer International Publishing, Cham, 289--310."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the International Symposium on Computer Architecture.","author":"Christopher","unstructured":"Christopher J. Glass and Lionel M. Ni. 1992. The turn model for adaptive routing . In Proceedings of the International Symposium on Computer Architecture. Christopher J. Glass and Lionel M. Ni. 1992. The turn model for adaptive routing. In Proceedings of the International Symposium on Computer Architecture."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038228.3038237"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ScalA49573.2019.00010"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749741"},{"key":"e_1_3_2_1_19_1","volume-title":"Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al.","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang , Youlong Cheng , Ankur Bapna , Orhan Firat , Dehao Chen , Mia Chen , Hyouk Joong Lee , Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019 . Gpipe : Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019), 103--112. Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, Hyouk Joong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019), 103--112."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of ISCA'09","author":"Jiang Nan","unstructured":"Nan Jiang , John Kim , and William J. Dally . 2009. Indirect Adaptive Routing on Large Scale Interconnection Networks . In Proceedings of ISCA'09 . Austin, TX, 220--231. Nan Jiang, John Kim, and William J. Dally. 2009. Indirect Adaptive Routing on Large Scale Interconnection Networks. In Proceedings of ISCA'09. Austin, TX, 220--231."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451118"},{"key":"e_1_3_2_1_22_1","unstructured":"Andrej Karpathy. 2021. Keynote at Workshop on Autonomous Driving.  Andrej Karpathy. 2021. Keynote at Workshop on Autonomous Driving."},{"key":"e_1_3_2_1_23_1","volume-title":"Virtual cut-through: A new computer communication switching technique. Computer Networks (1976) 3, 4","author":"Kermani Parviz","year":"1979","unstructured":"Parviz Kermani and Leonard Kleinrock . 1979. Virtual cut-through: A new computer communication switching technique. Computer Networks (1976) 3, 4 ( 1979 ), 267--286. Parviz Kermani and Leonard Kleinrock. 1979. Virtual cut-through: A new computer communication switching technique. Computer Networks (1976) 3, 4 (1979), 267--286."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.15"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/1394608.1382129"},{"key":"e_1_3_2_1_26_1","volume-title":"ISCA '05: Proceedings of the 32nd Annual International Symposium on Computer Architecture. IEEE Computer Society","author":"Kim John","unstructured":"John Kim , William J. Dally , Brian Towles , and Amit K. Gupta . 2005. Microarchitecture of a High-Radix Router . In ISCA '05: Proceedings of the 32nd Annual International Symposium on Computer Architecture. IEEE Computer Society , Madison, WI, USA, 420--431. John Kim, William J. Dally, Brian Towles, and Amit K. Gupta. 2005. Microarchitecture of a High-Radix Router. In ISCA '05: Proceedings of the 32nd Annual International Symposium on Computer Architecture. IEEE Computer Society, Madison, WI, USA, 420--431."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.18"},{"key":"e_1_3_2_1_29_1","volume-title":"Hot Chips 31 Symposium","author":"Medina Eitan","year":"2019","unstructured":"Eitan Medina . 2019 . Habana Labs Approach to Scaling AI Training . In Hot Chips 31 Symposium , Palo Alto, CA, USA. Eitan Medina. 2019. Habana Labs Approach to Scaling AI Training. In Hot Chips 31 Symposium, Palo Alto, CA, USA."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_31_1","volume-title":"Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al.","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan , Mohammad Shoeybi , Jared Casper , Patrick LeGresley , Mostofa Patwary , Vijay Anand Korthikanti , Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. 2021 . Efficient large-scale language model training on gpu clusters. arXiv preprint arXiv:2104.04473 (2021). Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Anand Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, et al. 2021. Efficient large-scale language model training on gpu clusters. arXiv preprint arXiv:2104.04473 (2021)."},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2011. NVidia DGX POD. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-pod-reference-architecture\/  NVIDIA. 2011. NVidia DGX POD. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-pod-reference-architecture\/"},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2021. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests\/  NVIDIA. 2021. NCCL Tests. https:\/\/github.com\/NVIDIA\/nccl-tests\/"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1508244.1508256"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080256"},{"key":"e_1_3_2_1_37_1","volume-title":"Rosetta: A 64-port Switch for Cray's Slingshot Interconnect. Keynote 2, HOTI.","author":"Scott Steve","year":"2019","unstructured":"Steve Scott . 2019 . Rosetta: A 64-port Switch for Cray's Slingshot Interconnect. Keynote 2, HOTI. Steve Scott. 2019. Rosetta: A 64-port Switch for Cray's Slingshot Interconnect. Keynote 2, HOTI."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2006.40"},{"key":"e_1_3_2_1_39_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi , Mostofa Patwary , Raul Puri , Patrick LeGresley , Jared Casper , and Bryan Catanzaro . 2019 . Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019). Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPINEB.2017.11"},{"key":"e_1_3_2_1_41_1","volume-title":"Overcoming Far-end Congestion in Large-Scale Networks. In IEEE 21st International Symposium on High Performance Computer Architecture (HPCA).","author":"Won Jongmin","year":"2015","unstructured":"Jongmin Won , Gwangsun Kim , John Kim , Ted Jiang , Mike Parker , and Steve Scott . 2015 . Overcoming Far-end Congestion in Large-Scale Networks. In IEEE 21st International Symposium on High Performance Computer Architecture (HPCA). Jongmin Won, Gwangsun Kim, John Kim, Ted Jiang, Mike Parker, and Steve Scott. 2015. Overcoming Far-end Congestion in Large-Scale Networks. In IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_42_1","volume-title":"Evaluation of the Tensor Processing Unit: A Deep Neural Network Accelerator for the Datacenter. In Hot Chips 29 Symposium","author":"Young Cliff","year":"2017","unstructured":"Cliff Young . 2017 . Evaluation of the Tensor Processing Unit: A Deep Neural Network Accelerator for the Datacenter. In Hot Chips 29 Symposium , Palo Alto, CA, USA. Cliff Young. 2017. Evaluation of the Tensor Processing Unit: A Deep Neural Network Accelerator for the Datacenter. In Hot Chips 29 Symposium, Palo Alto, CA, USA."}],"event":{"name":"ISCA '22: The 49th Annual International Symposium on Computer Architecture","location":"New York New York","acronym":"ISCA '22","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCAA IEEE CS technical committee on architectural acoustics"]},"container-title":["Proceedings of the 49th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3527405","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3470496.3527405","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:28Z","timestamp":1750188628000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3527405"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,11]]},"references-count":40,"alternative-id":["10.1145\/3470496.3527405","10.1145\/3470496"],"URL":"https:\/\/doi.org\/10.1145\/3470496.3527405","relation":{},"subject":[],"published":{"date-parts":[[2022,6,11]]},"assertion":[{"value":"2022-06-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}