{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T05:42:24Z","timestamp":1767850944697,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658693","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"334-347","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Near-Optimal Wafer-Scale Reduce"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8779-4223","authenticated-orcid":false,"given":"Piotr","family":"Luczynski","sequence":"first","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5975-4526","authenticated-orcid":false,"given":"Lukas","family":"Gianinazzi","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5979-4915","authenticated-orcid":false,"given":"Patrick","family":"Iff","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1676-8156","authenticated-orcid":false,"given":"Leighton","family":"Wilson","sequence":"additional","affiliation":[{"name":"Cerebras Systems, Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7244-639X","authenticated-orcid":false,"given":"Daniele","family":"De Sensi","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Sapienza University of Rome, Rome, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1333-9797","authenticated-orcid":false,"given":"Torsten","family":"Hoefler","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1006\/jpdc.1995.1018"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Yves Baumann Tal Ben-Nun Maciej Besta Lukas Gianinazzi Torsten Hoefler and Piotr Luczynski. 2024. Low-Depth Spatial Tree Algorithms. arXiv:2404.12953 [cs.DC]","DOI":"10.1109\/IPDPS57955.2024.00024"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3320060"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.34"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2205.09702"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/0021-9991(90)90171-V"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.1206"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2017.7995277"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476146"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, SC 2018","author":"Chunduri Sudheer","year":"2018","unstructured":"Sudheer Chunduri, Scott Parker, Pavan Balaji, Kevin Harms, and Kalyan Kumaran. 2018. Characterization of MPI usage on a production supercomputer. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, SC 2018, Dallas, TX, USA, November 11--16, 2018. IEEE \/ ACM, 30:1--30:15. http:\/\/dl.acm.org\/citation.cfm?id=3291696"},{"key":"e_1_3_2_1_11_1","volume-title":"Swing: Short-cutting Rings for Higher Bandwidth Allreduce. In 21th USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Sensi Daniele De","year":"2024","unstructured":"Daniele De Sensi, Tommaso Bonato, David Saam, and Torsten Hoefler. 2024. Swing: Short-cutting Rings for Higher Bandwidth Allreduce. In 21th USENIX Symposium on Networked Systems Design and Implementation (NSDI 24). USENIX Association, Santa Clara, CA."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476178"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00039"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.03208"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2021.3057203"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293906"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2205.04934"},{"key":"e_1_3_2_1_18_1","first-page":"41","article-title":"Scalable Hierarchical Aggregation and Reduction Protocol (SHARP)TM Streaming-Aggregation Hardware Design and Evaluation","volume":"12151","author":"Graham Richard L.","year":"2020","unstructured":"Richard L. Graham, Lion Levi, Devendar Bureddy, Gil Bloch, Gilad Shainer, David Cho, George Elias, Daniel Klein, Joshua Ladd, Ophir Maor, Ami Marelli, Valentin Petrov, Evyatar Romlet, Yong Qin, and Ido Zemah. 2020. Scalable Hierarchical Aggregation and Reduction Protocol (SHARP)TM Streaming-Aggregation Hardware Design and Evaluation. High Performance Computing 12151 (2020), 41--59.","journal-title":"High Performance Computing"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/0167-8191(96)00024-5"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807644"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00016"},{"key":"e_1_3_2_1_22_1","first-page":"2","article-title":"Energy, Memory, and Runtime Tradeoffs for Implementing Collective Communication Operations","volume":"1","author":"Hoefler Torsten","year":"2014","unstructured":"Torsten Hoefler and D. Moor. 2014. Energy, Memory, and Runtime Tradeoffs for Implementing Collective Communication Operations. Journal of Supercomputing Frontiers and Innovations 1, 2 (Oct. 2014), 58--75.","journal-title":"Journal of Supercomputing Frontiers and Innovations"},{"key":"e_1_3_2_1_23_1","first-page":"4","article-title":"Accurately Measuring Overhead, Communication Time and Progression of Blocking and Nonblocking Collective Operations at Massive Scale","volume":"25","author":"Hoefler Torsten","year":"2010","unstructured":"Torsten Hoefler, Timo Schneider, and Andrew Lumsdaine. 2010. Accurately Measuring Overhead, Communication Time and Progression of Blocking and Nonblocking Collective Operations at Massive Scale. International Journal of Parallel, Emergent and Distributed Systems 25, 4 (Jul. 2010), 241--258.","journal-title":"International Journal of Parallel, Emergent and Distributed Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Cerebras Systems: Achieving Industry Best AI Performance Through A Systems Approach.","author":"Cerebras Systems Inc.","year":"2021","unstructured":"Cerebras Systems Inc. 2021. Cerebras Systems: Achieving Industry Best AI Performance Through A Systems Approach. (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2204.03775"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1810085.1810093"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.29465"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2000.846009"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250662.1250679"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.19"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2488551.2488557"},{"key":"e_1_3_2_1_32_1","volume-title":"Highly Available Data Parallel ML training on Mesh Networks. CoRR abs\/2011.03605","author":"Kumar Sameer","year":"2020","unstructured":"Sameer Kumar and Norm Jouppi. 2020. Highly Available Data Parallel ML training on Mesh Networks. CoRR abs\/2011.03605 (2020). arXiv:2011.03605 https:\/\/arxiv.org\/abs\/2011.03605"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356176"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567153"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2023.3256384"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3627042"},{"key":"e_1_3_2_1_37_1","volume-title":"MPI: A Message-Passing Interface Standard Version 4.0. https:\/\/www.mpi-forum.org\/docs\/mpi-4.0\/mpi40-report.pdf","author":"Interface Forum Message Passing","year":"2021","unstructured":"Message Passing Interface Forum. 2021. MPI: A Message-Passing Interface Standard Version 4.0. https:\/\/www.mpi-forum.org\/docs\/mpi-4.0\/mpi40-report.pdf"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3202350"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593708"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_2_1_41_1","volume-title":"Johan Bj\u00f6rkegren, and Tom Michoel.","author":"Qi Jianlong","year":"2014","unstructured":"Jianlong Qi, Hassan Foroughi Asl, Johan Bj\u00f6rkegren, and Tom Michoel. 2014. kruX: matrix-based non-parametric eQTL discovery. BMC bioinformatics 15 (2014), 1--7."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24685-5_1"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-30218-6_13"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00062"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","unstructured":"Niklas Roemer. 2023-08-06. Designing of a communication library for Versal devices using Stream-Based API. Bachelor Thesis. ETH Zurich Zurich. 10.3929\/ethz-b-000635928","DOI":"10.3929\/ethz-b-000635928"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/0167-8191(89)90024-0"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2686882"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/bts163"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2023.109072"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2000.10024"},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4--9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998--6008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/27633.28055"},{"key":"e_1_3_2_1_56_1","volume-title":"A Survey of Methods for Collective Communication Optimization and Tuning. CoRR abs\/1611.06334","author":"Wickramasinghe Udayanga","year":"2016","unstructured":"Udayanga Wickramasinghe and Andrew Lumsdaine. 2016. A Survey of Methods for Collective Communication Optimization and Tuning. CoRR abs\/1611.06334 (2016). arXiv:1611.06334 http:\/\/arxiv.org\/abs\/1611.06334"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","unstructured":"Max Wierse. 2023-02. Evaluation of Xilinx Versal Device. Bachelor Thesis. ETH Zurich Zurich. 10.3929\/ethz-b-000600880","DOI":"10.3929\/ethz-b-000600880"},{"key":"e_1_3_2_1_58_1","unstructured":"Leighton Wilson. 2023. What's New in R0.6 of the Cerebras SDK. https:\/\/www.cerebras.net\/blog\/whats-new-in-r0.6-of-the-cerebras-sdk. Accessed: 2023-08-09."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2209.13768"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45825-5_43"}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","location":"Pisa Italy","acronym":"HPDC '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658693","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658693","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:38Z","timestamp":1750287038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658693"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":58,"alternative-id":["10.1145\/3625549.3658693","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658693","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}