{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T18:36:32Z","timestamp":1772303792692,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T00:00:00Z","timestamp":1717372800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Defense Advanced Research Projects Agency (DARPA)","award":["HR001120C0089"],"award-info":[{"award-number":["HR001120C0089"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,3]]},"DOI":"10.1145\/3625549.3658656","type":"proceedings-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T15:55:29Z","timestamp":1725033329000},"page":"28-41","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Efficient all-to-all Collective Communication Schedules for Direct-connect Topologies"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8295-1878","authenticated-orcid":false,"given":"Prithwish","family":"Basu","sequence":"first","affiliation":[{"name":"RTX BBN Technologies, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5794-0326","authenticated-orcid":false,"given":"Liangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7369-5585","authenticated-orcid":false,"given":"Jason","family":"Fantl","sequence":"additional","affiliation":[{"name":"RTX BBN Technologies, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6509-6267","authenticated-orcid":false,"given":"Siddharth","family":"Pal","sequence":"additional","affiliation":[{"name":"RTX BBN Technologies, Cambridge, MA, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9505-9528","authenticated-orcid":false,"given":"Arvind","family":"Krishnamurthy","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, WA, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9030-5208","authenticated-orcid":false,"given":"Joud","family":"Khoury","sequence":"additional","affiliation":[{"name":"RTX BBN Technologies, Cambridge, MA, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2021. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2021. TACC establishes new Center of Excellence with Rockport Networks. https:\/\/rockportnetworks.com\/tacc-establishes-new-center-of-excellence-with-rockport-networks\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2023. Cerio. https:\/\/www.cerio.io\/"},{"key":"e_1_3_2_1_4_1","unstructured":"2023. Parallelization of Particle-mesh Ewald (PME) in GROMACS for Molecular Dynamics. Available at https:\/\/manual.gromacs.org\/documentation\/current\/user-guide\/mdrun-performance.html accessed on 7.15.2023."},{"key":"e_1_3_2_1_5_1","unstructured":"2023. Texas Advanced Computing Center. https:\/\/www.tacc.utexas.edu\/."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Fastest Fourier Transform in the West. https:\/\/fftw.org\/."},{"key":"e_1_3_2_1_7_1","unstructured":"AMD. 2023. ROCm Communication Collectives Library. Available at https:\/\/github.com\/ROCmSoftwarePlatform\/rccl accessed on 7.15.2023."},{"key":"e_1_3_2_1_8_1","unstructured":"MOSEK ApS. 2023. MOSEK Solver version 10.1. https:\/\/www.mosek.com\/"},{"key":"e_1_3_2_1_9_1","volume-title":"Miguel Castro, Srikanth Kandula, Saeed Maleki, and Luke Marshall.","author":"Arzani Behnaz","year":"2023","unstructured":"Behnaz Arzani, Siva Kesava Reddy Kakarla, Miguel Castro, Srikanth Kandula, Saeed Maleki, and Luke Marshall. 2023. Rethinking Machine Learning Collective Communication as a Multi-Commodity Flow Problem. arXiv:2305.13479 [cs.NI]"},{"key":"e_1_3_2_1_10_1","volume-title":"2019 IEEE\/ACM Workshop on Exascale MPI (ExaMPI). IEEE, 12--18","author":"Ayala Alan","year":"2019","unstructured":"Alan Ayala, Stanimire Tomov, Xi Luo, Hejer Shaeik, Azzam Haidar, George Bosilca, and Jack Dongarra. 2019. Impacts of multi-gpu mpi collective communications on large fft computation. In 2019 IEEE\/ACM Workshop on Exascale MPI (ExaMPI). IEEE, 12--18."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"Prithwish Basu Feng Yu Amotz Bar-Noy and Dror Rawitz. [n. d.]. To Sample or To Smash? Estimating reachability in large time-varying graphs. 983--991. arXiv:https:\/\/epubs.siam.org\/doi\/pdf\/10.1137\/1.9781611973440.112 10.1137\/1.9781611973440.112","DOI":"10.1137\/1.9781611973440.112"},{"key":"e_1_3_2_1_12_1","volume-title":"GROMACS: A message-passing parallel molecular dynamics implementation. Computer physics communications 91, 1--3","author":"Berendsen Herman JC","year":"1995","unstructured":"Herman JC Berendsen, David van der Spoel, and Rudi van Drunen. 1995. GROMACS: A message-passing parallel molecular dynamics implementation. Computer physics communications 91, 1--3 (1995), 43--56."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.34"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313276.3316303"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304604"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1987.1676939"},{"key":"e_1_3_2_1_18_1","volume-title":"Principles and practices of interconnection networks","author":"Dally William James","unstructured":"William James Dally and Brian Patrick Towles. 2004. Principles and practices of interconnection networks. Elsevier."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.65"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1137\/S0895480199355754"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1983.1676323"},{"key":"e_1_3_2_1_22_1","unstructured":"Intel. 2023. oneAPI Collective Communications Library (oneCCL). Available at https:\/\/github.com\/oneapi-src\/oneCCL accessed on 07.15.2023."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.29465"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.64"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1328911.1328924"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472900"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1394608.1382129"},{"key":"e_1_3_2_1_30_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 2021 ACM SIGCOMM 2021 Conference (SIGCOMM '23)","author":"Liu Hong","year":"2023","unstructured":"Hong Liu, Ryohei Urata, Kevin Yasumura, Xiang Zhou, Roy Bannon, Jill Berger, Pedram Dashti, Norm Jouppi, Cedric Lam, Sheng Li, Erji Mao, Daniel Nelson, George Papen, Mukarram Tariq, and Amin Vahdat. 2023. Lightwave Fabrics: At-Scale Optical Circuit Switching for Datacenter and Machine Learning Systems. In Proceedings of the 2021 ACM SIGCOMM 2021 Conference (SIGCOMM '23). Association for Computing Machinery, New York, NY, USA."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/JLT.2021.3073277"},{"key":"e_1_3_2_1_33_1","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mellette William M","year":"2020","unstructured":"William M Mellette, Rajdeep Das, Yibo Guo, Rob McGuinness, Alex C Snoeren, and George Porter. 2020. Expanding across time to deliver bandwidth efficiency and low latency. In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20). 1--18."},{"key":"e_1_3_2_1_34_1","unstructured":"Microsoft. 2022. Microsoft Collective Communication Library. Available at https:\/\/github.com\/microsoft\/msccl."},{"key":"e_1_3_2_1_35_1","volume-title":"Direct numerical simulation: a tool in turbulence research. Annual review of fluid mechanics 30, 1","author":"Moin Parviz","year":"1998","unstructured":"Parviz Moin and Krishnan Mahesh. 1998. Direct numerical simulation: a tool in turbulence research. Annual review of fluid mechanics 30, 1 (1998), 539--578."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_1_37_1","unstructured":"Maxim Naumov John Kim Dheevatsa Mudigere Srinivas Sridharan Xiaodong Wang Whitney Zhao Serhat Yilmaz Changkyu Kim Hector Yuen Mustafa Ozdal et al. 2020. Deep learning training in facebook data centers: Design of scale-up and scale-out systems. arXiv preprint arXiv:2003.09518 (2020)."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2023. NVIDIA Collective Communication Library (NCCL). Available at https:\/\/github.com\/NVIDIA\/nccl accessed on 7.15.2023."},{"key":"e_1_3_2_1_39_1","unstructured":"Open MPI. [n. d.]. Open Source High Performance Computing. https:\/\/www.open-mpi.org\/."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1137\/11082748X"},{"key":"e_1_3_2_1_41_1","unstructured":"Polatis 2023. Polatis Optical Circuit Switch. Available at https:\/\/www.polatis.com\/series-7000-384x384-port-software-controlled-optical-circuitswitch-sdn-enabled.asp."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544265"},{"key":"e_1_3_2_1_43_1","volume-title":"2021 IEEE\/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC). IEEE, 1--8.","author":"Pumma Sarunya","year":"2021","unstructured":"Sarunya Pumma and Abhinav Vishnu. 2021. Semantic-Aware Lossless Data Compression for Deep Learning Recommendation Model (DLRM). In 2021 IEEE\/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC). IEEE, 1--8."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2018436.2018467"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/DMCC.1991.633174"},{"key":"e_1_3_2_1_46_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Shah Aashaka","year":"2023","unstructured":"Aashaka Shah, Vijay Chidambaram, Meghan Cowan, Saeed Maleki, Madan Musuvathi, Todd Mytkowicz, Jacob Nelson, Olli Saarikivi, and Rachee Singh. 2023. TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 593--612. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/shah"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/77600.77620"},{"key":"e_1_3_2_1_48_1","volume-title":"Jellyfish: Networking data centers randomly. In Presented as part of the 9th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 12). 225--238.","author":"Singla Ankit","year":"2012","unstructured":"Ankit Singla, Chi-Yao Hong, Lucian Popa, and P Brighten Godfrey. 2012. Jellyfish: Networking data centers randomly. In Presented as part of the 9th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 12). 225--238."},{"key":"e_1_3_2_1_49_1","volume-title":"Layered Shortest Path (LASH) Routing in Irregular System Area Networks.. In ipdps","author":"Skeie Tor","unstructured":"Tor Skeie, Olav Lysne, and Ingebj\u00f8rg Theiss. 2002. Layered Shortest Path (LASH) Routing in Irregular System Area Networks.. In ipdps, Vol. 2. 194."},{"key":"e_1_3_2_1_50_1","unstructured":"Evandro DE SOUZA. 2022. Deadlock-free multipath routing for direct interconnect networks. Available at https:\/\/patents.google.com\/patent\/WO2022269357A1\/en."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.679215"},{"key":"e_1_3_2_1_52_1","unstructured":"Telescent. 2021. G4 Network Topology Manager. https:\/\/www.telescent.com\/products"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2020EDP7201"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/2999572.2999580"},{"key":"e_1_3_2_1_55_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. 2023. {TopoOpt}: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 739--767."},{"key":"e_1_3_2_1_56_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. 2023. {TopoOpt}: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). 739--767."},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings Fifth International Symposium on High-Performance Computer Architecture. IEEE, 290--299","author":"Yang Yuanyuan","year":"1999","unstructured":"Yuanyuan Yang and Jianchao Wang. 1999. Efficient all-to-all broadcast in all-port mesh and torus networks. In Proceedings Fifth International Symposium on High-Performance Computer Architecture. IEEE, 290--299."},{"key":"e_1_3_2_1_58_1","volume-title":"2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1040--1050","author":"Young Stephen","year":"2022","unstructured":"Stephen Young, Sinan Aksoy, Jesun Firoz, Roberto Gioiosa, Tobias Hagge, Mark Kempton, Juan Escobedo, and Mark Raugas. 2022. Spectralfly: Ramanujan graphs as flexible and efficient interconnection networks. In 2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1040--1050."},{"key":"e_1_3_2_1_59_1","volume-title":"Optimal Direct-Connect Topologies for Collective Communications. arXiv preprint arXiv:2202.03356","author":"Zhao Liangyu","year":"2022","unstructured":"Liangyu Zhao, Siddharth Pal, Tapan Chugh, Weiyang Wang, Prithwish Basu, Joud Khoury, and Arvind Krishnamurthy. 2022. Optimal Direct-Connect Topologies for Collective Communications. arXiv preprint arXiv:2202.03356 (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"Efficient Direct-Connect Topologies for Collective Communications. arXiv preprint arXiv:2202.03356","author":"Zhao Liangyu","year":"2022","unstructured":"Liangyu Zhao, Siddharth Pal, Tapan Chugh, Weiyang Wang, Jason Fantl, Prithwish Basu, Joud Khoury, and Arvind Krishnamurthy. 2022. Efficient Direct-Connect Topologies for Collective Communications. arXiv preprint arXiv:2202.03356 (2022)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1063\/5.0070711"}],"event":{"name":"HPDC '24: 33rd International Symposium on High-Performance Parallel and Distributed Computing","location":"Pisa Italy","acronym":"HPDC '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 33rd International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658656","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3625549.3658656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:50:38Z","timestamp":1750287038000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3625549.3658656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,3]]},"references-count":60,"alternative-id":["10.1145\/3625549.3658656","10.1145\/3625549"],"URL":"https:\/\/doi.org\/10.1145\/3625549.3658656","relation":{},"subject":[],"published":{"date-parts":[[2024,6,3]]},"assertion":[{"value":"2024-08-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}