{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:44:32Z","timestamp":1766220272492,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2311830"],"award-info":[{"award-number":["2311830"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2312927"],"award-info":[{"award-number":["2312927"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2323116"],"award-info":[{"award-number":["2323116"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2415201"],"award-info":[{"award-number":["2415201"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"XRAC","award":["#NCR-130002"],"award-info":[{"award-number":["#NCR-130002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754666","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"784-793","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Design and Optimization of GPU-Aware MPI Allreduce Using Direct Sendrecv Communication"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7471-7552","authenticated-orcid":false,"given":"Chen-Chun","family":"Chen","sequence":"first","affiliation":[{"name":"The Ohio State University, Columbus, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7129-9508","authenticated-orcid":false,"given":"Jinghan","family":"Yao","sequence":"additional","affiliation":[{"name":"The Ohio State University, Columbus, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1200-2754","authenticated-orcid":false,"given":"Hari","family":"Subramoni","sequence":"additional","affiliation":[{"name":"The Ohio State University, Columbus, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0356-1781","authenticated-orcid":false,"given":"Dhabaleswar K.","family":"Panda","sequence":"additional","affiliation":[{"name":"The Ohio State University, Columbus, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Amber Team. 2025. Amber. https:\/\/ambermd.org. Accessed: 2025-04-29."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"Schott Atchley and Matthias Maiterth. 2023. OLCF Frontier Supercomputer 2023-04-29 HPL Power Data used for Top500\/Green500 Submission. (5 2023). 10.13139\/OLCF\/1975494","DOI":"10.13139\/OLCF\/1975494"},{"key":"e_1_3_3_1_4_2","unstructured":"Microsoft Azure. 2023. Eagle HPC Cluster on Microsoft Azure. https:\/\/www.servethehome.com\/microsoft-azure-eagle-is-a-paradigm-shifting-cloud-supercomputer. Ranked #3 on the TOP500 list featuring NVIDIA H100 GPUs and Intel Xeon Sapphire Rapids CPUs. Provides cloud-based HPC and AI capabilities.."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/SHPCC.1994.296665"},{"key":"e_1_3_3_1_6_2","first-page":"110","volume-title":"Proceedings of the 19th European Conference on Recent Advances in the Message Passing Interface (EuroMPI)","author":"Bureddy D.","year":"2012","unstructured":"D. Bureddy, H. Wang, A. Venkatesh, S. Potluri, and D.\u00a0K. Panda. 2012. OMB-GPU: A Micro-benchmark Suite for Evaluating MPI Libraries on GPU Clusters. In Proceedings of the 19th European Conference on Recent Advances in the Message Passing Interface (EuroMPI) (Vienna, Austria). 110\u2013120."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC62374.2024.00022"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS64566.2025.00088"},{"key":"e_1_3_3_1_9_2","unstructured":"NVIDIA\u00a0NCCL Developers. 2020. Issue #320: Hierarchical AllReduce Implementation. https:\/\/github.com\/NVIDIA\/nccl\/issues\/320. Accessed: 2025-04-29."},{"key":"e_1_3_3_1_10_2","unstructured":"Argonne Leadership\u00a0Computing Facility. 2024. Aurora Supercomputer. https:\/\/www.anl.gov\/aurora. Sponsored by the U.S. Department of Energy designed by Intel and Cray. Expected to achieve over 2 ExaFLOPS of performance upon optimization.."},{"key":"e_1_3_3_1_11_2","unstructured":"IBM. [n. d.]. IBM Spectrum MPI. https:\/\/www.ibm.com\/products\/spectrum-mpi."},{"key":"e_1_3_3_1_12_2","unstructured":"Sylvain Jeaugey. 2019. Massively Scale Your Deep Learning Training with NCCL 2.4. https:\/\/developer.nvidia.com\/blog\/massively-scale-deep-learning-training-nccl-2-4\/. Accessed: 2025-04-29."},{"key":"e_1_3_3_1_13_2","unstructured":"Sylvain Jeaugey Giuseppe Congiu Thomas Gillis Ben Williams and Fred Oh. 2025. New Scaling Algorithm and Initialization with NVIDIA Collective Communications Library 2.23. https:\/\/developer.nvidia.com\/blog\/new-scaling-algorithm-and-initialization-with-nvidia-collective-communications-library-2-23\/. Accessed: 2025-04-29."},{"key":"e_1_3_3_1_14_2","unstructured":"Andrej Karpathy. 2022. NanoGPT. https:\/\/github.com\/karpathy\/nanoGPT."},{"key":"e_1_3_3_1_15_2","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.15704 (2020). https:\/\/arxiv.org\/abs\/2006.15704"},{"key":"e_1_3_3_1_16_2","unstructured":"NERSC. [n. d.]. Cray MPICH - NERSC Documentation. https:\/\/docs.nersc.gov\/development\/programming-models\/mpi\/cray-mpich\/."},{"key":"e_1_3_3_1_17_2","unstructured":"Ohio State University. [n. d.]. MVAPICH2-GDR: High-Performance MPI for GPU Clusters. https:\/\/mvapich.cse.ohio-state.edu\/."},{"key":"e_1_3_3_1_18_2","unstructured":"Open MPI Project. [n. d.]. Open MPI: Open Source High Performance Computing. https:\/\/www.open-mpi.org\/."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Pitch Patarasuk and Xin Yuan. 2009. Bandwidth optimal all-reduce algorithms for clusters of workstations. J. Parallel and Distrib. Comput. 69 2 (2009) 117\u2013124.","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_3_1_20_2","first-page":"1","volume-title":"Proceedings of the International Conference on Computational Science (ICCS)","author":"Rabenseifner Rolf","year":"2004","unstructured":"Rolf Rabenseifner. 2004. Optimization of collective reduction operations. In Proceedings of the International Conference on Computational Science (ICCS). Springer, 1\u20139."},{"key":"e_1_3_3_1_21_2","unstructured":"Erich Strohmaier Jack Dongarra Horst Simon and Martin Meuer. 1993. TOP 500 Supercomputer Sites. http:\/\/www.top500.org."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI52880.2021.00018"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"Yiltan\u00a0Hassan Temu\u00e7in AmirHossein Sojoodi Pedram Alizadeh Benjamin Kitor and Ahmad Afsahi. 2022. Accelerating Deep Learning Using Interconnect-Aware UCX Communication for MPI Collectives. IEEE Micro 42 2 (2022) 68\u201376. 10.1109\/MM.2022.3148670","DOI":"10.1109\/MM.2022.3148670"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Rajeev Thakur Rolf Rabenseifner and William Gropp. 2005. Optimization of collective communication operations in MPICH. The International Journal of High Performance Computing Applications 19 1 (2005) 49\u201366.","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2019.00057"}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754666","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:40:01Z","timestamp":1766220001000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754666"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":24,"alternative-id":["10.1145\/3754598.3754666","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754666","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}