{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:54:32Z","timestamp":1776930872533,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"name":"Natural Sciences and Engineering Research Council of Canada (NSERC)","award":["5785392022"],"award-info":[{"award-number":["5785392022"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767392","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"449-460","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Accelerating Intra-Node GPU Communication: A Performance Model for Multi-Path Transfers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9877-3201","authenticated-orcid":false,"given":"Amirhossein","family":"Sojoodi","sequence":"first","affiliation":[{"name":"Queens University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3041-629X","authenticated-orcid":false,"given":"Mohammad","family":"Akbari","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2275-3313","authenticated-orcid":false,"given":"Hamed","family":"Sharifian","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7112-5080","authenticated-orcid":false,"given":"Ali","family":"Farazdaghi","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0163-3892","authenticated-orcid":false,"given":"Ryan E.","family":"Grant","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2924-6851","authenticated-orcid":false,"given":"Ahmad","family":"Afsahi","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Ontario, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","unstructured":"Albert Alexandrov Mihai\u00a0F. Ionescu Klaus\u00a0E. Schauser and Chris Scheiman. 1995. LogGP: Incorporating Long Messages into the LogP Model. J. Parallel and Distrib. Comput. (1995) 95\u2013105. 10.1145\/215399.215427","DOI":"10.1145\/215399.215427"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS51385.2021.00015"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","unstructured":"Petros Anastasiadis Nikela Papadopoulou Georgios Goumas Nectarios Koziris Dennis Hoppe and Li Zhong. 2023. PARALiA: A Performance Aware Runtime for Auto-tuning Linear Algebra on Heterogeneous Systems. ACM Transactions on Architecture and Code Optimization 20 4 (2023) 1\u201325. 10.1145\/3624569","DOI":"10.1145\/3624569"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622742"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2013.236"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33518-1_16"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Jes\u00fas C\u00e1mara Javier Cuenca Victor Galindo Arturo Vicente and Murilo Boratto. 2025. An autotuning approach to select the inter-GPU communication library on heterogeneous systems. Journal of Supercomputing 81 1 (2025) 1\u201316. 10.1007\/s11227-024-06794-3","DOI":"10.1007\/s11227-024-06794-3"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/155332.155333"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00039"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Fr\u00e9d\u00e9ric Desprez Pierre Ramet and Jean Roman. 1996. Optimal grain size computation for pipelined algorithms. Lecture Notes in Computer Science (1996) 165\u2013172. 10.1007\/3-540-61626-8_21","DOI":"10.1007\/3-540-61626-8_21"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Juan G\u00f3mez-Luna Jos\u00e9\u00a0Mar\u00eda Gonz\u00e1lez-Linares Jos\u00e9\u00a0Ignacio Benavides and Nicol\u00e1s Guil. 2012. Performance models for asynchronous data transfers on consumer Graphics Processing Units. J. Parallel and Distrib. Comput. 72 9 (2012) 1117\u20131126. 10.1016\/j.jpdc.2011.07.011","DOI":"10.1016\/j.jpdc.2011.07.011"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","unstructured":"Roger\u00a0W. Hockney. 1994. The communication challenge for MPP: Intel Paragon and Meiko CS-2. Parallel Comput. 20 3 (1994) 389\u2013398. 10.1016\/S0167-8191(06)80021-9","DOI":"10.1016\/S0167-8191(06)80021-9"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"T. Hoefler T. Schneider and A. Lumsdaine. 2009. LogGP in theory and practice - An in-depth analysis of modern interconnection networks and benchmarking methods for collective operations. Simulation Modelling Practice and Theory 17 9 (2009) 1511\u20131521. 10.1016\/j.simpat.2009.06.007","DOI":"10.1016\/j.simpat.2009.06.007"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"Harsh Khetawat Nikhil Jain Abhinav Bhatele and Frank Mueller. 2024. Predicting GPUDirect Benefits for HPC Workloads. Proceedings of Euromicro International Conference on Parallel Distributed and Network-Based Processing (PDP) (2024) 88\u201397. 10.1109\/PDP62718.2024.00020","DOI":"10.1109\/PDP62718.2024.00020"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651362"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","unstructured":"Bozhong Liu Weidong Qiu Lin Jiang and Zheng Gong. 2016. Software pipelining for graphic processing unit acceleration: Partition scheduling and granularity. International Journal of High Performance Computing Applications 30 2 (2016) 169\u2013185. 10.1177\/1094342015585845","DOI":"10.1177\/1094342015585845"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Gangfeng Liu Yunlan Wang Tianhai Zhao Jianhua Gu and Dongyang Li. 2012. mHLogGP: A Parallel Computation Model for CPU\/GPU. Network and Parallel Computing (2012) 217\u2013224. 10.1007\/978-3-642-35606-3_25","DOI":"10.1007\/978-3-642-35606-3_25"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Shelby Lockhart Amanda Bienz William Gropp and Luke Olson. 2023. Performance Analysis and Optimal Node-aware Communication for Enlarged Conjugate Gradient Methods. ACM Transactions on Parallel Computing 10 1 (2023) 1\u201325. 10.1145\/3580003","DOI":"10.1145\/3580003"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Shelby Lockhart Amanda Bienz William\u00a0D. Gropp and Luke\u00a0N. Olson. 2023. Characterizing the performance of node-aware strategies for irregular point-to-point communication on heterogeneous architectures. Parallel Comput. 116 September 2022 (2023) 1\u201312. 10.1016\/j.parco.2023.103021","DOI":"10.1016\/j.parco.2023.103021"},{"key":"e_1_3_3_1_21_2","unstructured":"MPI Forum. 2025. https:\/\/www.mpi-forum.org\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_22_2","unstructured":"MPICH. 2025. https:\/\/www.mpich.org\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/BigData52589.2021.9672073"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/bigdata52589.2021.9672073"},{"key":"e_1_3_3_1_25_2","unstructured":"NVIDIA. 2025. https:\/\/www.nvidia.com\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_26_2","unstructured":"NVIDIA. 2025. GPUDirect Technologies. https:\/\/developer.nvidia.com\/gpudirect [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_27_2","unstructured":"NVIDIA. 2025. NVIDIA Collective Communications Library. https:\/\/github.com\/NVIDIA\/nccl [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_28_2","unstructured":"Open MPI. 2025. https:\/\/www.open-mpi.org\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","unstructured":"Ali Riahi Abdorreza Savadi and Mahmoud Naghibzadeh. 2020. Comparison of analytical and ML-based models for predicting CPU\u2013GPU data transfer time. Computing 102 9 (2020) 2099\u20132116. 10.1007\/s00607-019-00780-x","DOI":"10.1007\/s00607-019-00780-x"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","unstructured":"Juan\u00a0Antonio Rico-Gallego and Juan\u00a0Carlos D\u00edaz-Mart\u00edn. 2015. \u03c4 -Lop: Modeling performance of shared memory MPI. Parallel Comput. 46 (2015) 14\u201331. 10.1016\/j.parco.2015.02.006","DOI":"10.1016\/j.parco.2015.02.006"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","unstructured":"Juan\u00a0A. Rico-Gallego Juan\u00a0C. D\u00edaz-Mart\u00edn Ravi\u00a0Reddy Manumachu and Alexey\u00a0L. Lastovetsky. 2019. A survey of communication performance models for high-performance computing. Comput. Surveys 51 6 (2019) 1\u201336. 10.1145\/3284358","DOI":"10.1145\/3284358"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","unstructured":"Whit Schonbein Scott Levy Matthew\u00a0G.F. Dosanjh W.\u00a0Pepper Marts Elizabeth Reid and Ryan\u00a0E. Grant. 2023. Modeling and Benchmarking the Potential Benefit of Early-Bird Transmission in Fine-Grained Communication. Proceedings of the International Conference on Parallel Processing (ICPP) (2023) 306\u2013316. 10.1145\/3605573.3605618","DOI":"10.1145\/3605573.3605618"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI.2015.13"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER59578.2024.00019"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","unstructured":"Shaohuai Shi Qiang Wang and Xiaowen Chu. 2018. Performance modeling and evaluation of distributed deep learning frameworks on GPUs. Proceedings of the IEEE International Conference on Dependable Autonomic and Secure Computing (DASC) (2018) 943\u2013948. 10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4","DOI":"10.1109\/DASC\/PiCom\/DataCom\/CyberSciTec.2018.000-4"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3642961.3643800"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI52880.2021.00018"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","unstructured":"Y\u0131ltan\u00a0Hassan Temucin Amirhossein Sojoodi Pedram Alizadeh Benjamin\u00a0W Kitor and Ahmad Afsahi. 2021. Accelerating Deep Learning using Interconnect-Aware UCX Communication for MPI Collectives. IEEE Micro (2021) 1\u20139. 10.1109\/MM.2022.3148670","DOI":"10.1109\/MM.2022.3148670"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","unstructured":"Andreas Thune Sven-arne Reinemo Tor Skeie and Xing Cai. 2023. Detailed modeling of heterogeneous and communication. IEEE Transactions on Parallel and Distributed Systems (2023) 1\u201314. 10.1109\/TPDS.2023.3253881","DOI":"10.1109\/TPDS.2023.3253881"},{"key":"e_1_3_3_1_40_2","unstructured":"Top500. 2025. https:\/\/top500.org\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","unstructured":"Tu Tran Bharath Ramesh Benjamin Michalowicz Mustafa Abduljabbar Hari Subramoni Aamir Shafi and Dhabaleswar\u00a0K. Panda. 2024. Accelerating communication with multi-HCA aware collectives in MPI. Concurrency and Computation: Practice and Experience 36 1 (2024) 1\u201320. 10.1002\/cpe.7879","DOI":"10.1002\/cpe.7879"},{"key":"e_1_3_3_1_42_2","unstructured":"Didem Unat Ilyas Turimbetov Mohammed Kefah Taha Issa Flavio Vella Daniele D\u00a0E Sensi and Ismayil Ismayilov. 2024. The Landscape of GPU-Centric Communication. arXiv (2024) 1\u201325. arxiv:arXiv:2409.09874v2"},{"key":"e_1_3_3_1_43_2","unstructured":"Unified Communication Framework Consortium. 2025. Unified Collective Communication (UCC). https:\/\/github.com\/openucx\/ucc [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_44_2","unstructured":"Unified Communication Framework Consortium. 2025. Unified Communication X (UCX). https:\/\/openucx.org\/ [Accessed: 2025-04-01]."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI63208.2024.00018"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","unstructured":"Jingyuan Wang Tianhai Zhao and Yunlan Wang. 2024. Network states-aware collective communication optimization. Cluster Computing (2024) 1\u201319. 10.1007\/s10586-024-04330-9","DOI":"10.1007\/s10586-024-04330-9"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","unstructured":"Ziheng Wang Heng Chen Xiaoshe Dong Weilin Cai and Xingjun Zhang. 2022. LogSC: Model-based one-sided communication performance estimation. Future Generation Computer Systems 132 (2022) 25\u201339. 10.1016\/j.future.2022.02.004","DOI":"10.1016\/j.future.2022.02.004"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid.2014.16"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767392","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:34:31Z","timestamp":1767987271000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767392"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":47,"alternative-id":["10.1145\/3731599.3767392","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767392","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}