{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:29:58Z","timestamp":1773318598356,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"European High Performance Computing Joint Undertaking","award":["101118139"],"award-info":[{"award-number":["101118139"]}]},{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["949587"],"award-info":[{"award-number":["949587"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Joint Usage\/Research Center for Interdisciplinary Large-scale Information Infrastructures","award":["jh240030"],"award-info":[{"award-number":["jh240030"]}]},{"DOI":"10.13039\/501100005416","name":"Norges Forskningsr\u00e5d","doi-asserted-by":"publisher","award":["270053"],"award-info":[{"award-number":["270053"]}],"id":[{"id":"10.13039\/501100005416","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759774","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"298-315","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["CPU- and GPU-initiated Communication Strategies for Conjugate Gradient Methods on Large GPU Clusters"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4498-020X","authenticated-orcid":false,"given":"James D.","family":"Trotter","sequence":"first","affiliation":[{"name":"Simula Research Laboratory, Oslo, Norway"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5377-6339","authenticated-orcid":false,"given":"Sinan","family":"Ekmek\u00e7iba\u015f\u0131","sequence":"additional","affiliation":[{"name":"Ko\u00e7 University, Istanbul, Turkiye"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9603-2466","authenticated-orcid":false,"given":"Do\u011fan","family":"Sa\u011fbili","sequence":"additional","affiliation":[{"name":"Ko\u00e7 University, Istanbul, Turkiye"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4200-511X","authenticated-orcid":false,"given":"Johannes","family":"Langguth","sequence":"additional","affiliation":[{"name":"Simula Research Laboratory, Oslo, Norway and University of Bergen, Bergen, Norway"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3706-4414","authenticated-orcid":false,"given":"Xing","family":"Cai","sequence":"additional","affiliation":[{"name":"University of Oslo, Oslo, Norway and Simula Research Laboratory, Oslo, Norway"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2351-0770","authenticated-orcid":false,"given":"Didem","family":"Unat","sequence":"additional","affiliation":[{"name":"Ko\u00e7 University, Istanbul, Turkiye"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"E. Agostini D. Rossetti and S. Potluri. 2018. GPUDirect Async: Exploring GPU synchronous communication techniques for InfiniBand clusters. J. Parallel and Distrib. Comput. 114 (2018) 28\u201345. 10.1016\/j.jpdc.2017.12.007","DOI":"10.1016\/j.jpdc.2017.12.007"},{"key":"e_1_3_3_2_3_2","unstructured":"AMD. 2023. ROCnRDMA. https:\/\/github.com\/rocmarchive\/ROCnRDMA."},{"key":"e_1_3_3_2_4_2","unstructured":"AMD. 2023. ROC_SHMEM. https:\/\/github.com\/ROCm-Developer-Tools\/ROC_SHMEM."},{"key":"e_1_3_3_2_5_2","unstructured":"AMD. 2025. hipBLAS documentation. https:\/\/rocm.docs.amd.com\/projects\/hipBLAS\/en\/latest\/index.html"},{"key":"e_1_3_3_2_6_2","unstructured":"AMD. 2025. hipSPARSE User Guide. https:\/\/rocm.docs.amd.com\/projects\/hipSPARSE\/en\/latest\/basics.html"},{"key":"e_1_3_3_2_7_2","unstructured":"AMD. 2025. RCCL. https:\/\/rocm.docs.amd.com\/projects\/rccl\/en\/latest\/."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.5555\/266469.266486"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Fabio Banchelli Marta Garcia-Gasulla Filippo Mantovani Joan Vinyals Josep Pocurull David Vicente Beatriz Eguzkitza Flavio\u00a0CC Galeazzo Mario\u00a0C Acosta and Sergi Girona. 2025. Introducing MareNostrum5: A European pre-exascale energy-efficient system designed to serve a broad spectrum of scientific workloads. arxiv:https:\/\/arXiv.org\/abs\/2503.09917\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2503.09917","DOI":"10.1016\/j.future.2025.108125"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Erin Carson and James Demmel. 2014. A residual replacement strategy for improving the maximum attainable accuracy of s-step Krylov subspace methods. SIAM J. Matrix Anal. Appl. 35 1 (2014) 22\u201343.","DOI":"10.1137\/120893057"},{"key":"e_1_3_3_2_11_2","unstructured":"Erin\u00a0Claire Carson. 2015. Communication-Avoiding Krylov Subspace Methods in Theory and Practice. Ph.\u00a0D. Dissertation. University of California Berkeley."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Erin\u00a0C. Carson. 2018. The adaptive s-step conjugate gradient method. SIAM J. Matrix Anal. Appl. 39 3 (2018) 1318\u20131338.","DOI":"10.1137\/16M1107942"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Erin\u00a0C. Carson Miroslav Rozlo\u017en\u00edk Zden\u011bk Strako\u0161 Petr Tich\u00fd and Miroslav T\u016fma. 2018. The numerical stability analysis of pipelined conjugate gradient methods: Historical context and methodology. SIAM Journal on Scientific Computing 40 5 (2018) A3549\u2013A3580.","DOI":"10.1137\/16M1103361"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Anthony\u00a0T. Chronopoulos and C.\u00a0William Gear. 1989. On the efficient implementation of preconditioned s-step conjugate gradient methods on multiprocessors with memory hierarchy. Parallel computing 11 1 (1989) 37\u201353.","DOI":"10.1016\/0167-8191(89)90062-8"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Siegfried Cools Emrullah\u00a0Fatih Yetkin Emmanuel Agullo Luc Giraud and Wim Vanroose. 2018. Analyzing the effect of local rounding error propagation on the maximal attainable accuracy of the pipelined conjugate gradient method. SIAM J. Matrix Anal. Appl. 39 1 (2018) 426\u2013450.","DOI":"10.1137\/17M1117872"},{"key":"e_1_3_3_2_16_2","volume-title":"cuSPARSE Library","author":"Corporation NVIDIA","year":"2022","unstructured":"NVIDIA Corporation. 2022. cuSPARSE Library. NVIDIA Corporation."},{"key":"e_1_3_3_2_17_2","volume-title":"cuBLAS, Release 12.8","author":"Corporation NVIDIA","year":"2025","unstructured":"NVIDIA Corporation. 2025. cuBLAS, Release 12.8. NVIDIA Corporation."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The University of Florida Sparse Matrix Collection. ACM Trans. Math. Softw. 38 1 Article 1 (dec 2011) 25\u00a0pages. 10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00039"},{"key":"e_1_3_3_2_20_2","unstructured":"Jacob Faibussowitsch Mark\u00a0F. Adams Richard\u00a0Tran Mills Stefano Zampini and Junchao Zhang. 2023. Safe Seamless And Scalable Integration Of Asynchronous GPU Streams In PETSc. arxiv:https:\/\/arXiv.org\/abs\/2306.17801\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2306.17801"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-47789-6_66"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"P. Ghysels and W. Vanroose. 2014. Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm. Parallel Comput. 40 7 (2014) 224\u2013238. 10.1016\/j.parco.2013.06.0017th Workshop on Parallel Matrix Algorithms and Applications.","DOI":"10.1016\/j.parco.2013.06.001"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS51919.2020.00016"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339596"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.51"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374544"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Michael Heroux Hui Zhou Ken Raffenetti Yanfei Guo Thomas Gillis Robert Latham and Rajeev Thakur. 2024. Designing and prototyping extensions to the Message Passing Interface in MPICH. Int. J. High Perform. Comput. Appl. 38 5 (sep 2024) 527\u2013545. 10.1177\/10943420241263544","DOI":"10.1177\/10943420241263544"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"M.\u00a0R. Hestenes and E. Stiefel. 1952. Methods of conjugate gradients for solving linear systems. J. Res. Nat. Bur. Standards 49 6 (1952) 409\u2013436.","DOI":"10.6028\/jres.049.044"},{"key":"e_1_3_3_2_29_2","unstructured":"HPE. 2021. Cray MPICH Documentation. https:\/\/cpe.ext.hpe.com\/docs\/mpt\/mpich\/intro_mpi.html."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW50202.2020.00104"},{"key":"e_1_3_3_2_31_2","unstructured":"Intel. 2023. Intel\u00ae SHMEM. https:\/\/github.com\/oneapi-src\/ishmem."},{"key":"e_1_3_3_2_32_2","unstructured":"Intel. 2025. oneCCL. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/oneccl.html."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593713"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","unstructured":"George Karypis and Vipin Kumar. 1998. A Fast and High Quality Multilevel Scheme for Partitioning Irregular Graphs. SIAM Journal on Scientific Computing 20 1 (1998) 359\u2013392. 10.1137\/S1064827595287997","DOI":"10.1137\/S1064827595287997"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00070"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","unstructured":"Tailai Ma Zhihong Gou Ningyi Xu and Shuli Sun. 2024. Efficient Multi-Gpu Implementations of Preconditioned Conjugate Gradient Method Using Tensor Cores. 10.2139\/ssrn.4985526","DOI":"10.2139\/ssrn.4985526"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","unstructured":"Tailai Ma Zhihong Gou Ningyi Xu and Shuli Sun. 2025. Efficient multi-GPU implementations of preconditioned conjugate gradient method. Advances in Engineering Software 207 (2025) 103936. 10.1016\/j.advengsoft.2025.103936","DOI":"10.1016\/j.advengsoft.2025.103936"},{"key":"e_1_3_3_2_38_2","unstructured":"H Martinez-Navarro B Rodriguez A Bueno-Orovio and A Minchole. 2019. Repository for modelling acute myocardial ischemia: simulation scripts and torso-heart mesh. https:\/\/ora.ox.ac.uk\/objects\/uuid:951b086c-c4ba-41ef-b967-c2106d87ee06"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.57"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","unstructured":"Richard\u00a0Tran Mills Mark\u00a0F. Adams Satish Balay Jed Brown Alp Dener Matthew Knepley Scott\u00a0E. Kruger Hannah Morgan Todd Munson Karl Rupp Barry\u00a0F. Smith Stefano Zampini Hong Zhang and Junchao Zhang. 2021. Toward performance-portable PETSc for GPU-based exascale systems. Parallel Comput. 108 (2021) 102831. 10.1016\/j.parco.2021.102831","DOI":"10.1016\/j.parco.2021.102831"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","unstructured":"Richard\u00a0Tran Mills Mark\u00a0F. Adams Satish Balay Jed Brown Jacob Faibussowitsch Toby Isaac Matthew\u00a0G. Knepley Todd Munson Hansol Suh Stefano Zampini Hong Zhang and Junchao Zhang. 2025. PETSc\/TAO developments for GPU-based early exascale systems. The International Journal of High Performance Computing Applications (2025). 10.1177\/10943420241303710","DOI":"10.1177\/10943420241303710"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/2159430.2159433"},{"key":"e_1_3_3_2_43_2","unstructured":"Naveen Namashivayam Krishna Kandalla Trey White Nick Radcliffe Larry Kaplan and Mark Pagel. 2022. Exploring GPU Stream-Aware Message Passing using Triggered Operations. arxiv:https:\/\/arXiv.org\/abs\/2208.04817\u00a0[cs.DC]"},{"key":"e_1_3_3_2_44_2","unstructured":"NVIDIA. 2023. GPUDirect RDMA. https:\/\/docs.nvidia.com\/cuda\/gpudirect-rdma\/."},{"key":"e_1_3_3_2_45_2","unstructured":"NVIDIA. 2023. NVSHMEM. https:\/\/developer.nvidia.com\/nvshmem."},{"key":"e_1_3_3_2_46_2","unstructured":"NVIDIA. 2025. NCCL. https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2014.111"},{"key":"e_1_3_3_2_48_2","unstructured":"OpenMPI. 2023. Open MPI v5.0.x Documentation: CUDA. https:\/\/docs.open-mpi.org\/en\/v5.0.x\/tuning-apps\/networking\/cuda.html."},{"key":"e_1_3_3_2_49_2","unstructured":"OpenMPI. 2023. Open MPI v5.0.x Documentation: ROCm. https:\/\/docs.open-mpi.org\/en\/v5.0.x\/tuning-apps\/networking\/rocm.html."},{"key":"e_1_3_3_2_50_2","unstructured":"Sreeram\u00a0Potluri Pak\u00a0Markthub Jim\u00a0Dinan and Seth Howell. 2022. Improving Network Performance of HPC Systems Using NVIDIA Magnum IO NVSHMEM and GPUDirect Async. https:\/\/developer.nvidia.com\/blog\/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async\/."},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2017.00037"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2012.228"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","unstructured":"Markus Steinberger Michael Kenzel Pedro Boechat Bernhard Kerbl Mark Dokter and Dieter Schmalstieg. 2014. Whippletree: Task-Based Scheduling of Dynamic Workloads on the GPU. ACM Trans. Graph. 33 6 Article 228 (nov 2014) 11\u00a0pages. 10.1145\/2661229.2661250","DOI":"10.1145\/2661229.2661250"},{"key":"e_1_3_3_2_54_2","unstructured":"Didem Unat Ilyas Turimbetov Mohammed Kefah\u00a0Taha Issa Do\u011fan Sa\u011fbili Flavio Vella Daniele\u00a0De Sensi and Ismayil Ismayilov. 2024. The Landscape of GPU-Centric Communication. arxiv:https:\/\/arXiv.org\/abs\/2409.09874v2\u00a0[cs] https:\/\/arxiv.org\/abs\/2409.09874v2"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","unstructured":"Hao Wang Sreeram Potluri Devendar Bureddy Carlos Rosales and Dhabaleswar\u00a0K. Panda. 2014. GPU-Aware MPI on RDMA-Enabled Clusters: Design Implementation and Evaluation. IEEE Transactions on Parallel and Distributed Systems 25 10 (2014) 2595\u20132605. 10.1109\/TPDS.2013.222","DOI":"10.1109\/TPDS.2013.222"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","unstructured":"Hao Wang Sreeram Potluri Miao Luo Ashish Singh Sayantan Sur and D.K. Panda. 2011. MVAPICH2GPU: optimized GPU to GPU communication for InfiniBand clusters. Computer Science - Research and Development 26 (06 2011) 257\u2013266. 10.1007\/s00450-011-0171-3","DOI":"10.1007\/s00450-011-0171-3"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2011.42"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","unstructured":"Adam Weingram Yuke Li Hao Qi Darren Ng Liuyao Dai and Xiaoyi Lu. 2023. xCCL: A Survey of Industry-Led Collective Communication Libraries for Deep Learning. Journal of Computer Science and Technology 38 1 (Feb 2023) 166\u2013195. 10.1007\/s11390-023-2894-6","DOI":"10.1007\/s11390-023-2894-6"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"crossref","unstructured":"Junchao Zhang Jed Brown Satish Balay Jacob Faibussowitsch Matthew Knepley Oana Marin Richard\u00a0Tran Mills Todd Munson Barry\u00a0F Smith and Stefano Zampini. 2021. The PetscSF scalable communication layer. IEEE Transactions on Parallel and Distributed Systems 33 4 (2021) 842\u2013853.","DOI":"10.1109\/TPDS.2021.3084070"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:46:08Z","timestamp":1773254768000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759774"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":58,"alternative-id":["10.1145\/3712285.3759774","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759774","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}