{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:09Z","timestamp":1755870009306,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730425","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"73-87","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CoLa: Towards Communication-efficient Distributed Sparse Matrix-Matrix Multiplication on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7377-9269","authenticated-orcid":false,"given":"Lixing","family":"Zhang","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8559-2628","authenticated-orcid":false,"given":"Yingxia","family":"Shao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0022-7865","authenticated-orcid":false,"given":"Shigang","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2024. cuSPARSE. https:\/\/docs.nvidia.com\/cuda\/cusparse\/index.html. Accessed: 2024-05-08."},{"key":"e_1_3_3_2_3_2","unstructured":"2024. GPUDirect | NVIDIA Developer. https:\/\/developer.nvidia.com\/gpudirect. Accessed: 2024-05-13."},{"key":"e_1_3_3_2_4_2","unstructured":"2024. NVSHMEM | NVIDIA Developer. https:\/\/developer.nvidia.com\/nvshmem. Accessed: 2024-05-08."},{"key":"e_1_3_3_2_5_2","unstructured":"2024. Unified Memory for CUDA Beginners | NVIDIA Technical Blog. https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/. Accessed: 2024-05-13."},{"key":"e_1_3_3_2_6_2","unstructured":"2025. Device APIs on Proxy-Based Transport. https:\/\/docs.nvidia.com\/nvshmem\/release-notes-install-guide\/best-practice-guide\/apis.html. Accessed: 2025-01-13."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.110"},{"key":"e_1_3_3_2_8_2","series-title":"(ICPP \u201922)","volume-title":"Proceedings of the 51st International Conference on Parallel Processing","author":"Balin Muhammed\u00a0Fatih","year":"2023","unstructured":"Muhammed\u00a0Fatih Balin, Kaan Sancak, and Umit\u00a0V. Catalyurek. 2023. MG-GCN: A Scalable multi-GPU GCN Training Framework. In Proceedings of the 51st International Conference on Parallel Processing (Bordeaux, France) (ICPP \u201922). Association for Computing Machinery, New York, NY, USA, Article 79, 11\u00a0pages. https:\/\/doi.org\/10.1145\/3545008.3545082"},{"key":"e_1_3_3_2_9_2","series-title":"(SC \u201912)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Bauer Michael","year":"2012","unstructured":"Michael Bauer, Sean Treichler, Elliott Slaughter, and Alex Aiken. 2012. Legion: expressing locality and independence with logical regions. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Salt Lake City, Utah) (SC \u201912). IEEE Computer Society Press, Washington, DC, USA, Article 66, 11\u00a0pages."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00014"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640427"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456233"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","unstructured":"Zhaodong Chen Zheng Qu Liu Liu Yufei Ding and Yuan Xie. 2021. Efficient tensor core-based GPU kernels for structured sparsity under reduced precision(SC \u201921). Association for Computing Machinery New York NY USA Article 78 14\u00a0pages. 10.1145\/3458817.3476182","DOI":"10.1145\/3458817.3476182"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/1557019.1557049"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. NVIDIA A100 Tensor Core GPU: Performance and Innovation. IEEE Micro 41 2 (2021) 29\u201335.","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530508"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The university of Florida sparse matrix collection. ACM Trans. Math. Softw. 38 1 Article 1 (dec 2011) 25\u00a0pages. https:\/\/doi.org\/10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00057"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295265"},{"key":"e_1_3_3_2_20_2","unstructured":"Torsten Hoefler Dan Alistarh Tal Ben-Nun Nikoli Dryden and Alexandra Peste. 2021. Sparsity in deep learning: pruning and growth for efficient inference and training in neural networks. J. Mach. Learn. Res. 22 1 Article 241 (jan 2021) 124\u00a0pages."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295712"},{"key":"e_1_3_3_2_22_2","series-title":"(NIPS \u201920)","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Hu Weihua","year":"2020","unstructured":"Weihua Hu, Matthias Fey, Marinka Zitnik, Yuxiao Dong, Hongyu Ren, Bowen Liu, Michele Catasta, and Jure Leskovec. 2020. Open graph benchmark: datasets for machine learning on graphs. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS \u201920). Curran Associates Inc., Red Hook, NY, USA, Article 1855, 16\u00a0pages."},{"key":"e_1_3_3_2_23_2","unstructured":"Zhihao Jia Sina Lin Mingyu Gao Matei Zaharia and Alex Aiken. 2020. Improving the accuracy scalability and performance of graph neural networks with roc. Proceedings of Machine Learning and Systems 2 (2020) 187\u2013198."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.80"},{"key":"e_1_3_3_2_25_2","unstructured":"George Karypis and Vipin Kumar. 1997. METIS: A software package for partitioning unstructured graphs partitioning meshes and computing fill-reducing orderings of sparse matrices. (1997)."},{"key":"e_1_3_3_2_26_2","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.02907 (2016)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.117"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Amy\u00a0N. Langville and Carl\u00a0D. Meyer. 2006. A Reordering for the PageRank Problem. SIAM J. Sci. Comput. 27 6 (jan 2006) 2112\u20132120. https:\/\/doi.org\/10.1137\/040607551","DOI":"10.1137\/040607551"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Ang Li Shuaiwen\u00a0Leon Song Jieyang Chen Jiajia Li Xu Liu Nathan\u00a0R. Tallent and Kevin\u00a0J. Barker. 2020. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. IEEE Trans. Parallel Distrib. Syst. 31 1 (jan 2020) 94\u2013110. https:\/\/doi.org\/10.1109\/TPDS.2019.2928289","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_3_2_30_2","series-title":"(SC \u201922)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Li Shigang","year":"2022","unstructured":"Shigang Li, Kazuki Osawa, and Torsten Hoefler. 2022. Efficient quantized sparse matrix operations on tensor cores. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC \u201922). IEEE Press, Article 37, 15\u00a0pages."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Seung\u00a0Won Min Kun Wu Sitao Huang Mert Hidayeto\u011flu Jinjun Xiong Eiman Ebrahimi Deming Chen and Wen-mei Hwu. 2021. Large graph convolutional network training with GPU-oriented data communication architecture. Proc. VLDB Endow. 14 11 (jul 2021) 2087\u20132100. https:\/\/doi.org\/10.14778\/3476249.3476264","DOI":"10.14778\/3476249.3476264"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447818.3461472"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Dahai Tang Jiali Wang Rong Chen Lei Wang Wenyuan Yu Jingren Zhou and Kenli Li. 2024. XGNN: Boosting Multi-GPU GNN Training via Global GNN Memory Store. Proc. VLDB Endow. 17 5 (may 2024) 1105\u20131118. https:\/\/doi.org\/10.14778\/3641204.3641219","DOI":"10.14778\/3641204.3641219"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433794"},{"key":"e_1_3_3_2_35_2","volume-title":"SUMMA: Scalable Universal Matrix Multiplication Algorithm","author":"Geijn Robert\u00a0A. van\u00a0de","year":"1995","unstructured":"Robert\u00a0A. van\u00a0de Geijn and Jerrell Watts. 1995. SUMMA: Scalable Universal Matrix Multiplication Algorithm. Technical Report. USA."},{"key":"e_1_3_3_2_36_2","unstructured":"Petar Velickovic Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Lio Yoshua Bengio et\u00a0al. 2017. Graph attention networks. stat 1050 20 (2017) 10\u201348550."},{"key":"e_1_3_3_2_37_2","unstructured":"Minjie Wang Da Zheng Zihao Ye Quan Gan Mufei Li Xiang Song Jinjing Zhou Chao Ma Lingfan Yu Yu Gai et\u00a0al. 2019. Deep graph library: A graph-centric highly-performant package for graph neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.01315 (2019)."},{"key":"e_1_3_3_2_38_2","first-page":"515","volume-title":"15th USENIX symposium on operating systems design and implementation (OSDI 21)","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. 2021. { GNNAdvisor} : An adaptive and efficient runtime system for { GNN} acceleration on { GPUs}. In 15th USENIX symposium on operating systems design and implementation (OSDI 21). 515\u2013531."},{"key":"e_1_3_3_2_39_2","first-page":"779","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Tong Geng, Kevin Barker, Ang Li, and Yufei Ding. 2023. MGG: Accelerating Graph Neural Networks with Fine-Grained Intra-Kernel Communication-Computation Pipelining on Multi-GPU Platforms. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 779\u2013795. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/wang-yuke"},{"key":"e_1_3_3_2_40_2","first-page":"149","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Guyue Huang, and Yufei Ding. 2023. TC-GNN: Bridging Sparse GNN Computation and Dense Tensor Cores on GPUs. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). USENIX Association, Boston, MA, 149\u2013164. https:\/\/www.usenix.org\/conference\/atc23\/presentation\/wang-yuke"},{"key":"e_1_3_3_2_41_2","unstructured":"Keyulu Xu Weihua Hu Jure Leskovec and Stefanie Jegelka. 2018. How powerful are graph neural networks? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.00826 (2018)."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-96983-1_48"},{"key":"e_1_3_3_2_43_2","series-title":"(SC \u201922)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Yang Dongxu","year":"2022","unstructured":"Dongxu Yang, Junhong Liu, Jiaxing Qi, and Junjie Lai. 2022. WholeGraph: a fast graph neural network training framework with multi-GPU distributed shared memory architecture. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC \u201922). IEEE Press, Article 54, 14\u00a0pages."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Xintian Yang Srinivasan Parthasarathy and P. Sadayappan. 2011. Fast Sparse Matrix-Vector Multiplication on GPUs: Implications for Graph Mining. Proc. VLDB Endow. 4 4 (jan 2011) 231\u2013242.","DOI":"10.14778\/1938545.1938548"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/1278177.1278183"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577506"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532369"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.5555\/3327345.3327423"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11782"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730425","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:58:26Z","timestamp":1755867506000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730425"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":48,"alternative-id":["10.1145\/3721145.3730425","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730425","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}