{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T08:24:13Z","timestamp":1768033453193,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2018631,2316201"],"award-info":[{"award-number":["2018631,2316201"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656593","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"511-524","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["FASTEN: Fast GPU-accelerated Segmented Matrix Multiplication for Heterogenous Graph Neural Networks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-3182","authenticated-orcid":false,"given":"Keren","family":"Zhou","sequence":"first","affiliation":[{"name":"George Mason University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7709-1184","authenticated-orcid":false,"given":"Karthik Ganapathi","family":"Subramanian","sequence":"additional","affiliation":[{"name":"North Carolina State University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6466-276X","authenticated-orcid":false,"given":"Po-Hsun","family":"Lin","sequence":"additional","affiliation":[{"name":"North Carolina State University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5727-0701","authenticated-orcid":false,"given":"Matthias","family":"Fey","sequence":"additional","affiliation":[{"name":"Kumo.AI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7336-0507","authenticated-orcid":false,"given":"Binqian","family":"Yin","sequence":"additional","affiliation":[{"name":"George Mason University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1270-4147","authenticated-orcid":false,"given":"Jiajia","family":"Li","sequence":"additional","affiliation":[{"name":"North Carolina State University, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Learning to represent programs with graphs. arXiv preprint arXiv:1711.00740","author":"Allamanis Miltiadis","year":"2017","unstructured":"Miltiadis Allamanis, Marc Brockschmidt, and Mahmoud Khademi. 2017. Learning to represent programs with graphs. arXiv preprint arXiv:1711.00740 (2017)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289600.3290967"},{"key":"e_1_3_2_1_3_1","volume-title":"Network biology: understanding the cell\u2019s functional organization. Nature reviews genetics 5, 2","author":"Barabasi Albert-Laszlo","year":"2004","unstructured":"Albert-Laszlo Barabasi and Zoltan\u00a0N Oltvai. 2004. Network biology: understanding the cell\u2019s functional organization. Nature reviews genetics 5, 2 (2004), 101\u2013113."},{"key":"e_1_3_2_1_4_1","unstructured":"Ganesh Bikshandi and Jay Shah. 2023. A Case Study in CUDA Kernel Fusion: Implementing FlashAttention-2 on NVIDIA Hopper Architecture using the CUTLASS Library. arxiv:2312.11918\u00a0[cs.LG]"},{"key":"e_1_3_2_1_5_1","unstructured":"Alpheus Bingham and Dwayne Spradlin. 2011. The long tail of expertise. Pearson Education."},{"key":"e_1_3_2_1_6_1","volume-title":"Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_1_7_1","volume-title":"Relational graph attention networks. arXiv preprint arXiv:1904.05811","author":"Busbridge Dan","year":"2019","unstructured":"Dan Busbridge, Dane Sherburn, Pietro Cavallo, and Nils\u00a0Y Hammerla. 2019. Relational graph attention networks. arXiv preprint arXiv:1904.05811 (2019)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415610"},{"key":"e_1_3_2_1_9_1","volume-title":"Networks, crowds, and markets","author":"Easley David","year":"2012","unstructured":"David Easley, Jon Kleinberg, 2012. Networks, crowds, and markets. Cambridge Books (2012)."},{"key":"e_1_3_2_1_10_1","volume-title":"Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:1903.02428","author":"Fey Matthias","year":"2019","unstructured":"Matthias Fey and Jan\u00a0Eric Lenssen. 2019. Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:1903.02428 (2019)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380297"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Gale Trevor","year":"2023","unstructured":"Trevor Gale, Deepak Narayanan, Cliff Young, and Matei Zaharia. 2023. MegaBlocks: Efficient Sparse Training with Mixture-of-Experts. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Utilizing graph machine learning within drug discovery and development. Briefings in bioinformatics 22, 6","author":"Gaudelet Thomas","year":"2021","unstructured":"Thomas Gaudelet, Ben Day, Arian\u00a0R Jamasb, Jyothish Soman, Cristian Regep, Gertrude Liu, Jeremy\u00a0BR Hayter, Richard Vickers, Charles Roberts, Jian Tang, 2021. Utilizing graph machine learning within drug discovery and development. Briefings in bioinformatics 22, 6 (2021), bbab159."},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 1263\u20131272","author":"Gilmer Justin","year":"2017","unstructured":"Justin Gilmer, Samuel\u00a0S Schoenholz, Patrick\u00a0F Riley, Oriol Vinyals, and George\u00a0E Dahl. 2017. Neural message passing for quantum chemistry. In International conference on machine learning. PMLR, 1263\u20131272."},{"key":"e_1_3_2_1_15_1","unstructured":"Scott Gray Alec Radford and Diederik\u00a0P. Kingma. 2017. Block-Sparse GPU Kernels. https:\/\/blog.openai.com\/block-sparse-gpu-kernels\/. Accessed: 1-14-2024."},{"key":"e_1_3_2_1_16_1","volume-title":"SC22: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201315","author":"Gui Yuntao","year":"2022","unstructured":"Yuntao Gui, Yidi Wu, Han Yang, Tatiana Jin, Boyang Li, Qihui Zhou, James Cheng, and Fan Yu. 2022. HGL: accelerating heterogeneous GNN training with holistic representation and optimization. In SC22: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201315."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524610.3527900"},{"key":"e_1_3_2_1_18_1","volume-title":"A study of persistent threads style GPU programming for GPGPU workloads","author":"Gupta Kshitij","unstructured":"Kshitij Gupta, Jeff\u00a0A Stuart, and John\u00a0D Owens. 2012. A study of persistent threads style GPU programming for GPGPU workloads. IEEE."},{"key":"e_1_3_2_1_19_1","volume-title":"Open graph benchmark: Datasets for machine learning on graphs. Advances in neural information processing systems 33","author":"Hu Weihua","year":"2020","unstructured":"Weihua Hu, Matthias Fey, Marinka Zitnik, Yuxiao Dong, Hongyu Ren, Bowen Liu, Michele Catasta, and Jure Leskovec. 2020. Open graph benchmark: Datasets for machine learning on graphs. Advances in neural information processing systems 33 (2020), 22118\u201322133."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380027"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Huang Guyue","year":"2023","unstructured":"Guyue Huang, Yang Bai, Liu Liu, Yuke Wang, Bei Yu, Yufei Ding, and Yuan Xie. 2023. ALCOP: Automatic Load-Compute Pipelining in Deep Learning Compiler for AI-GPUs. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201312","author":"Huang Guyue","year":"2020","unstructured":"Guyue Huang, Guohao Dai, Yu Wang, and Huazhong Yang. 2020. Ge-spmm: General-purpose sparse matrix-matrix multiplication on gpus for graph neural networks. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201312."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441585"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3247808"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037709"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926255"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295734"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534136"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 27th ACM SIGKDD conference on knowledge discovery & data mining. 1150\u20131160","author":"Lv Qingsong","year":"2021","unstructured":"Qingsong Lv, Ming Ding, Qiang Liu, Yuxiang Chen, Wenzheng Feng, Siming He, Chang Zhou, Jianguo Jiang, Yuxiao Dong, and Jie Tang. 2021. Are we really making much progress? revisiting, benchmarking and refining heterogeneous graph neural networks. In Proceedings of the 27th ACM SIGKDD conference on knowledge discovery & data mining. 1150\u20131160."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00041"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342010385729"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2015.2483592"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW59300.2023.00042"},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2023. Parallel Thread Execution ISA. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html Accessed: 12-25-2023."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA Corporation. 2023. cuBLAS: The NVIDIA CUDA Basic Linear Algebra Subroutines library. https:\/\/developer.nvidia.com\/cublas. Accessed: 12-16-2023."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA Corporation. 2023. cuSPARSE: Basic Linear Algebra for Sparse Matrices on NVIDIA GPUs. https:\/\/developer.nvidia.com\/cusparse. Accessed: 12-16-2023."},{"key":"e_1_3_2_1_38_1","volume-title":"CUTLASS: CUDA C++ template abstractions for implementing high-performance matrix-matrix multiplication. https:\/\/github.com\/NVIDIA\/cutlass. Accessed: 12-16-2023.","author":"NVIDIA Corporation","year":"2023","unstructured":"NVIDIA Corporation. 2023. CUTLASS: CUDA C++ template abstractions for implementing high-performance matrix-matrix multiplication. https:\/\/github.com\/NVIDIA\/cutlass. Accessed: 12-16-2023."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA Corporation. 2023. CUTLASS Grouped Kernel Schedulers. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/media\/docs\/grouped_scheduler.md. Accessed: 12-16-2023."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA Corporation. 2024. CUDA Toolkit Documentation. https:\/\/developer.nvidia.com\/cuda-toolkit Accessed: 01-06-2024."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"e_1_3_2_1_42_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_43_1","unstructured":"PyTorch\u00a0Geometric Team. 2024. PyTorch Geometric (PyG). https:\/\/github.com\/pyg-team\/pytorch_geometric\/blob\/master\/torch_geometric\/nn\/conv\/rgcn_conv.py Accessed: 04-04-2024."},{"key":"e_1_3_2_1_44_1","unstructured":"PyTorch\u00a0Geometric Team. 2024. PyTorch Geometric (PyG) Lib. https:\/\/github.com\/pyg-team\/pyg-lib\/blob\/master\/pyg_lib\/csrc\/ops\/autograd\/matmul_kernel.cpp Accessed: 04-04-2024."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.1073"},{"key":"e_1_3_2_1_46_1","first-page":"1","article-title":"Locality-aware cta scheduling for gaming applications","volume":"19","author":"Ukarande Aditya","year":"2021","unstructured":"Aditya Ukarande, Suryakant Patidar, and Ram Rangan. 2021. Locality-aware cta scheduling for gaming applications. ACM Transactions on Architecture and Code Optimization (TACO) 19, 1 (2021), 1\u201326.","journal-title":"ACM Transactions on Architecture and Code Optimization (TACO)"},{"key":"e_1_3_2_1_47_1","volume-title":"Deep graph library: A graph-centric, highly-performant package for graph neural networks. arXiv preprint arXiv:1909.01315","author":"Wang Minjie","year":"2019","unstructured":"Minjie Wang, Da Zheng, Zihao Ye, Quan Gan, Mufei Li, Xiang Song, Jinjing Zhou, Chao Ma, Lingfan Yu, Yu Gai, 2019. Deep graph library: A graph-centric, highly-performant package for graph neural networks. arXiv preprint arXiv:1909.01315 (2019)."},{"key":"e_1_3_2_1_48_1","volume-title":"15th USENIX symposium on operating systems design and implementation (OSDI 21)","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. 2021. GNNAdvisor: An adaptive and efficient runtime system for GNN acceleration on GPUs. In 15th USENIX symposium on operating systems design and implementation (OSDI 21). 515\u2013531."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.24"},{"key":"e_1_3_2_1_51_1","volume-title":"PIGEON: Optimizing CUDA Code Generator for End-to-End Training and Inference of Relational Graph Neural Networks. arXiv preprint arXiv:2301.06284","author":"Wu Kun","year":"2023","unstructured":"Kun Wu, Mert Hidayeto\u011flu, Xiang Song, Sitao Huang, Da Zheng, Israt Nisa, and Wen-mei Hwu. 2023. PIGEON: Optimizing CUDA Code Generator for End-to-End Training and Inference of Relational Graph Neural Networks. arXiv preprint arXiv:2301.06284 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3494523"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2978386"},{"key":"e_1_3_2_1_54_1","volume-title":"MoleculeNet: a benchmark for molecular machine learning. Chemical science 9, 2","author":"Wu Zhenqin","year":"2018","unstructured":"Zhenqin Wu, Bharath Ramsundar, Evan\u00a0N Feinberg, Joseph Gomes, Caleb Geniesse, Aneesh\u00a0S Pappu, Karl Leswing, and Vijay Pande. 2018. MoleculeNet: a benchmark for molecular machine learning. Chemical science 9, 2 (2018), 513\u2013530."},{"key":"e_1_3_2_1_55_1","first-page":"515","article-title":"Graphiler: Optimizing graph neural networks with message passing data flow graph","volume":"4","author":"Xie Zhiqiang","year":"2022","unstructured":"Zhiqiang Xie, Minjie Wang, Zihao Ye, Zheng Zhang, and Rui Fan. 2022. Graphiler: Optimizing graph neural networks with message passing data flow graph. Proceedings of Machine Learning and Systems 4 (2022), 515\u2013528.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"},{"key":"e_1_3_2_1_57_1","volume-title":"Graph neural networks: A review of methods and applications. AI open 1","author":"Zhou Jie","year":"2020","unstructured":"Jie Zhou, Ganqu Cui, Shengding Hu, Zhengyan Zhang, Cheng Yang, Zhiyuan Liu, Lifeng Wang, Changcheng Li, and Maosong Sun. 2020. Graph neural networks: A review of methods and applications. AI open 1 (2020), 57\u201381."},{"key":"e_1_3_2_1_58_1","volume-title":"Proton: A Profiler for Triton. https:\/\/github.com\/openai\/triton\/tree\/main\/third_party\/proton Accessed: 04-021-2024.","author":"Zhou Keren","year":"2024","unstructured":"Keren Zhou. 2024. Proton: A Profiler for Triton. https:\/\/github.com\/openai\/triton\/tree\/main\/third_party\/proton Accessed: 04-021-2024."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370339"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656593","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656593","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:25:04Z","timestamp":1755876304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656593"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":59,"alternative-id":["10.1145\/3650200.3656593","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656593","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}