{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:11Z","timestamp":1755870011604,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730430","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"104-118","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Fused3S: Fast Sparse Attention on Tensor Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3029-2214","authenticated-orcid":false,"given":"Zitong","family":"Li","sequence":"first","affiliation":[{"name":"University of California, Irvine, Irvine, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0840-4192","authenticated-orcid":false,"given":"Aparna","family":"Chandramowlishwaran","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Iz Beltagy Matthew\u00a0E Peters and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2004.05150 (2020)."},{"key":"e_1_3_3_2_3_2","unstructured":"Rewon Child Scott Gray Alec Radford and Ilya Sutskever. 2019. Generating Long Sequences with Sparse Transformers. CoRR abs\/1904.10509 (2019). arXiv:https:\/\/arXiv.org\/abs\/1904.10509http:\/\/arxiv.org\/abs\/1904.10509"},{"key":"e_1_3_3_2_4_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_3_2_5_2","unstructured":"Tri Dao Daniel\u00a0Y. Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. arxiv:https:\/\/arXiv.org\/abs\/2205.14135\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2205.14135"},{"key":"e_1_3_3_2_6_2","unstructured":"Vijay\u00a0Prakash Dwivedi and Xavier Bresson. 2021. A Generalization of Transformer Networks to Graphs. AAAI Workshop on Deep Learning on Graphs: Methods and Applications (2021)."},{"key":"e_1_3_3_2_7_2","volume-title":"Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track","author":"Dwivedi Vijay\u00a0Prakash","year":"2022","unstructured":"Vijay\u00a0Prakash Dwivedi, Ladislav Ramp\u00e1\u0161ek, Mikhail Galkin, Ali Parviz, Guy Wolf, Anh\u00a0Tuan Luu, and Dominique Beaini. 2022. Long Range Graph Benchmark. In Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=in7XC5RcjEn"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651378"},{"key":"e_1_3_3_2_9_2","volume-title":"ICLR Workshop on Representation Learning on Graphs and Manifolds","author":"Fey Matthias","year":"2019","unstructured":"Matthias Fey and Jan\u00a0E. Lenssen. 2019. Fast Graph Representation Learning with PyTorch Geometric. In ICLR Workshop on Representation Learning on Graphs and Manifolds."},{"key":"e_1_3_3_2_10_2","series-title":"(SC \u201920)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Gale Trevor","year":"2020","unstructured":"Trevor Gale, Matei Zaharia, Cliff Young, and Erich Elsen. 2020. Sparse GPU kernels for deep learning. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 17, 14\u00a0pages."},{"key":"e_1_3_3_2_11_2","unstructured":"Alicia Golden Samuel Hsia Fei Sun Bilge Acun Basil Hosmer Yejin Lee Zachary DeVito Jeff Johnson Gu-Yeon Wei David Brooks and Carole-Jean Wu. 2024. Is Flash Attention Stable? arxiv:https:\/\/arXiv.org\/abs\/2405.02803\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2405.02803"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1007\/s10710-017-9314-z"},{"key":"e_1_3_3_2_13_2","unstructured":"William\u00a0L. Hamilton Rex Ying and Jure Leskovec. 2018. Inductive Representation Learning on Large Graphs. arxiv:https:\/\/arXiv.org\/abs\/1706.02216\u00a0[cs.SI] https:\/\/arxiv.org\/abs\/1706.02216"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Yoonsang Han Inseo Kim Jinsung Kim and Gordon\u00a0Euhyun Moon. 2024. Tensor Core-Adapted Sparse Matrix Multiplication for Accelerating Sparse Deep Neural Networks. Electronics 13 20 (Jan. 2024) 3981. doi:10.3390\/electronics13203981 Number: 20 Publisher: Multidisciplinary Digital Publishing Institute.","DOI":"10.3390\/electronics13203981"},{"key":"e_1_3_3_2_15_2","unstructured":"Weihua Hu Matthias Fey Marinka Zitnik Yuxiao Dong Hongyu Ren Bowen Liu Michele Catasta and Jure Leskovec. 2020. Open Graph Benchmark: Datasets for Machine Learning on Graphs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.00687 (2020)."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Eun-Jin Im Katherine Yelick and Richard Vuduc. 2004. Sparsity: Optimization framework for sparse matrix kernels. The International Journal of High Performance Computing Applications 18 1 (2004) 135\u2013158.","DOI":"10.1177\/1094342004041296"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599843"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-69583-4_1"},{"key":"e_1_3_3_2_19_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Lee Heejun","year":"2024","unstructured":"Heejun Lee, Jina Kim, Jeffrey Willette, and Sung\u00a0Ju Hwang. 2024. SEA: Sparse Linear Attention with Estimated Attention Mask. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=JbcwfmYrob"},{"key":"e_1_3_3_2_20_2","unstructured":"Jure Leskovec and Andrej Krevl. 2014. SNAP Datasets: Stanford Large Network Dataset Collection. http:\/\/snap.stanford.edu\/data."},{"key":"e_1_3_3_2_21_2","series-title":"(SC \u201922)","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Li Shigang","year":"2022","unstructured":"Shigang Li, Kazuki Osawa, and Torsten Hoefler. 2022. Efficient quantized sparse matrix operations on tensor cores. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC \u201922). IEEE Press, Article 37, 15\u00a0pages."},{"key":"e_1_3_3_2_22_2","volume-title":"The Third Learning on Graphs Conference","author":"Liu Jiahui","year":"2024","unstructured":"Jiahui Liu, Zhenkun Cai, Zhiyong Chen, and Minjie Wang. 2024. DF-GNN: Dynamic Fusion Framework for Attention Graph Neural Networks on GPUs. In The Third Learning on Graphs Conference. https:\/\/openreview.net\/forum?id=8GNDnBbUfF"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Liu Liu Zheng Qu Zhaodong Chen Fengbin Tu Yufei Ding and Yuan Xie. 2022. Dynamic Sparse Attention for Scalable Transformer Acceleration. IEEE Trans. Comput. 71 12 (2022) 3165\u20133178. doi:10.1109\/TC.2022.3208206","DOI":"10.1109\/TC.2022.3208206"},{"key":"e_1_3_3_2_24_2","unstructured":"Luis M\u00fcller Mikhail Galkin Christopher Morris and Ladislav Ramp\u00e1\u0161ek. 2024. Attending to Graph Transformers. Transactions on Machine Learning Research (2024). https:\/\/openreview.net\/forum?id=HhbqHBBrfZ"},{"key":"e_1_3_3_2_25_2","volume-title":"NVIDIA TESLA V100","year":"2018","unstructured":"NVIDIA Corporation 2018. NVIDIA TESLA V100. NVIDIA Corporation. https:\/\/images.nvidia.com\/content\/technologies\/volta\/pdf\/tesla-volta-v100-datasheet-letter-fnl-web.pdf"},{"key":"e_1_3_3_2_26_2","volume-title":"NVIDIA A30 TENSOR CORE GPU","year":"2022","unstructured":"NVIDIA Corporation 2022. NVIDIA A30 TENSOR CORE GPU. NVIDIA Corporation. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/data-center\/products\/a30-gpu\/pdf\/a30-datasheet.pdf"},{"key":"e_1_3_3_2_27_2","volume-title":"NVIDIA GH200 Grace Hopper Superchip","year":"2025","unstructured":"NVIDIA Corporation 2025. NVIDIA GH200 Grace Hopper Superchip. NVIDIA Corporation. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/grace-hopper-superchip?ncid=no-ncid"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Patrik Okanovic Grzegorz Kwasniewski Paolo\u00a0Sylos Labini Maciej Besta Flavio Vella and Torsten Hoefler. 2024. High Performance Unstructured SpMM Computation Using Tensor Cores. arxiv:https:\/\/arXiv.org\/abs\/2408.11551\u00a0[cs.DC]","DOI":"10.1109\/SC41406.2024.00060"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Meng Pang Xiang Fei Peng Qu Youhui Zhang and Zhaolin Li. 2024. A Row Decomposition-based Approach for Sparse Matrix Multiplication on GPUs(PPoPP \u201924). Association for Computing Machinery New York NY USA 377\u2013389. doi:10.1145\/3627535.3638470","DOI":"10.1145\/3627535.3638470"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00034"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9277"},{"key":"e_1_3_3_2_32_2","unstructured":"Ahsan Shehzad Feng Xia Shagufta Abid Ciyuan Peng Shuo Yu Dongyu Zhang and Karin Verspoor. 2024. Graph Transformers: A Survey. arxiv:https:\/\/arXiv.org\/abs\/2407.09777\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2407.09777"},{"key":"e_1_3_3_2_33_2","unstructured":"Jinliang Shi Shigang Li Youxuan Xu Rongtian Fu Xueying Wang and Tong Wu. 2024. FlashSparse: Minimizing Computation Redundancy for Fast Sparse Matrix Multiplications on Tensor Cores. arxiv:https:\/\/arXiv.org\/abs\/2412.11007\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2412.11007"},{"key":"e_1_3_3_2_34_2","volume-title":"International Conference on Machine Learning","author":"Shirzad Hamed","year":"2023","unstructured":"Hamed Shirzad, Ameya Velingker, Balaji Venkatachalam, Danica\u00a0J Sutherland, and Ali\u00a0Kemal Sinop. 2023. Exphormer: Sparse transformers for graphs. In International Conference on Machine Learning. arXiv:https:\/\/arXiv.org\/abs\/2303.06147"},{"key":"e_1_3_3_2_35_2","unstructured":"Kiran\u00a0K Thekumparampil Chong Wang Sewoong Oh and Li-Jia Li. 2018. Attention-based graph neural network for semi-supervised learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1803.03735 (2018)."},{"key":"e_1_3_3_2_36_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_37_2","volume-title":"International Conference on Learning Representations","author":"Veli\u010dkovi\u0107 Petar","year":"2018","unstructured":"Petar Veli\u010dkovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Li\u00f2, and Yoshua Bengio. 2018. Graph Attention Networks. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJXMpikCZ"},{"key":"e_1_3_3_2_38_2","unstructured":"Minjie Wang Da Zheng Zihao Ye Quan Gan Mufei Li Xiang Song Jinjing Zhou Chao Ma Lingfan Yu Yu Gai et\u00a0al. 2019. Deep graph library: A graph-centric highly-performant package for graph neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.01315 (2019)."},{"key":"e_1_3_3_2_39_2","first-page":"149","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Guyue Huang, and Yufei Ding. 2023. TC-GNN: Bridging Sparse GNN Computation and Dense Tensor Cores on GPUs. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). USENIX Association, Boston, MA, 149\u2013164. https:\/\/www.usenix.org\/conference\/atc23\/presentation\/wang-yuke"},{"key":"e_1_3_3_2_40_2","unstructured":"Mark Weber Giacomo Domeniconi Jie Chen Daniel Karl\u00a0I. Weidele Claudio Bellei Tom Robinson and Charles\u00a0E. Leiserson. 2019. Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional Networks for Financial Forensics. arxiv:https:\/\/arXiv.org\/abs\/1908.02591\u00a0[cs.SI] https:\/\/arxiv.org\/abs\/1908.02591"},{"key":"e_1_3_3_2_41_2","unstructured":"Haojun Xia Zhen Zheng Yuchao Li Donglin Zhuang Zhongzhu Zhou Xiafei Qiu Yong Li Wei Lin and Shuaiwen\u00a0Leon Song. 2023. Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity. doi:10.48550\/arXiv.2309.10285 arXiv:https:\/\/arXiv.org\/abs\/2309.10285 [cs]."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045396"},{"key":"e_1_3_3_2_43_2","series-title":"(NIPS \u201920)","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, and Amr Ahmed. 2020. Big bird: transformers for longer sequences. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS \u201920). Curran Associates Inc., Red Hook, NY, USA, Article 1450, 15\u00a0pages."},{"key":"e_1_3_3_2_44_2","unstructured":"Hanqing Zeng Hongkuan Zhou Ajitesh Srivastava Rajgopal Kannan and Viktor Prasanna. 2020. GraphSAINT: Graph Sampling Based Inductive Learning Method. arxiv:https:\/\/arXiv.org\/abs\/1907.04931\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1907.04931"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673108"},{"key":"e_1_3_3_2_46_2","series-title":"(SC \u201924)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis","author":"Zhang Meng","year":"2024","unstructured":"Meng Zhang, Jie Sun, Qinghao Hu, Peng Sun, Zeke Wang, Yonggang Wen, and Tianwei Zhang. 2024. TorchGT: A Holistic System for Large-Scale Graph Transformer Training. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (Atlanta, GA, USA) (SC \u201924). IEEE Press, Article 77, 17\u00a0pages. doi:10.1109\/SC41406.2024.00083"},{"key":"e_1_3_3_2_47_2","unstructured":"Haisha Zhao San Li Jiaheng Wang Chunbao Zhou Jue Wang Zhikuang Xin Shunde Li Zhiqiang Liang Zhijie Pan Fang Liu Yan Zeng Yangang Wang and Xuebin Chi. 2024. Acc-SpMM: Accelerating General-purpose Sparse Matrix-Matrix Multiplication with GPU Tensor Cores. arxiv:https:\/\/arXiv.org\/abs\/2501.09251\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/2501.09251"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730430","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:01:36Z","timestamp":1755867696000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730430"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":46,"alternative-id":["10.1145\/3721145.3730430","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730430","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}