{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T15:31:06Z","timestamp":1773588666149,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"name":"NSF &#x28;National Science Foundation&#x29;","award":["CCF-2217064"],"award-info":[{"award-number":["CCF-2217064"]}]},{"name":"NSF &#x28;National Science Foundation&#x29;","award":["CCF-2107244"],"award-info":[{"award-number":["CCF-2107244"]}]},{"name":"NSF &#x28;National Science Foundation&#x29;","award":["CCF-2217099"],"award-info":[{"award-number":["CCF-2217099"]}]},{"name":"Defense Sciences Office, DARPA","award":["PROWESS HR0011-23-C-0101"],"award-info":[{"award-number":["PROWESS HR0011-23-C-0101"]}]},{"name":"Defense Sciences Office, DARPA","award":["SBIR HR001123C0139"],"award-info":[{"award-number":["SBIR HR001123C0139"]}]},{"name":"DOE U.S. Department of Energy","award":["PSAAP DE-NA0003965"],"award-info":[{"award-number":["PSAAP DE-NA0003965"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3779212.3790176","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:55:26Z","timestamp":1773150926000},"page":"993-1006","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Insum: Sparse GPU Kernels Simplified and Optimized with Indirect Einsums"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3082-4348","authenticated-orcid":false,"given":"Jaeyeon","family":"Won","sequence":"first","affiliation":[{"name":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4963-0869","authenticated-orcid":false,"given":"Willow","family":"Ahrens","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7231-7643","authenticated-orcid":false,"given":"Saman","family":"Amarasinghe","sequence":"additional","affiliation":[{"name":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3459-5466","authenticated-orcid":false,"given":"Joel S.","family":"Emer","sequence":"additional","affiliation":[{"name":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA and Architecture Research Group, NVIDIA, Westford, MA, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3720473"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580020"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3554733"},{"key":"e_1_3_2_1_5_1","volume-title":"Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang.","author":"Bradbury James","year":"2018","unstructured":"James Bradbury, Roy Frostig, Peter Hawkins, Matthew James Johnson, Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang. 2018. JAX: composable transformations of PythonNumPy programs. http:\/\/github.com\/jax-ml\/jax"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536313"},{"key":"e_1_3_2_1_7_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al., 2018. TVM: An automated End-to-End optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578-594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3276493"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1002\/andp.19163540702"},{"key":"e_1_3_2_1_10_1","volume-title":"Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:1903.02428","author":"Fey Matthias","year":"2019","unstructured":"Matthias Fey and Jan Eric Lenssen. 2019. Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:1903.02428 (2019)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00021"},{"key":"e_1_3_2_1_12_1","volume-title":"e3nn: Euclidean neural networks. arXiv preprint arXiv:2207.09453","author":"Geiger Mario","year":"2022","unstructured":"Mario Geiger and Tess Smidt. 2022. e3nn: Euclidean neural networks. arXiv preprint arXiv:2207.09453 (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-2649-2"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_2_1_15_1","first-page":"428","article-title":"Exploiting hardware utilization and adaptive dataflow for efficient sparse convolution in 3D point clouds","volume":"5","author":"Hong Ke","year":"2023","unstructured":"Ke Hong, Zhongming Yu, Guohao Dai, Xinhao Yang, Yaoxiu Lian, Ningyi Xu, and Yu Wang. 2023. Exploiting hardware utilization and adaptive dataflow for efficient sparse convolution in 3D point clouds. Proceedings of Machine Learning and Systems, Vol. 5 (2023), 428-441.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_16_1","unstructured":"Andrew James. 2023. PyTorch 2.1: Quansight's Improvements to BSR Sparse Matrix Multiplication. https:\/\/quansight.com\/post\/pytorch-2-1-quansights-improvements-to-bsr-sparse-matrix-multiplication\/."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623791"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00107"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA. 2025a. cuEquivariance: High-Performance Equivariant Neural Network Library. https:\/\/github.com\/NVIDIA\/cuEquivariance."},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2025b. cuSPARSE: A CUDA Library for Sparse Matrix Computations. https:\/\/docs.nvidia.com\/cuda\/cusparse\/."},{"key":"e_1_3_2_1_23_1","volume-title":"Owens","author":"Odemuyiwa Toluwanimi O.","year":"2024","unstructured":"Toluwanimi O. Odemuyiwa, Joel S. Emer, and John D. Owens. 2024. The EDGE Language: Extended General Einsums for Graph Algorithms. http:\/\/arxiv.org\/abs\/2404.11591"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_25_1","volume-title":"PyTorch: an imperative style, high-performance deep learning library","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: an imperative style, high-performance deep learning library. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_27_1","first-page":"638","article-title":"torch. fx: Practical program capture and transformation for deep learning in python","volume":"4","author":"Reed James","year":"2022","unstructured":"James Reed, Zachary DeVito, Horace He, Ansley Ussery, and Jason Ansel. 2022. torch. fx: Practical program capture and transformation for deep learning in python. Proceedings of Machine Learning and Systems, Vol. 4 (2022), 638-651.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-5018-0"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-739X(00)00076-5"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01766-7"},{"key":"e_1_3_2_1_31_1","first-page":"302","article-title":"Torchsparse: Efficient point cloud inference engine","volume":"4","author":"Tang Haotian","year":"2022","unstructured":"Haotian Tang, Zhijian Liu, Xiuyu Li, Yujun Lin, and Song Han. 2022. Torchsparse: Efficient point cloud inference engine. Proceedings of Machine Learning and Systems, Vol. 4 (2022), 302-315.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_41"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00025"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614303"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/LLVMHPC54804.2021.00009"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_37_1","volume-title":"ICLR workshop on representation learning on graphs and manifolds.","author":"Wang Minjie Yu","year":"2019","unstructured":"Minjie Yu Wang. 2019. Deep graph library: Towards efficient and scalable deep learning on graphs. In ICLR workshop on representation learning on graphs and manifolds."},{"key":"e_1_3_2_1_38_1","first-page":"149","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Guyue Huang, and Yufei Ding. 2023. TC-GNN: Bridging sparse GNN computation and dense tensor cores on GPUs. In 2023 USENIX Annual Technical Conference (USENIX ATC 23). 149-164."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3763146"},{"key":"e_1_3_2_1_40_1","first-page":"666","article-title":"Unified Convolution Framework: A compiler-based approach to support sparse convolutions","volume":"5","author":"Won Jaeyeon","year":"2023","unstructured":"Jaeyeon Won, Changwan Hong, Charith Mendis, Joel Emer, and Saman Amarasinghe. 2023a. Unified Convolution Framework: A compiler-based approach to support sparse convolutions. Proceedings of Machine Learning and Systems, Vol. 5 (2023), 666-679.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575742"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623786"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00096"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582047"}],"event":{"name":"ASPLOS '26: 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"deposited":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T14:03:32Z","timestamp":1773583412000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3779212.3790176"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":44,"alternative-id":["10.1145\/3779212.3790176","10.1145\/3779212"],"URL":"https:\/\/doi.org\/10.1145\/3779212.3790176","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}