{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:09Z","timestamp":1755870009550,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"name":"NSERC-DGECR","award":["-2023-0013"],"award-info":[{"award-number":["-2023-0013"]}]},{"name":"NSERC-RGPIN","award":["2023-0489"],"award-info":[{"award-number":["2023-0489"]}]},{"name":"NSFRC-ALLRP","award":["586319-2"],"award-info":[{"award-number":["586319-2"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3730427","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"625-639","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Loop Fusion in Matrix Multiplications with Sparse Dependence"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3637-4473","authenticated-orcid":false,"given":"Mohammad Mahdi","family":"Salehi","sequence":"first","affiliation":[{"name":"McMaster University, Hamilton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2968-5176","authenticated-orcid":false,"given":"Kazem","family":"Cheshmi","sequence":"additional","affiliation":[{"name":"McMaster University, Hamilton, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Khalid Ahmad Cris Cecka Michael Garland and Mary Hall. 2024. Exploring data layout for sparse tensor times dense matrix on GPUs. ACM Transactions on Architecture and Code Optimization 21 1 (2024) 1\u201320.","DOI":"10.1145\/3633462"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"Arash Ashari Shirish Tatikonda Matthias Boehm Berthold Reinwald Keith Campbell John Keenleyside and P. Sadayappan. 2015. On optimizing machine learning workloads via kernel fusion. SIGPLAN Not. 50 8 (jan 2015) 173\u2013182. 10.1145\/2858788.2688521","DOI":"10.1145\/2858788.2688521"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3168804"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00014"},{"key":"e_1_3_3_2_6_2","unstructured":"Aleksandar Bojchevski and Stephan G\u00fcnnemann. 2018. Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking. arxiv:https:\/\/arXiv.org\/abs\/1707.03815\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1707.03815"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456233"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.5555\/AAI28774188"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.5555\/3571885.3571927"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126936"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00065"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Kazem Cheshmi Danny\u00a0M Kaufman Shoaib Kamil and Maryam\u00a0Mehri Dehnavi. 2020. NASOQ: numerically accurate sparsity-oriented QP solver. ACM Transactions on Graphics (TOG) 39 4 (2020) 96\u20131.","DOI":"10.1145\/3386569.3392486"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607097"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508439"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/800195.805928"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A Davis and Yifan Hu. 2011. The University of Florida sparse matrix collection. ACM Transactions on Mathematical Software (TOMS) 38 1 (2011) 1\u201325.","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536305"},{"key":"e_1_3_3_2_18_2","unstructured":"Mohammad Mahdi\u00a0Salehi Dezfuli and Kazem Cheshmi. 2024. Improving Locality in Sparse and Dense Matrix Multiplications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.00243 (2024)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532386"},{"key":"e_1_3_3_2_20_2","unstructured":"Matthias Fey and Jan\u00a0Eric Lenssen. 2019. Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.02428 (2019)."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Mahdi Ghorbani Emilien Bauer Tobias Grosser and Amir Shaikhha. 2025. Compressed and Parallelized Structured Tensor Algebra. Proceedings of the ACM on Programming Languages 9 OOPSLA1 (2025) 1717\u20131745.","DOI":"10.1145\/3720506"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Mahdi Ghorbani Mathieu Huot Shideh Hashemian and Amir Shaikhha. 2023. Compiling structured tensor algebra. Proceedings of the ACM on Programming Languages 7 OOPSLA2 (2023) 204\u2013233.","DOI":"10.1145\/3622804"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527403"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Ahan Gupta Yueming Yuan Devansh Jain Yuhao Ge David Aponte Yanqi Zhou and Charith Mendis. 2025. SPLAT: A framework for optimised GPU code-generation for SParse reguLar ATtention. Proceedings of the ACM on Programming Languages 9 OOPSLA1 (2025) 1632\u20131660.","DOI":"10.1145\/3720503"},{"key":"e_1_3_3_2_25_2","unstructured":"Will Hamilton Zhitao Ying and Jure Leskovec. 2017. Inductive representation learning on large graphs. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295712"},{"key":"e_1_3_3_2_27_2","unstructured":"Weihua Hu Matthias Fey Marinka Zitnik Yuxiao Dong Hongyu Ren Bowen Liu Michele Catasta and Jure Leskovec. 2021. Open Graph Benchmark: Datasets for Machine Learning on Graphs. arxiv:https:\/\/arXiv.org\/abs\/2005.00687\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2005.00687"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00076"},{"key":"e_1_3_3_2_29_2","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC \u201920)","author":"Keahey Kate","year":"2020","unstructured":"Kate Keahey, Jason Anderson, Zhuo Zhen, Pierre Riteau, Paul Ruth, Dan Stanzione, Mert Cevik, Jacob Colleran, Haryadi\u00a0S. Gunawi, Cody Hammock, Joe Mambretti, Alexander Barnes, Fran\u00e7ois Halbach, Alex Rocha, and Joe Stubbs. 2020. Lessons Learned from the Chameleon Testbed. In Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC \u201920). USENIX Association."},{"key":"e_1_3_3_2_30_2","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.02907 (2016)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Fredrik Kjolstad Shoaib Kamil Stephen Chou David Lugato and Saman Amarasinghe. 2017. The tensor algebra compiler. Proceedings of the ACM on Programming Languages 1 OOPSLA (2017) 1\u201329.","DOI":"10.1145\/3133901"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","unstructured":"Boris Krasnopolsky and Alexey Medvedev. 2021. XAMG: A library for solving linear systems with multiple right-hand side vectors. SoftwareX 14 (June 2021) 100695. 10.1016\/j.softx.2021.100695","DOI":"10.1016\/j.softx.2021.100695"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2013.68"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476166"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433816"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/2833157.2833162"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Peiming Liu Alexander\u00a0J Root Anlun Xu Yinying Li Fredrik Kjolstad and Aart\u00a0JC Bik. 2024. Compiler Support for Sparse Tensor Convolutions. Proceedings of the ACM on Programming Languages 8 OOPSLA2 (2024) 275\u2013303.","DOI":"10.1145\/3689721"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3480856"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","unstructured":"Aaron Meurer Christopher\u00a0P. Smith Mateusz Paprocki Ond\u0159ej \u010cert\u00edk Sergey\u00a0B. Kirpichev Matthew Rocklin Amit Kumar Sergiu Ivanov Jason\u00a0K. Moore Sartaj Singh Thilina Rathnayake Sean Vig Brian\u00a0E. Granger Richard\u00a0P. Muller Francesco Bonazzi Harsh Gupta Shivam Vats Fredrik Johansson Fabian Pedregosa Matthew\u00a0J. Curry Andy\u00a0R. Terrel \u0160t\u011bp\u00e1n Rou\u010dka Ashutosh Saboo Isuru Fernando Sumith Kulal Robert Cimrman and Anthony Scopatz. 2017. SymPy: symbolic computing in Python. PeerJ Computer Science 3 (Jan. 2017) e103. 10.7717\/peerj-cs.103","DOI":"10.7717\/peerj-cs.103"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3314221.3314646"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-95953-1_7"},{"key":"e_1_3_3_2_43_2","unstructured":"Vikas Natesh Andrew Sabot HT Kung and Mark Ting. 2023. Rosko: Row Skipping Outer Products for Sparse Matrix Multiplication Kernels. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.03930 (2023)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1018"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Dianne\u00a0P O\u2019Leary. 1980. The block conjugate gradient algorithm and related methods. Linear algebra and its applications 29 (1980) 293\u2013322.","DOI":"10.1016\/0024-3795(80)90247-5"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286154"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00034"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Benedek Rozemberczki Carl Allen and Rik Sarkar. 2021. Multi-scale attributed node embedding. Journal of Complex Networks 9 2 (2021) cnab014.","DOI":"10.1093\/comnet\/cnab014"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411866"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.5555\/829576"},{"key":"e_1_3_3_2_52_2","unstructured":"Michael Schlichtkrull Thomas\u00a0N. Kipf Peter Bloem Rianne van\u00a0den Berg Ivan Titov and Max Welling. 2017. Modeling Relational Data with Graph Convolutional Networks. arxiv:https:\/\/arXiv.org\/abs\/1703.06103\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1703.06103"},{"key":"e_1_3_3_2_53_2","unstructured":"Oleksandr Shchur Maximilian Mumme Aleksandar Bojchevski and Stephan G\u00fcnnemann. 2018. Pitfalls of Graph Neural Network Evaluation. ArXiv abs\/1811.05868 (2018). https:\/\/api.semanticscholar.org\/CorpusID:53303554"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Michelle\u00a0Mills Strout Larry Carter Jeanne Ferrante and Barbara Kreaseck. 2004. Sparse tiling for stationary iterative methods. The International Journal of High Performance Computing Applications 18 1 (2004) 95\u2013113.","DOI":"10.1177\/1094342004041294"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11261-4_11"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Field\u00a0G Van\u00a0Zee and Robert\u00a0A Van De\u00a0Geijn. 2015. BLIS: A framework for rapidly instantiating BLAS functionality. ACM Transactions on Mathematical Software (TOMS) 41 3 (2015) 1\u201333.","DOI":"10.1145\/2764454"},{"key":"e_1_3_3_2_57_2","unstructured":"Petar Veli\u010dkovi\u0107 Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Li\u00f2 and Yoshua Bengio. 2018. Graph Attention Networks. arxiv:https:\/\/arXiv.org\/abs\/1710.10903\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1710.10903"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"crossref","unstructured":"Endong Wang Qing Zhang Bo Shen Guangyong Zhang Xiaowei Lu Qing Wu Yajuan Wang Endong Wang Qing Zhang Bo Shen et\u00a0al. 2014. Intel math kernel library. High-Performance Computing on the Intel\u00ae Xeon Phi\u2122: How to Fully Exploit MIC Architectures (2014) 167\u2013188.","DOI":"10.1007\/978-3-319-06486-4_7"},{"key":"e_1_3_3_2_59_2","unstructured":"Minjie Wang Da Zheng Zihao Ye Quan Gan Mufei Li Xiang Song Jinjing Zhou Chao Ma Lingfan Yu Yu Gai et\u00a0al. 2019. Deep graph library: A graph-centric highly-performant package for graph neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.01315 (2019)."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"Lucas Wilkinson Kazem Cheshmi and Maryam\u00a0Mehri Dehnavi. 2023. Register Tiling for Unstructured Sparsity in Neural Network Inference. Proceedings of the ACM on Programming Languages 7 PLDI (2023) 1995\u20132020.","DOI":"10.1145\/3591302"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575742"},{"key":"e_1_3_3_2_62_2","unstructured":"Wenchao Wu Xuanhua Shi Ligang He and Hai Jin. 2023. TurboMGNN: Improving Concurrent GNN Training Tasks on GPU With Fine-Grained Kernel Fusion. IEEE Transactions on Parallel and Distributed Systems (2023)."},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00121"},{"key":"e_1_3_3_2_64_2","unstructured":"Hanqing Zeng Hongkuan Zhou Ajitesh Srivastava Rajgopal Kannan and Viktor Prasanna. 2019. Graphsaint: Graph sampling based inductive learning method. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.04931 (2019)."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"crossref","unstructured":"Jie Zhou Ganqu Cui Shengding Hu Zhengyan Zhang Cheng Yang Zhiyuan Liu Lifeng Wang Changcheng Li and Maosong Sun. 2020. Graph neural networks: A review of methods and applications. AI open 1 (2020) 57\u201381.","DOI":"10.1016\/j.aiopen.2021.01.001"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","unstructured":"Marinka Zitnik and Jure Leskovec. 2017. Predicting multicellular function through multi-layer tissue networks. Bioinformatics 33 14 (July 2017) i190\u2013i198. 10.1093\/bioinformatics\/btx252","DOI":"10.1093\/bioinformatics\/btx252"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3730427","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:58:59Z","timestamp":1755867539000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3730427"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":65,"alternative-id":["10.1145\/3721145.3730427","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3730427","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}