{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:10Z","timestamp":1755870010630,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725773","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"442-457","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MAGNUS: Generating Data Locality to Accelerate Sparse Matrix-Matrix Multiplication on CPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2952-2996","authenticated-orcid":false,"given":"Jordi","family":"Wolfson-Pou","sequence":"first","affiliation":[{"name":"Intel Labs, Santa Clara, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3776-9353","authenticated-orcid":false,"given":"Jan","family":"Laukemann","sequence":"additional","affiliation":[{"name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4977-7107","authenticated-orcid":false,"given":"Fabrizio","family":"Petrini","sequence":"additional","affiliation":[{"name":"Intel Labs, Santa Clara, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC53243.2021.00034"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926273"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","unstructured":"Ariful Azad Grey Ballard Aydin Bulu\u00e7 James Demmel Laura Grigori Oded Schwartz Sivan Toledo and Samuel Williams. 2016. Exploiting Multiple Levels of Parallelism in Sparse Matrix-Matrix Multiplication. SIAM J. Sci. Statist. Comput. 38 6 (2016) C624\u2013C651. 10.1137\/15M104253X","DOI":"10.1137\/15M104253X"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2015.75"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"Ariful Azad Georgios\u00a0A Pavlopoulos Christos\u00a0A Ouzounis Nikos\u00a0C Kyrpides and Aydin Bulu\u00e7. 2018. HipMCL: a high-performance parallel implementation of the Markov clustering algorithm for large-scale networks. Nucleic Acids Res. 46 6 (January 2018) e33\u2013e33. 10.1093\/nar\/gkx1313","DOI":"10.1093\/nar\/gkx1313"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","unstructured":"Berenger Bramas. 2017. A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel Skylake. Int. J. Adv. Comput. Sci. Appl. 8 10 (2017). 10.14569\/IJACSA.2017.081044","DOI":"10.14569\/IJACSA.2017.081044"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611972740.43"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605611"},{"key":"e_1_3_3_1_10_2","unstructured":"Intel Corporation. 2023. Developer Reference for Intel oneAPI Math Kernel Library (MKL) for C."},{"key":"e_1_3_3_1_11_2","unstructured":"NVIDIA Corporation. 2023. cuSPARSE Library."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Steven Dalton Luke Olson and Nathan Bell. 2015. Optimizing Sparse Matrix-Matrix Multiplication for the GPU. ACM Trans. Math. Software 41 4 Article 25 (October 2015) 20\u00a0pages. 10.1145\/2699470","DOI":"10.1145\/2699470"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis. 2019. Algorithm 1000: SuiteSparse:GraphBLAS: Graph Algorithms in the Language of Sparse Linear Algebra. ACM Trans. Math. Software 45 4 Article 44 (December 2019) 25\u00a0pages. 10.1145\/3322125","DOI":"10.1145\/3322125"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The university of Florida sparse matrix collection. ACM Trans. Math. Software 38 1 Article 1 (December 2011) 25\u00a0pages. 10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"Mehmet Deveci Christian Trott and Siva Rajamanickam. 2018. Multi-threaded Sparse Matrix-Matrix Multiplication for Many-Core and GPU Architectures. Parallel Comput. 78 (January 2018). 10.1016\/j.parco.2018.06.009","DOI":"10.1016\/j.parco.2018.06.009"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","unstructured":"Zhaoyang Du Yijin Guan Tianchan Guan Dimin Niu Linyong Huang Hongzhong Zheng and Yuan Xie. 2022. OpSparse: A Highly Optimized Framework for Sparse General Matrix Multiplication on GPUs. IEEE Access 10 (2022) 85960\u201385974. 10.1109\/ACCESS.2022.3196940","DOI":"10.1109\/ACCESS.2022.3196940"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"P. Erd\u00f6s and A. R\u00e9nyi. 1959. On Random Graphs I. Publicationes Mathematicae Debrecen 6 (1959) 290.","DOI":"10.5486\/PMD.1959.6.3-4.12"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"R.D. Falgout. 2006. An introduction to algebraic multigrid. Comput. Sci. Eng. 8 6 (2006) 24\u201333. 10.1109\/MCSE.2006.105","DOI":"10.1109\/MCSE.2006.105"},{"key":"e_1_3_3_1_19_2","first-page":"710","volume-title":"Asian conference on machine learning","author":"Feng Xu","year":"2018","unstructured":"Xu Feng, Yuyang Xie, Mingye Song, Wenjian Yu, and Jie Tang. 2018. Fast randomized PCA for sparse data. In Asian conference on machine learning. PMLR, 710\u2013725. https:\/\/proceedings.mlr.press\/v95\/feng18a.html"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Jianhua Gao Weixing Ji Fangli Chang Shiyu Han Bingxin Wei Zeming Liu and Yizhuo Wang. 2023. A Systematic Survey of General Sparse Matrix-matrix Multiplication. Comput. Surveys 55 12 Article 244 (March 2023) 36\u00a0pages. 10.1145\/3571157","DOI":"10.1145\/3571157"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-75755-9_32"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286191"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2014.34"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3350755.3400216"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611976830.12"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00060"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","unstructured":"Fred\u00a0G. Gustavson. 1978. Two Fast Algorithms for Sparse Matrices: Multiplication and Permuted Transposition. ACM Trans. Math. Software 4 3 (September 1978) 250\u2013269. 10.1145\/355791.355796","DOI":"10.1145\/355791.355796"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","unstructured":"Torsten Hoefler Dan Alistarh Tal Ben-Nun Nikoli Dryden and Alexandra Peste. 2021. Sparsity in deep learning: pruning and growth for efficient inference and training in neural networks. J. Mach. Learn. Res. 22 1 Article 241 (Jan. 2021) 124\u00a0pages. 10.5555\/3546258.3546499","DOI":"10.5555\/3546258.3546499"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/1137856.1137866"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3593000"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/MCHPC49590.2019.00012"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","unstructured":"Ruipeng Li Bj\u00f6rn Sj\u00f6green and Ulrike\u00a0Meier Yang. 2021. A New Class of AMG Interpolation Methods Based on Matrix-Matrix Multiplications. SIAM J. Sci. Comput. 43 5 (2021) S540\u2013S564. 10.1137\/20M134931X","DOI":"10.1137\/20M134931X"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","unstructured":"Junhong Liu Xin He Weifeng Liu and Guangming Tan. 2019. Register-Aware Optimizations for Parallel Sparse Matrix-Matrix Multiplication. Int. J. Parallel Program. 47 3 (June 2019) 403\u2013417. 10.1007\/s10766-018-0604-8","DOI":"10.1007\/s10766-018-0604-8"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.47"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","unstructured":"Weifeng Liu and Brian Vinter. 2015. A framework for general sparse matrix\u2013matrix multiplication on GPUs and heterogeneous processors. J. Parallel and Distrib. Comput. 85 (2015) 47\u201361. 10.1016\/j.jpdc.2015.06.010","DOI":"10.1016\/j.jpdc.2015.06.010"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3229710.3229720"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","unstructured":"Yusuke Nagasaka Satoshi Matsuoka Ariful Azad and Ayd\u0131n Bulu\u00e7. 2019. Performance optimization modeling and analysis of sparse matrix-matrix products on multi-core and many-core processors. Parallel Comput. 90 (2019) 102545. 10.1016\/j.parco.2019.102545","DOI":"10.1016\/j.parco.2019.102545"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2017.19"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374521"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20119-1_4"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_3_1_42_2","unstructured":"Sivasankaran Rajamanickam Seher Acer Luc Berger-Vergiat Vinh Dang Nathan Ellingwood Evan Harvey Brian Kelley Christian\u00a0R. Trott Jeremiah Wilke and Ichitaro Yamazaki. 2021. Kokkos Kernels: Performance Portable Sparse\/Dense Linear Algebra and Graph Kernels. arxiv:https:\/\/arXiv.org\/abs\/2103.11991"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00022"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3625131"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295701"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2015.7322450"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","unstructured":"Kathy Yelick Ayd\u0131n Bulu\u00e7 Muaaz Awan Ariful Azad Bowei Brock Rob Egan Saliya Ekanayake Marquita Ellis Evangelos Georganas Giulia Guidi Steven Hofmeyr Oguz Selvitopi Cristina Teodoropol and Leonid Oliker. 2020. The parallelism motifs of genomic data analysis. Philos. Trans. R. Soc. A Math. Phys. Eng. Sci. 378 2166 (2020) 20190394. 10.1098\/rsta.2019.0394","DOI":"10.1098\/rsta.2019.0394"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","unstructured":"Chao Zhang Maximilian Bremer Cy Chan John Shalf and Xiaochen Guo. 2022. ASA: Accelerating Sparse Accumulation in Column-wise SpGEMM. ACM Trans. Archit. Code Optim. 19 4 Article 49 (September 2022) 24\u00a0pages. 10.1145\/3543068","DOI":"10.1145\/3543068"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446702"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725773","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:02:55Z","timestamp":1755867775000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725773"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":48,"alternative-id":["10.1145\/3721145.3725773","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725773","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}