{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:52:08Z","timestamp":1780674728538,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,17]],"date-time":"2023-06-17T00:00:00Z","timestamp":1686960000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"DARPA","award":["ACE, one of the seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC) program"],"award-info":[{"award-number":["ACE, one of the seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC) program"]}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS 1956007"],"award-info":[{"award-number":["CNS 1956007"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CCF 2107470"],"award-info":[{"award-number":["CCF 2107470"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,17]]},"DOI":"10.1145\/3579371.3589054","type":"proceedings-article","created":{"date-parts":[[2023,6,16]],"date-time":"2023-06-16T20:25:28Z","timestamp":1686947128000},"page":"1-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["SPADE: A Flexible and Scalable Accelerator for SpMM and SDDMM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7946-2683","authenticated-orcid":false,"given":"Gerasimos","family":"Gerogiannis","sequence":"first","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7947-2451","authenticated-orcid":false,"given":"Serif","family":"Yesil","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9804-3994","authenticated-orcid":false,"given":"Damitha","family":"Lenadora","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5975-4884","authenticated-orcid":false,"given":"Dingyuan","family":"Cao","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8140-2321","authenticated-orcid":false,"given":"Charith","family":"Mendis","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2595-5228","authenticated-orcid":false,"given":"Josep","family":"Torrellas","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana, IL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,6,17]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Sriram Aananthakrishnan Nesreen K. Ahmed Vincent Cave Marcelo Cintra Yigit Demir Kristof Du Bois Stijn Eyerman Joshua B. Fryman Ivan Ganev Wim Heirman Hans-Christian Hoppe Jason Howard Ibrahim Hur MidhunChandra Kodiyath Samkit Jain Daniel S. Klowden Marek M. Landowski Laurent Montigny Ankit More Przemyslaw Ossowski Robert Pawlowski Nick Pepperling Fabrizio Petrini Mariusz Sikora Balasubramanian Seshasayee Shaden Smith Sebastian Szkoda Sanjaya Tayal Jesmin Jahan Tithi Yves Vandriessche and Izajasz P. Wrosz. 2020. PIUMA: Programmable Integrated Unified Memory Architecture. arXiv:2010.06277 [cs.AR]  Sriram Aananthakrishnan Nesreen K. Ahmed Vincent Cave Marcelo Cintra Yigit Demir Kristof Du Bois Stijn Eyerman Joshua B. Fryman Ivan Ganev Wim Heirman Hans-Christian Hoppe Jason Howard Ibrahim Hur MidhunChandra Kodiyath Samkit Jain Daniel S. Klowden Marek M. Landowski Laurent Montigny Ankit More Przemyslaw Ossowski Robert Pawlowski Nick Pepperling Fabrizio Petrini Mariusz Sikora Balasubramanian Seshasayee Shaden Smith Sebastian Szkoda Sanjaya Tayal Jesmin Jahan Tithi Yves Vandriessche and Izajasz P. Wrosz. 2020. PIUMA: Programmable Integrated Unified Memory Architecture. arXiv:2010.06277 [cs.AR]"},{"key":"e_1_3_2_1_2_1","volume-title":"Characterizing the Scalability of Graph Convolutional Networks on Intel\u00ae PIUMA. In IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Adiletta Matthew","year":"2023","unstructured":"Matthew Adiletta , Jesmin Jahan Tithi , Emmanouil-Ioannis Farsarakis , Gerasimos Gerogiannis , Robert Adolf , Robert Benke , Sidharth Kashyap , Samuel Hsia , Kartik Lakhotia , Fabrizio Petrini , Gu-Yeon Wei , and David Brooks . 2023 . Characterizing the Scalability of Graph Convolutional Networks on Intel\u00ae PIUMA. In IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) . Raleigh, North Carolina. Matthew Adiletta, Jesmin Jahan Tithi, Emmanouil-Ioannis Farsarakis, Gerasimos Gerogiannis, Robert Adolf, Robert Benke, Sidharth Kashyap, Samuel Hsia, Kartik Lakhotia, Fabrizio Petrini, Gu-Yeon Wei, and David Brooks. 2023. Characterizing the Scalability of Graph Convolutional Networks on Intel\u00ae PIUMA. In IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). Raleigh, North Carolina."},{"key":"e_1_3_2_1_3_1","volume-title":"2014 IEEE 28th International Parallel and Distributed Processing Symposium. IEEE, 1213--1222","author":"Aktulga Hasan Metin","year":"2014","unstructured":"Hasan Metin Aktulga , Aydin Bulu\u00e7 , Samuel Williams , and Chao Yang . 2014 . Optimizing sparse matrix-multiple vectors multiplication for nuclear configuration interaction calculations . In 2014 IEEE 28th International Parallel and Distributed Processing Symposium. IEEE, 1213--1222 . Hasan Metin Aktulga, Aydin Bulu\u00e7, Samuel Williams, and Chao Yang. 2014. Optimizing sparse matrix-multiple vectors multiplication for nuclear configuration interaction calculations. In 2014 IEEE 28th International Parallel and Distributed Processing Symposium. IEEE, 1213--1222."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3480855"},{"key":"e_1_3_2_1_5_1","unstructured":"Hartwig Anzt Stanimire Tomov and Jack J Dongarra. 2015. Accelerating the LOBPCG method on GPUs using a blocked sparse matrix vector product. In SpringSim (HPS). 75--82.  Hartwig Anzt Stanimire Tomov and Jack J Dongarra. 2015. Accelerating the LOBPCG method on GPUs using a blocked sparse matrix vector product. In SpringSim (HPS). 75--82."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.110"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00029"},{"key":"e_1_3_2_1_8_1","volume-title":"2020 57th ACM\/IEEE Design Automation Conference (DAC). IEEE, 1--6.","author":"Auten Adam","year":"2020","unstructured":"Adam Auten , Matthew Tomei , and Rakesh Kumar . 2020 . Hardware acceleration of graph neural networks . In 2020 57th ACM\/IEEE Design Automation Conference (DAC). IEEE, 1--6. Adam Auten, Matthew Tomei, and Rakesh Kumar. 2020. Hardware acceleration of graph neural networks. In 2020 57th ACM\/IEEE Design Automation Conference (DAC). IEEE, 1--6."},{"key":"e_1_3_2_1_9_1","unstructured":"Zhaojun Bai James Demmel Jack Dongarra Axel Ruhe and Henk van der Vorst. 2000. Templates for the solution of algebraic eigenvalue problems: a practical guide. SIAM.  Zhaojun Bai James Demmel Jack Dongarra Axel Ruhe and Henk van der Vorst. 2000. Templates for the solution of algebraic eigenvalue problems: a practical guide. SIAM."},{"key":"e_1_3_2_1_10_1","volume-title":"2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 203--214","author":"Balaji Vignesh","year":"2018","unstructured":"Vignesh Balaji and Brandon Lucia . 2018 . When is graph reordering an optimization? Studying the effect of lightweight graph reordering across applications and input graphs . In 2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 203--214 . Vignesh Balaji and Brandon Lucia. 2018. When is graph reordering an optimization? Studying the effect of lightweight graph reordering across applications and input graphs. In 2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 203--214."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3085572"},{"key":"e_1_3_2_1_12_1","volume-title":"2021 IEEE High Performance Extreme Computing Conference (HPEC). 1--7.","author":"Cabrera Anthony","year":"2021","unstructured":"Anthony Cabrera , Seth Hitefield , Jungwon Kim , Seyong Lee , Narasinga Rao Miniskar , and Jeffrey S Vetter . 2021 . Toward performance portable programming for heterogeneous systems on a chip: A case study with Qualcomm Snapdragon SoC . In 2021 IEEE High Performance Extreme Computing Conference (HPEC). 1--7. Anthony Cabrera, Seth Hitefield, Jungwon Kim, Seyong Lee, Narasinga Rao Miniskar, and Jeffrey S Vetter. 2021. Toward performance portable programming for heterogeneous systems on a chip: A case study with Qualcomm Snapdragon SoC. In 2021 IEEE High Performance Extreme Computing Conference (HPEC). 1--7."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2015.7363760"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_15_1","volume-title":"Adaptive universal generalized pagerank graph neural network. arXiv preprint arXiv:2006.07988","author":"Chien Eli","year":"2020","unstructured":"Eli Chien , Jianhao Peng , Pan Li , and Olgica Milenkovic . 2020. Adaptive universal generalized pagerank graph neural network. arXiv preprint arXiv:2006.07988 ( 2020 ). Eli Chien, Jianhao Peng, Pan Li, and Olgica Milenkovic. 2020. Adaptive universal generalized pagerank graph neural network. arXiv preprint arXiv:2006.07988 (2020)."},{"key":"e_1_3_2_1_16_1","volume-title":"2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 595--608","author":"Dadu Vidushi","year":"2021","unstructured":"Vidushi Dadu , Sihao Liu , and Tony Nowatzki . 2021 . Polygraph: Exposing the value of flexibility for graph processing accelerators . In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 595--608 . Vidushi Dadu, Sihao Liu, and Tony Nowatzki. 2021. Polygraph: Exposing the value of flexibility for graph processing accelerators. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 595--608."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2049662.2049670"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2022.3163817"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586114"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","first-page":"913","DOI":"10.1109\/TC.2010.121","article-title":"Energy-efficient floating-point unit design","volume":"60","author":"Galal Sameh","year":"2010","unstructured":"Sameh Galal and Mark Horowitz . 2010 . Energy-efficient floating-point unit design . IEEE Transactions on computers 60 , 7 (2010), 913 -- 922 . Sameh Galal and Mark Horowitz. 2010. Energy-efficient floating-point unit design. IEEE Transactions on computers 60, 7 (2010), 913--922.","journal-title":"IEEE Transactions on computers"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00062"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00079"},{"key":"e_1_3_2_1_23_1","volume-title":"SAVE: Sparsity-Aware Vector Engine for Accelerating DNN Training and Inference on CPUs. In 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 796--810","author":"Gong Zhangxiaowen","year":"2020","unstructured":"Zhangxiaowen Gong , Houxiang Ji , Christopher W Fletcher , Christopher J Hughes , Sara Baghsorkhi , and Josep Torrellas . 2020 . SAVE: Sparsity-Aware Vector Engine for Accelerating DNN Training and Inference on CPUs. In 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 796--810 . Zhangxiaowen Gong, Houxiang Ji, Christopher W Fletcher, Christopher J Hughes, Sara Baghsorkhi, and Josep Torrellas. 2020. SAVE: Sparsity-Aware Vector Engine for Accelerating DNN Training and Inference on CPUs. In 2020 53rd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). IEEE, 796--810."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527403"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems. 1025--1035","author":"Hamilton William L","year":"2017","unstructured":"William L Hamilton , Rex Ying , and Jure Leskovec . 2017 . Inductive representation learning on large graphs . In Proceedings of the 31st International Conference on Neural Information Processing Systems. 1025--1035 . William L Hamilton, Rex Ying, and Jure Leskovec. 2017. Inductive representation learning on large graphs. In Proceedings of the 31st International Conference on Neural Information Processing Systems. 1025--1035."},{"key":"e_1_3_2_1_26_1","volume-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding. arXiv preprint arXiv:1510.00149","author":"Han Song","year":"2015","unstructured":"Song Han , Huizi Mao , and William J Dally . 2015. Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding. arXiv preprint arXiv:1510.00149 ( 2015 ). Song Han, Huizi Mao, and William J Dally. 2015. Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding. arXiv preprint arXiv:1510.00149 (2015)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392751"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295712"},{"key":"e_1_3_2_1_30_1","unstructured":"Guyue Huang Guohao Dai Yu Wang Yufei Ding and Yuan Xie. 2021. Efficient Sparse Matrix Kernels Based on Adaptive Workload-Balancing and Parallel-Reduction. arXiv:2106.16064 [cs.DC]  Guyue Huang Guohao Dai Yu Wang Yufei Ding and Yuan Xie. 2021. Efficient Sparse Matrix Kernels Based on Adaptive Workload-Balancing and Parallel-Reduction. arXiv:2106.16064 [cs.DC]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441585"},{"key":"e_1_3_2_1_32_1","unstructured":"Intel. 2021. Intel Xeon Gold 6348 Processor 42M Cache 2.60 GHz Product Specifications. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/sku\/212456\/intel-xeon-gold-6348-processor-42m-cache-2-60-ghz\/specifications.html  Intel. 2021. Intel Xeon Gold 6348 Processor 42M Cache 2.60 GHz Product Specifications. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/sku\/212456\/intel-xeon-gold-6348-processor-42m-cache-2-60-ghz\/specifications.html"},{"key":"e_1_3_2_1_33_1","unstructured":"Intel. 2022. Intel 64 and IA-32 Architectures Software Developer's Manual. https:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf  Intel. 2022. Intel 64 and IA-32 Architectures Software Developer's Manual. https:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf"},{"key":"e_1_3_2_1_34_1","volume-title":"VEGETA: Vertically-Integrated Extensions for Sparse\/Dense GEMM Tile Acceleration on CPUs. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 259--272","author":"Jeong Geonhwa","year":"2023","unstructured":"Geonhwa Jeong , Sana Damani , Abhimanyu Rajeshkumar Bambhaniya , Eric Qin , Christopher J Hughes , Sreenivas Subramoney , Hyesoon Kim , and Tushar Krishna . 2023 . VEGETA: Vertically-Integrated Extensions for Sparse\/Dense GEMM Tile Acceleration on CPUs. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 259--272 . Geonhwa Jeong, Sana Damani, Abhimanyu Rajeshkumar Bambhaniya, Eric Qin, Christopher J Hughes, Sreenivas Subramoney, Hyesoon Kim, and Tushar Krishna. 2023. VEGETA: Vertically-Integrated Extensions for Sparse\/Dense GEMM Tile Acceleration on CPUs. In 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 259--272."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3016078.2851152"},{"key":"e_1_3_2_1_36_1","volume-title":"GRIP: A graph neural network accelerator architecture","author":"Kiningham Kevin","year":"2022","unstructured":"Kevin Kiningham , Philip Levis , and Christopher R\u00e9 . 2022 . GRIP: A graph neural network accelerator architecture . IEEE Trans. Comput . (2022). Kevin Kiningham, Philip Levis, and Christopher R\u00e9. 2022. GRIP: A graph neural network accelerator architecture. IEEE Trans. Comput. (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling . 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 ( 2016 ). Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_39_1","series-title":"SIAM journal on scientific computing 23, 2","volume-title":"Toward the optimal preconditioned eigensolver: Locally optimal block preconditioned conjugate gradient method","author":"Knyazev Andrew V","year":"2001","unstructured":"Andrew V Knyazev . 2001. Toward the optimal preconditioned eigensolver: Locally optimal block preconditioned conjugate gradient method . SIAM journal on scientific computing 23, 2 ( 2001 ), 517--541. Andrew V Knyazev. 2001. Toward the optimal preconditioned eigensolver: Locally optimal block preconditioned conjugate gradient method. SIAM journal on scientific computing 23, 2 (2001), 517--541."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00070"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2973991"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575706"},{"key":"e_1_3_2_1_43_1","volume-title":"Engn: A high-throughput and energy-efficient accelerator for large graph neural networks","author":"Liang Shengwen","year":"2020","unstructured":"Shengwen Liang , Ying Wang , Cheng Liu , Lei He , LI Huawei , Dawen Xu , and Xiaowei Li . 2020 . Engn: A high-throughput and energy-efficient accelerator for large graph neural networks . IEEE Trans. Comput . (2020). Shengwen Liang, Ying Wang, Cheng Liu, Lei He, LI Huawei, Dawen Xu, and Xiaowei Li. 2020. Engn: A high-throughput and energy-efficient accelerator for large graph neural networks. IEEE Trans. Comput. (2020)."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 2022 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays. 123--133","author":"Lin Yi-Chien","year":"2022","unstructured":"Yi-Chien Lin , Bingyi Zhang , and Viktor Prasanna . 2022 . HP-GNN: Generating high throughput GNN training implementation on CPU-FPGA heterogeneous platform . In Proceedings of the 2022 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays. 123--133 . Yi-Chien Lin, Bingyi Zhang, and Viktor Prasanna. 2022. HP-GNN: Generating high throughput GNN training implementation on CPU-FPGA heterogeneous platform. In Proceedings of the 2022 ACM\/SIGDA International Symposium on Field-Programmable Gate Arrays. 123--133."},{"key":"e_1_3_2_1_45_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Ma Lingxiao","year":"2019","unstructured":"Lingxiao Ma , Zhi Yang , Youshan Miao , Jilong Xue , Ming Wu , Lidong Zhou , and Yafei Dai . 2019 . NeuGraph: Parallel deep neural network computation on large graphs . In 2019 USENIX Annual Technical Conference (USENIX ATC 19) . 443--458. Lingxiao Ma, Zhi Yang, Youshan Miao, Jilong Xue, Ming Wu, Lidong Zhou, and Yafei Dai. 2019. NeuGraph: Parallel deep neural network computation on large graphs. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 443--458."},{"key":"e_1_3_2_1_46_1","volume-title":"2021 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). IEEE, 48--58","author":"Mehrabi Atefeh","year":"2021","unstructured":"Atefeh Mehrabi , Donghyuk Lee , Niladrish Chatterjee , Daniel J Sorin , Benjamin C Lee , and Mike O'Connor . 2021 . Learning sparse matrix row permutations for efficient SpMM on GPU architectures . In 2021 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). IEEE, 48--58 . Atefeh Mehrabi, Donghyuk Lee, Niladrish Chatterjee, Daniel J Sorin, Benjamin C Lee, and Mike O'Connor. 2021. Learning sparse matrix row permutations for efficient SpMM on GPU architectures. In 2021 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS). IEEE, 48--58."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582069"},{"key":"e_1_3_2_1_48_1","volume-title":"GPU Technology Conference.","author":"Naumov Maxim","year":"2010","unstructured":"Maxim Naumov , L Chien , Philippe Vandermersch , and Ujval Kapasi . 2010 . cuS-PARSE library . In GPU Technology Conference. Maxim Naumov, L Chien, Philippe Vandermersch, and Ujval Kapasi. 2010. cuS-PARSE library. In GPU Technology Conference."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11075-018-0502-6"},{"key":"e_1_3_2_1_50_1","volume-title":"2018 IEEE 25th International Conference on High Performance Computing (HiPC). IEEE, 32--41","author":"Nisa Israt","year":"2018","unstructured":"Israt Nisa , Aravind Sukumaran-Rajam , Sureyya Emre Kurt , Changwan Hong , and P Sadayappan . 2018 . Sampled dense matrix multiplication for high-performance machine learning . In 2018 IEEE 25th International Conference on High Performance Computing (HiPC). IEEE, 32--41 . Israt Nisa, Aravind Sukumaran-Rajam, Sureyya Emre Kurt, Changwan Hong, and P Sadayappan. 2018. Sampled dense matrix multiplication for high-performance machine learning. In 2018 IEEE 25th International Conference on High Performance Computing (HiPC). IEEE, 32--41."},{"key":"e_1_3_2_1_51_1","unstructured":"NVIDIA. 2017. NVIDIA Tesla V100 whitepaper. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf  NVIDIA. 2017. NVIDIA Tesla V100 whitepaper. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf"},{"key":"e_1_3_2_1_52_1","unstructured":"NVIDIA. 2022. NVIDIA Grace Hopper superchip architecture whitepaper. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/nvidia-grace-hopper  NVIDIA. 2022. NVIDIA Grace Hopper superchip architecture whitepaper. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/nvidia-grace-hopper"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414627"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.2514\/1.J052113"},{"key":"e_1_3_2_1_56_1","volume-title":"Enabling Flexibility for Sparse Tensor Acceleration via Heterogeneity. arXiv preprint arXiv:2201.08916","author":"Qin Eric","year":"2022","unstructured":"Eric Qin , Raveesh Garg , Abhimanyu Bambhaniya , Michael Pellauer , Angshuman Parashar , Sivasankaran Rajamanickam , Cong Hao , and Tushar Krishna . 2022. Enabling Flexibility for Sparse Tensor Acceleration via Heterogeneity. arXiv preprint arXiv:2201.08916 ( 2022 ). Eric Qin, Raveesh Garg, Abhimanyu Bambhaniya, Michael Pellauer, Angshuman Parashar, Sivasankaran Rajamanickam, Cong Hao, and Tushar Krishna. 2022. Enabling Flexibility for Sparse Tensor Acceleration via Heterogeneity. arXiv preprint arXiv:2201.08916 (2022)."},{"key":"e_1_3_2_1_57_1","volume-title":"2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1014--1024","author":"Qin Eric","year":"2021","unstructured":"Eric Qin , Geonhwa Jeong , William Won , Sheng-Chun Kao , Hyoukjun Kwon , Sudarshan Srinivasan , Dipankar Das , Gordon E Moon , Sivasankaran Rajamanickam , and Tushar Krishna . 2021 . Extending sparse tensor accelerators to support multiple compression formats . In 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1014--1024 . Eric Qin, Geonhwa Jeong, William Won, Sheng-Chun Kao, Hyoukjun Kwon, Sudarshan Srinivasan, Dipankar Das, Gordon E Moon, Sivasankaran Rajamanickam, and Tushar Krishna. 2021. Extending sparse tensor accelerators to support multiple compression formats. In 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1014--1024."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_2_1_59_1","volume-title":"2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 256--266","author":"Rahman Md Khaledur","year":"2021","unstructured":"Md Khaledur Rahman , Majedul Haque Sujon , and Ariful Azad . 2021 . FusedMM: A unified SDDMM-SpMM kernel for graph embedding and graph neural networks . In 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 256--266 . Md Khaledur Rahman, Majedul Haque Sujon, and Ariful Azad. 2021. FusedMM: A unified SDDMM-SpMM kernel for graph embedding and graph neural networks. In 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 256--266."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/1964218.1964225"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2014.07.006"},{"key":"e_1_3_2_1_62_1","volume-title":"Intel launches 3rd Gen Ice Lake Xeon Scalable. https:\/\/fuse.wikichip.org\/news\/4734\/intel-launches-3rd-gen-ice-lake-xeon-scalable\/","author":"Schor David","unstructured":"David Schor . 2021. Intel launches 3rd Gen Ice Lake Xeon Scalable. https:\/\/fuse.wikichip.org\/news\/4734\/intel-launches-3rd-gen-ice-lake-xeon-scalable\/ David Schor. 2021. Intel launches 3rd Gen Ice Lake Xeon Scalable. https:\/\/fuse.wikichip.org\/news\/4734\/intel-launches-3rd-gen-ice-lake-xeon-scalable\/"},{"key":"e_1_3_2_1_63_1","volume-title":"Intel showcases Sapphire Rapids Plus HBM Xeon Performance at ISC","author":"Smith Ryan","year":"2022","unstructured":"Ryan Smith . 2022. Intel showcases Sapphire Rapids Plus HBM Xeon Performance at ISC 2022 . https:\/\/www.anandtech.com\/show\/17422\/intel-showcases-sapphire-rapids-plus-hbm-xeon-performance-isc-2022 Ryan Smith. 2022. Intel showcases Sapphire Rapids Plus HBM Xeon Performance at ISC 2022. https:\/\/www.anandtech.com\/show\/17422\/intel-showcases-sapphire-rapids-plus-hbm-xeon-performance-isc-2022"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490422.3502357"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00062"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.vlsi.2017.02.002"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2011.6130281"},{"key":"e_1_3_2_1_68_1","volume-title":"Graph attention networks. arXiv preprint arXiv:1710.10903","author":"Veli\u010dkovi\u0107 Petar","year":"2017","unstructured":"Petar Veli\u010dkovi\u0107 , Guillem Cucurull , Arantxa Casanova , Adriana Romero , Pietro Lio , and Yoshua Bengio . 2017. Graph attention networks. arXiv preprint arXiv:1710.10903 ( 2017 ). Petar Veli\u010dkovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Lio, and Yoshua Bengio. 2017. Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482091"},{"key":"e_1_3_2_1_70_1","volume-title":"High-Performance Computing on the Intel\u00ae Xeon Phi\u2122","author":"Wang Endong","unstructured":"Endong Wang , Qing Zhang , Bo Shen , Guangyong Zhang , Xiaowei Lu , Qing Wu , and Yajuan Wang . 2014. Intel math kernel library . In High-Performance Computing on the Intel\u00ae Xeon Phi\u2122 . Springer , 167--188. Endong Wang, Qing Zhang, Bo Shen, Guangyong Zhang, Xiaowei Lu, Qing Wu, and Yajuan Wang. 2014. Intel math kernel library. In High-Performance Computing on the Intel\u00ae Xeon Phi\u2122. Springer, 167--188."},{"key":"e_1_3_2_1_71_1","volume-title":"ICLR Workshop on Representation Learning on Graphs and Manifolds","author":"Wang Minjie Yu","year":"2019","unstructured":"Minjie Yu Wang . 2019 . Deep Graph Library: towards efficient and scalable deep learning on graphs . ICLR Workshop on Representation Learning on Graphs and Manifolds (2019). https:\/\/par.nsf.gov\/biblio\/10311680 Minjie Yu Wang. 2019. Deep Graph Library: towards efficient and scalable deep learning on graphs. ICLR Workshop on Representation Learning on Graphs and Manifolds (2019). https:\/\/par.nsf.gov\/biblio\/10311680"},{"key":"e_1_3_2_1_72_1","volume-title":"TC-GNN: Accelerating Sparse Graph Neural Network Computation Via Dense Tensor Core on GPUs. arXiv preprint arXiv:2112.02052","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang , Boyuan Feng , and Yufei Ding . 2021. TC-GNN: Accelerating Sparse Graph Neural Network Computation Via Dense Tensor Core on GPUs. arXiv preprint arXiv:2112.02052 ( 2021 ). Yuke Wang, Boyuan Feng, and Yufei Ding. 2021. TC-GNN: Accelerating Sparse Graph Neural Network Computation Via Dense Tensor Core on GPUs. arXiv preprint arXiv:2112.02052 (2021)."},{"key":"e_1_3_2_1_73_1","volume-title":"GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang , Boyuan Feng , Gushu Li , Shuangchen Li , Lei Deng , Yuan Xie , and Yufei Ding . 2021 . GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21) . USENIX Association, 515--531. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/wang-yuke Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. 2021. GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). USENIX Association, 515--531. https:\/\/www.usenix.org\/conference\/osdi21\/presentation\/wang-yuke"},{"key":"e_1_3_2_1_74_1","unstructured":"WikiChip. 2023. Sunny Cove - Microarchitectures - Intel. https:\/\/en.wikichip.org\/wiki\/intel\/microarchitectures\/sunny_cove  WikiChip. 2023. Sunny Cove - Microarchitectures - Intel. https:\/\/en.wikichip.org\/wiki\/intel\/microarchitectures\/sunny_cove"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00012"},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. 329--341","author":"Yesil Serif","year":"2023","unstructured":"Serif Yesil , Azin Heidarshenas , Adam Morrison , and Josep Torrellas . 2023 . WISE: Predicting the Performance of Sparse Matrix Vector Multiplication with Machine Learning . In Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. 329--341 . Serif Yesil, Azin Heidarshenas, Adam Morrison, and Josep Torrellas. 2023. WISE: Predicting the Performance of Sparse Matrix Vector Multiplication with Machine Learning. In Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. 329--341."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532369"},{"key":"e_1_3_2_1_78_1","volume-title":"2021 IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 1--7.","author":"Zhang Bingyi","year":"2021","unstructured":"Bingyi Zhang , Sanmukh R Kuppannagari , Rajgopal Kannan , and Viktor Prasanna . 2021 . Efficient neighbor-sampling-based GNN training on CPU-FPGA heterogeneous platform . In 2021 IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 1--7. Bingyi Zhang, Sanmukh R Kuppannagari, Rajgopal Kannan, and Viktor Prasanna. 2021. Efficient neighbor-sampling-based GNN training on CPU-FPGA heterogeneous platform. In 2021 IEEE High Performance Extreme Computing Conference (HPEC). IEEE, 1--7."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/2684746.2689060"},{"key":"e_1_3_2_1_80_1","volume-title":"Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 1495--1502","author":"Zhao Huasha","year":"2015","unstructured":"Huasha Zhao , Biye Jiang , John F Canny , and Bobby Jaros . 2015 . SAME but Different: Fast and High Quality Gibbs Parameter Estimation . In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 1495--1502 . Huasha Zhao, Biye Jiang, John F Canny, and Bobby Jaros. 2015. SAME but Different: Fast and High Quality Gibbs Parameter Estimation. In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 1495--1502."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","DOI":"10.1145\/3534678","volume-title":"Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 4582--4591","author":"Zheng Da","year":"2022","unstructured":"Da Zheng , Xiang Song , Chengru Yang , Dominique LaSalle , and George Karypis . 2022 . Distributed hybrid CPU and GPU training for Graph Neural Networks on billion-scale heterogeneous graphs . In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 4582--4591 . Da Zheng, Xiang Song, Chengru Yang, Dominique LaSalle, and George Karypis. 2022. Distributed hybrid CPU and GPU training for Graph Neural Networks on billion-scale heterogeneous graphs. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 4582--4591."}],"event":{"name":"ISCA '23: 50th Annual International Symposium on Computer Architecture","location":"Orlando FL USA","acronym":"ISCA '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE"]},"container-title":["Proceedings of the 50th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589054","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:38Z","timestamp":1750178798000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589054"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,17]]},"references-count":81,"alternative-id":["10.1145\/3579371.3589054","10.1145\/3579371"],"URL":"https:\/\/doi.org\/10.1145\/3579371.3589054","relation":{},"subject":[],"published":{"date-parts":[[2023,6,17]]},"assertion":[{"value":"2023-06-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}