{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:08:13Z","timestamp":1775912893227,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":117,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710859","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"339-354","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["BerryBees: Breadth First Search by Bit-Tensor-Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7605-525X","authenticated-orcid":false,"given":"Yuyao","family":"Niu","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4564-2093","authenticated-orcid":false,"given":"Marc","family":"Casas","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"High-Performance GraphBLAS Backend Prototype for NEC SX-Aurora TSUBASA. In IPDPSW '22","author":"Afanasyev Ilya","year":"2022","unstructured":"Ilya Afanasyev, Kazuhiko Komatsu, Dmitry Lichmanov, Vadim Voevodin, and Hiroaki Kobayashi. 2022. High-Performance GraphBLAS Backend Prototype for NEC SX-Aurora TSUBASA. In IPDPSW '22. 221--229."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.6515"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3380930"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243192"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.69"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.76"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3094091"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.50"},{"key":"e_1_3_2_1_9_1","volume-title":"The GAP benchmark suite. arXiv preprint arXiv:1508.03619","author":"Beamer Scott","year":"2015","unstructured":"Scott Beamer, Krste Asanovi\u0107, and David Patterson. 2015. The GAP benchmark suite. arXiv preprint arXiv:1508.03619 (2015)."},{"key":"e_1_3_2_1_10_1","first-page":"235","article-title":"Groute","volume":"17","author":"Ben-Nun Tal","year":"2017","unstructured":"Tal Ben-Nun, Michael Sutton, Sreepathi Pai, and Keshav Pingali. 2017. Groute: An Asynchronous Multi-GPU Programming Model for Irregular Computations. In PPoPP 17. 235--248.","journal-title":"An Asynchronous Multi-GPU Programming Model for Irregular Computations. In PPoPP"},{"key":"e_1_3_2_1_11_1","volume-title":"SlimSell: A Vectorizable Graph Representation for Breadth-First Search. In IPDPS '17","author":"Besta Maciej","year":"2017","unstructured":"Maciej Besta, Florian Marending, Edgar Solomonik, and Torsten Hoefler. 2017. SlimSell: A Vectorizable Graph Representation for Breadth-First Search. In IPDPS '17. 32--41."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078597.3078616"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243198"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2015.2475270"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1137\/19M1289546"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342011403516"},{"key":"e_1_3_2_1_17_1","volume-title":"Reduced-Bandwidth Multithreaded Algorithms for Sparse Matrix-Vector Multiplication. In IPDPS '11","author":"Bulu\u00e7 Aydin","year":"2011","unstructured":"Aydin Bulu\u00e7, Samuel Williams, Leonid Oliker, and James Demmel. 2011. Reduced-Bandwidth Multithreaded Algorithms for Sparse Matrix-Vector Multiplication. In IPDPS '11. 721--733."},{"key":"e_1_3_2_1_18_1","volume-title":"SPAA '09","author":"Bulu\u00e7 Aydin","unstructured":"Aydin Bulu\u00e7, Jeremy T. Fineman, Matteo Frigo, John R. Gilbert, and Charles E. Leiserson. 2009. Parallel sparse matrix-vector and matrix-transpose-vector multiplication using compressed sparse blocks. In SPAA '09. 233--244."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063471"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3446216"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3446216"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508403"},{"key":"e_1_3_2_1_23_1","volume-title":"Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU. In IPDPS '22","author":"Chen Jou-An","year":"2022","unstructured":"Jou-An Chen, Hsin-Hsuan Sung, Xipeng Shen, Nathan Tallent, Kevin Barker, and Ang Li. 2022. Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU. In IPDPS '22. 515--525."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2023.02.013"},{"key":"e_1_3_2_1_25_1","volume-title":"Scalable Irregular Parallelism with GPUs: Getting CPUs Out of the Way. In SC '22","author":"Chen Yuxin","unstructured":"Yuxin Chen, Benjamin Brock, Serban Porumbescu, Aydin Bulu\u00e7, Katherine Yelick, and John D. Owens. 2022. Scalable Irregular Parallelism with GPUs: Getting CPUs Out of the Way. In SC '22. 1--16."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638476"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512770"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476182"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3350755.3400252"},{"key":"e_1_3_2_1_31_1","volume-title":"Algorithm Design for Tensor Units. In Euro-Par '21","author":"Chowdhury Rezaul","year":"2021","unstructured":"Rezaul Chowdhury, Francesco Silvestri, and Flavio Vella. 2021. Algorithm Design for Tensor Units. In Euro-Par '21. 353--367."},{"key":"e_1_3_2_1_32_1","volume-title":"An analysis of the graph processing landscape. journal of Big Data 8, 1","author":"Coimbra Miguel E","year":"2021","unstructured":"Miguel E Coimbra, Alexandre P Francisco, and Lu\u00eds Veiga. 2021. An analysis of the graph processing landscape. journal of Big Data 8, 1 (2021), 55."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331057"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3322125"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577195"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00114"},{"key":"e_1_3_2_1_38_1","volume-title":"Mert Hidayetoglu, Rashid Tahir, Abdul Dakkak","author":"Durrani Sultan","year":"2021","unstructured":"Sultan Durrani, Muhammad Saad Chughtai, Mert Hidayetoglu, Rashid Tahir, Abdul Dakkak, Lawrence Rauchwerger, Fareed Zaffar, and Wen-mei Hwu. 2021. Accelerating Fourier and Number Theoretic Transforms using Tensor Cores and Warp Shuffles. In PACT '21. 345--355."},{"key":"e_1_3_2_1_39_1","volume-title":"DTC-SpMM: Bridging the Gap in Accelerating General Sparse Matrix Multiplication with Tensor Cores. In ASPLOS '24","author":"Fan Ruibo","year":"2024","unstructured":"Ruibo Fan, Wei Wang, and Xiaowen Chu. 2024. DTC-SpMM: Bridging the Gap in Accelerating General Sparse Matrix Multiplication with Tensor Cores. In ASPLOS '24 (La Jolla, CA, USA). 253--267."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476157"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476157"},{"key":"e_1_3_2_1_42_1","volume-title":"On the Feasibility of Using Reduced-Precision Tensor Core Operations for Graph Analytics. In HPEC '20","author":"Firoz Jesun Sahariar","year":"2020","unstructured":"Jesun Sahariar Firoz, Ang Li, Jiajia Li, and Kevin Barker. 2020. On the Feasibility of Using Reduced-Precision Tensor Core Operations for Graph Analytics. In HPEC '20. 1--7."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307681.3326606"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441592"},{"key":"e_1_3_2_1_45_1","volume-title":"SC '18","author":"Haidar Azzam","unstructured":"Azzam Haidar, Stanimire Tomov, Jack Dongarra, and Nicholas J. Higham. 2018. Harnessing GPU Tensor Cores for Fast FP16 Arithmetic to Speed up Mixed-Precision Iterative Refinement Solvers. In SC '18. 603--613."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2017.41"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI54635.2022.00051"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2017.48"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/1941553.1941590"},{"key":"e_1_3_2_1_50_1","volume-title":"Efficient Parallel Graph Exploration on Multi-Core CPU and GPU. In PATC '11","author":"Hong Sungpack","year":"2011","unstructured":"Sungpack Hong, Tayo Oguntebi, and Kunle Olukotun. 2011. Efficient Parallel Graph Exploration on Multi-Core CPU and GPU. In PATC '11. 78--88."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530588"},{"key":"e_1_3_2_1_52_1","volume-title":"RM-STC: Row-Merge Dataflow Inspired GPU Sparse Tensor Core for Energy-Efficient Sparse Acceleration. In MICRO '23","author":"Huang Guyue","year":"2023","unstructured":"Guyue Huang, Zhengyang Wang, Po-An Tsai, Chen Zhang, Yufei Ding, and Yuan Xie. 2023. RM-STC: Row-Merge Dataflow Inspired GPU Sparse Tensor Core for Energy-Efficient Sparse Acceleration. In MICRO '23. 338--352."},{"key":"e_1_3_2_1_53_1","volume-title":"TileSpMSpV: A Tiled Algorithm for Sparse Matrix-Sparse Vector Multiplication on GPUs. In ICPP '23","author":"Ji Haonan","year":"2023","unstructured":"Haonan Ji, Huimin Song, Shibo Lu, Zhou Jin, Guangming Tan, and Weifeng Liu. 2023. TileSpMSpV: A Tiled Algorithm for Sparse Matrix-Sparse Vector Multiplication on GPUs. In ICPP '23. 1--11."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532368"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2016.7761646"},{"key":"e_1_3_2_1_56_1","volume-title":"PACT '18","author":"Khorasani Farzad","unstructured":"Farzad Khorasani, Keval Vora, Rajiv Gupta, and Laxmi N. Bhuyan. 2014. CuSha: vertex-centric graph processing on GPUs. In PACT '18. 239--252."},{"key":"e_1_3_2_1_57_1","volume-title":"TC-GVF: Tensor Core GPU based Vector Fitting via Accelerated Tall-Skinny QR Solvers","author":"Kukutla Vinay","year":"2024","unstructured":"Vinay Kukutla, Ramachandra Achar, and Wai Kong Lee. 2024. TC-GVF: Tensor Core GPU based Vector Fitting via Accelerated Tall-Skinny QR Solvers. IEEE Transactions on Components, Packaging and Manufacturing Technology (2024), 1--1."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3380942"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356169"},{"key":"e_1_3_2_1_60_1","first-page":"1878","article-title":"Accelerating Binarized Neural Networks via Bit-Tensor-Cores in Turing GPUs","volume":"32","author":"Li Ang","year":"2021","unstructured":"Ang Li and Simon Su. 2021. Accelerating Binarized Neural Networks via Bit-Tensor-Cores in Turing GPUs. IEEE Transactions on Parallel and Distributed Systems 32, 7 (2021), 1878--1891.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/Cluster48925.2021.00035"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2968219.2968297"},{"key":"e_1_3_2_1_63_1","volume-title":"Unleashing the Low-Precision Computation Potential of Tensor Cores on GPUs. In CGO '21","author":"Li Guangli","year":"2021","unstructured":"Guangli Li, Jingling Xue, Lei Liu, Xueying Wang, Xiu Ma, Xiao Dong, Jiansong Li, and Xiaobing Feng. 2021. Unleashing the Low-Precision Computation Potential of Tensor Cores on GPUs. In CGO '21. 90--102."},{"key":"e_1_3_2_1_64_1","first-page":"1842","article-title":"Adaptive SpMV\/SpMSpV on GPUs for Input Vectors of Varied Sparsity","volume":"32","author":"Li Min","year":"2021","unstructured":"Min Li, Yulong Ao, and Chao Yang. 2021. Adaptive SpMV\/SpMSpV on GPUs for Input Vectors of Varied Sparsity. IEEE Transactions on Parallel and Distributed Systems 32, 7 (2021), 1842--1853.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_65_1","volume-title":"Efficient Quantized Sparse Matrix Operations on Tensor Cores. In SC '22","author":"Li Shigang","year":"2022","unstructured":"Shigang Li, Kazuki Osawa, and Torsten Hoefler. 2022. Efficient Quantized Sparse Matrix Operations on Tensor Cores. In SC '22. 1--15."},{"key":"e_1_3_2_1_66_1","volume-title":"HASpMV: Heterogeneity-Aware Sparse Matrix-Vector Multiplication on Modern Asymmetric Multicore Processors. In CLUSTER '23","author":"Li Wenxuan","year":"2023","unstructured":"Wenxuan Li, Helin Cheng, Zhengyang Lu, Yuechen Lu, and Weifeng Liu. 2023. HASpMV: Heterogeneity-Aware Sparse Matrix-Vector Multiplication on Modern Asymmetric Multicore Processors. In CLUSTER '23. 209--220."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.53"},{"key":"e_1_3_2_1_68_1","volume-title":"Scalable Graph Traversal on Sunway TaihuLight with Ten Million Cores. In IPDPS '17","author":"Lin Heng","year":"2017","unstructured":"Heng Lin, Xiongchao Tang, Bowen Yu, Youwei Zhuo, Wenguang Chen, Jidong Zhai, Wanwang Yin, and Weimin Zheng. 2017. Scalable Graph Traversal on Sunway TaihuLight with Ten Million Cores. In IPDPS '17. 635--645."},{"key":"e_1_3_2_1_69_1","volume-title":"SC '15","author":"Liu Hang","unstructured":"Hang Liu and H. Howie Huang. 2015. Enterprise: breadth-first graph traversal on GPUs. In SC '15. 1--12."},{"key":"e_1_3_2_1_70_1","volume-title":"USENIX ATC '19","author":"Liu Hang","year":"2019","unstructured":"Hang Liu and H Howie Huang. 2019. {SIMD-X}: Programming and processing of graph algorithms on {GPUs}. In USENIX ATC '19. 411--428."},{"key":"e_1_3_2_1_71_1","volume-title":"ICS '15","author":"Liu W.","unstructured":"W. Liu and B. Vinter. 2015. CSR5: An Efficient Storage Format for Cross-Platform Sparse Matrix-Vector Multiplication. In ICS '15. 339--350."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_2_1_73_1","volume-title":"DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication. In SC '23","author":"Lu Yuechen","year":"2023","unstructured":"Yuechen Lu and Weifeng Liu. 2023. DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication. In SC '23. 1--14."},{"key":"e_1_3_2_1_74_1","volume-title":"AmgT: Algebraic Multigrid Solver on Tensor Cores. In SC '24","author":"Lu Yuechen","year":"2024","unstructured":"Yuechen Lu, Lijie Zeng, Tengcheng Wang, Xu Fu, Wenxuan Li, Helin Cheng, Dechuang Yang, Zhou Jin, Marc Casas, and Weifeng Liu. 2024. AmgT: Algebraic Multigrid Solver on Tensor Cores. In SC '24. 1--16."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/1837274.1837289"},{"key":"e_1_3_2_1_76_1","volume-title":"Erwin Laure, Ivy Bo Peng, and Jeffrey S. Vetter.","author":"Markidis Stefano","year":"2018","unstructured":"Stefano Markidis, Steven Wei Der Chien, Erwin Laure, Ivy Bo Peng, and Jeffrey S. Vetter. 2018. NVIDIA Tensor Core Programmability, Performance & Precision. In IPDPSW '18. 522--531."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-10549-5_35"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3561652"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2013.05.007"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2013.6670338"},{"key":"e_1_3_2_1_81_1","volume-title":"Moreira","author":"Mattson Timothy G.","year":"2017","unstructured":"Timothy G. Mattson, Carl Yang, Scott McMillan, Aydin Bulu\u00e7, and Jos\u00e9 E. Moreira. 2017. GraphBLAS C API: Ideas for future versions of the specification. In HPEC '17. 1--6."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295716"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145832"},{"key":"e_1_3_2_1_84_1","volume-title":"Balancing Computation and Communication in Distributed Sparse Matrix-Vector Multiplication. In CCGRID '23","author":"Mi Hongli","year":"2023","unstructured":"Hongli Mi, Xiangrui Yu, Xiaosong Yu, Shuangyuan Wu, and Weifeng Liu. 2023. Balancing Computation and Communication in Distributed Sparse Matrix-Vector Multiplication. In CCGRID '23. 535--544."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"crossref","unstructured":"Sushruta Mishra Chandrakanta Mahanty Shreela Dash and Brojo Kishore Mishra. 2019. Implementation of BFS-NB Hybrid Model in Intrusion Detection System. In Recent Developments in Machine Learning and Data Analytics. 167--175.","DOI":"10.1007\/978-981-13-1280-9_17"},{"key":"e_1_3_2_1_86_1","volume-title":"TileSpMV: A Tiled Algorithm for Sparse Matrix-Vector Multiplication on GPUs. In IPDPS '21","author":"Niu Yuyao","year":"2021","unstructured":"Yuyao Niu, Zhengyang Lu, Meichen Dong, Zhou Jin, Weifeng Liu, and Guangming Tan. 2021. TileSpMV: A Tiled Algorithm for Sparse Matrix-Vector Multiplication on GPUs. In IPDPS '21. 68--78."},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173180"},{"key":"e_1_3_2_1_88_1","volume-title":"IPDPS '18","author":"Pan Yuechao","unstructured":"Yuechao Pan, Roger Pearce, and John D. Owens. 2018. Scalable Breadth-First Search on a GPU Cluster. In IPDPS '18. 1090--1101."},{"key":"e_1_3_2_1_89_1","volume-title":"Multi-GPU Graph Analytics. In IPDPS '17","author":"Pan Yuechao","unstructured":"Yuechao Pan, Yangzihao Wang, Yuduo Wu, Carl Yang, and John D. Owens. 2017. Multi-GPU Graph Analytics. In IPDPS '17. 479--490."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243205"},{"key":"e_1_3_2_1_91_1","volume-title":"IPDPS '21","author":"Louis","unstructured":"Louis Pisha and \u0141ukasz Ligowski. 2021. Accelerating non-power-of-2 size Fourier transforms with GPU Tensor Cores. In IPDPS '21. 507--516."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.70"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2019.8916413"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3128571"},{"key":"e_1_3_2_1_95_1","volume-title":"Blelloch","author":"Shun Julian","year":"2013","unstructured":"Julian Shun and Guy E. Blelloch. 2013. Ligra: a lightweight graph processing framework for shared memory. In PPoPP'13. 135--146."},{"key":"e_1_3_2_1_96_1","volume-title":"GPNPU: Enabling Efficient Hardware-Based Direct Convolution with Multi-Precision Support in GPU Tensor Cores. In DAC '20","author":"Song Zhuoran","year":"2020","unstructured":"Zhuoran Song, Jianfei Wang, Tianjian Li, Li Jiang, Jing Ke, Xiaoyao Liang, and Naifeng Jing. 2020. GPNPU: Enabling Efficient Hardware-Based Direct Convolution with Multi-Precision Support in GPU Tensor Cores. In DAC '20. 1--6."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3217824"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE51398.2021.9473984"},{"key":"e_1_3_2_1_99_1","volume-title":"Accelerating Sparse Deep Neural Network Inference Using GPU Tensor Cores. In HPEC '22","author":"Sun Yufei","year":"2022","unstructured":"Yufei Sun, Long Zheng, Qinggang Wang, Xiangyu Ye, Yu Huang, Pengcheng Yao, Xiaofei Liao, and Hai Jin. 2022. Accelerating Sparse Deep Neural Network Inference Using GPU Tensor Cores. In HPEC '22. 1--7."},{"key":"e_1_3_2_1_100_1","volume-title":"Md. Mostofa Ali Patwary, Subramanya Dulloor, Satya Gautam Vadlamudi, Dipankar Das, and Pradeep Dubey.","author":"Sundaram Narayanan","year":"2015","unstructured":"Narayanan Sundaram, Nadathur Rajagopalan Satish, Md. Mostofa Ali Patwary, Subramanya Dulloor, Satya Gautam Vadlamudi, Dipankar Das, and Pradeep Dubey. 2015. GraphMat: High performance graph analytics made productive. CoRR abs\/1503.07241 (2015). arXiv:1503.07241"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2021.107327"},{"key":"e_1_3_2_1_102_1","volume-title":"PASC '21","author":"Tu Jiqun","unstructured":"Jiqun Tu, M. A. Clark, Chulwoo Jung, and Robert D. Mawhinney. 2021. Solving DWF dirac equation using multi-splitting preconditioned conjugate gradient with tensor cores on NVIDIA GPUs. In PASC '21. 1--11."},{"key":"e_1_3_2_1_103_1","volume-title":"Modeling Matrix Engines for Portability and Performance. In IPDPS '22","author":"Tukanov Nicholai","year":"2022","unstructured":"Nicholai Tukanov, Rajalakshmi Srinivasaraghavan, Jos\u00e9 E. Moreira, and Tze Meng Low. 2022. Modeling Matrix Engines for Portability and Performance. In IPDPS '22. 1173--1183."},{"key":"e_1_3_2_1_104_1","volume-title":"Understanding Graph Sampling Algorithms for Social Network Analysis. In ICDCSW '11","author":"Wang Tianyi","year":"2011","unstructured":"Tianyi Wang, Yang Chen, Zengbin Zhang, Tianyin Xu, Long Jin, Pan Hui, Beixing Deng, and Xing Li. 2011. Understanding Graph Sampling Algorithms for Social Network Analysis. In ICDCSW '11. 123--128."},{"key":"e_1_3_2_1_105_1","volume-title":"PPoPP '16","author":"Wang Yangzihao","unstructured":"Yangzihao Wang, Andrew Davidson, Yuechao Pan, Yuduo Wu, Andy Riffel, and John D. Owens. 2016. Gunrock: a high-performance graph processing library on the GPU. In PPoPP '16. 1--12."},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"crossref","unstructured":"Yuke Wang Boyuan Feng and Yufei Ding. 2022. QGTC: accelerating quantized graph neural networks via GPU tensor core. In PPoPP'22. 107--119.","DOI":"10.1145\/3503221.3508408"},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1145\/3108140"},{"key":"e_1_3_2_1_108_1","volume-title":"Improving Parallelism of Breadth First Search (BFS) Algorithm for Accelerated Performance on GPUs. In HPEC '19","author":"Wen Hao","year":"2019","unstructured":"Hao Wen and Wei Zhang. 2019. Improving Parallelism of Breadth First Search (BFS) Algorithm for Accelerated Performance on GPUs. In HPEC '19. 1--7."},{"key":"e_1_3_2_1_109_1","volume-title":"Implementing Push-Pull Efficiently in GraphBLAS. In ICPP '18","author":"Yang Carl","unstructured":"Carl Yang, Aydin Bulu\u00e7, and John D. Owens. 2018. Implementing Push-Pull Efficiently in GraphBLAS. In ICPP '18. New York, NY, USA."},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466795"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2005.4"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-022-04934-1"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2011.08.004"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2013.31"},{"key":"e_1_3_2_1_115_1","doi-asserted-by":"publisher","DOI":"10.1137\/080733243"},{"key":"e_1_3_2_1_116_1","volume-title":"LoRAStencil: Low-Rank Adaptation of Stencil Computation on Tensor Cores. In SC '24","author":"Zhang Yiwei","year":"2024","unstructured":"Yiwei Zhang, Kun Li, Liang Yuan, Jiawen Cheng, Yunquan Zhang, Ting Cao, and Mao Yang. 2024. LoRAStencil: Low-Rank Adaptation of Stencil Computation on Tensor Cores. In SC '24. 1--17."},{"key":"e_1_3_2_1_117_1","doi-asserted-by":"publisher","DOI":"10.1145\/3276491"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710859","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710859","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:15:51Z","timestamp":1755875751000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710859"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":117,"alternative-id":["10.1145\/3710848.3710859","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710859","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}