{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:12Z","timestamp":1755870012720,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFB3002902"],"award-info":[{"award-number":["2023YFB3002902"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China","award":["No. 62322201&U23B2020"],"award-info":[{"award-number":["No. 62322201&U23B2020"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725746","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"149-160","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CB-SpMV:A Data Aggregating and Balance Algorithm for for Cache-Friendly Block-Based SpMV on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0405-5008","authenticated-orcid":false,"given":"Xing","family":"Cong","sequence":"first","affiliation":[{"name":"Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4067-1219","authenticated-orcid":false,"given":"FuKai","family":"Sun","sequence":"additional","affiliation":[{"name":"Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3065-9035","authenticated-orcid":false,"given":"YiFan","family":"Chen","sequence":"additional","affiliation":[{"name":"Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1399-0352","authenticated-orcid":false,"given":"Chenhao","family":"Xie","sequence":"additional","affiliation":[{"name":"Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1829-2817","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, BeiJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5382-1473","authenticated-orcid":false,"given":"Depei","family":"Qian","sequence":"additional","affiliation":[{"name":"Beihang University, BeiJing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Hartwig Anzt Terry Cojean Chen Yen-Chen Jack Dongarra Goran Flegar Pratik Nayak Stanimire Tomov Yuhsiang\u00a0M Tsai and Weichung Wang. 2020. Load-balancing sparse matrix vector product kernels on gpus. ACM Transactions on Parallel Computing (TOPC) 7 1 (2020) 1\u201326.","DOI":"10.1145\/3380930"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.69"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/2597652.2597678"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654078"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS60453.2023.00262"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Haodong Bian Jianqiang Huang Lingbin Liu Dongqiang Huang and Xiaoying Wang. 2021. Albus: A method for efficiently processing spmv using simd and load balancing. Future Generation Computer Systems 116 (2021) 371\u2013392.","DOI":"10.1016\/j.future.2020.10.036"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Urban Bor\u0161tnik Joost VandeVondele Val\u00e9ry Weber and J\u00fcrg Hutter. 2014. Sparse matrix multiplication: The distributed block-compressed sparse row library. Parallel Comput. 40 5-6 (2014) 47\u201358.","DOI":"10.1016\/j.parco.2014.03.012"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/1583991.1584053"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536313"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Aydin Bulu\u00e7 and John\u00a0R Gilbert. 2012. Parallel sparse matrix-matrix multiplication and indexing: Implementation and experiments. SIAM Journal on Scientific Computing 34 4 (2012) C170\u2013C191.","DOI":"10.1137\/110848244"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.73"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3593002"},{"key":"e_1_3_3_1_14_2","volume-title":"CUDA programming: a developer\u2019s guide to parallel computing with GPUs","author":"Cook Shane","year":"2012","unstructured":"Shane Cook. 2012. CUDA programming: a developer\u2019s guide to parallel computing with GPUs. Newnes."},{"key":"e_1_3_3_1_15_2","unstructured":"NVIDIA Corporation. 2024. cuSPARSE: GPU-Accelerated Sparse Matrix Library. https:\/\/developer.nvidia.com\/cusparse. Version 12.4."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A Davis and Yifan Hu. 2011. The University of Florida sparse matrix collection. ACM Transactions on Mathematical Software (TOMS) 38 1 (2011) 1\u201325.","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00071"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2016.42"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651378"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Salvatore Filippone Valeria Cardellini Davide Barbieri and Alessandro Fanfarillo. 2017. Sparse matrix-vector multiplication on GPGPUs. ACM Transactions on Mathematical Software (TOMS) 43 4 (2017) 1\u201349.","DOI":"10.1145\/3017994"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","unstructured":"Salvatore Filippone Valeria Cardellini Davide Barbieri and Alessandro Fanfarillo. 2017. Sparse Matrix-Vector Multiplication on GPGPUs. ACM Trans. Math. Softw. 43 4 Article 30 (Jan. 2017) 49\u00a0pages. 10.1145\/3017994","DOI":"10.1145\/3017994"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Jianhua Gao Weixing Ji Zhaonian Tan Yizhuo Wang and Feng Shi. 2022. Taichi: A hybrid compression format for binary sparse matrix-vector multiplication on gpu. IEEE Transactions on Parallel and Distributed Systems 33 12 (2022) 3732\u20133745.","DOI":"10.1109\/TPDS.2022.3170501"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.68"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673042"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545028"},{"key":"e_1_3_3_1_26_2","unstructured":"Kwangrae Kim and Ki-Seok Chung. 2024. CAMPuS: Concurrent Acceleration of Memory Access and Parallel Processing in Near-Memory SpMV Architecture. IEEE Access (2024)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Kornilios Kourtis Vasileios Karakasis Georgios Goumas and Nectarios Koziris. 2011. CSX: an extended compression format for spmv on shared memory systems. ACM SIGPLAN Notices 46 8 (2011) 247\u2013256.","DOI":"10.1145\/2038037.1941587"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Moritz Kreutzer Georg Hager Gerhard Wellein Holger Fehske and Alan\u00a0R Bishop. 2014. A unified sparse matrix data format for efficient general sparse matrix-vector multiplication on modern processors with wide SIMD units. SIAM Journal on Scientific Computing 36 5 (2014) C401\u2013C423.","DOI":"10.1137\/130930352"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/IA356718.2022.00009"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00022"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER52292.2023.00025"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Yun Liang Wai\u00a0Teng Tang Ruizhe Zhao Mian Lu Huynh\u00a0Phung Huynh and Rick Siow\u00a0Mong Goh. 2017. Scale-free sparse matrix-vector multiplication on many-core architectures. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 36 12 (2017) 2106\u20132119.","DOI":"10.1109\/TCAD.2017.2681072"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","first-page":"733","DOI":"10.1109\/ICPADS.2015.97","volume-title":"2015 IEEE 21st International Conference on Parallel and Distributed Systems (ICPADS)","author":"Liu Lifeng","year":"2015","unstructured":"Lifeng Liu, Meilin Liu, Chongjun Wang, and Jun Wang. 2015. LSRB-CSR: A low overhead storage format for SpMV on the GPU systems. In 2015 IEEE 21st International Conference on Parallel and Distributed Systems (ICPADS). IEEE, 733\u2013741."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751209"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607051"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404413"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.57"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid57682.2023.00056"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/237578.237624"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00016"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508431"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577434"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"James O\u2019Neil and Daniel\u00a0B Szyld. 1990. A block ordering method for sparse matrices. SIAM J. Sci. Statist. Comput. 11 5 (1990) 811\u2013823.","DOI":"10.1137\/0911048"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079079.3079086"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM60383.2024.00014"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1007\/11557654_91"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472478"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Shengen Yan Chao Li Yunquan Zhang and Huiyang Zhou. 2014. yaSpMV: Yet another SpMV framework on GPUs. Acm Sigplan Notices 49 8 (2014) 107\u2013118.","DOI":"10.1145\/2692916.2555255"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Wangdong Yang Kenli Li and Keqin Li. 2018. A parallel computing method using blocked format with optimal partitioning for SpMV on GPU. Journal of computer and system sciences 92 (2018) 152\u2013170.","DOI":"10.1016\/j.jcss.2017.09.010"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433815"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00046"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725746","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:03:47Z","timestamp":1755867827000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725746"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":50,"alternative-id":["10.1145\/3721145.3725746","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725746","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}