{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:39:28Z","timestamp":1766219968221,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFA1011704"],"award-info":[{"award-number":["2023YFA1011704"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB0300101"],"award-info":[{"award-number":["2021YFB0300101"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754613","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"135-145","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["VES: Vectorized Sparse General Matrix-Matrix Multiplication on Multi-Core DSPs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-5746-164X","authenticated-orcid":false,"given":"Chuhe","family":"Hong","sequence":"first","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8286-6566","authenticated-orcid":false,"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2113-301X","authenticated-orcid":false,"given":"Xing","family":"Peng","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5015-4736","authenticated-orcid":false,"given":"Gencheng","family":"Liu","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7396-874X","authenticated-orcid":false,"given":"Qingyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2931-4893","authenticated-orcid":false,"given":"Xinhai","family":"Chen","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3745-7541","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"Laboratory of Digitizing Software for Frontier Equipment, National University of Defense Technology, Changsha, China; National Key Laboratory of Parallel and Distributed Computing, National University of Defense Technology, Changsha, China and College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Ariful Azad Grey Ballard Aydin Buluc James Demmel Laura Grigori Oded Schwartz and Sivan Toledo. 2016. Exploiting Multiple Levels of Parallelism in Sparse Matrix-Matrix Multiplication. SIAM Journal on Matrix Analysis and Applications SIAM Journal on Matrix Analysis and Applications (Nov 2016).","DOI":"10.1137\/15M104253X"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"Ariful Azad Georgios\u00a0A Pavlopoulos Christos\u00a0A Ouzounis Nikos\u00a0C Kyrpides and Aydin Bulu\u00e7. 2018. HipMCL: a high-performance parallel implementation of the Markov clustering algorithm for large-scale networks. Nucleic Acids Research 46 6 (Apr 2018) e33\u2013e33. 10.1093\/nar\/gkx1313","DOI":"10.1093\/nar\/gkx1313"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","unstructured":"Nathan Bell Steven Dalton and Luke\u00a0N. Olson. 2012. Exposing Fine-Grained Parallelism in Algebraic Multigrid Methods. SIAM Journal on Scientific Computing (Jan 2012) C123\u2013C152. 10.1137\/110838844","DOI":"10.1137\/110838844"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673061"},{"key":"e_1_3_3_1_6_2","first-page":"201","volume-title":"International Conference on Algorithms and Architectures for Parallel Processing","author":"Bi Deshun","year":"2023","unstructured":"Deshun Bi, Shengguo Li, Yichen Zhang, Xiaojian Yang, and Dezun Dong. 2023. Efficiently Running SpMV on Multi-core DSPs for Banded Matrix. In International Conference on Algorithms and Architectures for Parallel Processing. Springer, 201\u2013220."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS60453.2023.00262"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Yuedan Chen Kenli Li Wangdong Yang Guoqing Xiao Xianghui Xie and Tao Li. 2019. Performance-Aware Model for Sparse Matrix-Matrix Multiplication on the Sunway TaihuLight Supercomputer. IEEE Transactions on Parallel and Distributed Systems 30 4 (2019) 923\u2013938. 10.1109\/TPDS.2018.2871189","DOI":"10.1109\/TPDS.2018.2871189"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605611"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"Timothy\u00a0A. Davis and Yifan Hu. 2011. The university of Florida sparse matrix collection. ACM Trans. Math. Software (Nov 2011) 1\u201325. 10.1145\/2049662.2049663","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Zhaoyang Du Yijin Guan Tianchan Guan Dimin Niu Hongzhong Zheng and Yuan Xie. 2022. Accelerating CPU-Based Sparse General Matrix Multiplication With Binary Row Merging. IEEE Access 10 (2022) 79237\u201379248. 10.1109\/ACCESS.2022.3193937","DOI":"10.1109\/ACCESS.2022.3193937"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Jianbin Fang Peng Zhang Chun Huang Tao Tang Kai Lu Ruibo Wang and Zheng Wang. 2023. Programming bare-metal accelerators with heterogeneous threading models: a case study of Matrix-3000. Frontiers of Information Technology & Electronic Engineering 24 4 (2023) 509\u2013520.","DOI":"10.1631\/FITEE.2200359"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2013.6567571"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/hpec.2014.7040985"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"John\u00a0R. Gilbert Cleve Moler and Robert Schreiber. 1992. Sparse Matrices in MATLAB: Design and Implementation. SIAM J. Matrix Anal. Appl. (Jan 1992) 333\u2013356. 10.1137\/0613024","DOI":"10.1137\/0613024"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Felix Gremse Andreas Hofter Lars\u00a0Ole Schwen Fabian Kiessling and Uwe Naumann. 2015. GPU-accelerated sparse matrix-matrix multiplication by iterative row merging. SIAM Journal on Scientific Computing 37 1 (2015) C54\u2013C71.","DOI":"10.1137\/130948811"},{"key":"e_1_3_3_1_17_2","unstructured":"Zhixiang Gu Jos\u00e9E. Moreira David Edelsohn and Ariful Azad. 2020. Bandwidth-Optimized Parallel Algorithms for Sparse Matrix-Matrix Multiplication using Propagation Blocking. Proceedings of the 32nd ACM Symposium on Parallelism in Algorithms and Architectures (Feb 2020)."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Fred\u00a0G. Gustavson. 1978. Two Fast Algorithms for Sparse Matrices: Multiplication and Permuted Transposition. ACM Trans. Math. Software (Sep 1978) 250\u2013269. 10.1145\/355791.355796","DOI":"10.1145\/355791.355796"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Charles\u00a0AR Hoare. 1962. Quicksort. The computer journal 5 1 (1962) 10\u201316.","DOI":"10.1093\/comjnl\/5.1.10"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3208040.3208062"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673054"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00076"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Jianjiang Li Hongyaoxing Gu Jing Zhao Lin Qiao Chunye Gong and Gang Zheng. 2024. Transplantation and optimization of molecular dynamics simulation on MT-3000. Future Generation Computer Systems 153 (2024) 262\u2013275.","DOI":"10.1016\/j.future.2023.11.035"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Kenli Li Wangdong Yang and Keqin Li. 2014. Performance analysis and optimization for SpMV on GPU using probabilistic modeling. IEEE Transactions on Parallel and Distributed Systems 26 1 (2014) 196\u2013205.","DOI":"10.1109\/TPDS.2014.2308221"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ipdps.2014.47"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Weifeng Liu and Brian Vinter. 2015. A framework for general sparse matrix\u2013matrix multiplication on GPUs and heterogeneous processors. J. Parallel and Distrib. Comput. 85 (2015) 47\u201361.","DOI":"10.1016\/j.jpdc.2015.06.010"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICESC48915.2020.9155623"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Kai Lu Yaohua Wang Yang Guo Chun Huang Sheng Liu Ruibo Wang Jianbin Fang Tao Tang Zhaoyun Chen Biwei Liu et\u00a0al. 2022. MT-3000: a heterogeneous multi-zone processor for HPC. CCF Transactions on High Performance Computing 4 2 (2022) 150\u2013164.","DOI":"10.1007\/s42514-022-00095-y"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545048"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Yusuke Nagasaka Satoshi Matsuoka Ariful Azad and Ayd\u0131n Bulu\u00e7. 2019. Performance optimization modeling and analysis of sparse matrix-matrix products on multi-core and many-core processors. Parallel Comput. 90 (2019) 102545.","DOI":"10.1016\/j.parco.2019.102545"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/icpp.2017.19"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508431"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374521"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57675-2_16"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","unstructured":"Yaohua Wang Chen Li Chang Liu Sheng Liu Yuanwu Lei Jian Zhang Yang Zhang and Yang Guo. 2021. Advancing DSP into HPC AI and beyond: challenges mechanisms and future directions. CCF Transactions on High Performance Computing (Mar 2021) 114\u2013125. 10.1007\/s42514-020-00057-2","DOI":"10.1007\/s42514-020-00057-2"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Yang Wang Jie Liu Xiaoxiong Zhu Qingyang Zhang Shengguo Li and Qinglin Wang. 2023. Improving Structured Grid-Based Sparse Matrix-Vector Multiplication and Gauss\u2013Seidel Iteration on GPDSP. Applied Sciences 13 15 (2023) 8952.","DOI":"10.3390\/app13158952"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Yang Wang Qinglin Wang Xiangdong Pei Songzhu Mei Rongchun Li and Jie Liu. 2024. High performance dilated convolutions on multi-core DSPs. CCF Transactions on High Performance Computing 6 1 (2024) 78\u201393.","DOI":"10.1007\/s42514-023-00166-8"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Rui Xia Xiao-Wei Guo Chao Li and Jie Liu. 2023. Direct numerical simulation of acoustic wave propagation in ocean waveguides using a parallel finite volume solver. Ocean Engineering 281 (2023) 114894.","DOI":"10.1016\/j.oceaneng.2023.114894"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Guoqing Xiao Kenli Li Yuedan Chen Wangquan He Albert\u00a0Y Zomaya and Tao Li. 2019. Caspmv: A customized and accelerative spmv framework for the sunway taihulight. IEEE Transactions on Parallel and Distributed Systems 32 1 (2019) 131\u2013146.","DOI":"10.1109\/TPDS.2019.2907537"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3168818"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-DSS-SmartCity-DependSys57074.2022.00035"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Shengen Yan Chao Li Yunquan Zhang and Huiyang Zhou. 2014. yaSpMV: Yet another SpMV framework on GPUs. Acm Sigplan Notices 49 8 (2014) 107\u2013118.","DOI":"10.1145\/2692916.2555255"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Carl Yang Aydin Buluc and JohnD. Owens. 2018. Design Principles for Sparse Matrix Multiplication on the GPU. Cornell University - arXiv Cornell University - arXiv (Mar 2018).","DOI":"10.1007\/978-3-319-96983-1_48"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","unstructured":"Chao Yang Shuming Chen Jian Zhang Zhao Lv and Zhi Wang. 2019. A Novel DSP Architecture for Scientific Computing and Deep Learning. IEEE Access (Jan 2019) 36413\u201336425. 10.1109\/access.2019.2905302","DOI":"10.1109\/access.2019.2905302"},{"key":"e_1_3_3_1_45_2","unstructured":"Mouzhi Yang Peng Zhang Jianbin Fang Weifeng Liu and Chun Huang. 2024. thSORT: an efficient parallel sorting algorithm on multi-core DSPs. CCF Transactions on High Performance Computing (2024) 1\u201316."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Wangdong Yang Kenli Li Zeyao Mo and Keqin Li. 2014. Performance optimization using partitioned SpMV on GPUs and multicore CPUs. IEEE Trans. Comput. 64 9 (2014) 2623\u20132636.","DOI":"10.1109\/TC.2014.2366731"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00055"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00090"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673062"}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754613","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:56Z","timestamp":1766219696000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754613"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":48,"alternative-id":["10.1145\/3754598.3754613","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754613","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}