{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:29:36Z","timestamp":1766068176688,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":31,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819615506"},{"type":"electronic","value":"9789819615513"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-1551-3_7","type":"book-chapter","created":{"date-parts":[[2025,2,16]],"date-time":"2025-02-16T09:09:29Z","timestamp":1739696969000},"page":"78-97","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["LSSM-SpMM: A Long-Row Splitting and\u00a0Short-Row Merging Approach for\u00a0Parallel SpMM on\u00a0PEZY-SC3s"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6311-3385","authenticated-orcid":false,"given":"Ligang","family":"Cao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8286-6566","authenticated-orcid":false,"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5688-6506","authenticated-orcid":false,"given":"Shun","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Xia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weihao","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,17]]},"reference":[{"key":"7_CR1","unstructured":"Anzt, H., Tomov, S., Dongarra, J.J.: Accelerating the LOBPCG method on GPUs using a blocked sparse matrix vector product. In: SpringSim (HPS), pp. 75\u201382 (2015)"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Chen, Z., Qu, Z., Liu, L., Ding, Y., Xie, Y.: Efficient tensor core-based GPU kernels for structured sparsity under reduced precision. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201314 (2021)","DOI":"10.1145\/3458817.3476182"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Duff, I.S., Erisman, A.M., Reid, J.K.: Direct Methods for Sparse Matrices. Oxford University Press (2017)","DOI":"10.1093\/acprof:oso\/9780198508380.001.0001"},{"key":"7_CR4","unstructured":"Fout, A., Byrd, J., Shariat, B., Ben-Hur, A.: Protein interface prediction using graph convolutional networks. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Gale, T., Zaharia, M., Young, C., Elsen, E.: Sparse GPU kernels for deep learning. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201314. IEEE (2020)","DOI":"10.1109\/SC41405.2020.00021"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Guo, J., Liu, J., Wang, Q., Zhu, X.: Optimizing CSR-based SpMV on a new MIMD architecture Pezy-SC3s. In: International Conference on Algorithms and Architectures for Parallel Processing, pp. 22\u201339. Springer, Cham (2023)","DOI":"10.1007\/978-981-97-0801-7_2"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Guo, M., et al.: BS-spmm: accelerate sparse matrix-matrix multiplication by balanced split strategy on the GPU. In: IEEE INFOCOM 2023-IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS), pp.\u00a01\u20136. IEEE (2023)","DOI":"10.1109\/INFOCOMWKSHPS57453.2023.10226061"},{"key":"7_CR8","unstructured":"Hamilton, W., Ying, Z., Leskovec, J.: Inductive representation learning on large graphs. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Hong, C., et al.: Efficient sparse-matrix multi-vector product on GPUs. In: Proceedings of the 27th International Symposium on High-Performance Parallel and Distributed Computing, pp. 66\u201379 (2018)","DOI":"10.1145\/3208040.3208062"},{"key":"7_CR10","series-title":"Mechanisms and Machine Science","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-030-27053-7_60","volume-title":"Computational and Experimental Simulations in Engineering","author":"N Hosono","year":"2020","unstructured":"Hosono, N., Furuichi, M.: Implementation of SPH and DEM for a PEZY-SC heterogeneous many-core system. In: Okada, H., Atluri, S.N. (eds.) ICCES 2019. MMS, vol. 75, pp. 709\u2013715. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-27053-7_60"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Featgraph: a flexible and efficient backend for graph neural network systems. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201313. IEEE (2020)","DOI":"10.1109\/SC41405.2020.00075"},{"key":"7_CR12","unstructured":"Jia, Z., Maggioni, M., Staiger, B., Scarpazza, D.P.: Dissecting the nvidia volta GPU architecture via microbenchmarking. arXiv preprint arXiv:1804.06826 (2018)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, P., Hong, C., Agrawal, G.: A novel data transformation and execution strategy for accelerating sparse matrix multiplication on GPUs. In: Proceedings of the 25th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 376\u2013388 (2020)","DOI":"10.1145\/3332466.3374546"},{"key":"7_CR14","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)"},{"key":"7_CR15","unstructured":"Lenadora, D., Sathia, V., Gerogiannis, G., Yesil, S., Torrellas, J., Mendis, C.: Input-sensitive dense-sparse primitive compositions for gnn acceleration. arXiv preprint arXiv:2306.15155 (2023)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Li, S., Osawa, K., Hoefler, T.: Efficient quantized sparse matrix operations on tensor cores. In: SC22: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315. IEEE (2022)","DOI":"10.1109\/SC41404.2022.00042"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Matsumoto, K., Nakasato, N., Hishinuma, T.: Effectiveness of performance tuning techniques for general matrix multiplication on the pezy-sc2. In: Proceedings of the 10th International Symposium on Highly-Efficient Accelerators and Reconfigurable Technologies, pp.\u00a01\u20136 (2019)","DOI":"10.1145\/3337801.3337817"},{"key":"7_CR18","unstructured":"Naumov, M., Chien, L., Vandermersch, P., Kapasi, U.: Cusparse library. In: GPU Technology Conference (2010)"},{"issue":"7","key":"7_CR19","doi-asserted-by":"publisher","first-page":"968","DOI":"10.1093\/comjnl\/bxt038","volume":"57","author":"G Ortega","year":"2014","unstructured":"Ortega, G., V\u00e1zquez, F., Garc\u00eda, I., Garz\u00f3n, E.M.: Fastspmm: an efficient library for sparse matrix matrix product on GPUs. Comput. J. 57(7), 968\u2013979 (2014)","journal-title":"Comput. J."},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Pang, M., Fei, X., Qu, P., Zhang, Y., Li, Z.: A row decomposition-based approach for sparse matrix multiplication on GPUs. In: Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, pp. 377\u2013389 (2024)","DOI":"10.1145\/3627535.3638470"},{"key":"7_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1007\/978-3-319-93417-4_38","volume-title":"The Semantic Web","author":"M Schlichtkrull","year":"2018","unstructured":"Schlichtkrull, M., Kipf, T.N., Bloem, P., van\u00a0den Berg, R., Titov, I., Welling, M.: Modeling relational data with graph convolutional networks. In: Gangemi, A., et al. (eds.) ESWC 2018. LNCS, vol. 10843, pp. 593\u2013607. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-93417-4_38"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Steinberger, M., Zayer, R., Seidel, H.P.: Globally homogeneous, locally adaptive sparse matrix-vector multiplication on the GPU. In: Proceedings of the International Conference on Supercomputing, pp. 1\u201311 (2017)","DOI":"10.1145\/3079079.3079086"},{"key":"7_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"807","DOI":"10.1007\/11557654_91","volume-title":"High Performance Computing and Communications","author":"RW Vuduc","year":"2005","unstructured":"Vuduc, R.W., Moon, H.-J.: Fast sparse matrix-vector multiplication by exploiting variable block structure. In: Yang, L.T., Rana, O.F., Di Martino, B., Dongarra, J. (eds.) HPCC 2005. LNCS, vol. 3726, pp. 807\u2013816. Springer, Heidelberg (2005). https:\/\/doi.org\/10.1007\/11557654_91"},{"key":"7_CR24","unstructured":"Wang, M., et al.: Deep graph library: a graph-centric, highly-performant package for graph neural networks. arXiv preprint arXiv:1909.01315 (2019)"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Wang, Z.: Sparsert: accelerating unstructured sparsity on GPUs for deep learning inference. In: Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques, pp. 31\u201342 (2020)","DOI":"10.1145\/3410463.3414654"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Wilkinson, L., Cheshmi, K., Dehnavi, M.M.: Register tiling for unstructured sparsity in neural network inference. Proc. ACM Program. Lang. 7(PLDI), 1995\u20132020 (2023)","DOI":"10.1145\/3591302"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Williams, S., Oliker, L., Vuduc, R., Shalf, J., Yelick, K., Demmel, J.: Optimization of sparse matrix-vector multiplication on emerging multicore platforms. In: Proceedings of the 2007 ACM\/IEEE Conference on Supercomputing, pp. 1\u201312 (2007)","DOI":"10.1145\/1362622.1362674"},{"key":"7_CR28","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.jpdc.2016.12.023","volume":"104","author":"W Yang","year":"2017","unstructured":"Yang, W., Li, K., Li, K.: A hybrid computing method of SPMV on CPU-GPU heterogeneous computing systems. J. Parallel Distrib. Comput. 104, 49\u201360 (2017)","journal-title":"J. Parallel Distrib. Comput."},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Yesil, S., Moreira, J.E., Torrellas, J.: Dense dynamic blocks: optimizing SPMM for processors with vector and matrix units using machine learning techniques. In: Proceedings of the 36th ACM International Conference on Supercomputing, pp. 1\u201314 (2022)","DOI":"10.1145\/3524059.3532369"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Ying, R., He, R., Chen, K., Eksombatchai, P., Hamilton, W.L., Leskovec, J.: Graph convolutional neural networks for web-scale recommender systems. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 974\u2013983 (2018)","DOI":"10.1145\/3219819.3219890"},{"key":"7_CR31","doi-asserted-by":"crossref","unstructured":"Yoshifuji, N., Sakamoto, R., Nitadori, K., Makino, J.: Implementation and evaluation of data-compression algorithms for irregular-grid iterative methods on the pezy-sc processor. In: 2016 6th Workshop on Irregular Applications: Architecture and Algorithms (IA3), pp. 58\u201361. IEEE (2016)","DOI":"10.1109\/IA3.2016.015"}],"container-title":["Lecture Notes in Computer Science","Algorithms and Architectures for Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-1551-3_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,16]],"date-time":"2025-02-16T09:09:48Z","timestamp":1739696988000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-1551-3_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819615506","9789819615513"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-1551-3_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"17 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICA3PP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Algorithms and Architectures for Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Macau","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ica3pp2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ica3pp2024.scimeeting.cn\/en\/web\/index\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}