{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:16:08Z","timestamp":1769501768527,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China (NSFC)","award":["92373103, 62204271, 62334014"],"award-info":[{"award-number":["92373103, 62204271, 62334014"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756077","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1026-1039","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["C3ache: Towards Hierarchical Cache-Centric Computing for Sparse Matrix Multiplication on GPGPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3903-9686","authenticated-orcid":false,"given":"Xiaojie","family":"Li","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4006-8870","authenticated-orcid":false,"given":"Mingyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7455-815X","authenticated-orcid":false,"given":"Baiqing","family":"Zhong","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6366-7386","authenticated-orcid":false,"given":"Haiqiu","family":"Huang","sequence":"additional","affiliation":[{"name":"Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2387-4802","authenticated-orcid":false,"given":"Guangjie","family":"Cao","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8802-0457","authenticated-orcid":false,"given":"Zhiyi","family":"Yu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.5555\/3236002"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.21"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071074"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTR.2009.5289124"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Nathan Bell Steven Dalton and Luke\u00a0N Olson. 2012. Exposing Fine-Grained Parallelism in Algebraic Multigrid Methods. SIAM Journal on Scientific Computing 34 4 (2012) C123\u2013C152.","DOI":"10.1137\/110838844"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Nathan Bell Steven Dalton and Luke\u00a0N Olson. 2012. Exposing Fine-Grained Parallelism in Algebraic Multigrid Methods. SIAM Journal on Scientific Computing 34 4 (2012) C123\u2013C152.","DOI":"10.1137\/110838844"},{"key":"e_1_3_3_1_8_2","volume-title":"Efficient Sparse Matrix-Vector Multiplication on CUDA","author":"Bell Nathan","year":"2008","unstructured":"Nathan Bell and Michael Garland. 2008. Efficient Sparse Matrix-Vector Multiplication on CUDA. Technical Report. Nvidia Technical Report NVR-2008-004, Nvidia Corporation."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898719505"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/1250790.1250877"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0M. Chan. 2010. More Algorithms for All-Pairs Shortest Paths in Weighted Graphs. Proceedings of the Annual ACM Symposium on Theory of Computing 39 (2010) 2075\u20132089.","DOI":"10.1137\/08071990X"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. Nvidia A100 Tensor Core GPU: Performance and Innovation. IEEE Micro 41 2 (2021) 29\u201335.","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2018.00023"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Paolo D\u2019alberto and Alexandru Nicolau. 2007. R-Kleene: A High-Performance Divide-and-Conquer Algorithm for the All-Pair Shortest Path for Densely Connected Networks. Algorithmica 47 (2007) 203\u2013213.","DOI":"10.1007\/s00453-006-1224-z"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Timothy\u00a0A Davis and Yifan Hu. 2011. The University of Florida Sparse Matrix Collection. ACM Transactions on Mathematical Software (TOMS) 38 1 (2011) 1\u201325.","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_3_1_16_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-Training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00040"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614268"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"John\u00a0Russell Gilbert Steven\u00a0K. Reinhardt and Viral\u00a0B Shah. 2008. A Unified Framework for Numerical and Combinatorial Computing. Computing in Science and Engineering (2008).","DOI":"10.1109\/MCSE.2008.45"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2008.41"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2909437.2909442"},{"key":"e_1_3_3_1_22_2","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et\u00a0al. 2025. Deepseek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12948 (2025)."},{"key":"e_1_3_3_1_23_2","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep Residual Learning for Image Recognition. IEEE (2016)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00017"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Satoshi Itoh Pablo Ordej\u00f3n and Richard\u00a0M Martin. 1995. Order-N Tight-Binding Molecular Dynamics on Parallel Computers. Computer physics communications 88 2-3 (1995) 173\u2013185.","DOI":"10.1016\/0010-4655(95)00031-A"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ESSCIRC55480.2022.9911450"},{"key":"e_1_3_3_1_27_2","unstructured":"Zhe Jia Marco Maggioni Benjamin Staiger and Daniele\u00a0P Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.06826 (2018)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.5555\/602770.602808"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Hyunjoon Kim Junjie Mu Chengshuo Yu Tony Tae-Hyoung Kim and Bongjin Kim. 2023. A 1-16b Reconfigurable 80Kb 7T SRAM-Based Digital Near-Memory Computing Macro for Processing Neural Networks. IEEE Transactions on Circuits and Systems I: Regular Papers 70 4 (2023) 1580\u20131590.","DOI":"10.1109\/TCSI.2022.3232648"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623790"},{"key":"e_1_3_3_1_31_2","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-supervised Classification with Graph Convolutional Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.02907 (2016)."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.23919\/VLSICircuits52068.2021.9492476"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669172"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2012.6507483"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247896"},{"key":"e_1_3_3_1_36_2","volume-title":"GPU technology conference","author":"Naumov Maxim","year":"2010","unstructured":"Maxim Naumov, L Chien, Philippe Vandermersch, and Ujval Kapasi. 2010. Cusparse Library. In GPU technology conference , Vol.\u00a012."},{"key":"e_1_3_3_1_37_2","first-page":"264","volume-title":"2023 36th International Conference on VLSI Design and 2023 22nd International Conference on Embedded Systems (VLSID)","author":"Noble G","year":"2023","unstructured":"G Noble, S Nalesh, and S Kala. 2023. MOSCON: Modified Outer Product based Sparse Matrix-Matrix Multiplication Accelerator with Configurable Tiles. In 2023 36th International Conference on VLSI Design and 2023 22nd International Conference on Embedded Systems (VLSID). IEEE, 264\u2013269."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Gerald Penn. 2006. Efficient Transitive Closure of Sparse Matrices over Closed Semirings. Theoretical Computer Science 354 1 (2006) 72\u201381.","DOI":"10.1016\/j.tcs.2005.11.008"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Gerald Penn. 2006. Efficient Transitive Closure of Sparse Matrices over Closed Semirings. Theoretical Computer Science 354 1 (2006) 72\u201381.","DOI":"10.1016\/j.tcs.2005.11.008"},{"key":"e_1_3_3_1_41_2","unstructured":"Jeff Pool Abhishek Sawarkar and Jay Rodge. 2021. Accelerating Inference with Sparsity using the Nvidia Ampere Architecture and Nvidia Tensorrt. NVIDIA Developer Technical Blog https:\/\/developer. nvidia. com\/blog\/accelerating-inference-with-sparsityusing-ampere-and-tensorrt (2021)."},{"key":"e_1_3_3_1_42_2","volume-title":"Maximum Matchings in General Graphs through Randomization","author":"Rabin Michael\u00a0O","year":"1984","unstructured":"Michael\u00a0O Rabin and Vijay\u00a0V Vazirani. 1984. Maximum Matchings in General Graphs through Randomization. Center for Research in Computing Techn., Aiken Computation Laboratory, Univ."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Michael\u00a0O Rabin and Vijay\u00a0V Vazirani. 1989. Maximum Matchings in General Graphs through Randomization. Journal of Algorithms 10 4 (1989) 557\u2013567.","DOI":"10.1016\/0196-6774(89)90005-9"},{"key":"e_1_3_3_1_44_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language Models Are Unsupervised Multitask Learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00016"},{"key":"e_1_3_3_1_46_2","volume-title":"An Interactive System for Combinatorial Scientific Computing with an Emphasis on Programmer Productivity","author":"Shah Viral\u00a0B","year":"2007","unstructured":"Viral\u00a0B Shah. 2007. An Interactive System for Combinatorial Scientific Computing with an Emphasis on Programmer Productivity. University of California, Santa Barbara."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"William\u00a0Andrew Simon Yasir\u00a0Mahmood Qureshi Marco Rios Alexandre Levisse Marina Zapater and David Atienza. 2020. BLADE: An In-Cache Computing Architecture for Edge Devices. IEEE Trans. Comput. 69 9 (2020) 1349\u20131363.","DOI":"10.1109\/TC.2020.2972528"},{"key":"e_1_3_3_1_48_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. Computer Science (2014)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Nitish Srivastava Hanchen Jin Jie Liu David Albonesi and Zhiru Zhang. 2020. MatRaptor: A Sparse-Sparse Matrix Multiplication Accelerator Based on Row-Wise Product. (2020).","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063431"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480128"},{"key":"e_1_3_3_1_52_2","unstructured":"Stijn Van\u00a0Dongen. 2000. Graph Clustering by Flow Simulation. PhD thesis University of Utrecht (2000)."},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00088"},{"key":"e_1_3_3_1_54_2","first-page":"421","volume-title":"International Conference on High Performance Computing for Computational Science","author":"Yamazaki Ichitaro","year":"2010","unstructured":"Ichitaro Yamazaki and Xiaoye\u00a0S Li. 2010. On Techniques to Improve Robustness and Scalability of a Parallel Hybrid Linear Solver. In International Conference on High Performance Computing for Computational Science. Springer, 421\u2013434."},{"key":"e_1_3_3_1_55_2","first-page":"421","volume-title":"International Conference on High Performance Computing for Computational Science","author":"Yamazaki Ichitaro","year":"2010","unstructured":"Ichitaro Yamazaki and Xiaoye\u00a0S Li. 2010. On Techniques to Improve Robustness and Scalability of a Parallel Hybrid Linear Solver. In International Conference on High Performance Computing for Computational Science. Springer, 421\u2013434."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00072"},{"key":"e_1_3_3_1_57_2","first-page":"254","volume-title":"SODA","author":"Yuster Raphael","year":"2004","unstructured":"Raphael Yuster and Uri Zwick. 2004. Detecting Short Directed Cycles using Rectangular Matrix Multiplication and Dynamic Programming.. In SODA , Vol.\u00a04. 254\u2013260."},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"crossref","unstructured":"Xiaoyu Zhang Zerun Li Rui Liu Xiaoming Chen and Yinhe Han. 2024. GAS: General-Purpose In-Memory-Computing Accelerator for Sparse Matrix Multiplication. IEEE Trans. Comput. (2024).","DOI":"10.1109\/TC.2024.3371790"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"crossref","unstructured":"Yicong Zhang Mingyu Wang Yangzhan Mai and Zhiyi Yu. 2023. TensorCache: Reconstructing Memory Architecture With SRAM-Based In-Cache Computing for Efficient Tensor Computations in GPGPUs. IEEE Transactions on Very Large Scale Integration (VLSI) Systems (2023).","DOI":"10.1109\/TVLSI.2023.3326741"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00056"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00030"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756077","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:42:34Z","timestamp":1769463754000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756077"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":61,"alternative-id":["10.1145\/3725843.3756077","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756077","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}