{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T01:11:00Z","timestamp":1771549860013,"version":"3.50.1"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100005311","name":"China Southern Power Grid Co Ltd","doi-asserted-by":"publisher","award":["ZBKJXM20232483"],"award-info":[{"award-number":["ZBKJXM20232483"]}],"id":[{"id":"10.13039\/501100005311","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100028466","name":"Power","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100028466","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Guangdong Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["2024A1515010204"],"award-info":[{"award-number":["2024A1515010204"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Systems Architecture"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.sysarc.2026.103728","type":"journal-article","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T17:22:13Z","timestamp":1770657733000},"page":"103728","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["An architecture-adaptive optimization strategy for high-performance SYMV on a heterogeneous AI accelerator"],"prefix":"10.1016","volume":"175","author":[{"given":"Hao","family":"Jiang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6372-7088","authenticated-orcid":false,"given":"Lu","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Zhihong","family":"Liang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.sysarc.2026.103728_b1","series-title":"IEEE High Performance Extreme Computing Conference","first-page":"1","article-title":"Ai accelerator survey and trends","volume":"vol. 2021","author":"Reuther","year":"2021"},{"key":"10.1016\/j.sysarc.2026.103728_b2","series-title":"IEEE High Performance Extreme Computing Conference","first-page":"1","article-title":"Survey of machine learning accelerators","volume":"vol. 2020","author":"Reuther","year":"2020"},{"issue":"3","key":"10.1016\/j.sysarc.2026.103728_b3","doi-asserted-by":"crossref","first-page":"264","DOI":"10.1016\/j.eng.2020.01.007","article-title":"A survey of accelerator architectures for deep neural networks","volume":"6","author":"Chen","year":"2020","journal-title":"Engineering"},{"issue":"02","key":"10.1016\/j.sysarc.2026.103728_b4","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1109\/MC.2019.2954056","article-title":"Accelerators for artificial intelligence and high-performance computing","volume":"53","author":"Milojicic","year":"2020","journal-title":"Computer"},{"issue":"7","key":"10.1016\/j.sysarc.2026.103728_b5","doi-asserted-by":"crossref","first-page":"48","DOI":"10.1145\/3361682","article-title":"Domain-specific hardware accelerators","volume":"63","author":"Dally","year":"2020","journal-title":"Commun. ACM"},{"key":"10.1016\/j.sysarc.2026.103728_b6","first-page":"336","article-title":"Mlperf training benchmark","volume":"2","author":"Mattson","year":"2020","journal-title":"Proc. Mach. Learn. Syst."},{"issue":"2","key":"10.1016\/j.sysarc.2026.103728_b7","doi-asserted-by":"crossref","first-page":"285","DOI":"10.4208\/cicp.110113.010813a","article-title":"A survey on parallel computing and its applications in data-parallel problems using gpu architectures","volume":"15","author":"Navarro","year":"2014","journal-title":"Commun. Comput. Phys."},{"issue":"12","key":"10.1016\/j.sysarc.2026.103728_b8","doi-asserted-by":"crossref","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","article-title":"Efficient processing of deep neural networks: A tutorial and survey","volume":"105","author":"Sze","year":"2017","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.sysarc.2026.103728_b9","series-title":"57th International Scientific Conference on Information, Communication and Energy Systems and Technologies","first-page":"1","article-title":"A survey of three types of processing units","volume":"vol. 2022","author":"Nikoli\u0107","year":"2022"},{"issue":"11","key":"10.1016\/j.sysarc.2026.103728_b10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3729215","article-title":"A survey on deep learning hardware accelerators for heterogeneous hpc platforms","volume":"57","author":"Silvano","year":"2025","journal-title":"ACM Comput. Surv."},{"issue":"1","key":"10.1016\/j.sysarc.2026.103728_b11","doi-asserted-by":"crossref","first-page":"51","DOI":"10.5753\/jbcs.2023.2219","article-title":"Challenges in high-performance computing","volume":"29","author":"Navaux","year":"2023","journal-title":"J. Braz. Comput. Soc."},{"issue":"1","key":"10.1016\/j.sysarc.2026.103728_b12","article-title":"Survey of methodologies, approaches, and challenges in parallel programming using high-performance computing systems","volume":"2020","author":"Czarnul","year":"2020","journal-title":"Sci. Program."},{"key":"10.1016\/j.sysarc.2026.103728_b13","unstructured":"Z. Jia, M. Maggioni, B. Staiger, D. Scarpino, G. Alonso, Dissecting the nvidia turing t4 gpu via microbenchmarking, in: Proceedings of the 10th ACM\/SPEC International Conference on Performance Engineering, 2019, pp. 205\u2013216."},{"key":"10.1016\/j.sysarc.2026.103728_b14","series-title":"IEEE Hot Chips 31 Symposium","first-page":"1","article-title":"Davinci","volume":"vol. 2019","author":"Liao","year":"2019"},{"issue":"1","key":"10.1016\/j.sysarc.2026.103728_b15","article-title":"A survey on algorithm and architecture for ai accelerators","volume":"42","author":"Chen","year":"2021","journal-title":"J. Semicond."},{"issue":"2","key":"10.1016\/j.sysarc.2026.103728_b16","doi-asserted-by":"crossref","first-page":"519","DOI":"10.1109\/TPDS.2022.3225230","article-title":"A comprehensive performance model of sparse matrix\u2013vector multiplication to guide kernel optimization","volume":"34","author":"Xia","year":"2022","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"10","key":"10.1016\/j.sysarc.2026.103728_b17","doi-asserted-by":"crossref","first-page":"1675","DOI":"10.3390\/electronics9101675","article-title":"Performance analysis of sparse matrix\u2013vector multiplication (spmv) on graphics processing units (gpus)","volume":"9","author":"AlAhmadi","year":"2020","journal-title":"Electronics"},{"issue":"3","key":"10.1016\/j.sysarc.2026.103728_b18","doi-asserted-by":"crossref","first-page":"855","DOI":"10.1007\/s11390-024-3673-8","article-title":"Optimization of generalized eigensolver for dense symmetric matrices on amd gpu","volume":"40","author":"Zhang","year":"2025","journal-title":"J. Comput. Sci. Tech."},{"issue":"4","key":"10.1016\/j.sysarc.2026.103728_b19","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1145\/1498765.1498785","article-title":"Roofline: an insightful visual performance model for multicore architectures","volume":"52","author":"Williams","year":"2009","journal-title":"Commun. ACM"},{"issue":"3","key":"10.1016\/j.sysarc.2026.103728_b20","doi-asserted-by":"crossref","first-page":"308","DOI":"10.1145\/355841.355847","article-title":"Basic linear algebra subprograms for fortran usage","volume":"5","author":"Lawson","year":"1979","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"10.1016\/j.sysarc.2026.103728_b21","doi-asserted-by":"crossref","unstructured":"L. Wang, W. Wu, Z. Xu, J. Xiao, Y. Yang, Blasx: A high performance level-3 blas library for heterogeneous multi-gpu computing, in: Proceedings of the 2016 International Conference on Supercomputing, 2016, 2016, pp. 1\u201311.","DOI":"10.1145\/2925426.2926256"},{"issue":"2","key":"10.1016\/j.sysarc.2026.103728_b22","first-page":"101","article-title":"Minimizing development and maintenance costs in supporting persistently optimized blas","volume":"35","author":"Whaley","year":"2005","journal-title":"Softw.: Pr. Exp."},{"issue":"3","key":"10.1016\/j.sysarc.2026.103728_b23","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2764454","article-title":"Blis: A framework for rapidly instantiating blas functionality","volume":"41","author":"Van Zee","year":"2015","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"key":"10.1016\/j.sysarc.2026.103728_b24","series-title":"IEEE International Parallel and Distributed Processing Symposium","first-page":"976","article-title":"Machine-learning-driven runtime optimization of blas level 3 on modern multi-core systems","volume":"vol. 2024","author":"Xia","year":"2024"},{"issue":"1\u20132","key":"10.1016\/j.sysarc.2026.103728_b25","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1016\/S0167-8191(00)00087-9","article-title":"Automated empirical optimizations of software and the atlas project","volume":"27","author":"Whaley","year":"2001","journal-title":"Parallel Comput."},{"issue":"3","key":"10.1016\/j.sysarc.2026.103728_b26","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1356052.1356053","article-title":"Anatomy of high-performance matrix multiplication","volume":"34","author":"Goto","year":"2008","journal-title":"ACM Trans. Math. Softw. (TOMS)"},{"issue":"8","key":"10.1016\/j.sysarc.2026.103728_b27","doi-asserted-by":"crossref","first-page":"1183","DOI":"10.1002\/cpe.2916","article-title":"Auto-tuning of level 1 and level 2 blas for gpus","volume":"25","author":"S\u00f8rensen","year":"2013","journal-title":"Concurr. Comput.: Pr. Exp."},{"key":"10.1016\/j.sysarc.2026.103728_b28","series-title":"Blas for gpus","author":"Nath","year":"2010"},{"key":"10.1016\/j.sysarc.2026.103728_b29","series-title":"A systematic literature survey of sparse matrix\u2013vector multiplication","author":"Gao","year":"2024"},{"issue":"11","key":"10.1016\/j.sysarc.2026.103728_b30","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3570638","article-title":"Optimization techniques for gpu programming","volume":"55","author":"Hijma","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.sysarc.2026.103728_b31","first-page":"1","article-title":"Memory hierarchy optimization strategies for highperformance computing architectures","author":"Vaithianathan","year":"2025","journal-title":"Int. J. Emerg. Trends & Technol. Comput. Sci."},{"key":"10.1016\/j.sysarc.2026.103728_b32","series-title":"International Conference on High Performance Computing for Computational Science","first-page":"72","article-title":"Optimizing memory-bound symv kernel on gpu hardware accelerators","author":"Abdelfattah","year":"2012"},{"issue":"2","key":"10.1016\/j.sysarc.2026.103728_b33","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1109\/MM.2008.31","article-title":"Nvidia tesla: A unified graphics and computing architecture","volume":"28","author":"Lindholm","year":"2008","journal-title":"IEEE Micro"},{"key":"10.1016\/j.sysarc.2026.103728_b34","series-title":"Symmetry discovered: Concepts and applications in nature and science","author":"Rosen","year":"2012"},{"key":"10.1016\/j.sysarc.2026.103728_b35","doi-asserted-by":"crossref","unstructured":"N. Bell, M. Garland, Implementing sparse matrix\u2013vector multiplication on throughput-oriented processors, in: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, 2009, pp. 1\u201311.","DOI":"10.1145\/1654059.1654078"},{"key":"10.1016\/j.sysarc.2026.103728_b36","doi-asserted-by":"crossref","unstructured":"N.P. Jouppi, C. Young, N. Patil, D. Patterson, G. Agrawal, R. Bajwa, S. Bates, S. Bhatia, N. Boden, A. Borchers, et al., In-datacenter performance analysis of a tensor processing unit, in: Proceedings of the 44th Annual International Symposium on Computer Architecture, 2017, pp. 1\u201312.","DOI":"10.1145\/3079856.3080246"},{"key":"10.1016\/j.sysarc.2026.103728_b37","unstructured":"NVIDIA, NVIDIA A100 Tensor Core GPU Architecture, Whitepaper, available at https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf, 2020."},{"key":"10.1016\/j.sysarc.2026.103728_b38","unstructured":"Advanced Micro Devices, AMD Instinct MI200 Series Accelerators, Product Brief, available at https:\/\/www.amd.com\/en\/products\/accelerators\/instinct\/mi200\/mi210.html, 2021."},{"key":"10.1016\/j.sysarc.2026.103728_b39","series-title":"IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems","first-page":"7","article-title":"An instruction roofline model for gpus","volume":"vol. 2019","author":"Ding","year":"2019"},{"issue":"1","key":"10.1016\/j.sysarc.2026.103728_b40","doi-asserted-by":"crossref","first-page":"21","DOI":"10.1109\/L-CA.2013.6","article-title":"Cache-aware roofline model: Upgrading the loft","volume":"13","author":"Ilic","year":"2013","journal-title":"IEEE Comput. Archit. Lett."}],"container-title":["Journal of Systems Architecture"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1383762126000469?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1383762126000469?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T00:22:26Z","timestamp":1771546946000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1383762126000469"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":40,"alternative-id":["S1383762126000469"],"URL":"https:\/\/doi.org\/10.1016\/j.sysarc.2026.103728","relation":{},"ISSN":["1383-7621"],"issn-type":[{"value":"1383-7621","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"An architecture-adaptive optimization strategy for high-performance SYMV on a heterogeneous AI accelerator","name":"articletitle","label":"Article Title"},{"value":"Journal of Systems Architecture","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.sysarc.2026.103728","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103728"}}