{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:19:24Z","timestamp":1768029564037,"version":"3.49.0"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"the National Key R&D Program of China","award":["2017YFB0202003"],"award-info":[{"award-number":["2017YFB0202003"]}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61602501"],"award-info":[{"award-number":["61602501"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["11502296"],"award-info":[{"award-number":["11502296"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61772542"],"award-info":[{"award-number":["61772542"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61561146395"],"award-info":[{"award-number":["61561146395"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61872294"],"award-info":[{"award-number":["61872294"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Open Research Program of China State Key Laboratory of Aerodynamics","award":["SKLA20160104"],"award-info":[{"award-number":["SKLA20160104"]}]},{"DOI":"10.13039\/501100000266","name":"the UK Engineering and Physical Sciences Research Council","doi-asserted-by":"crossref","award":["EP\/M01567X\/1 (SANDeRs)"],"award-info":[{"award-number":["EP\/M01567X\/1 (SANDeRs)"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100000266","name":"the UK Engineering and Physical Sciences Research Council","doi-asserted-by":"crossref","award":["EP\/M015793\/1 (DIVIDEND)"],"award-info":[{"award-number":["EP\/M015793\/1 (DIVIDEND)"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Royal Society International Collaboration Grant","award":["IE161012"],"award-info":[{"award-number":["IE161012"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2019,6]]},"DOI":"10.1007\/s10766-018-00625-8","type":"journal-article","created":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T03:17:23Z","timestamp":1546312643000},"page":"418-432","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Optimizing Sparse Matrix\u2013Vector Multiplications on an ARMv8-based Many-Core Architecture"],"prefix":"10.1007","volume":"47","author":[{"given":"Donglin","family":"Chen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-4869","authenticated-orcid":false,"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Shizhao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Chuanfu","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,1]]},"reference":[{"key":"625_CR1","unstructured":"FT-2000. Phytium Technology Co. Ltd. (2017). http:\/\/www.phytium.com.cn\/Product\/detail?language=1&product_id=7"},{"key":"625_CR2","doi-asserted-by":"crossref","unstructured":"Bell, N., Garland, M.: Implementing sparse matrix-vector multiplication on throughput-oriented processors. In: SC (2009)","DOI":"10.1145\/1654059.1654078"},{"issue":"12","key":"625_CR3","doi-asserted-by":"publisher","first-page":"3279","DOI":"10.1093\/comjnl\/bxv022","volume":"58","author":"Y Che","year":"2015","unstructured":"Che, Y., Xu, C., Fang, J., Wang, Y., Wang, Z.: Realistic performance characterization of CFD applications on intel many integrated core architecture. Comput. J. 58(12), 3279\u20133294 (2015)","journal-title":"Comput. J."},{"key":"625_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., Fang, J., Liu, W., Tang, T., Chen, X., Yang, C.: Efficient and portable ALS matrix factorization for recommender systems. In: IPDPS (2017)","DOI":"10.1109\/IPDPSW.2017.91"},{"key":"625_CR5","doi-asserted-by":"publisher","unstructured":"Chen, J., Fang, J., Liu, W., Tang, T., Yang, C.: clmf: A fine-grained and portable alternating least squares algorithm for parallel matrix factorization. FGCS (2018a). https:\/\/doi.org\/10.1016\/j.future.2018.04.071","DOI":"10.1016\/j.future.2018.04.071"},{"key":"625_CR6","doi-asserted-by":"crossref","unstructured":"Chen, S., Fang, J., Chen, D., Xu, C., Wang, Z.: Adaptive optimization of sparse matrix-vector multiplication on emerging many-core architectures. In: HPCC \u201918 (2018b)","DOI":"10.1109\/HPCC\/SmartCity\/DSS.2018.00116"},{"key":"625_CR7","doi-asserted-by":"crossref","unstructured":"Cummins, C., et\u00a0al.: End-to-end deep learning of optimization heuristics. In: PACT \u201917 (2017)","DOI":"10.1109\/PACT.2017.24"},{"key":"625_CR8","doi-asserted-by":"crossref","unstructured":"Davis, T.A., Hu, Y.: The university of florida sparse matrix collection. ACM Trans. Math. Softw. 38(1), 1:1\u20131:25 (2011)","DOI":"10.1145\/2049662.2049663"},{"key":"625_CR9","doi-asserted-by":"crossref","unstructured":"Emani, M.K., et\u00a0al.: Smart, adaptive mapping of parallelism in the presence of external workload. In: CGO \u201913 (2013)","DOI":"10.1109\/CGO.2013.6495010"},{"key":"625_CR10","doi-asserted-by":"crossref","unstructured":"Grewe, D., et\u00a0al.: A workload-aware mapping approach for data-parallel programs. In: HiPEAC \u201911 (2011)","DOI":"10.1145\/1944862.1944881"},{"key":"625_CR11","doi-asserted-by":"crossref","unstructured":"Grewe, D. et\u00a0al.: Opencl task partitioning in the presence of gpu contention. In: LCPC \u201913 (2013a)","DOI":"10.1007\/978-3-319-09967-5_5"},{"key":"625_CR12","doi-asserted-by":"crossref","unstructured":"Grewe, D. et\u00a0al.: Portable mapping of data parallel programs to opencl for heterogeneous systems. In: CGO \u201913 (2013b)","DOI":"10.1109\/CGO.2013.6494993"},{"key":"625_CR13","unstructured":"Ho, T.K.: Random decision forests. In: ICDAR, pp. 278\u2013282 (1995)"},{"issue":"092010","key":"625_CR14","first-page":"1","volume":"664","author":"C Hollowell","year":"2015","unstructured":"Hollowell, C., et al.: The effect of numa tunings on cpu performance. J. Phys. Conf. Ser. 664(092010), 1\u20137 (2015)","journal-title":"J. Phys. Conf. Ser."},{"key":"625_CR15","doi-asserted-by":"crossref","unstructured":"Im, E., Yelick, K.A., Vuduc, R.W.: Sparsity: Optimization framework for sparse matrix kernels. IJHPCA (2004)","DOI":"10.1177\/1094342004041296"},{"key":"625_CR16","doi-asserted-by":"crossref","unstructured":"Kincaid, D. et\u00a0al.: Itpackv 2d user\u2019s guide. Tech. rep., Center for Numerical Analysis, Texas Univ., Austin, TX (USA) (1989)","DOI":"10.2172\/7093021"},{"key":"625_CR17","doi-asserted-by":"publisher","unstructured":"Kreutzer, M., Hager, G., Wellein, G., Fehske, H., Bishop, A.R.: A unified sparse matrix data format for efficient general sparse matrix-vector multiplication on modern processors with wide SIMD units. SIAM J. Sci. Comput. 36(5) (2014). https:\/\/doi.org\/10.1137\/130930352","DOI":"10.1137\/130930352"},{"key":"625_CR18","doi-asserted-by":"crossref","unstructured":"Laurenzano, M.A., Tiwari, A., Cauble-Chantrenne, A., Jundt, A., Jr WAW, Campbell, R.L., Carrington, L.: Characterization and bottleneck analysis of a 64-bit armv8 platform. In: ISPASS (2016)","DOI":"10.1109\/ISPASS.2016.7482072"},{"key":"625_CR19","doi-asserted-by":"crossref","unstructured":"Li, A., Liu, W., Kristensen, M.R.B., Vinter, B., Wang, H., Hou, K., Marquez, A., Song, S.L.: Exploring and analyzing the real impact of modern on-package memory on HPC scientific kernels. In: SC (2017)","DOI":"10.1145\/3126908.3126931"},{"key":"625_CR20","doi-asserted-by":"crossref","unstructured":"Li, J., Tan, G., Chen, M., Sun, N.: SMAT: an input adaptive auto-tuner for sparse matrix-vector multiplication. In: PLDI (2013)","DOI":"10.1145\/2491956.2462181"},{"key":"625_CR21","doi-asserted-by":"crossref","unstructured":"Liu, J., He, X., Liu, W., Tan, G.: Register-based implementation of the sparse general matrix-matrix multiplication on gpus. In: PPoPP (2018)","DOI":"10.1145\/3178487.3178529"},{"key":"625_CR22","unstructured":"Liu, W.: Parallel and scalable sparse basic linear algebra subprograms. PhD thesis, University of Copenhagen (2015)"},{"key":"625_CR23","doi-asserted-by":"crossref","unstructured":"Liu, W., Vinter, B.: CSR5: an efficient storage format for cross-platform sparse matrix-vector multiplication. In: ICS (2015a)","DOI":"10.1145\/2751205.2751209"},{"key":"625_CR24","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1016\/j.parco.2015.04.004","volume":"49","author":"W Liu","year":"2015","unstructured":"Liu, W., Vinter, B.: Speculative segmented sum for sparse matrix\u2013vector multiplication on heterogeneous processors. Parallel Comput. 49, 179\u2013193 (2015b)","journal-title":"Parallel Comput."},{"key":"625_CR25","doi-asserted-by":"crossref","unstructured":"Liu, X., Smelyanskiy, M., Chow, E., Dubey, P.: Efficient sparse matrix\u2013vector multiplication on x86-based many-core processors. In: ICS (2013)","DOI":"10.1145\/2464996.2465013"},{"key":"625_CR26","doi-asserted-by":"crossref","unstructured":"Maggioni, M., Berger-Wolf, T.Y.: An architecture-aware technique for optimizing sparse matrix-vector multiplication on GPUS. In: ICCS (2013)","DOI":"10.1016\/j.procs.2013.05.196"},{"issue":"2","key":"625_CR27","first-page":"225","volume":"18","author":"JM Mellor-Crummey","year":"2004","unstructured":"Mellor-Crummey, J.M., Garvin, J.: Optimizing sparse matrix-vector product computations using unroll and jam. IJHPCA 18(2), 225\u2013236 (2004)","journal-title":"IJHPCA"},{"key":"625_CR28","doi-asserted-by":"crossref","unstructured":"Monakov, A., Lokhmotov, A., Avetisyan, A.: Automatically tuning sparse matrix\u2013vector multiplication for GPU architectures. In: HIPEAC (2010)","DOI":"10.1007\/978-3-642-11515-8_10"},{"key":"625_CR29","doi-asserted-by":"crossref","unstructured":"Ogilvie, W.F., et\u00a0al.: Fast automatic heuristic construction using active learning. In: LCPC \u201914 (2014)","DOI":"10.1007\/978-3-319-17473-0_10"},{"key":"625_CR30","doi-asserted-by":"crossref","unstructured":"Ogilvie, W.F., et\u00a0al.: Minimizing the cost of iterative compilation with active learning. In: CGO \u201917 (2017)","DOI":"10.1109\/CGO.2017.7863744"},{"key":"625_CR31","unstructured":"Pedregosa, F., et\u00a0al.: Scikit-learn: Machine learning in Python. Journal of Machine Learning Research (2011)"},{"key":"625_CR32","doi-asserted-by":"crossref","unstructured":"Pinar, A., Heath, M.T.: Improving performance of sparse matrix-vector multiplication. In: SC (1999)","DOI":"10.1145\/331532.331562"},{"key":"625_CR33","doi-asserted-by":"crossref","unstructured":"Ren, J. et\u00a0al.: Optimise web browsing on heterogeneous mobile platforms: a machine learning based approach. In: INFOCOM \u201917 (2017)","DOI":"10.1109\/INFOCOM.2017.8057087"},{"key":"625_CR34","doi-asserted-by":"crossref","unstructured":"Ren, J., et\u00a0al.: Adaptive web browsing on mobile heterogeneous multi-cores. IEEE Comput. Architect. Lett. (2018)","DOI":"10.1109\/LCA.2018.2869814"},{"key":"625_CR35","doi-asserted-by":"crossref","unstructured":"Sedaghati, N., Mu, T., Pouchet, L., Parthasarathy, S., Sadayappan, P.: Automatic selection of sparse matrix representation on gpus. In: ICS (2015)","DOI":"10.1145\/2751205.2751244"},{"key":"625_CR36","doi-asserted-by":"crossref","unstructured":"Stephens, N.: Armv8-a next-generation vector architecture for HPC. In: 2016 IEEE Hot Chips 28 Symposium (HCS), pp. 1\u201331 (2016)","DOI":"10.1109\/HOTCHIPS.2016.7936203"},{"key":"625_CR37","doi-asserted-by":"crossref","unstructured":"Taylor, B., et\u00a0al.: Adaptive optimization for opencl programs on embedded heterogeneous systems. In: LCTES \u201917 (2017)","DOI":"10.1145\/3078633.3081040"},{"key":"625_CR38","doi-asserted-by":"crossref","unstructured":"Taylor, B. et\u00a0al.: Adaptive deep learning model selection on embedded systems. In: LCTES \u201918 (2018)","DOI":"10.1145\/3211332.3211336"},{"key":"625_CR39","doi-asserted-by":"crossref","unstructured":"Tournavitis, G., et\u00a0al.: Towards a holistic approach to auto-parallelization: Integrating profile-driven parallelism detection and machine-learning based mapping. In: PLDI \u201909 (2009)","DOI":"10.1145\/1542476.1542496"},{"key":"625_CR40","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.: Machine learning in compiler optimization. In: Proceedings of the IEEE (2018)","DOI":"10.1109\/JPROC.2018.2817118"},{"key":"625_CR41","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Mapping parallelism to multi-cores: a machine learning based approach. In: PPoPP \u201909 (2009)","DOI":"10.1145\/1504176.1504189"},{"key":"625_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Partitioning streaming parallelism for multi-cores: a machine learning based approach. In: PACT \u201910 (2010)","DOI":"10.1145\/1854273.1854313"},{"key":"625_CR43","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019boyle, M.F.: Using machine learning to partition streaming programs. ACM TACO (2013)","DOI":"10.1145\/2512436"},{"key":"625_CR44","doi-asserted-by":"crossref","unstructured":"Wang, Z., et\u00a0al.: Automatic and portable mapping of data parallel programs to opencl for gpu-based heterogeneous systems. ACM TACO (2014a)","DOI":"10.1145\/2677036"},{"key":"625_CR45","doi-asserted-by":"crossref","unstructured":"Wang, Z. et\u00a0al.: Exploitation of gpus for the parallelisation of probably parallel legacy code. In: CC \u201914 (2014b)","DOI":"10.1007\/978-3-642-54807-9_9"},{"key":"625_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Z., et\u00a0al.: Integrating profile-driven parallelism detection and machine-learning-based mapping. ACM TACO (2014c)","DOI":"10.1145\/2579561"},{"key":"625_CR47","doi-asserted-by":"crossref","unstructured":"Williams, S., Oliker, L., Vuduc, R.W., Shalf, J., Yelick, K.A., Demmel, J.: Optimization of sparse matrix-vector multiplication on emerging multicore platforms. In: SC (2007)","DOI":"10.1145\/1362622.1362674"},{"key":"625_CR48","doi-asserted-by":"crossref","unstructured":"Williams, S., Oliker, L., Vuduc, R.W., Shalf, J., Yelick, K.A., Demmel, J.: Optimization of sparse matrix-vector multiplication on emerging multicore platforms. Parallel Comput. (2009)","DOI":"10.1016\/j.parco.2008.12.006"},{"key":"625_CR49","doi-asserted-by":"crossref","unstructured":"Yang, X., Fang, J., Chen, J., Wu, C., Tang, T., Lu, K.: High performance coordinate descent matrix factorization for recommender systems. In: CF (2017)","DOI":"10.1145\/3075564.3077625"},{"key":"625_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, C.: Mars: A 64-core armv8 processor. In: HotChips (2015)","DOI":"10.1109\/HOTCHIPS.2015.7477454"},{"key":"625_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, P. et\u00a0al.: Auto-tuning streamed applications on intel xeon phi. In: IPDPS \u201918 (2018)","DOI":"10.1109\/IPDPS.2018.00061"},{"key":"625_CR52","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Li, J., Liao, C., Shen, X.: Bridging the gap between deep learning and sparse matrix format selection. In: PPoPP (2018)","DOI":"10.2172\/1426119"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-018-00625-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-018-00625-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-018-00625-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,12]],"date-time":"2023-09-12T16:04:21Z","timestamp":1694534661000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-018-00625-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1,1]]},"references-count":52,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2019,6]]}},"alternative-id":["625"],"URL":"https:\/\/doi.org\/10.1007\/s10766-018-00625-8","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,1,1]]},"assertion":[{"value":"20 September 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 December 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 January 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}