{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T08:11:23Z","timestamp":1768032683003,"version":"3.49.0"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2019,11,15]],"date-time":"2019-11-15T00:00:00Z","timestamp":1573776000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,11,15]],"date-time":"2019-11-15T00:00:00Z","timestamp":1573776000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"the National Key R&D Program of China","award":["2017YFB0202003"],"award-info":[{"award-number":["2017YFB0202003"]}]},{"DOI":"10.13039\/501100001809","name":"the Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61602501"],"award-info":[{"award-number":["61602501"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the National Science Foundation of China","award":["61772542"],"award-info":[{"award-number":["61772542"]}]},{"name":"the National Science Foundation of China","award":["61872294"],"award-info":[{"award-number":["61872294"]}]},{"name":"the Royal Society International Collaboration Grant","award":["IE161012"],"award-info":[{"award-number":["IE161012"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Parallel Prog"],"published-print":{"date-parts":[[2020,2]]},"DOI":"10.1007\/s10766-019-00646-x","type":"journal-article","created":{"date-parts":[[2019,11,15]],"date-time":"2019-11-15T11:03:02Z","timestamp":1573815782000},"page":"80-97","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["Characterizing Scalability of Sparse Matrix\u2013Vector Multiplications on Phytium FT-2000+"],"prefix":"10.1007","volume":"48","author":[{"given":"Donglin","family":"Chen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-4869","authenticated-orcid":false,"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Chuanfu","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Shizhao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,11,15]]},"reference":[{"key":"646_CR1","doi-asserted-by":"crossref","first-page":"685","DOI":"10.1002\/cpe.1553","volume":"22","author":"L Adhianto","year":"2010","unstructured":"Adhianto, L., Banerjee, S., Fagan, M.W., Krentel, M., Marin, G., Mellor-Crummey, J.M., Tallent, N.R.: HPCTOOLKIT: tools for performance analysis of optimized parallel programs. Concurr. Comput. Pract. Exp. 22, 685\u2013701 (2010)","journal-title":"Concurr. Comput. Pract. Exp."},{"key":"646_CR2","unstructured":"Alam, S.R., Barrett, R.F., Kuehn, J.A., Roth, P.C., Vetter, J.S.: Characterization of scientific workloads on systems with multi-core processors. In: Proceedings of the 2006 IEEE International Symposium on Workload Characterization, IISWC 2006, October 25\u201327, 2006, San Jose, California, USA, pp. 225\u2013236 (2006)"},{"key":"646_CR3","doi-asserted-by":"crossref","unstructured":"Bell, N., Garland, M.: Implementing sparse matrix\u2013vector multiplication on throughput-oriented processors. In: SC (2009)","DOI":"10.1145\/1654059.1654078"},{"key":"646_CR4","unstructured":"Benatia, A., Ji, W., Wang, Y., Shi, F.: Sparse matrix format selection with multiclass SVM for SpMV on GPU. In: 45th International Conference on Parallel Processing, ICPP 2016, Philadelphia, PA, USA, August 16\u201319, 2016, pp. 496\u2013505 (2016)"},{"key":"646_CR5","unstructured":"Bhattacharjee, A., Martonosi, M.: Thread criticality predictors for dynamic performance, power, and resource management in chip multiprocessors. In: 36th International Symposium on Computer Architecture (ISCA 2009), June 20\u201324, 2009, Austin, TX, USA, pp. 290\u2013301 (2009)"},{"issue":"3","key":"646_CR6","doi-asserted-by":"publisher","first-page":"418","DOI":"10.1007\/s10766-018-00625-8","volume":"47","author":"D Chen","year":"2019","unstructured":"Chen, D., Fang, J., Chen, S., Xu, C., Wang, Z.: Optimizing sparse matrix\u2013vector multiplications on an armv8-based many-core architecture. Int. J. Parallel Program. 47(3), 418\u2013432 (2019)","journal-title":"Int. J. Parallel Program."},{"key":"646_CR7","doi-asserted-by":"crossref","unstructured":"Chen, S., et\u00a0al.: Adaptive optimization of sparse matrix\u2013vector multiplication on emerging many-core architectures. In: HPCC\u201918 (2018)","DOI":"10.1109\/HPCC\/SmartCity\/DSS.2018.00116"},{"key":"646_CR8","doi-asserted-by":"crossref","unstructured":"Cummins, C., Petoumenos, P., Wang, Z., Leather, H.: End-to-end deep learning of optimization heuristics. In: PACT (2017)","DOI":"10.1109\/PACT.2017.24"},{"key":"646_CR9","first-page":"1","volume":"38","author":"TA Davis","year":"2011","unstructured":"Davis, T.A., Hu, Y.: The university of florida sparse matrix collection. ACM Trans. Math. Softw. 38, 1 (2011)","journal-title":"ACM Trans. Math. Softw."},{"key":"646_CR10","unstructured":"Diamond, J.R., Burtscher, M., McCalpin, J.D., Kim, B., Keckler, S.W., Browne, J.C.: Evaluation and optimization of multicore performance bottlenecks in supercomputing applications. In: IEEE International Symposium on Performance Analysis of Systems and Software, ISPASS 2011, 10\u201312 April, 2011, Austin, TX, USA, pp. 32\u201343 (2011)"},{"key":"646_CR11","doi-asserted-by":"crossref","unstructured":"Emani, M.K., Wang, Z., O\u2019Boyle, M.F.P.: Smart, adaptive mapping of parallelism in the presence of external workload. In: CGO (2013)","DOI":"10.1109\/CGO.2013.6495010"},{"key":"646_CR12","unstructured":"Eyerman, S., Bois, K.D., Eeckhout, L.: Speedup stacks: identifying scaling bottlenecks in multi-threaded applications. In: 2012 IEEE International Symposium on Performance Analysis of Systems and Software, New Brunswick, NJ, USA, April 1\u20133, 2012, pp. 145\u2013155 (2012)"},{"key":"646_CR13","unstructured":"FT-2000 Plus. Phytium Technology Co. Ltd., http:\/\/tech.sina.com.cn\/d\/2017-10-16\/doc-ifymvuyt0962449.shtml (2017)"},{"key":"646_CR14","doi-asserted-by":"crossref","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.F.P.: A workload-aware mapping approach for data-parallel programs. In: HiPEAC (2011)","DOI":"10.1145\/1944862.1944881"},{"key":"646_CR15","doi-asserted-by":"crossref","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.F.P.: Portable mapping of data parallel programs to opencl for heterogeneous systems. In: CGO (2013a)","DOI":"10.1109\/CGO.2013.6494993"},{"key":"646_CR16","doi-asserted-by":"crossref","unstructured":"Grewe, D., et\u00a0al.: Opencl task partitioning in the presence of GPU contention. In: LCPC (2013b)","DOI":"10.1007\/978-3-319-09967-5_5"},{"key":"646_CR17","volume-title":"Evaluating Scalability of Multi-threaded Applications on a Many-Core Platform","author":"V Gupta","year":"2012","unstructured":"Gupta, V., Kim, H., Schwan, K.: Evaluating Scalability of Multi-threaded Applications on a Many-Core Platform. Georgia Institute of Technology, Georgia (2012)"},{"key":"646_CR18","doi-asserted-by":"crossref","unstructured":"Kincaid, D.R., Young, T.C.: Itpackv 2d user\u2019s guide. In: Technical Report, Center for Numerical Analysis, Texas University, Austin, TX (USA) (1989)","DOI":"10.2172\/7093021"},{"key":"646_CR19","doi-asserted-by":"publisher","first-page":"C401","DOI":"10.1137\/130930352","volume":"36","author":"M Kreutzer","year":"2014","unstructured":"Kreutzer, M., Hager, G., Wellein, G., Fehske, H., Bishop, A.R.: A unified sparse matrix data format for efficient general sparse matrix\u2013vector multiplication on modern processors with wide SIMD units. SIAM J. Sci. Comput. 36, C401\u2013C423 (2014)","journal-title":"SIAM J. Sci. Comput."},{"key":"646_CR20","doi-asserted-by":"crossref","unstructured":"Laurenzano, M.A., Tiwari, A., Cauble-Chantrenne, A., Jundt, A., William W.A., Jr., Campbell, R.L., Carrington, L.: Characterization and bottleneck analysis of a 64-bit ARMv8 platform. In: ISPASS (2016)","DOI":"10.1109\/ISPASS.2016.7482072"},{"key":"646_CR21","doi-asserted-by":"crossref","unstructured":"Liu, J., He, X., Liu, W., Tan, G.: Register-based implementation of the sparse general matrix\u2013matrix multiplication on GPUS. In: PPoPP (2018)","DOI":"10.1145\/3178487.3178529"},{"key":"646_CR22","unstructured":"Liu, L., Li, Z., Sameh, A.H.: Analyzing memory access intensity in parallel programs on multicore. In: Proceedings of the 22nd Annual International Conference on Supercomputing, ICS 2008, Island of Kos, Greece, June 7\u201312, 2008, pp. 359\u2013367 (2008)"},{"key":"646_CR23","doi-asserted-by":"crossref","unstructured":"Liu, W., Vinter, B.: CSR5: an efficient storage format for cross-platform sparse matrix\u2013vector multiplication. In: ICS (2015a)","DOI":"10.1145\/2751205.2751209"},{"key":"646_CR24","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1016\/j.parco.2015.04.004","volume":"49","author":"W Liu","year":"2015","unstructured":"Liu, W., Vinter, B.: Speculative segmented sum for sparse matrix\u2013vector multiplication on heterogeneous processors. Parallel Comput. 49, 179\u2013193 (2015b)","journal-title":"Parallel Comput."},{"key":"646_CR25","unstructured":"Lv, Y., Sun, B., Luo, Q., Wang, J., Yu, Z., Qian, X.: Counterminer: Mining big performance data from hardware counters. In: 51st Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 2018, Fukuoka, Japan, October 20\u201324, 2018, pp. 613\u2013626 (2018)"},{"key":"646_CR26","doi-asserted-by":"crossref","unstructured":"Maggioni, M., Berger-Wolf, T.Y.: An architecture-aware technique for optimizing sparse matrix\u2013vector multiplication on GPUS. In: ICCS (2013)","DOI":"10.1016\/j.procs.2013.05.196"},{"key":"646_CR27","unstructured":"Magni, A., Dubach, C., O\u2019Boyle, M.F.P.: A large-scale cross-architecture evaluation of thread-coarsening. In: International Conference for High Performance Computing, Networking, Storage and Analysis, SC\u201913, Denver, CO, USA, November 17\u201321, 2013, pp. 11:1\u201311:11 (2013)"},{"key":"646_CR28","doi-asserted-by":"crossref","unstructured":"Mellor-Crummey, J.M., Garvin, J.: Optimizing sparse matrix\u2013vector product computations using unroll and jam. In: IJHPCA (2004)","DOI":"10.1177\/1094342004038951"},{"key":"646_CR29","doi-asserted-by":"crossref","unstructured":"Monakov, A., Lokhmotov, A., Avetisyan, A.: Automatically tuning sparse matrix\u2013vector multiplication for GPU architectures. In: HIPEAC (2010)","DOI":"10.1007\/978-3-642-11515-8_10"},{"key":"646_CR30","doi-asserted-by":"crossref","unstructured":"Ogilvie, W.F., Petoumenos, P., Wang, Z., Leather, H.: Fast automatic heuristic construction using active learning. In: LCPC (2014)","DOI":"10.1007\/978-3-319-17473-0_10"},{"key":"646_CR31","doi-asserted-by":"crossref","unstructured":"Ogilvie, W.F., Petoumenos, P., Wang, Z., Leather, H.: Minimizing the cost of iterative compilation with active learning. In: CGO (2017)","DOI":"10.1109\/CGO.2017.7863744"},{"key":"646_CR32","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., et al.: Scikit-learn: Machine learning in python. J. Mach. Learn. Res. 12, 2825\u20132830 (2011)","journal-title":"J. Mach. Learn. Res."},{"key":"646_CR33","doi-asserted-by":"crossref","unstructured":"Pinar, A., Heath, M.T.: Improving performance of sparse matrix\u2013vector multiplication. In: SC (1999)","DOI":"10.1145\/331532.331562"},{"key":"646_CR34","doi-asserted-by":"crossref","unstructured":"Ren, J., Gao, L., Wang, H., Wang, Z.: Optimise web browsing on heterogeneous mobile platforms: a machine learning based approach. In: INFOCOM (2017)","DOI":"10.1109\/INFOCOM.2017.8057087"},{"key":"646_CR35","doi-asserted-by":"crossref","unstructured":"Ren, J., et\u00a0al.: Proteus: Network-aware web browsing on heterogeneous mobile systems. In: CoNEXT\u201918 (2018)","DOI":"10.1145\/3281411.3281422"},{"key":"646_CR36","doi-asserted-by":"crossref","unstructured":"Sedaghati, N., Mu, T., Pouchet, L., Parthasarathy, S., Sadayappan, P.: Automatic selection of sparse matrix representation on GPUS. In: ICS (2015)","DOI":"10.1145\/2751205.2751244"},{"key":"646_CR37","doi-asserted-by":"crossref","unstructured":"Stephens, N.: Armv8-a next-generation vector architecture for HPC. In: 2016 IEEE Hot Chips 28 Symposium (HCS), pp. 1\u201331 (2016)","DOI":"10.1109\/HOTCHIPS.2016.7936203"},{"key":"646_CR38","doi-asserted-by":"crossref","unstructured":"Terpstra, D., Jagode, H., You, H., Dongarra, J.J.: Collectingperformance data with PAPI-C. In: Tools for High Performance Computing 2009, pp. 157\u2013173 (2009)","DOI":"10.1007\/978-3-642-11261-4_11"},{"key":"646_CR39","doi-asserted-by":"crossref","unstructured":"Tournavitis, G., Wang, Z., Franke, B., O\u2019Boyle, M.F.P.: Towards a holistic approach to auto-parallelization: integrating profile-driven parallelism detection and machine-learning based mapping. In: PLDI (2009)","DOI":"10.1145\/1542476.1542496"},{"key":"646_CR40","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.: Machine learning in compiler optimization. In: Proceedings of IEEE (2018)","DOI":"10.1109\/JPROC.2018.2817118"},{"key":"646_CR41","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Mapping parallelism to multi-cores: a machine learning based approach. In: PPoPP\u201909 (2009)","DOI":"10.1145\/1504176.1504189"},{"key":"646_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Z., O\u2019Boyle, M.F.: Partitioning streaming parallelism for multi-cores: a machine learning based approach. In: PACT\u201910 (2010)","DOI":"10.1145\/1854273.1854313"},{"key":"646_CR43","first-page":"20","volume":"10","author":"Z Wang","year":"2013","unstructured":"Wang, Z., O\u2019boyle, M.F.: Using machine learning to partition streaming programs. ACM Trans. Arch. Code Optm. 10, 20 (2013)","journal-title":"ACM Trans. Arch. Code Optm."},{"key":"646_CR44","first-page":"2","volume":"11","author":"Z Wang","year":"2014","unstructured":"Wang, Z., Tournavitis, G., Franke, B., O\u2019Boyle, M.F.P.: Integrating profile-driven parallelism detection and machine-learning-based mapping. ACM Trans. Arch. Code Optm. 11, 2 (2014a)","journal-title":"ACM Trans. Arch. Code Optm."},{"key":"646_CR45","first-page":"42","volume":"11","author":"Z Wang","year":"2014","unstructured":"Wang, Z., et al.: Automatic and portable mapping of data parallel programs to opencl for GPU-based heterogeneous systems. ACM Trans. Arch. Code Optm. 11, 42 (2014b)","journal-title":"ACM Trans. Arch. Code Optm."},{"key":"646_CR46","doi-asserted-by":"crossref","unstructured":"Wen, Y., Wang, Z., O\u2019Boyle, M.F.P.: Smart multi-task scheduling for opencl programs on CPU\/GPU heterogeneous platforms. In: HiPC\u201914 (2014)","DOI":"10.1109\/HiPC.2014.7116910"},{"key":"646_CR47","doi-asserted-by":"crossref","unstructured":"Williams, S., Oliker, L., Vuduc, R.W., Shalf, J., Yelick, K.A., Demmel, J.: Optimization of sparse matrix\u2013vector multiplication onemerging multicore platforms. In: Parallel Computing (2009)","DOI":"10.1016\/j.parco.2008.12.006"},{"key":"646_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, C.: Mars: A 64-core ARMv8 processor. In: HotChips (2015)","DOI":"10.1109\/HOTCHIPS.2015.7477454"},{"key":"646_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, P., et\u00a0al.: Auto-tuning streamed applications on intel xeon phi. In: IPDPS (2018)","DOI":"10.1109\/IPDPS.2018.00061"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-019-00646-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10766-019-00646-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10766-019-00646-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,23]],"date-time":"2023-09-23T00:26:07Z","timestamp":1695428767000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10766-019-00646-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11,15]]},"references-count":49,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,2]]}},"alternative-id":["646"],"URL":"https:\/\/doi.org\/10.1007\/s10766-019-00646-x","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"value":"0885-7458","type":"print"},{"value":"1573-7640","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,11,15]]},"assertion":[{"value":"2 August 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 November 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 November 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}