{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T00:56:06Z","timestamp":1773017766305,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2018YFB0204300"],"award-info":[{"award-number":["2018YFB0204300"]}]},{"name":"Excellent Youth Foundation of Hunan Province","award":["2021JJ10050"],"award-info":[{"award-number":["2021JJ10050"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972408 and 61872294"],"award-info":[{"award-number":["61972408 and 61872294"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3476217","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-14","source":"Crossref","is-referenced-by-count":35,"title":["LIBSHALOM"],"prefix":"10.1145","author":[{"given":"Weiling","family":"Yang","sequence":"first","affiliation":[{"name":"National University of Defense Technology, China"}]},{"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}]},{"given":"Dezun","family":"Dong","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}]},{"given":"Xing","family":"Su","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}]},{"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Leeds, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"[n. d.]. ARM PERFORMANCE LIBRARIES. ([n. d.]). https:\/\/www.arm.com\/products\/development-tools\/server-and-hpc\/allinea-studio\/performance-libraries.  [n. d.]. ARM PERFORMANCE LIBRARIES. ([n. d.]). https:\/\/www.arm.com\/products\/development-tools\/server-and-hpc\/allinea-studio\/performance-libraries."},{"key":"e_1_3_2_2_2_1","unstructured":"[n. d.]. ARMv9. ([n. d.]). https:\/\/www.arm.com\/company\/news\/2021\/03\/arms-answer-to-the-future-of-ai-armv9-architecture.  [n. d.]. ARMv9. ([n. d.]). https:\/\/www.arm.com\/company\/news\/2021\/03\/arms-answer-to-the-future-of-ai-armv9-architecture."},{"key":"e_1_3_2_2_3_1","unstructured":"[n. d.]. Intel MKL. ([n. d.]). https:\/\/software.intel.com\/en-us\/mkl.  [n. d.]. Intel MKL. ([n. d.]). https:\/\/software.intel.com\/en-us\/mkl."},{"key":"e_1_3_2_2_4_1","unstructured":"[n. d.]. Kunpeng 920. ([n. d.]). https:\/\/www.hisilicon.com\/en\/products\/Kunpeng\/Huawei%20Kunpeng%20920.  [n. d.]. Kunpeng 920. ([n. d.]). https:\/\/www.hisilicon.com\/en\/products\/Kunpeng\/Huawei%20Kunpeng%20920."},{"key":"e_1_3_2_2_5_1","unstructured":"[n. d.]. LibShalom. ([n. d.]). https:\/\/github.com\/AnonymousYWL\/MYLIB.  [n. d.]. LibShalom. ([n. d.]). https:\/\/github.com\/AnonymousYWL\/MYLIB."},{"key":"e_1_3_2_2_6_1","unstructured":"[n. d.]. Nek5000\/NekBox. ([n. d.]). https:\/\/github.com\/NekBox\/NekBox.  [n. d.]. Nek5000\/NekBox. ([n. d.]). https:\/\/github.com\/NekBox\/NekBox."},{"key":"e_1_3_2_2_7_1","unstructured":"[n. d.]. OpenCL BLAS. ([n. d.]). https:\/\/github.com\/clMathLibraries\/clBLAS.  [n. d.]. OpenCL BLAS. ([n. d.]). https:\/\/github.com\/clMathLibraries\/clBLAS."},{"key":"e_1_3_2_2_8_1","unstructured":"[n. d.]. A scientific software for the numerical simulation of seismic wave phenomena and earthquake dynamics. ([n. d.]). http:\/\/www.seissol.org\/.  [n. d.]. A scientific software for the numerical simulation of seismic wave phenomena and earthquake dynamics. ([n. d.]). http:\/\/www.seissol.org\/."},{"key":"e_1_3_2_2_9_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015","author":"Calderara Mauro","year":"2015"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","first-page":"418","DOI":"10.1007\/s10766-018-00625-8","article-title":"Optimizing sparse matrix-vector multiplications on an armv8-based many-core architecture","volume":"47","author":"Chen Donglin","year":"2019","journal-title":"International Journal of Parallel Programming"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","first-page":"80","DOI":"10.1007\/s10766-019-00646-x","article-title":"Characterizing Scalability of Sparse Matrix-Vector Multiplications on Phytium FT-2000+","volume":"48","author":"Chen Donglin","year":"2020","journal-title":"Int. J. Parallel Program."},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the ACM International Conference on Supercomputing, ICS 2019","author":"Chen Jieyang","year":"2019"},{"key":"e_1_3_2_2_13_1","volume-title":"2017 26th International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE, 219--232","author":"Cummins Chris","year":"2017"},{"key":"e_1_3_2_2_14_1","volume-title":"The Indirect Convolution Algorithm. CoRR abs\/1907.02129","author":"Dukhan Marat","year":"2019"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1177\/1094342020965661","article-title":"Performance engineering for real and complex tall & skinny matrix multiplication kernels on GPUs","volume":"35","author":"Ernst Dominik","year":"2021","journal-title":"The International Journal of High Performance Computing Applications"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","first-page":"33","DOI":"10.1007\/s11390-020-0741-6","article-title":"Performance Evaluation of Memory-Centric ARMv8 Many-Core Architectures: A Case Study with Phytium 2000+","volume":"36","author":"Fang Jianbin","year":"2021","journal-title":"J. Comput. Sci. Technol."},{"key":"e_1_3_2_2_17_1","article-title":"BLASFEO: Basic Linear Algebra Subroutines for Embedded Optimization","volume":"44","author":"Frison Gianluca","year":"2018","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_18_1","article-title":"The BLAS API of BLASFEO: Optimizing Performance for Small Matrices","volume":"46","author":"Frison Gianluca","year":"2020","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_19_1","volume-title":"2021 IEEE International Conference on Cluster Computing (Cluster).","author":"Gao Wanrong","year":"2021"},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, SC 2018","author":"Georganas Evangelos","year":"2018"},{"key":"e_1_3_2_2_21_1","volume-title":"2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 222--233","author":"Georganas Evangelos","year":"2020"},{"key":"e_1_3_2_2_22_1","article-title":"Anatomy of high-performance matrix multiplication","volume":"34","author":"Goto Kazushige","year":"2008","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_23_1","article-title":"High-performance implementation of the level-3 BLAS","volume":"35","author":"Goto Kazushige","year":"2008","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_24_1","volume-title":"Proceedings of the 2013 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO). IEEE, 1--10","author":"Grewe Dominik","year":"2013"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1007\/s10107-019-01454-4","article-title":"On the behavior of Lagrange multipliers in convex and nonconvex infeasible interior point methods","volume":"186","author":"Haeser Gabriel","year":"2021","journal-title":"Math. Program."},{"key":"e_1_3_2_2_26_1","volume-title":"ICPP 2020: 49th International Conference on Parallel Processing, Edmonton, AB, Canada, August 17--20","author":"Han Qingchang","year":"2020"},{"key":"e_1_3_2_2_27_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 770--778","author":"He Kaiming","year":"2016"},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2016","author":"Heinecke Alexander","year":"2016"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2018","author":"Jia Zhen","year":"2018"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","first-page":"580","DOI":"10.1109\/TPDS.2019.2939785","article-title":"FeatherCNN: Fast Inference Computation with TensorGEMM on ARM Architectures","volume":"31","author":"Lan Haidong","year":"2020","journal-title":"IEEE Trans. Parallel Distributed Syst."},{"key":"e_1_3_2_2_31_1","volume-title":"Evaluation criteria for sparse matrix storage formats","author":"Langr Daniel","year":"2015"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the 24th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2019","author":"Li Xiuhong","year":"2019"},{"key":"e_1_3_2_2_33_1","article-title":"Analytical Modeling Is Enough for High-Performance BLIS","volume":"43","author":"Low Tze Meng","year":"2016","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","first-page":"800","DOI":"10.1016\/j.future.2020.06.033","article-title":"Performance and energy consumption of HPC workloads on a cluster based on Arm ThunderX2 CPU","volume":"112","author":"Mantovani Filippo","year":"2020","journal-title":"Future Gener. Comput. Syst."},{"key":"e_1_3_2_2_35_1","volume-title":"Proceedings of the 23rd international conference on Parallel architectures and compilation. 481--482","author":"Ogilvie William F","year":"2014"},{"key":"e_1_3_2_2_36_1","volume-title":"SIGMA: A Sparse and Irregular GEMM Accelerator with Flexible Interconnects for DNN Training. In IEEE International Symposium on High Performance Computer Architecture, HPCA 2020","author":"Qin Eric","year":"2020"},{"key":"e_1_3_2_2_37_1","volume-title":"FusionNet: A deep fully residual convolutional neural network for image segmentation in connectomics. CoRR abs\/1612.05360","author":"Quan Tran Minh","year":"2016"},{"key":"e_1_3_2_2_38_1","volume-title":"SC'16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 444--455","author":"Rajovic Nikola","year":"2016"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1016\/j.jpdc.2021.02.013","article-title":"TSM2X: High-performance tall-and-skinny matrix-matrix multiplication on GPUs","volume":"151","author":"Rivera Cody","year":"2021","journal-title":"J. Parallel Distributed Comput."},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2020","author":"Sato Mitsuhisa","year":"2020"},{"key":"e_1_3_2_2_41_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Simonyan Karen","year":"2015"},{"key":"e_1_3_2_2_42_1","volume-title":"van de Geijn","author":"Smith Tyler M.","year":"2019"},{"key":"e_1_3_2_2_43_1","volume-title":"Anatomy of High-Performance Many-Threaded Matrix Multiplication. In 2014 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 1049--1059","author":"Smith Tyler M."},{"key":"e_1_3_2_2_44_1","volume-title":"2016 IEEE Hot Chips 28 Symposium (HCS). IEEE, 1--31","author":"Stephens Nigel","year":"2016"},{"key":"e_1_3_2_2_45_1","volume-title":"SCP: Shared Cache Partitioning for High-Performance GEMM. TACO 15, 4","author":"Su Xing","year":"2019"},{"key":"e_1_3_2_2_46_1","volume-title":"Proceedings of the 2009 ACM SIGPLAN conference on Programming language design and implementation. ACM, 177--187","author":"Tournavitis Georgios","year":"2009"},{"key":"e_1_3_2_2_47_1","volume-title":"44th International Conference on Parallel Processing, ICPP 2015","author":"Wang Feng","year":"2015"},{"key":"e_1_3_2_2_48_1","first-page":"1","article-title":"Automatic and portable mapping of data parallel programs to opencl for gpu-based heterogeneous systems","volume":"11","author":"Wang Zheng","year":"2014","journal-title":"ACM Transactions on Architecture and Code Optimization (TACO)"},{"key":"e_1_3_2_2_49_1","volume-title":"Proceedings of the 19th international conference on Parallel architectures and compilation techniques. 307--318","author":"Wang Zheng","year":"2010"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","first-page":"1879","DOI":"10.1109\/JPROC.2018.2817118","article-title":"Machine learning in compiler optimization","volume":"106","author":"Wang Zheng","year":"2018","journal-title":"Proc. IEEE"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2579561","article-title":"Integrating profile-driven parallelism detection and machine-learning-based mapping","volume":"11","author":"Wang Zheng","year":"2014","journal-title":"ACM Transactions on Architecture and Code Optimization (TACO)"},{"key":"e_1_3_2_2_52_1","volume-title":"2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 101--110","author":"Yang Weiling","year":"2021"},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques. 111--123","author":"Ye Guixin","year":"2020"},{"key":"e_1_3_2_2_54_1","volume-title":"SCFA 2019, Singapore, March 11--14, 2019, Proceedings (Lecture Notes in Computer Science)","volume":"11416","author":"You Xin","year":"2019"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1145\/1077464.1077466","article-title":"Fast sparse matrix multiplication","volume":"1","author":"Yuster Raphael","year":"2005","journal-title":"ACM Transactions On Algorithms (TALG)"},{"key":"e_1_3_2_2_56_1","article-title":"BLIS: A Framework for Rapidly Instantiating BLAS Functionality","volume":"41","author":"Van Zee Field G.","year":"2015","journal-title":"ACM Trans. Math. Softw."},{"key":"e_1_3_2_2_57_1","volume-title":"2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 515--525","author":"Zhang Peng","year":"2018"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"crossref","first-page":"1878","DOI":"10.1109\/TPDS.2020.2978045","article-title":"Optimizing Streaming Parallelism on Heterogeneous Many-Core Architectures","volume":"31","author":"Zhang Peng","year":"2020","journal-title":"IEEE Trans. Parallel Distributed Syst."},{"key":"e_1_3_2_2_59_1","volume-title":"18th IEEE International Conference on Parallel and Distributed Systems, ICPADS 2012","author":"Zhang Xianyi","year":"2012"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476217","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476217","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:22Z","timestamp":1750191142000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476217"}},"subtitle":["optimizing small and irregular-shaped matrix multiplications on ARMv8 multi-cores"],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":59,"alternative-id":["10.1145\/3458817.3476217","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3476217","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}