{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T15:17:31Z","timestamp":1767971851644,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,11]],"date-time":"2023-11-11T00:00:00Z","timestamp":1699660800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972408"],"award-info":[{"award-number":["61972408"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB0300101"],"award-info":[{"award-number":["2021YFB0300101"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,12]]},"DOI":"10.1145\/3581784.3607107","type":"proceedings-article","created":{"date-parts":[[2023,10,30]],"date-time":"2023-10-30T20:34:48Z","timestamp":1698698088000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Optimizing Direct Convolutions on ARM Multi-Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2805-0862","authenticated-orcid":false,"given":"Pengyu","family":"Wang","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7167-4086","authenticated-orcid":false,"given":"Weiling","family":"Yang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3542-4869","authenticated-orcid":false,"given":"Jianbin","family":"Fang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6243-8479","authenticated-orcid":false,"given":"Dezun","family":"Dong","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0317-8192","authenticated-orcid":false,"given":"Chun","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8364-9793","authenticated-orcid":false,"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2883-6997","authenticated-orcid":false,"given":"Tao","family":"Tang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6157-0662","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computing, University of Leeds, Leeds, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2023,11,11]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Acl. https:\/\/github.com\/ARM-software\/ComputeLibrary."},{"key":"e_1_3_2_2_2_1","unstructured":"Armv9. https:\/\/www.arm.com\/company\/news\/2021\/03\/arms-answer-to-the-future-of-ai-armv9-architecture."},{"key":"e_1_3_2_2_3_1","unstructured":"cublas. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_2_4_1","unstructured":"Kunpeng-920. https:\/\/www.hisilicon.com\/cn\/products\/Kunpeng\/Huawei-Kunpeng\/Huawei-Kunpeng-920."},{"key":"e_1_3_2_2_5_1","unstructured":"Libxsmm. https:\/\/github.com\/libxsmm\/libxsmm."},{"key":"e_1_3_2_2_6_1","unstructured":"libxsmm-dnn. https:\/\/github.com\/libxsmm\/libxsmm-dnn."},{"key":"e_1_3_2_2_7_1","unstructured":"Mkl. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/tools\/oneapi\/components\/onemkl."},{"key":"e_1_3_2_2_8_1","unstructured":"Mxnet. https:\/\/github.com\/apache\/mxnet."},{"key":"e_1_3_2_2_9_1","unstructured":"ncnn. https:\/\/github.com\/Tencent\/ncnn."},{"key":"e_1_3_2_2_10_1","unstructured":"onednn. https:\/\/github.com\/oneapi-src\/oneDNN."},{"key":"e_1_3_2_2_11_1","unstructured":"Openblas. https:\/\/github.com\/xianyi\/OpenBLAS."},{"key":"e_1_3_2_2_12_1","unstructured":"Raspberry-pi-4-model-b. https:\/\/www.raspberrypi.com\/products\/raspberry-pi-4-model-b\/."},{"key":"e_1_3_2_2_13_1","unstructured":"Tensorflow. https:\/\/github.com\/tensorflow\/tensorflow."},{"key":"e_1_3_2_2_14_1","unstructured":"Xnnpack. https:\/\/github.com\/google\/XNNPACK."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_2_2_16_1","first-page":"3066","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, CVPR Workshops 2022","author":"Amir O.","year":"2022","unstructured":"Amir, O., and Gil, B. Smm-conv: Scalar matrix multiplication with zero packing for accelerated convolution. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, CVPR Workshops 2022, New Orleans, LA, USA, June 19--20, 2022 (2022), IEEE, pp. 3066--3074."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2022.102806"},{"key":"e_1_3_2_2_18_1","volume-title":"Tenth international workshop on frontiers in handwriting recognition","author":"Chellapilla K.","year":"2006","unstructured":"Chellapilla, K., Puri, S., and Simard, P. High performance convolutional neural networks for document processing. In Tenth international workshop on frontiers in handwriting recognition (2006), Suvisoft."},{"key":"e_1_3_2_2_19_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Chen T.","year":"2018","unstructured":"Chen, T., Moreau, T., Jiang, Z., Zheng, L., Yan, E. Q., Shen, H., Cowan, M., Wang, L., Hu, Y., Ceze, L., Guestrin, C., and Krishnamurthy, A. TVM: an automated end-to-end optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8--10, 2018 (2018), A. C. Arpaci-Dusseau and G. Voelker, Eds., USENIX Association, pp. 578--594."},{"key":"e_1_3_2_2_20_1","first-page":"3393","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Chen T.","year":"2018","unstructured":"Chen, T., Zheng, L., Yan, E. Q., Jiang, Z., Moreau, T., Ceze, L., Guestrin, C., and Krishnamurthy, A. Learning to optimize tensor programs. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3--8, 2018, Montr\u00e9al, Canada (2018), S. Bengio, H. M. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett, Eds., pp. 3393--3404."},{"key":"e_1_3_2_2_21_1","volume-title":"cudnn: Efficient primitives for deep learning. CoRR abs\/1410.0759","author":"Chetlur S.","year":"2014","unstructured":"Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Tran, J., Catanzaro, B., and Shelhamer, E. cudnn: Efficient primitives for deep learning. CoRR abs\/1410.0759 (2014)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11179-7_36"},{"key":"e_1_3_2_2_24_1","first-page":"342","volume-title":"Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP 2023","author":"de Limas Santana A.","year":"2023","unstructured":"de Limas Santana, A., Armejach, A., and Casas, M. Efficient direct convolution using long SIMD instructions. In Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP 2023, Montreal, QC, Canada, 25 February 2023 - 1 March 2023 (2023), M. M. Dehnavi, M. Kulkarni, and S. Krishnamoorthy, Eds., ACM, pp. 342--353."},{"key":"e_1_3_2_2_25_1","volume-title":"Communication-optimal convolutional neural nets. CoRR abs\/1802.06905","author":"Demmel J.","year":"2018","unstructured":"Demmel, J., and Dinh, G. Communication-optimal convolutional neural nets. CoRR abs\/1802.06905 (2018)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/SBAC-PAD55451.2022.00027"},{"key":"e_1_3_2_2_27_1","volume-title":"The indirect convolution algorithm. arXiv preprint arXiv:1907.02129","author":"Dukhan M.","year":"2019","unstructured":"Dukhan, M. The indirect convolution algorithm. arXiv preprint arXiv:1907.02129 (2019)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.20"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-020-0741-6"},{"key":"e_1_3_2_2_30_1","volume-title":"Advancing direct convolution using convolution slicing optimization and isa extensions. arXiv preprint arXiv:2303.04739","author":"Ferrari V.","year":"2023","unstructured":"Ferrari, V., Sousa, R., Pereira, M., de Carvalho, J. P., Amaral, J. N., Moreira, J., and Araujo, G. Advancing direct convolution using convolution slicing optimization and isa extensions. arXiv preprint arXiv:2303.04739 (2023)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00069"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00032"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476206"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/1356052.1356053"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2799562.2799641"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-019-01454-4"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCS48598.2019.9188171"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.83"},{"key":"e_1_3_2_2_40_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. ArXiv abs\/1704.04861","author":"Howard A. G.","year":"2017","unstructured":"Howard, A. G., Zhu, M., Chen, B., Kalenichenko, D., Wang, W., Weyand, T., Andreetto, M., and Adam, H. Mobilenets: Efficient convolutional neural networks for mobile vision applications. ArXiv abs\/1704.04861 (2017)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3529113.3529122"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178496"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"e_1_3_2_2_47_1","first-page":"1025","volume-title":"2019 USENIX Annual Technical Conference, USENIX ATC 2019","author":"Liu Y.","year":"2019","unstructured":"Liu, Y., Wang, Y., Yu, R., Li, M., Sharma, V., and Wang, Y. Optimizing CNN model inference on cpus. In 2019 USENIX Annual Technical Conference, USENIX ATC 2019, Renton, WA, USA, July 10--12, 2019 (2019), D. Malkhi and D. Tsafrir, Eds., USENIX Association, pp. 1025--1040."},{"key":"e_1_3_2_2_48_1","first-page":"99","article-title":"A survey of techniques for optimizing deep learning on gpus","author":"Mittal S.","year":"2019","unstructured":"Mittal, S., and Vaishay, S. A survey of techniques for optimizing deep learning on gpus. J. Syst. Archit. 99 (2019).","journal-title":"J. Syst. Archit."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366428.3380771"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394885.3431534"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2968456.2968476"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538940"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037745"},{"key":"e_1_3_2_2_55_1","volume-title":"Yolov3: An incremental improvement. CoRR abs\/1804.02767","author":"Redmon J.","year":"2018","unstructured":"Redmon, J., and Farhadi, A. Yolov3: An incremental improvement. CoRR abs\/1804.02767 (2018)."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_2_57_1","volume-title":"Relay: A high-level IR for deep learning. CoRR abs\/1904.08368","author":"Roesch J.","year":"2019","unstructured":"Roesch, J., Lyubomirsky, S., Kirisame, M., Pollock, J., Weber, L., Jiang, Z., Chen, T., Moreau, T., and Tatlock, Z. Relay: A high-level IR for deep learning. CoRR abs\/1904.08368 (2019)."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2572683"},{"key":"e_1_3_2_2_59_1","volume-title":"Rigid-motion scattering for texture classification. ArXiv abs\/1403.1687","author":"Sifre L.","year":"2014","unstructured":"Sifre, L., and Mallat, S. Rigid-motion scattering for texture classification. ArXiv abs\/1403.1687 (2014)."},{"key":"e_1_3_2_2_60_1","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings","author":"Simonyan K.","year":"2015","unstructured":"Simonyan, K., and Zisserman, A. Very deep convolutional networks for large-scale image recognition. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings (2015), Y. Bengio and Y. LeCun, Eds."},{"key":"e_1_3_2_2_61_1","volume-title":"Small: A software framework for portable machine learning libraries. CoRR abs\/2303.04769","author":"Sridhar U.","year":"2023","unstructured":"Sridhar, U., Tukanov, N., Binder, E., Low, T. M., McMillan, S., and Schatz, M. D. Small: A software framework for portable machine learning libraries. CoRR abs\/2303.04769 (2023)."},{"key":"e_1_3_2_2_62_1","first-page":"4","article-title":"SCP: shared cache partitioning for high-performance GEMM","volume":"15","author":"Su X.","year":"2019","unstructured":"Su, X., Liao, X., Jiang, H., Yang, C., and Xue, J. SCP: shared cache partitioning for high-performance GEMM. ACM Trans. Archit. Code Optim. 15, 4 (2019), 43:1--43:21.","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"e_1_3_2_2_63_1","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings","author":"Vasilache N.","year":"2015","unstructured":"Vasilache, N., Johnson, J., Mathieu, M., Chintala, S., Piantino, S., and LeCun, Y. Fast convolutional nets with fbfft: A GPU performance evaluation. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings (2015), Y. Bengio and Y. LeCun, Eds."},{"key":"e_1_3_2_2_64_1","volume-title":"Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. CoRR abs\/1802.04730","author":"Vasilache N.","year":"2018","unstructured":"Vasilache, N., Zinenko, O., Theodoridis, T., Goyal, P., DeVito, Z., Moses, W. S., Verdoolaege, S., Adams, A., and Cohen, A. Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. CoRR abs\/1802.04730 (2018)."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8852012"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00019"},{"key":"e_1_3_2_2_67_1","series-title":"Proceedings of Machine Learning Research","first-page":"5771","volume-title":"Proceedings of the 35th International Conference on Machine Learning, ICML","author":"Zhang J.","year":"2018","unstructured":"Zhang, J., Franchetti, F., and Low, T. M. High performance zero-memory overhead direct convolutions. In Proceedings of the 35th International Conference on Machine Learning, ICML 2018, Stockholmsm\u00e4ssan, Stockholm, Sweden, July 10--15, 2018 (2018), J. G. Dy and A. Krause, Eds., vol. 80 of Proceedings of Machine Learning Research, PMLR, pp. 5771--5780."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3341841"},{"key":"e_1_3_2_2_70_1","first-page":"863","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Zheng L.","year":"2020","unstructured":"Zheng, L., Jia, C., Sun, M., Wu, Z., Yu, C. H., Haj-Ali, A., Wang, Y., Yang, J., Zhuo, D., Sen, K., Gonzalez, J. E., and Stoica, I. Ansor: Generating high-performance tensor programs for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020 (2020), USENIX Association, pp. 863--879."},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_2_2_72_1","volume-title":"Fusionstitching: Boosting memory intensive computations for deep learning workloads. CoRR abs\/2009.10924","author":"Zheng Z.","year":"2020","unstructured":"Zheng, Z., Zhao, P., Long, G., Zhu, F., Zhu, K., Zhao, W., Diao, L., Yang, J., and Lin, W. Fusionstitching: Boosting memory intensive computations for deep learning workloads. CoRR abs\/2009.10924 (2020)."}],"event":{"name":"SC '23: International Conference for High Performance Computing, Networking, Storage and Analysis","location":"Denver CO USA","acronym":"SC '23","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581784.3607107","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581784.3607107","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:23Z","timestamp":1750178183000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581784.3607107"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,11]]},"references-count":72,"alternative-id":["10.1145\/3581784.3607107","10.1145\/3581784"],"URL":"https:\/\/doi.org\/10.1145\/3581784.3607107","relation":{},"subject":[],"published":{"date-parts":[[2023,11,11]]},"assertion":[{"value":"2023-11-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}