{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T17:10:19Z","timestamp":1767892219126,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T00:00:00Z","timestamp":1676937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"European High-Performance Computing Joint Undertaking","award":["101034126"],"award-info":[{"award-number":["101034126"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,25]]},"DOI":"10.1145\/3572848.3577435","type":"proceedings-article","created":{"date-parts":[[2023,2,21]],"date-time":"2023-02-21T16:02:30Z","timestamp":1676995350000},"page":"342-353","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Efficient Direct Convolution Using Long SIMD Instructions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3203-3662","authenticated-orcid":false,"given":"Alexandre de Limas","family":"Santana","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Catalunya, Spain and Universitat Polit\u00e8cnica de Catalunya, Barcelona, Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2869-668X","authenticated-orcid":false,"given":"Adri\u00e0","family":"Armejach","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Catalunya, Spain and Universitat Polit\u00e8cnica de Catalunya, Barcelona, Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4564-2093","authenticated-orcid":false,"given":"Marc","family":"Casas","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Catalunya, Spain and Universitat Polit\u00e8cnica de Catalunya, Barcelona, Catalunya, Spain"}]}],"member":"320","published-online":{"date-parts":[[2023,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"265","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi M.","year":"2016","unstructured":"Abadi, M., Barham, P., Chen, J., Chen, Z., Davis, A., Dean, J., Devin, M., Ghemawat, S., Irving, G., Isard, M., et al. Tensorflow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) (2016), pp. 265--283."},{"key":"e_1_3_2_1_2_1","volume-title":"Low-memory gemm-based convolution algorithms for deep neural networks. arXiv preprint arXiv:1709.03395","author":"Anderson A.","year":"2017","unstructured":"Anderson, A., Vasudevan, A., Keane, C., and Gregg, D. Low-memory gemm-based convolution algorithms for deep neural networks. arXiv preprint arXiv:1709.03395 (2017)."},{"key":"e_1_3_2_1_3_1","volume-title":"Efficient 8-bit quantization of transformer neural machine language translation model. arXiv preprint arXiv:1906.00532","author":"Bhandare A.","year":"2019","unstructured":"Bhandare, A., Sripathi, V., Karkada, D., Menon, V., Choi, S., Datta, K., and Saletore, V. Efficient 8-bit quantization of transformer neural machine language translation model. arXiv preprint arXiv:1906.00532 (2019)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_5_1","volume-title":"cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759","author":"Chetlur S.","year":"2014","unstructured":"Chetlur, S., Woolley, C., Vandermersch, P., Cohen, J., Tran, J., Catanzaro, B., and Shelhamer, E. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)."},{"key":"e_1_3_2_1_6_1","volume-title":"Risc-v vector extension","author":"Community R.-V.","year":"2022","unstructured":"Community, R.-V. Risc-v vector extension, 2022. https:\/\/github.com\/riscv\/riscv-v-spec\/blob\/master\/v-spec.adoc."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/99.660313"},{"key":"e_1_3_2_1_8_1","volume-title":"Distributed deep learning using synchronous stochastic gradient descent. arXiv preprint arXiv:1602.06709","author":"Das D.","year":"2016","unstructured":"Das, D., Avancha, S., Mudigere, D., Vaidynathan, K., Sridharan, S., Kalamkar, D., Kaul, B., and Dubey, P. Distributed deep learning using synchronous stochastic gradient descent. arXiv preprint arXiv:1602.06709 (2016)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.38"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00069"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.83"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.40842"},{"key":"e_1_3_2_1_14_1","unstructured":"Intel. Neon 2022. https:\/\/github.com\/NervanaSystems\/neon."},{"key":"e_1_3_2_1_15_1","volume-title":"Oneapi deep neural network library","author":"Intel","year":"2022","unstructured":"Intel. Oneapi deep neural network library, 2022. https:\/\/oneapi-src.github.io\/oneDNN\/."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_17_1","volume-title":"A study of bfloat16 for deep learning training. arXiv preprint arXiv:1905.12322","author":"Kalamkar D.","year":"2019","unstructured":"Kalamkar, D., Mudigere, D., Mellempudi, N., Das, D., Banerjee, K., Avancha, S., Vooturi, D. T., Jammalamadaka, N., Huang, J., Yuen, H., et al. A study of bfloat16 for deep learning training. arXiv preprint arXiv:1905.12322 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Analytical modeling is enough for high-performance blis. ACM Transactions on Mathematical Software (TOMS) 43, 2","author":"Low T. M.","year":"2016","unstructured":"Low, T. M., Igual, F. D., Smith, T. M., and Quintana-Orti, E. S. Analytical modeling is enough for high-performance blis. ACM Transactions on Mathematical Software (TOMS) 43, 2 (2016), 1--18."},{"key":"e_1_3_2_1_19_1","volume-title":"Fast training of convolutional networks through ffts. arXiv preprint arXiv:1312.5851","author":"Mathieu M.","year":"2013","unstructured":"Mathieu, M., Henaff, M., and LeCun, Y. Fast training of convolutional networks through ffts. arXiv preprint arXiv:1312.5851 (2013)."},{"key":"e_1_3_2_1_20_1","unstructured":"NEC. Nec llvm compiler 2022. https:\/\/github.com\/sx-aurora-dev\/llvm-project."},{"key":"e_1_3_2_1_21_1","unstructured":"NEC. Tensorflow-ve 2022. https:\/\/github.com\/sx-aurora-dev\/tensorflow."},{"key":"e_1_3_2_1_22_1","unstructured":"NEC. Vednn 2022. https:\/\/github.com\/sx-aurora-dev\/vednn."},{"key":"e_1_3_2_1_23_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke A.","year":"2019","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019), 8026--8037."},{"key":"e_1_3_2_1_24_1","volume-title":"Co-Design for A64FX Manycore Processor and \"Fugaku\". SC '20","author":"Sato M.","unstructured":"Sato, M., Ishikawa, Y., Tomita, H., Kodama, Y., Odajima, T., Tsuji, M., Yashiro, H., Aoki, M., Shida, N., Miyoshi, I., Hirai, K., Furuya, A., Asato, A., Morita, K., and Shimizu, T. Co-Design for A64FX Manycore Processor and \"Fugaku\". SC '20, IEEE Press."},{"key":"e_1_3_2_1_25_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan K.","year":"2014","unstructured":"Simonyan, K., and Zisserman, A. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.35"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2017.7995254"},{"key":"e_1_3_2_1_30_1","first-page":"19","volume-title":"Proceedings of A Symposium on High Performance Chips, Hot Chips","volume":"30","author":"Yamada Y.","year":"2018","unstructured":"Yamada, Y., and Momose, S. Vector engine processor of NEC's brand-new supercomputer SX-Aurora TSUBASA. In Proceedings of A Symposium on High Performance Chips, Hot Chips (2018), vol. 30, pp. 19--21."},{"key":"e_1_3_2_1_31_1","first-page":"5776","volume-title":"International Conference on Machine Learning","author":"Zhang J.","year":"2018","unstructured":"Zhang, J., Franchetti, F., and Low, T. M. High performance zero-memory overhead direct convolutions. In International Conference on Machine Learning (2018), PMLR, pp. 5776--5785."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.119"}],"event":{"name":"PPoPP '23: The 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Montreal QC Canada","acronym":"PPoPP '23","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577435","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3572848.3577435","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:09Z","timestamp":1750183689000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3572848.3577435"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,21]]},"references-count":32,"alternative-id":["10.1145\/3572848.3577435","10.1145\/3572848"],"URL":"https:\/\/doi.org\/10.1145\/3572848.3577435","relation":{},"subject":[],"published":{"date-parts":[[2023,2,21]]},"assertion":[{"value":"2023-02-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}