{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:02:08Z","timestamp":1776931328933,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767574","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:13:44Z","timestamp":1762532024000},"page":"2129-2136","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Study of Performance Portability of Low-bit Fused Matrix-Vector Multiplication Kernels in SYCL"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7197-780X","authenticated-orcid":false,"given":"Zheming","family":"Jin","sequence":"first","affiliation":[{"name":"Oak Ridge National Lab, Oak Ridge, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_1_2","unstructured":"Brown T. Mann B. Ryder N. Subbiah M. Kaplan J.D. Dhariwal P. Neelakantan A. Shyam P. Sastry G. Askell A. and Agarwal S. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 pp.1877-1901."},{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2008.917757"},{"key":"e_1_3_3_1_3_2","first-page":"307","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Wang L.","unstructured":"Wang, L., Ma, L., Cao, S., Zhang, Q., Xue, J., Shi, Y., Zheng, N., Miao, Z., Yang, F., Cao, T. and Yang, Y., 2024. Ladder: Enabling Efficient {Low-Precision} Deep Learning Computing through Hardware-aware Tensor Transformation. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24) (pp. 307-323)."},{"key":"e_1_3_3_1_4_2","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323.","author":"Frantar E.","year":"2022","unstructured":"Frantar, E., Ashkboos, S., Hoefler, T. and Alistarh, D., 2022. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323."},{"key":"e_1_3_3_1_5_2","first-page":"87","article-title":". AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration","volume":"6","author":"Lin J.","year":"2024","unstructured":"Lin, J., Tang, J., Tang, H., Yang, S., Chen, W.M., Wang, W.C., Xiao, G., Dang, X., Gan, C. and Han, S., 2024. AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration. Proceedings of Machine Learning and Systems, 6, pp.87-100.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_3_1_6_2","unstructured":"High-speed GEMV kernels. https:\/\/github.com\/wangsiping97\/FastGEMV"},{"key":"e_1_3_3_1_7_2","unstructured":"GemLite: Fast low-bit matmul kernels in Triton. https:\/\/github.com\/mobiusml\/gemlite"},{"key":"e_1_3_3_1_8_2","volume-title":"International Conference on Machine Learning (pp. 38087-38099)","author":"Xiao G.","unstructured":"Xiao, G., Lin, J., Seznec, M., Wu, H., Demouth, J. and Han, S., 2023, July. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning (pp. 38087-38099). PMLR."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Garland M. Le Grand S. Nickolls J. Anderson J. Hardwick J. Morton S. Phillips E. Zhang Y. and Volkov V. 2008. Parallel computing experiences with CUDA. IEEE micro 28(4) pp.13-27.","DOI":"10.1109\/MM.2008.57"},{"key":"e_1_3_3_1_10_2","volume-title":"SYCL: Single-source C++ accelerator programming. In Parallel Computing: On the Road to Exascale (pp. 673-682)","author":"Reyes R.","year":"2016","unstructured":"Reyes, R. and Lom\u00fcller, V., 2016. SYCL: Single-source C++ accelerator programming. In Parallel Computing: On the Road to Exascale (pp. 673-682). IOS Press."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2011.10.002"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Reinders J. Ashbaugh B. Brodman J. Kinsner M. Pennycook J. and Tian X. 2021. Data parallel C++: mastering DPC++ for programming of heterogeneous systems using C++ and SYCL (p. 548). Springer Nature.","DOI":"10.1007\/978-1-4842-5574-2"},{"key":"e_1_3_3_1_13_2","volume-title":"2022 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC) (pp. 111-122)","author":"Narasimhan K.","unstructured":"Narasimhan, K., El Farouki, O., Goli, M., Tanvir, M., Georgiev, S. and Ault, I., 2022, November. Towards performance portability of AI graphs using SYCL. In 2022 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC) (pp. 111-122). IEEE."},{"key":"e_1_3_3_1_14_2","unstructured":"Davis J.H. Sivaraman P. Minn I. Parasyris K. Menon H. Georgakoudis G. and Bhatele A. 2024. An evaluative comparison of performance portability across GPU programming models (No. LLNL-CONF-855581). Lawrence Livermore National Laboratory (LLNL) Livermore CA (United States)."},{"key":"e_1_3_3_1_15_2","volume-title":"2020 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC) (pp. 25-35)","author":"Goli M.","unstructured":"Goli, M., Narasimhan, K., Reyes, R., Tracy, B., Soutar, D., Georgiev, S., Fomenko, E.M. and Chereshnev, E., 2020, November. Towards cross-platform performance portability of DNN models using SYCL. In 2020 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC) (pp. 25-35). IEEE."},{"key":"e_1_3_3_1_16_2","volume-title":"Intel Data Parallel C++ Compiler","author":"Intel Corporation","year":"2025","unstructured":"Intel Corporation, \u201cIntel Data Parallel C++ Compiler,\u201d 2025. [Online]. Available: https:\/\/github.com\/intel\/llvm"},{"key":"e_1_3_3_1_17_2","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS'17)","author":"Ashish Vaswani","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS'17). Curran Associates Inc., Red Hook, NY, USA, 6000\u20136010."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.4218\/etrij.2024-0111"},{"key":"e_1_3_3_1_19_2","volume-title":"2009 IEEE international symposium on workload characterization (IISWC) (pp. 3-12)","author":"Kerr A.","unstructured":"Kerr, A., Diamos, G. and Yalamanchili, S., 2009, October. A characterization and analysis of PTX kernels. In 2009 IEEE international symposium on workload characterization (IISWC) (pp. 3-12). IEEE."},{"key":"e_1_3_3_1_20_2","volume-title":"Proceedings of the 2023 International Workshop on OpenCL (pp. 1-9).","author":"Pennycook S.J.","unstructured":"Pennycook, S.J., Ashbaugh, B., Brodman, J., Kinsner, M., Larsen, S., Lueck, G., Schulz, R. and Voss, M., 2023, April. Towards Alignment of Parallelism in SYCL and ISO C++. In Proceedings of the 2023 International Workshop on OpenCL (pp. 1-9)."},{"key":"e_1_3_3_1_21_2","volume-title":"Proceedings of the 12th International Workshop on OpenCL and SYCL (pp. 1-12)","author":"Crisci L.","unstructured":"Crisci, L., Carpentieri, L., Thoman, P., Alpay, A., Heuveline, V. and Cosenza, B., 2024, April. SYCL-Bench 2020: Benchmarking SYCL 2020 on AMD, Intel, and Nvidia GPUs. In Proceedings of the 12th International Workshop on OpenCL and SYCL (pp. 1-12)."},{"key":"e_1_3_3_1_22_2","volume-title":"Proceedings of the 13th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics (pp. 1-8).","author":"Jin Z.","unstructured":"Jin, Z. and Vetter, J.S., 2022, August. Performance portability study of epistasis detection using SYCL on NVIDIA GPU. In Proceedings of the 13th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics (pp. 1-8)."},{"key":"e_1_3_3_1_23_2","volume-title":"Proceedings of the SC'23 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis (pp. 547-555)","author":"Weckert C.","unstructured":"Weckert, C., Solis-Vasquez, L., Oppermann, J., Koch, A. and Sinnen, O., 2023, November. Altis-SYCL: Migrating altis benchmarking suite from CUDA to SYCL for GPUs and FPGAs. In Proceedings of the SC'23 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis (pp. 547-555)."}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767574","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:29:21Z","timestamp":1767986961000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767574"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":23,"alternative-id":["10.1145\/3731599.3767574","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767574","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}