{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:00:28Z","timestamp":1761897628811,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032024350","type":"print"},{"value":"9783032024367","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02436-7_3","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T06:50:27Z","timestamp":1761893427000},"page":"33-45","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Benchmarking Operators in Deep Neural Networks for Improving Performance Portability of SYCL"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7197-780X","authenticated-orcid":false,"given":"Zheming","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2449-6720","authenticated-orcid":false,"given":"Jeffrey S.","family":"Vetter","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"issue":"2","key":"3_CR1","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/MM.2008.31","volume":"28","author":"E Lindholm","year":"2008","unstructured":"Lindholm, E., Nickolls, J., Oberman, S., Montrym, J.: NVIDIA Tesla: a unified graphics and computing architecture. IEEE Micro 28(2), 39\u201355 (2008)","journal-title":"IEEE Micro"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Gutierrez, A., et al.: Lost in abstraction: pitfalls of analyzing GPUs at the intermediate language level. In: 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 608\u2013619. IEEE (2018)","DOI":"10.1109\/HPCA.2018.00058"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Blythe, D.: The Xe GPU architecture. In: 2020 IEEE Hot Chips 32 Symposium (HCS), pp. 1\u201327. IEEE Computer Society (2020)","DOI":"10.1109\/HCS49909.2020.9220591"},{"issue":"4","key":"3_CR4","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1109\/MM.2008.57","volume":"28","author":"M Garland","year":"2008","unstructured":"Garland, M., et al.: Parallel computing experiences with CUDA. IEEE Micro 28(4), 13\u201327 (2008)","journal-title":"IEEE Micro"},{"key":"3_CR5","unstructured":"Portability Across DOE Office of Science HPC Facilities. https:\/\/performanceportability.org\/"},{"issue":"4","key":"3_CR6","doi-asserted-by":"publisher","first-page":"805","DOI":"10.1109\/TPDS.2021.3097283","volume":"33","author":"CR Trott","year":"2021","unstructured":"Trott, C.R., et al.: Kokkos 3: programming model extensions for the exascale era. IEEE Trans. Parallel Distrib. Syst. 33(4), 805\u2013817 (2021)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"1","key":"3_CR7","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1109\/99.660313","volume":"5","author":"L Dagum","year":"1998","unstructured":"Dagum, L., Menon, R.: OpenMP: an industry standard API for shared-memory programming. IEEE Comput. Sci. Eng. 5(1), 46\u201355 (1998)","journal-title":"IEEE Comput. Sci. Eng."},{"key":"3_CR8","unstructured":"SYCL 2020 Specification (revision 5). https:\/\/www.khronos.org\/registry\/SYCL\/specs\/sycl-2020\/html\/sycl-2020.html"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Homerding, B., Tramm, J.: Evaluating the performance of the hipSYCL toolchain for HPC kernels on NVIDIA V100 GPUs. In: Proceedings of the International Workshop on OpenCL, pp. 1\u20137 (2020)","DOI":"10.1145\/3388333.3388660"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Haseeb, M., Ding, N., Deslippe, J., Awan, M.: Evaluating Performance and Portability of a core bioinformatics kernel on multiple vendor GPUs. In: 2021 International Workshop on Performance, Portability and Productivity in HPC (P3HPC), pp. 68\u201378. IEEE (2021)","DOI":"10.1109\/P3HPC54578.2021.00010"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Jin, Z., Vetter, J.S.: Understanding performance portability of bioinformatics applications in SYCL on an NVIDIA GPU. In: 2022 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), pp. 2190\u20132195. IEEE (2022)","DOI":"10.1109\/BIBM55620.2022.9995222"},{"key":"3_CR12","doi-asserted-by":"publisher","first-page":"120","DOI":"10.1016\/j.jpdc.2022.03.017","volume":"165","author":"G Casta\u00f1o","year":"2022","unstructured":"Casta\u00f1o, G., Faqir-Rhazoui, Y., Garc\u00eda, C., Prieto-Mat\u00edas, M.: Evaluation of Intel\u2019s DPC++ compatibility tool in heterogeneous computing. J. Parallel Distrib. Comput. 165, 120\u2013129 (2022)","journal-title":"J. Parallel Distrib. Comput."},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Hardy, D.J., Choi, J., Jiang, W., Tajkhorshid, E.: Experiences porting NAMD to the data parallel C++ programming model. In: International Workshop on OpenCL, pp. 1\u20135 (2022)","DOI":"10.1145\/3529538.3529560"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"Che, S., et al.: Rodinia: a benchmark suite for heterogeneous computing. In: 2009 IEEE International S0ymposium on Workload Characterization (IISWC), pp. 44\u201354. IEEE (2009)","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"3_CR15","doi-asserted-by":"publisher","unstructured":"Breyer, M., Van Craen, A., Pfl\u00fcger, D: A comparison of SYCL, OpenCL, CUDA, and OpenMP for massively parallel support vector machine classification on multi-vendor hardware. In: International Workshop on OpenCL (IWOCL'22), vol. 2, pp. 1\u201312. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3529538.3529980","DOI":"10.1145\/3529538.3529980"},{"key":"3_CR16","doi-asserted-by":"crossref","unstructured":"Tanvir, M., Narasimhan, K., Goli, M., El Farouki, O., Georgiev, S., Ault, I.: Towards performance portability of AI models using SYCL-DNN. In: International Workshop on OpenCL, pp. 1\u20133 (2022)","DOI":"10.1145\/3529538.3529999"},{"key":"3_CR17","doi-asserted-by":"publisher","first-page":"628","DOI":"10.1007\/s10766-021-00701-6","volume":"49","author":"J Li","year":"2021","unstructured":"Li, J., et al.: Compiler-assisted operator template library for DNN accelerators. Int. J. Parallel Prog. 49, 628\u2013645 (2021)","journal-title":"Int. J. Parallel Prog."},{"key":"3_CR18","unstructured":"Munshi, A., Gaster, B., Mattson, T.G., Ginsburg, D.: OpenCL programming guide. Pearson Education, Boston (2011)"},{"key":"3_CR19","unstructured":"Kaeli, D., Mistry, P., Schaa, D., Zhang, D.P.: Heterogeneous computing with OpenCL 2.0. Morgan Kaufmann (2015)"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Li, P., Brunet, E., Trahay, F., Parrot, C., Thomas, G., Namyst, R.: Automatic OpenCL code generation for multi-device heterogeneous architectures. In: 2015 44th International Conference on Parallel Processing, pp. 959\u2013968. IEEE (2015)","DOI":"10.1109\/ICPP.2015.105"},{"issue":"1","key":"3_CR21","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1007\/s11227-014-1213-y","volume":"69","author":"M Steuwer","year":"2014","unstructured":"Steuwer, M., Gorlatch, S.: SkelCL: a high-level extension of OpenCL for multi-GPU systems. J. Supercomput. 69(1), 25\u201333 (2014)","journal-title":"J. Supercomput."},{"key":"3_CR22","unstructured":"Stroustrup, B.: The C++ Programming Language. Pearson Education (2013)"},{"key":"3_CR23","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"3_CR24","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Proceedings of 3rd International Conference on Learning Representations (ICLR) (ICLR, 2015) (2015)"},{"key":"3_CR25","unstructured":"Li, S., et al.: Colossal-AI: a unified deep learning system for large-scale parallel training (2021). arXiv preprint arXiv:2110.14883"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Ham, T.J., et al.: A^ 3: Accelerating attention mechanisms in neural networks with approximation. In: 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 328\u2013341. IEEE (2020)","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"3_CR27","unstructured":"A. Vaswani, et al.: Attention is all you need. In: International Conference on Neural Information Processing Systems, NIPS (2017)"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., Sun, J.: Shufflenet: an extremely efficient convolutional neural network for mobile devices. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6848\u20136856 (2018)","DOI":"10.1109\/CVPR.2018.00716"},{"key":"3_CR29","unstructured":"The NVIDIA CUB library. https:\/\/docs.nvidia.com\/cuda\/cub\/index.html"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Chen, Z., Howe, A., Blair, H.T., Cong, J.: CLINK: compact LSTM inference kernel for energy efficient neurofeedback devices. In: Proceedings of the International Symposium on Low Power Electronics and Design, pp. 1\u20136 (2018)","DOI":"10.1145\/3218603.3218637"},{"issue":"8","key":"3_CR31","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Wang, X., Xiong, Y., Wei, Y., Wang, M., Li, L.: LightSeq: a high performance inference library for transformers. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers, pp. 113\u2013120 (2021)","DOI":"10.18653\/v1\/2021.naacl-industry.15"},{"key":"3_CR33","unstructured":"The Intel LLVM Github repository. https:\/\/github.com\/intel\/llvm\/issues\/5969"},{"key":"3_CR34","unstructured":"Howard, A.G., et al.: Mobilenets: efficient convolutional neural networks for mobile vision applications (2017). arXiv preprint arXiv:1704.04861"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Willemsen, F.J., van Nieuwpoort, R., van Werkhoven, B.: Bayesian Optimization for auto-tuning GPU kernels. In: 2021 International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS), pp. 106\u2013117. IEEE (2021)","DOI":"10.1109\/PMBS54543.2021.00017"},{"key":"3_CR36","unstructured":"A software development tool for the creation of highly-optimized and tuned GPU applications. https:\/\/github.com\/benvanwerkhoven\/kernel_tuner"},{"key":"3_CR37","unstructured":"C++ implementation of Gradient Descent, Stochastic Gradient Descent for Sparse Data. https:\/\/github.com\/CGudapati\/BinaryClassification"},{"key":"3_CR38","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (GELUs) (2016). arXiv preprint arXiv:1606.08415"},{"key":"3_CR39","unstructured":"Dauphin, Y.N., Fan, A., Auli, M., Grangier, D.: Language modeling with gated convolutional networks. In: International Conference on Machine Learning, pp. 933\u2013941. PMLR (2017)"},{"key":"3_CR40","volume-title":"Deep Learning","author":"Y Bengio","year":"2017","unstructured":"Bengio, Y., Goodfellow, I., Courville, A.: Deep Learning, vol. 1. MIT press, Cambridge (2017)"},{"key":"3_CR41","unstructured":"OpenCL Labs for PAPAA Summer School 2016 Edition. https:\/\/github.com\/nachiket\/papaa-opencl"},{"key":"3_CR42","unstructured":"Implementations of Mean Shift Clustering. https:\/\/github.com\/w00zie\/mean_shift"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Reyes, R., Brown, G., Burns, R.: Bringing performant support for NVIDIA hardware to SYCL. In: Proceedings of the International Workshop on OpenCL, p. 1 (2020)","DOI":"10.1145\/3388333.3388651"},{"key":"3_CR44","unstructured":"The CUDA programming guide. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.htm"},{"key":"3_CR45","unstructured":"The SYCL extensions implemented in the Intel LLVM compiler. https:\/\/github.com\/intel\/llvm\/blob\/sycl\/sycl\/doc\/extensions\/experimental\/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc"},{"key":"3_CR46","doi-asserted-by":"crossref","unstructured":"Wu, J., et al.: gpucc: an open-source GPGPU compiler. In: Proceedings of the 2016 International Symposium on Code Generation and Optimization, pp. 105\u2013116 (2016)","DOI":"10.1145\/2854038.2854041"},{"key":"3_CR47","doi-asserted-by":"crossref","unstructured":"Jin, Z., Vetter, J.S.: . Performance portability study of epistasis detection using SYCL on NVIDIA GPU. In: Proceedings of the 13th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics, pp. 1\u20138 (2022)","DOI":"10.1145\/3535508.3545591"},{"key":"3_CR48","doi-asserted-by":"crossref","unstructured":"Ozturk, M.E., Asudeh, O., Sabin, G., Sadayappan, P., Sukumaran-Rajam, A.: A performance portability study using tensor con-traction benchmarks. In: 2023 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), pp. 591\u2013600. IEEE (2023)","DOI":"10.1109\/IPDPSW59300.2023.00102"},{"key":"3_CR49","doi-asserted-by":"crossref","unstructured":"Solis-Vasquez, L., Mascarenhas, E., Koch, A.: Experiences migrating CUDA to SYCL: a molecular docking case study. In: Proceedings of the 2023 International Workshop on OpenCL (IWOCL \u201823), Article 15, pp. 1\u201311. Association for Computing Machinery, New York (2023)","DOI":"10.1145\/3585341.3585372"},{"key":"3_CR50","doi-asserted-by":"crossref","unstructured":"Breyer, M., Van Craen, A., Pfl\u00fcger, D.: Performance evolution of different SYCL implementations based on the parallel least squares support vector machine library. In: Proceedings of the 2023 International Workshop on OpenCL (IWOCL \u201823), Article 24, pp. 1\u201312. Association for Computing Machinery, New York (2023)","DOI":"10.1145\/3585341.3585369"}],"container-title":["Lecture Notes in Computer Science","Languages and Compilers for Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02436-7_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T06:50:36Z","timestamp":1761893436000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02436-7_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9783032024350","9783032024367"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02436-7_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"LCPC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Languages and Compilers for Parallel Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lexington, KY","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"36","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"lcpc2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.lcpcworkshop.org\/LCPC23\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}