{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T22:15:55Z","timestamp":1766268955246,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,5,14]],"date-time":"2018-05-14T00:00:00Z","timestamp":1526256000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,5,14]]},"DOI":"10.1145\/3204919.3204924","type":"proceedings-article","created":{"date-parts":[[2018,5,2]],"date-time":"2018-05-02T12:21:47Z","timestamp":1525263707000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":51,"title":["CLBlast"],"prefix":"10.1145","author":[{"given":"Cedric","family":"Nugteren","sequence":"first","affiliation":[{"name":"TomTom, Amsterdam, The Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2018,5,14]]},"reference":[{"doi-asserted-by":"publisher","key":"e_1_3_2_1_1_1","DOI":"10.1007\/978-3-319-41321-1_2"},{"unstructured":"R. Ballester-Ripoll E. G. Paredes and R. Pajarola. 2017. Sobol Tensor Trains for Global Sensitivity Analysis. ArXiv e-prints (Dec. 2017). arXiv:1712.00233  R. Ballester-Ripoll E. G. Paredes and R. Pajarola. 2017. Sobol Tensor Trains for Global Sensitivity Analysis. ArXiv e-prints (Dec. 2017). arXiv:1712.00233","key":"e_1_3_2_1_2_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1145\/1941487.1941507"},{"unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. (2014).  Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. (2014).","key":"e_1_3_2_1_4_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1109\/IPDPSW.2015.85"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1109\/CGO.2013.6494986"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.1007\/978-3-642-01970-8_89"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1007\/978-3-319-43659-3_48"},{"key":"e_1_3_2_1_10_1","first-page":"2014","article-title":"Implementing Level-3 BLAS Routines in OpenCL on Different Processing Units","author":"Matsumoto K.","year":"2014","unstructured":"K. Matsumoto , N. Nakasato , and S.G. Sedukhin . 2014 . Implementing Level-3 BLAS Routines in OpenCL on Different Processing Units . Technical Report TR 2014 - 2001 . The University of Aizu. K. Matsumoto, N. Nakasato, and S.G. Sedukhin. 2014. Implementing Level-3 BLAS Routines in OpenCL on Different Processing Units. Technical Report TR 2014-001. The University of Aizu.","journal-title":"Technical Report TR"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_11_1","DOI":"10.1109\/SC.Companion.2012.59"},{"key":"e_1_3_2_1_12_1","volume-title":"Mixed Precision Training. arXiv abs\/1710.03740","author":"Micikevicius Paulius","year":"2017","unstructured":"Paulius Micikevicius , Sharan Narang , Jonah Alben , Gregory F. Diamos , Erich Elsen , David Garcia , Boris Ginsburg , Michael Houston , Oleksii Kuchaiev , Ganesh Venkatesh , and Hao Wu. 2017. Mixed Precision Training. arXiv abs\/1710.03740 ( 2017 ). arXiv:1710.03740 http:\/\/arxiv.org\/abs\/1710.03740 Paulius Micikevicius, Sharan Narang, Jonah Alben, Gregory F. Diamos, Erich Elsen, David Garcia, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, and Hao Wu. 2017. Mixed Precision Training. arXiv abs\/1710.03740 (2017). arXiv:1710.03740 http:\/\/arxiv.org\/abs\/1710.03740"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1145\/3075564.3077382"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.1109\/MCSoC.2015.10"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1145\/3078155.3078156"},{"unstructured":"Samuel D Relton Pedro Valero-Lara and Mawussi Zounon. 2016. A Comparison of Potential Interfaces for Batched BLAS Computations. (2016).  Samuel D Relton Pedro Valero-Lara and Mawussi Zounon. 2016. A Comparison of Potential Interfaces for Batched BLAS Computations. (2016).","key":"e_1_3_2_1_16_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_17_1","DOI":"10.1145\/3126908.3126939"},{"key":"e_1_3_2_1_18_1","volume-title":"Efficient Convolutional Neural Networks for Pixelwise Classification on Heterogeneous Hardware Systems. arXiv abs\/1509.03371","author":"Tschopp Fabian","year":"2015","unstructured":"Fabian Tschopp . 2015. Efficient Convolutional Neural Networks for Pixelwise Classification on Heterogeneous Hardware Systems. arXiv abs\/1509.03371 ( 2015 ). arXiv:1509.03371 http:\/\/arxiv.org\/abs\/1509.03371 Fabian Tschopp. 2015. Efficient Convolutional Neural Networks for Pixelwise Classification on Heterogeneous Hardware Systems. arXiv abs\/1509.03371 (2015). arXiv:1509.03371 http:\/\/arxiv.org\/abs\/1509.03371"},{"key":"e_1_3_2_1_19_1","volume-title":"Parallel Multi Channel Convolution using General Matrix Multiplication. arXiv abs\/1704.04428","author":"Vasudevan Aravind","year":"2017","unstructured":"Aravind Vasudevan , Andrew Anderson , and David Gregg . 2017. Parallel Multi Channel Convolution using General Matrix Multiplication. arXiv abs\/1704.04428 ( 2017 ). Aravind Vasudevan, Andrew Anderson, and David Gregg. 2017. Parallel Multi Channel Convolution using General Matrix Multiplication. arXiv abs\/1704.04428 (2017)."},{"key":"e_1_3_2_1_20_1","volume-title":"Why GEMM is at the heart of deep learning. https:\/\/petewarden.com\/2015\/04\/20\/why-gemm-is-at-the-heart-of-deep-learning","author":"Warden Pete","year":"2015","unstructured":"Pete Warden . 2015. Why GEMM is at the heart of deep learning. https:\/\/petewarden.com\/2015\/04\/20\/why-gemm-is-at-the-heart-of-deep-learning ( 2015 ). Pete Warden. 2015. Why GEMM is at the heart of deep learning. https:\/\/petewarden.com\/2015\/04\/20\/why-gemm-is-at-the-heart-of-deep-learning (2015)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.1109\/SAAHPC.2012.19"}],"event":{"sponsor":["Huawei Technologies Co. Ltd. Huawei Technologies Co. Ltd.","Khronos Khronos Group","Xilinx Xilinx Inc.","Codeplay Codeplay Software Ltd.","Intel Intel","The University of Bristol The University of Bristol"],"acronym":"IWOCL '18","name":"IWOCL '18: International Workshop on OpenCL","location":"Oxford United Kingdom"},"container-title":["Proceedings of the International Workshop on OpenCL"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3204919.3204924","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3204919.3204924","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T01:08:31Z","timestamp":1750208911000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3204919.3204924"}},"subtitle":["A Tuned OpenCL BLAS Library"],"short-title":[],"issued":{"date-parts":[[2018,5,14]]},"references-count":20,"alternative-id":["10.1145\/3204919.3204924","10.1145\/3204919"],"URL":"https:\/\/doi.org\/10.1145\/3204919.3204924","relation":{},"subject":[],"published":{"date-parts":[[2018,5,14]]},"assertion":[{"value":"2018-05-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}