{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:14:59Z","timestamp":1766067299788,"version":"3.28.0"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T00:00:00Z","timestamp":1614384000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T00:00:00Z","timestamp":1614384000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,2,27]],"date-time":"2021-02-27T00:00:00Z","timestamp":1614384000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,2,27]]},"DOI":"10.1109\/cgo51591.2021.9370324","type":"proceedings-article","created":{"date-parts":[[2021,3,11]],"date-time":"2021-03-11T21:33:26Z","timestamp":1615498406000},"page":"289-300","source":"Crossref","is-referenced-by-count":6,"title":["C-for-Metal: High Performance Simd Programming on Intel GPUs"],"prefix":"10.1109","author":[{"given":"Guei-Yuan","family":"Lueh","sequence":"first","affiliation":[]},{"given":"Kaiyu","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Joel","family":"Fuentes","sequence":"additional","affiliation":[]},{"given":"Wei-Yu","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Fangwen","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Hong","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Hongzheng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Daniel","family":"Rhee","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"journal-title":"SGEMM for Intel&#x00AE; Processor Graphics","year":"2015","author":"kong","key":"ref39"},{"journal-title":"An efficient matrix transpose in CUDA CIC++","year":"2013","author":"harris","key":"ref38"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661189"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/1964179.1964184"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-54807-9_8"},{"journal-title":"Khronos OpenCL Working Group","year":"2018","key":"ref30"},{"journal-title":"An efficient K-means clustering algorithm","year":"1997","author":"alsabti","key":"ref37"},{"key":"ref36","first-page":"104","article-title":"A bitonic sorting network with simpler flip interconnections","author":"lee","year":"1996","journal-title":"Proceedings Second International Symposium on Parallel Architectures, Algorithms, and Networks (I-SPAN'96)"},{"journal-title":"Intel(R) SDK for OpenCL Applications","year":"2019","key":"ref35"},{"key":"ref34","article-title":"LLVM and Clang: Next generation compiler technology","volume":"5","author":"lattner","year":"2008","journal-title":"The BSD Conference"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2012.07.005"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/12.42122"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-38750-0_11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2015.85"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2011.45"},{"journal-title":"Intel intrinsics guide","year":"2020","key":"ref14"},{"journal-title":"Data-parallel vector library","year":"2020","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/2870650.2870653"},{"journal-title":"Intel oneAPI Data Parallel C++","year":"2020","key":"ref17"},{"key":"ref18","article-title":"C*: An extended c language for data parallel programming","author":"rose","year":"1987","journal-title":"Proceedings of the Second International Conference on Supercomputing"},{"key":"ref19","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1145\/2370036.2145825","article-title":"Extending a C-like language for portable SIMD programming","volume":"47","author":"leiba","year":"2012","journal-title":"ACM SIGPLAN Notices"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2011.63"},{"journal-title":"Intel(R) Graphics Compute Runtime for one API Level Zero and OpenCL(TM) Driver","year":"2020","key":"ref4"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3168806"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"journal-title":"C-for-Metal Compiler","year":"2019","key":"ref6"},{"journal-title":"Using cuda warp-level primitives","year":"2018","author":"yuan","key":"ref29"},{"journal-title":"oneAPI Level Zero Specification","year":"2020","key":"ref5"},{"key":"ref8","first-page":"3","article-title":"An experimental study on performance portability of OpenCL kernels","author":"rul","year":"2010","journal-title":"Symposium on Application Accelerators in High Performance Computing 2010"},{"journal-title":"Intel Subgroup Extension Specification","year":"2016","key":"ref7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2009.7478342"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2011.10.002"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/1365490.1365500"},{"journal-title":"MLIR A Compiler Infrastructure for the End of Moore's Law","year":"2020","author":"lattner","key":"ref20"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2908080.2908105"},{"journal-title":"Multi-Level IR Compiler Framework - Vector Dialect","year":"2020","key":"ref21"},{"journal-title":"IS PC for Gen","year":"2020","key":"ref42"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2254064.2254079"},{"journal-title":"Explicit SIMD Programming Extension for DPC++","year":"2020","key":"ref41"},{"journal-title":"Glow Graph lowering compiler techniques for neural networks","year":"2018","author":"rotem","key":"ref23"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-43229-4_33"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2019.00015"}],"event":{"name":"2021 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","start":{"date-parts":[[2021,2,27]]},"location":"Seoul, Korea (South)","end":{"date-parts":[[2021,3,3]]}},"container-title":["2021 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9370300\/9370301\/09370324.pdf?arnumber=9370324","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T15:42:50Z","timestamp":1652197370000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9370324\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,27]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/cgo51591.2021.9370324","relation":{},"subject":[],"published":{"date-parts":[[2021,2,27]]}}}