{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T17:37:08Z","timestamp":1772991428123,"version":"3.50.1"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,2,1]],"date-time":"2019-02-01T00:00:00Z","timestamp":1548979200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,2,1]],"date-time":"2019-02-01T00:00:00Z","timestamp":1548979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,2,1]],"date-time":"2019-02-01T00:00:00Z","timestamp":1548979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,2]]},"DOI":"10.1109\/cgo.2019.8661187","type":"proceedings-article","created":{"date-parts":[[2019,3,8]],"date-time":"2019-03-08T00:01:46Z","timestamp":1552003306000},"page":"73-84","source":"Crossref","is-referenced-by-count":26,"title":["Automatic Generation of Warp-Level Primitives and Atomic Instructions for Fast and Portable Parallel Reduction on GPUs"],"prefix":"10.1109","author":[{"given":"Simon Garcia De","family":"Gonzalo","sequence":"first","affiliation":[]},{"given":"Sitao","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Juan","family":"Gomez-Luna","sequence":"additional","affiliation":[]},{"given":"Simon","family":"Hammond","sequence":"additional","affiliation":[]},{"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[]},{"given":"Wen-mei","family":"Hwu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872373"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628087"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00027"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00074"},{"key":"ref37","author":"luitjens","year":"2013","journal-title":"CUDA pro tip Increase performance with vectorized memory access"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/99.660313"},{"key":"ref35","article-title":"LLVM and Clang: Next generation compiler technology","author":"lattner","year":"2008","journal-title":"The BSD Conference"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2010.5470423"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"ref40","author":"harris","year":"0","journal-title":"Optimizing Parallel Reduction in CUDA"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2015.30"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-012-0443-3"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2012.319"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442539"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2851141.2851178"},{"key":"ref16","year":"2018","journal-title":"PTX Parallel Thread Execution ISA Version 2 3"},{"key":"ref17","author":"nyland","year":"2013","journal-title":"Understanding and using atomic memory operations"},{"key":"ref18","author":"demouth","year":"2013","journal-title":"Shuffle Tips and tricks"},{"key":"ref19","year":"2013","journal-title":"NVIDIA's Next Generation CUDA Compute Architecture Kepler GK110"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863730"},{"key":"ref4","year":"0","journal-title":"CUDA Zone"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.2172\/1169830"},{"key":"ref3","year":"0","journal-title":"RightScale 2018 State of the Cloud Report"},{"key":"ref6","article-title":"Thrust: A productivity-oriented library for cuda","author":"bell","year":"2011","journal-title":"GPU Computing Gems Jade Edition"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783718"},{"key":"ref5","year":"2014","journal-title":"OpenCL 2 0 API specification"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783715"},{"key":"ref7","author":"merrill","year":"2015","journal-title":"CUDA UnBound (CUB) library"},{"key":"ref2","author":"wu","year":"0","journal-title":"Green500 List June 2018"},{"key":"ref9","article-title":"Kokkos: Enabling manycore performance portability through polymorphic memory access patterns","author":"edwards","year":"2014","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"ref1","author":"strohmaier","year":"0","journal-title":"Top500 List June 2018"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.1145\/2967938.2967950","article-title":"Reduction drawing: Language constructs and polyhedral compilation for reductions on GPU","author":"reddy","year":"2016","journal-title":"PACT"},{"key":"ref20","year":"2018","journal-title":"CUDA C Programming Guide"},{"key":"ref45","article-title":"Pencil: A platform-neutral compute intermediate language for accelerator programming","author":"baghdadi","year":"2015","journal-title":"PACT"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458533"},{"key":"ref21","author":"kirk","year":"2016","journal-title":"Programming Massively Parallel Processors A Hands-on Approach"},{"key":"ref42","author":"catanzaro","year":"2010","journal-title":"OpenCL Optimization Case Study Simple Reductions"},{"key":"ref24","year":"2014","journal-title":"NVIDIA GeForce GTX 980 Featuring Maxwell The Most Advanced GPU Ever Made"},{"key":"ref41","author":"luitjens","year":"0","journal-title":"Faster parallel reductions on Kepler"},{"key":"ref23","year":"2012","journal-title":"Fermi Whitepaper"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863747"},{"key":"ref26","year":"2016","journal-title":"NVIDIA Tesla P100 GPU"},{"key":"ref43","doi-asserted-by":"crossref","DOI":"10.1145\/2784731.2784754","article-title":"Generating performance portable code using rewrite rules: from high-level functional expressions to high-performance OpenCL code","author":"steuwer","year":"2015","journal-title":"ICFP"},{"key":"ref25","author":"adinets","year":"2014","journal-title":"CUDA pro tip Optimized Filtering with Warp-Aggregated Atomics"}],"event":{"name":"2019 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","location":"Washington, DC, USA","start":{"date-parts":[[2019,2,16]]},"end":{"date-parts":[[2019,2,20]]}},"container-title":["2019 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8653576\/8661160\/08661187.pdf?arnumber=8661187","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,19]],"date-time":"2022-07-19T20:17:40Z","timestamp":1658261860000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8661187\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,2]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/cgo.2019.8661187","relation":{},"subject":[],"published":{"date-parts":[[2019,2]]}}}