{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:33:53Z","timestamp":1772724833527,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,2]]},"DOI":"10.1109\/cgo.2015.7054183","type":"proceedings-article","created":{"date-parts":[[2015,3,10]],"date-time":"2015-03-10T17:47:12Z","timestamp":1426009632000},"page":"12-22","source":"Crossref","is-referenced-by-count":29,"title":["Characterizing and enhancing global memory data coalescing on GPUs"],"prefix":"10.1109","author":[{"given":"Naznin","family":"Fauzia","sequence":"first","affiliation":[]},{"given":"Louis-Noel","family":"Pouchet","sequence":"additional","affiliation":[]},{"given":"P.","family":"Sadayappan","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2038037.1941574"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1806596.1806606"},{"key":"ref33","first-page":"1","article-title":"Languages and compilers for parallel computing","author":"ueng","year":"2008","journal-title":"Chapter CUDA-Lite Reducing GPU Programming Complexity"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/1926385.1926449"},{"key":"ref31","article-title":"NVIDIA Corporation","year":"2011","journal-title":"NVIDIA CUDA C Programming Guide"},{"key":"ref30","article-title":"NVIDIA Corporation","year":"0","journal-title":"Parallel Thread Execution ISA"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442523"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2581122.2544141"},{"key":"ref34","year":"0","journal-title":"University of Illinois Urbana-Champaign Clang"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"1370","DOI":"10.1016\/j.jpdc.2008.05.014","article-title":"A performance study of general-purpose applications on graphics processors using cuda","volume":"68","author":"che","year":"2008","journal-title":"J Parallel Distrib Comput"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2012.6507475"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2010.5650274"},{"key":"ref13","first-page":"13:1","article-title":"Dymaxion: optimizing memory access patterns for heterogeneous systems","author":"che","year":"2011","journal-title":"SC"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5222004"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/BF01407931"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/BF01379404"},{"key":"ref18","year":"0","journal-title":"Georgia Institute of Technology GPUOcelot"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"ref28","first-page":"6:1","article-title":"Optimizing symmetric dense matrix-vector multiplication on gpus","author":"nath","year":"2011","journal-title":"SC"},{"key":"ref4","article-title":"Par4all: From convex array regions to heterogeneous computing","author":"amini","year":"0"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503268"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2467288"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1345206.1345210","article-title":"Automatic data movement and computation mapping for multi-level parallel architectures with explicitly managed memories","author":"baskaran","year":"2008","journal-title":"PPoPP"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654090"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/1375527.1375562"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2004.1342537"},{"key":"ref7","first-page":"244","article-title":"Automatic c-to-cuda code generation for affine programs","author":"baskaran","year":"2010","journal-title":"CCC"},{"key":"ref2","author":"aho","year":"1986","journal-title":"Compilers Principles Techniques and Tools"},{"key":"ref9","article-title":"Automated Dynamic Analysis of CUDA Programs","author":"boyer","year":"2008","journal-title":"In Third Workshop on Software Tools for MultiCore Systems"},{"key":"ref1","year":"0","journal-title":"PoCC the polyhedral compiler collection"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458526"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.36"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2010.62"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"101","DOI":"10.1145\/1594835.1504194","article-title":"Openmp to gpgpu: A compiler framework for automatic translation and optimization","volume":"44","author":"lee","year":"2009","journal-title":"SIGPLAN Not"},{"key":"ref23","first-page":"116","article-title":"Openmpc: Extended openmp for efficient programming and tuning on gpus","volume":"7","author":"lee","year":"2012","journal-title":"International Journal of Computational Engineering Science"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/263699.263719"},{"key":"ref25","first-page":"884","article-title":"A note on auto-tuning gemm for gpus","author":"li","year":"2009","journal-title":"ICCS"}],"event":{"name":"2015 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","location":"San Francisco, CA, USA","start":{"date-parts":[[2015,2,7]]},"end":{"date-parts":[[2015,2,11]]}},"container-title":["2015 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7041249\/7054173\/07054183.pdf?arnumber=7054183","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,6,23]],"date-time":"2017-06-23T03:03:50Z","timestamp":1498187030000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7054183\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,2]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/cgo.2015.7054183","relation":{},"subject":[],"published":{"date-parts":[[2015,2]]}}}