{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T02:16:34Z","timestamp":1775873794653,"version":"3.50.1"},"reference-count":87,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016,10]]},"DOI":"10.1109\/micro.2016.7783718","type":"proceedings-article","created":{"date-parts":[[2016,12,19]],"date-time":"2016-12-19T22:11:05Z","timestamp":1482185465000},"page":"1-14","source":"Crossref","is-referenced-by-count":34,"title":["Zorua: A holistic approach to resource virtualization in GPUs"],"prefix":"10.1109","author":[{"given":"Nandita","family":"Vijaykumar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kevin","family":"Hsieh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Samira","family":"Khan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ashish","family":"Shrestha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saugata","family":"Ghose","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Adwait","family":"Jog","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Phillip B.","family":"Gibbons","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/1060289.1060307"},{"key":"ref72","article-title":"Scaling the Power Wall: A Path to Exascale","author":"villa","year":"2014","journal-title":"SC"},{"key":"ref71","article-title":"A New Framework for GPU Resource Virtualization","author":"vijaykumar","year":"2016","journal-title":"CMU SAFARI Technical Report No 2016-005"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750399"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/4.509850"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835939"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750393"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750374"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400690"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056023"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830813"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/2588768.2576780"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"ref31","article-title":"Orchestrated Scheduling and Pre fetching for GPGPUs","author":"jog","year":"2013","journal-title":"ISCA"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830784"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/2967938.2967941"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.62"},{"key":"ref35","article-title":"Neither More Nor Less: Optimizing Thread-level Parallelism for GPGPUs","author":"kayiran","year":"2013","journal-title":"PACT"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/2581122.2544145"},{"key":"ref60","article-title":"Program optimization carving for GPU computing","author":"ryoo","year":"2008","journal-title":"JPDC"},{"key":"ref62","author":"sato","year":"2010","journal-title":"Automatic Tuning of CUDA Execution Parameters for Stencil Processing"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/1356058.1356084"},{"key":"ref63","article-title":"Atune-IL: An instrumentation language for autotuning parallel applications","author":"schaefer","year":"2009","journal-title":"Euro-Par"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/1950365.1950409"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15291-7_26"},{"key":"ref27","article-title":"Unified On-chip Memory Allocation for SIMT Archi-tecture","author":"hayes","year":"2014","journal-title":"ICS"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2012.194"},{"key":"ref66","article-title":"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing","author":"stratton","year":"2012","journal-title":"UIUC Technical Report IMPACT-12-01"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/40.710872"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2014.6853208"},{"key":"ref68","author":"tarjan","year":"2011","journal-title":"On Demand Register Allocation and Deallocation for A Multithreaded Processor"},{"key":"ref69","article-title":"CUDA-Lite: Reducing GPU Programming Complexity","author":"sain-zee","year":"2008","journal-title":"LCPC"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1147\/rd.82.0087"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522337"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.18"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/2166879.2166882"},{"key":"ref24","article-title":"Fine-grained resource sharing for concurrent GPGPU kernels","author":"gregg","year":"2012","journal-title":"HotPar"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.1998.650557"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2010.62"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1147\/rd.276.0530"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"ref51","year":"0","journal-title":"Nintendo\/Creatures Inc \/GAME FREAK inc Pok&#x00E9;mon"},{"key":"ref59","doi-asserted-by":"crossref","DOI":"10.1145\/1345206.1345220","article-title":"Optimization Principles and Application Performance Evaluation of a Multithreaded GPU Using CUDA","author":"ryoo","year":"2008","journal-title":"PPoPP"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694346"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451160"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2005.21"},{"key":"ref53","article-title":"NVIDIA","year":"2014","journal-title":"CUDA Dynamic Parallelism Programming Guide"},{"key":"ref52","article-title":"NVIDIA","year":"2011","journal-title":"CUDA C\/C++ SDK CODE Samples"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.20"},{"key":"ref11","author":"chen","year":"2008","journal-title":"N-Queens Solver"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835970"},{"key":"ref12","article-title":"Virtual local stores: Enabling software-managed memory hierarchies in mainstream computing environments","author":"cook","year":"2009","journal-title":"Tech Rep UCB\/EECS-2009-131"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1147\/rd.255.0483"},{"key":"ref14","article-title":"Toward Techniques for Auto-Tuning GPU Algorithms","author":"davidson","year":"2010","journal-title":"Applied Parallel and ScientificComputing"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/356571.356573"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/1806596.1806606"},{"key":"ref16","article-title":"HMPP: A hybrid multi-core parallel programming environment","author":"dolbeau","year":"2007","journal-title":"GPGPU"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/1328195.1328198"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/1941553.1941589"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/2370816.2370858"},{"key":"ref18","article-title":"Spills, fills, and kills - an architecture for reducing register-memory traffic","author":"erez","year":"2000","journal-title":"Technical Report Stanford University"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/2207222.2207225"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155675"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-69338-3_5"},{"key":"ref4","article-title":"Staged Memory Scheduling: Achieving High Prformance and Scalability in Heterogeneous Systems","author":"ausavarungnirun","year":"2012","journal-title":"ISCA"},{"key":"ref3","article-title":"Exploiting Inter-Warp Heterogeneity to Improve GPGPU Performance","author":"ausavarangnirun","year":"2015","journal-title":"PACT"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/359327.359335"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.59"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"ref86","doi-asserted-by":"crossref","DOI":"10.1145\/360128.360143","article-title":"Two-level Hierarchical Register File Organization for VLIW Processors","author":"zalamea","year":"2000","journal-title":"Micro"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2002.1176236"},{"key":"ref49","article-title":"iGPU: Exception Support and Speculative Execution on GPUs","author":"menon","year":"2012","journal-title":"ISCA"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2013.257"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref46","article-title":"A cross-input adaptive framework for GPU program optimizations","author":"liu","year":"2009","journal-title":"IPDPS"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056024"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.2307\/2683468"},{"key":"ref47","article-title":"Low GPU occupancy approach to fast arithmetic coding in JPEG 2000","author":"matela","year":"2011","journal-title":"Math Eng Methods Comput Sci"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751237"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2015.7054184"}],"event":{"name":"2016 49th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO)","location":"Taipei","start":{"date-parts":[[2016,10,15]]},"end":{"date-parts":[[2016,10,19]]}},"container-title":["2016 49th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7777315\/7783693\/07783718.pdf?arnumber=7783718","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,21]],"date-time":"2023-08-21T10:39:19Z","timestamp":1692614359000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/7783718\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10]]},"references-count":87,"URL":"https:\/\/doi.org\/10.1109\/micro.2016.7783718","relation":{},"subject":[],"published":{"date-parts":[[2016,10]]}}}