{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T02:23:16Z","timestamp":1748744596525,"version":"3.37.3"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2018,8,1]],"date-time":"2018-08-01T00:00:00Z","timestamp":1533081600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/501100001809","name":"Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672048"],"award-info":[{"award-number":["61672048"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst."],"published-print":{"date-parts":[[2018,8]]},"DOI":"10.1109\/tcad.2017.2764886","type":"journal-article","created":{"date-parts":[[2017,10,20]],"date-time":"2017-10-20T18:04:59Z","timestamp":1508522699000},"page":"1560-1573","source":"Crossref","is-referenced-by-count":8,"title":["Optimizing Cache Bypassing and Warp Scheduling for GPUs"],"prefix":"10.1109","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-7998","authenticated-orcid":false,"given":"Yun","family":"Liang","sequence":"first","affiliation":[]},{"given":"Xiaolong","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0597-3544","authenticated-orcid":false,"given":"Yu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Guangyu","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Wang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2007.70816"},{"key":"ref38","first-page":"81","article-title":"Bypass and insertion algorithms for exclusive last-level caches","author":"gaur","year":"2011","journal-title":"2011 38th Annual International Symposium on Computer Architecture (ISCA) ISCA"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830813"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155675"},{"key":"ref31","first-page":"832","article-title":"Real-time implementation and performance optimization of 3D sound localization on GPUs","author":"liang","year":"2012","journal-title":"Proceedings of the Design Automation and Test in Europe (DATE)"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.3850\/9783981537079_0647"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485951"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540718"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"ref34","first-page":"85","article-title":"Performance-centric register file design for GPUs using racetrack memory","author":"wang","year":"2016","journal-title":"Proc Asia South Pacific Design Autom Conf (ASP-DAC)"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.2013.6691165"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/379240.379259"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835938"},{"key":"ref12","first-page":"134","article-title":"Compiler managed micro-cache bypassing for high performance EPIC processors","author":"wu","year":"2002","journal-title":"Proc 35th Ann Int Symp Microarch (MICRO)"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.24"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155671"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454152"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2014.2313342"},{"journal-title":"The IMPACT Research Group UIUC Parboil benchmark suite","year":"0","author":"stratton","key":"ref4"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2013.2278025"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451160"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304582"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.18"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056023"},{"key":"ref1","first-page":"157","article-title":"Neither more nor less: Optimizing thread-level parallelism for GPGPUs","author":"kay?ran","year":"2013","journal-title":"Proc Int Conf Parallel Archit Compilation Techn (PACT)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.11"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835955"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555775"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2015.2424962"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2008.4771793"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/1693453.1693470"},{"key":"ref44","first-page":"25","article-title":"Exploiting inter-warp heterogeneity to improve GPGPU performance","author":"ausavarungnirun","year":"2015","journal-title":"Proc Int Conf Parallel Archit Compilation Techn (PACT)"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2015.2501303"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751237"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815992"}],"container-title":["IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/43\/8411796\/08076886.pdf?arnumber=8076886","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,26]],"date-time":"2022-01-26T16:29:02Z","timestamp":1643214542000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8076886\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,8]]},"references-count":45,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tcad.2017.2764886","relation":{},"ISSN":["0278-0070","1937-4151"],"issn-type":[{"type":"print","value":"0278-0070"},{"type":"electronic","value":"1937-4151"}],"subject":[],"published":{"date-parts":[[2018,8]]}}}