{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:35:09Z","timestamp":1763458509616,"version":"3.45.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,9,11]],"date-time":"2017-09-11T00:00:00Z","timestamp":1505088000000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1561041","1059376","1340947","1564647"],"award-info":[{"award-number":["1561041","1059376","1340947","1564647"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,9,11]]},"DOI":"10.1145\/2967938.2967947","type":"proceedings-article","created":{"date-parts":[[2016,8,31]],"date-time":"2016-08-31T08:32:08Z","timestamp":1472632328000},"page":"45-55","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["OAWS"],"prefix":"10.1145","author":[{"given":"Bin","family":"Wang","sequence":"first","affiliation":[{"name":"Auburn University, Auburn, AL, USA"}]},{"given":"Yue","family":"Zhu","sequence":"additional","affiliation":[{"name":"Florida State University, Tallahassee, FL, USA"}]},{"given":"Weikuan","family":"Yu","sequence":"additional","affiliation":[{"name":"Florida State University, Tallahassee, FL, USA"}]}],"member":"320","published-online":{"date-parts":[[2016,9,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in HPCA","author":"Jia W.","year":"2014","unstructured":"W. Jia, K. A. Shaw, and M. Martonosi, \"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in HPCA, 2014."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1145\/2613908.2613909"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1109\/MICRO.2014.11"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1145\/2751205.2751239"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1145\/2751205.2751237"},{"unstructured":"D. Li Orchestrating Thread Scheduling and Cache Management to Improve Memory System Throughput in Throughput Processor. PhD thesis University of Texas at Austin May 2014.","key":"e_1_3_2_1_6_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1109\/LCA.2014.2359882"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.1145\/2716282.2716291"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/MICRO.2012.16"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_10_1","DOI":"10.1145\/2540708.2540718"},{"doi-asserted-by":"publisher","unstructured":"D. Kroft \"Lockup-free Instruction Fetch\/Prefetch Cache Organization \" in ISCA 1981.","key":"e_1_3_2_1_11_1","DOI":"10.5555\/800052.801868"},{"unstructured":"A. E. Turner On replay and hazards in graphics processing units. PhD thesis University of British Columbia Oct 2012.","key":"e_1_3_2_1_12_1"},{"key":"e_1_3_2_1_13_1","volume-title":"Mascar: Speeding up GPU Warps by Reducing Memory Pitstops,\" in HPCA","author":"Sethia A.","year":"2015","unstructured":"A. Sethia, D. A. Jamshidi, and S. A. Mahlke, \"Mascar: Speeding up GPU Warps by Reducing Memory Pitstops,\" in HPCA, 2015."},{"key":"e_1_3_2_1_14_1","volume-title":"Fermi","author":"Compute Architecture CUDA","year":"2009","unstructured":"NVIDIA, \"NVIDIA's Next Generation CUDA Compute Architecture: Fermi,\" 2009."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_16_1","volume-title":"Kepler GK110","author":"Compute Architecture CUDA","year":"2012","unstructured":"NVIDIA, \"NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110,\" 2012."},{"key":"e_1_3_2_1_17_1","volume-title":"Tracking register usage during multithreaded processing using a scoreboard having separate memory regions and storing sequenti al register size indicators","author":"Coon B.","year":"2008","unstructured":"B. Coon, P. Mills, S. Oberman, and M. Siu, \"Tracking register usage during multithreaded processing using a scoreboard having separate memory regions and storing sequenti al register size indicators,\" Oct. 7 2008. US Patent 7,434,032."},{"key":"e_1_3_2_1_18_1","volume-title":"Scheduler in multi-threaded processor prioritizing instructions passing qualification rule","author":"Mills P.","year":"2011","unstructured":"P. Mills, J. Lindholm, B. Coon, G. Tarolli, and J. Burgess, \"Scheduler in multi-threaded processor prioritizing instructions passing qualification rule,\" May 24 2011. US Patent 7,949,855."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_19_1","DOI":"10.1145\/2155620.2155656"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_20_1","DOI":"10.1145\/2451116.2451158"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.5555\/2337159.2337166"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_22_1","DOI":"10.1145\/1454115.1454152"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_23_1","DOI":"10.5555\/2755753.2755911"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1109\/MICRO.2007.12"},{"key":"e_1_3_2_1_25_1","volume-title":"A Detailed GPU Cache Model Based on Reuse Distance Theory,\" in HPCA","author":"Nugteren C.","year":"2014","unstructured":"C. Nugteren, G.-J. van den Braak, H. Corporaal, and H. Bal, \"A Detailed GPU Cache Model Based on Reuse Distance Theory,\" in HPCA, 2014."},{"key":"e_1_3_2_1_26_1","volume-title":"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in ISPASS","author":"Bakhoda A.","year":"2009","unstructured":"A. Bakhoda, G. L. Yuan, W. W. L. Fung, H. Wong, and T. M. Aamodt, \"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in ISPASS, 2009."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_27_1","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_28_1","volume-title":"Auto-tuning a High-Level Language Targeted to GPU Codes.,\" in Innovative Parallel Computing","author":"Grauer-Gray S.","year":"2012","unstructured":"S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos, \"Auto-tuning a High-Level Language Targeted to GPU Codes.,\" in Innovative Parallel Computing, 2012."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1145\/1735688.1735702"},{"unstructured":"J. A. Stratton C. Rodrigues I.-J. Sung N. Obeid L.-W. Changx N. Anssari G. D. Liu and W. mei W. Hwu \"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing \" IMPACT Technical Report IMPACT-12-01 University of Illinois at Urbana-Champaign 2012.","key":"e_1_3_2_1_30_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_31_1","DOI":"10.5555\/2523721.2523745"},{"key":"e_1_3_2_1_32_1","volume-title":"Improving GPGPU Resource Utilization Through Alternative Thread Block Scheduling,\" in HPCA","author":"Lee M.","year":"2014","unstructured":"M. Lee, S. Song, J. Moon, J. Kim, W. Seo, Y.-G. Cho, and S. Ryu, \"Improving GPGPU Resource Utilization Through Alternative Thread Block Scheduling,\" in HPCA, 2014."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.5555\/2014698.2014893"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_34_1","DOI":"10.1145\/2000064.2000093"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.1145\/2751205.2751234"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1145\/2304576.2304582"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.5555\/2561828.2561929"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/MICRO.2012.43"}],"event":{"sponsor":["IFIP WG 10.3 IFIP WG 10.3","IEEE TCCA IEEE Computer Society Technical Committee on Computer Architecture","SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCPP IEEE Computer Society Technical Committee on Parallel Processing"],"acronym":"PACT '16","name":"PACT '16: International Conference on Parallel Architectures and Compilation","location":"Haifa Israel"},"container-title":["Proceedings of the 2016 International Conference on Parallel Architectures and Compilation"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967947","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2967938.2967947","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2967938.2967947","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:26:54Z","timestamp":1763458014000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2967938.2967947"}},"subtitle":["Memory Occlusion Aware Warp Scheduling"],"short-title":[],"issued":{"date-parts":[[2016,9,11]]},"references-count":38,"alternative-id":["10.1145\/2967938.2967947","10.1145\/2967938"],"URL":"https:\/\/doi.org\/10.1145\/2967938.2967947","relation":{},"subject":[],"published":{"date-parts":[[2016,9,11]]},"assertion":[{"value":"2016-09-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}