{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,8]],"date-time":"2026-07-08T15:54:43Z","timestamp":1783526083212,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,4,4]],"date-time":"2017-04-04T00:00:00Z","timestamp":1491264000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"HiPEAC Collaboration Grants","award":["H2020-ICT-2015-687689"],"award-info":[{"award-number":["H2020-ICT-2015-687689"]}]},{"name":"Department of Energy","award":["66150"],"award-info":[{"award-number":["66150"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,4,4]]},"DOI":"10.1145\/3037697.3037709","type":"proceedings-article","created":{"date-parts":[[2017,4,5]],"date-time":"2017-04-05T08:47:40Z","timestamp":1491382060000},"page":"297-311","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":46,"title":["Locality-Aware CTA Clustering for Modern GPUs"],"prefix":"10.1145","author":[{"given":"Ang","family":"Li","sequence":"first","affiliation":[{"name":"Pacific Northwest National Lab, Richland, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuaiwen Leon","family":"Song","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab, Richland, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weifeng","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Copenhagen, Copenhagen, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"College of William and Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Akash","family":"Kumar","sequence":"additional","affiliation":[{"name":"Technische Universit\u00e4t Dresden, Dresden, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Henk","family":"Corporaal","sequence":"additional","affiliation":[{"name":"Eindhoven University of Technology, Eindhoven , Netherlands"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2017,4,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Morgan Kaufmann","author":"Muchnick Steven S.","year":"1997","unstructured":"Steven S. Muchnick . Advanced compiler design implementation . Morgan Kaufmann , 1997 . Steven S. Muchnick. Advanced compiler design implementation. Morgan Kaufmann, 1997."},{"key":"e_1_3_2_1_2_1","volume-title":"Morgan Kaufmann","author":"Allen Randy","year":"2001","unstructured":"Randy Allen and Ken Kennedy . Optimizing compilers for modern architectures a dependence-based approach . Morgan Kaufmann , 2001 . Randy Allen and Ken Kennedy. Optimizing compilers for modern architectures a dependence-based approach. Morgan Kaufmann, 2001."},{"key":"e_1_3_2_1_3_1","volume-title":"Loop tiling for parallelism","author":"Xue Jingling","year":"2012","unstructured":"Jingling Xue . Loop tiling for parallelism , volume 575 . Springer Science & Business Media , 2012 . Jingling Xue. Loop tiling for parallelism, volume 575. Springer Science & Business Media, 2012."},{"key":"e_1_3_2_1_4_1","first-page":"60","volume-title":"ACM SIGOPS Operating Systems Review","author":"Philbin James","year":"1996","unstructured":"James Philbin , Jan Edler , Otto J Anshus , Craig C Douglas , and Kai Li . Thread scheduling for cache locality . In ACM SIGOPS Operating Systems Review , volume 30 , pages 60 -- 71 . ACM , 1996 . James Philbin, Jan Edler, Otto J Anshus, Craig C Douglas, and Kai Li. Thread scheduling for cache locality. In ACM SIGOPS Operating Systems Review, volume 30, pages 60--71. ACM, 1996."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1272996.1273004"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_7_1","volume-title":"CUDA Programming Guide","author":"NVIDIA.","year":"2015","unstructured":"NVIDIA. CUDA Programming Guide , 2015 . NVIDIA. CUDA Programming Guide, 2015."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897937.2898103"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807606"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751237"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.11"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.2013.6691165"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.38"},{"key":"e_1_3_2_1_17_1","first-page":"43","volume-title":"Eddy Z Zhang. Tag-Split Cache for Efficient GPGPU Cache Utilization. In Proceedings of the 2016 International Conference on Supercomputing","author":"Li Lingda","unstructured":"Lingda Li , Ari B Hayes , Shuaiwen Leon Song , and Eddy Z Zhang. Tag-Split Cache for Efficient GPGPU Cache Utilization. In Proceedings of the 2016 International Conference on Supercomputing , page 43 . ACM, 2016. Lingda Li, Ari B Hayes, Shuaiwen Leon Song, and Eddy Z Zhang. Tag-Split Cache for Efficient GPGPU Cache Utilization. In Proceedings of the 2016 International Conference on Supercomputing, page 43. ACM, 2016."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540717"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835938"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056023"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2014.2308179"},{"key":"e_1_3_2_1_23_1","first-page":"63","volume":"26","author":"A's Next NVIDIA.","year":"2009","unstructured":"NVIDIA. NVIDI A's Next Generation CUDA Compute Architecture : Fermi. Comput. Syst , 26 : 63 -- 72 , 2009 . NVIDIA. NVIDIA's Next Generation CUDA Compute Architecture: Fermi. Comput. Syst, 26:63--72, 2009.","journal-title":"Fermi. Comput. Syst"},{"key":"e_1_3_2_1_24_1","unstructured":"Nikolaj Leischner Vitaly Osipov and Peter Sanders. Fermi architecture white paper.  Nikolaj Leischner Vitaly Osipov and Peter Sanders. Fermi architecture white paper."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751213"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339596"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"e_1_3_2_1_28_1","volume-title":"GTX980 Whitepaper: Featuring Maxwell, the Most Advanced GPU Ever Made","author":"NVIDIA.","year":"2014","unstructured":"NVIDIA. GTX980 Whitepaper: Featuring Maxwell, the Most Advanced GPU Ever Made , 2014 . NVIDIA. GTX980 Whitepaper: Featuring Maxwell, the Most Advanced GPU Ever Made, 2014."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145820"},{"key":"e_1_3_2_1_31_1","first-page":"157","volume-title":"Proceedings of the 22nd international conference on Parallel architectures and compilation techniques (PACT)","author":"Kay\u0131ran Onur","year":"2013","unstructured":"Onur Kay\u0131ran , Adwait Jog , Mahmut Taylan Kandemir , and Chita Ranjan Das . Neither more nor less: optimizing thread-level parallelism for GPGPUs . In Proceedings of the 22nd international conference on Parallel architectures and compilation techniques (PACT) , pages 157 -- 166 . IEEE Press , 2013 . Onur Kay\u0131ran, Adwait Jog, Mahmut Taylan Kandemir, and Chita Ranjan Das. Neither more nor less: optimizing thread-level parallelism for GPGPUs. In Proceedings of the 22nd international conference on Parallel architectures and compilation techniques (PACT), pages 157--166. IEEE Press, 2013."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001199"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056031"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485951"},{"key":"e_1_3_2_1_37_1","volume-title":"A Graph-based Model for GPU Caching Problems. arXiv preprint arXiv:1605.02043","author":"Li Lingda","year":"2016","unstructured":"Lingda Li , Ari B Hayes , Stephen A Hackler , Eddy Z Zhang , Mario Szegedy , and Shuaiwen Leon Song . A Graph-based Model for GPU Caching Problems. arXiv preprint arXiv:1605.02043 , 2016 . Lingda Li, Ari B Hayes, Stephen A Hackler, Eddy Z Zhang, Mario Szegedy, and Shuaiwen Leon Song. A Graph-based Model for GPU Caching Problems. arXiv preprint arXiv:1605.02043, 2016."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1950365.1950408"},{"key":"e_1_3_2_1_39_1","first-page":"2","volume-title":"Proceedings of the 2016 International Conference on Supercomputing (ICS)","author":"Liu Jianqiao","unstructured":"Jianqiao Liu , Nikhil Hegde , and Milind Kulkarni . Hybrid CPU-GPU scheduling and execution of tree traversals . In Proceedings of the 2016 International Conference on Supercomputing (ICS) , page 2 . ACM, 2016. Jianqiao Liu, Nikhil Hegde, and Milind Kulkarni. Hybrid CPU-GPU scheduling and execution of tree traversals. In Proceedings of the 2016 International Conference on Supercomputing (ICS), page 2. ACM, 2016."},{"key":"e_1_3_2_1_40_1","volume-title":"CUDA SDK Code Samples","author":"NVIDIA.","year":"2015","unstructured":"NVIDIA. CUDA SDK Code Samples , 2015 . NVIDIA. CUDA SDK Code Samples, 2015."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00224-006-1350-7"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926255"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451160"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.44"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835970"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_47_1","volume-title":"Geng Daniel Liu, and Wen-Mei W Hwu. Parboil: A revised benchmark suite for scientific and commercial throughput computing","author":"Stratton John A","year":"2012","unstructured":"John A Stratton , Christopher Rodrigues , I- Jui Sung , Nady Obeid , Li-Wen Chang , Nasser Anssari , Geng Daniel Liu, and Wen-Mei W Hwu. Parboil: A revised benchmark suite for scientific and commercial throughput computing . Center for Reliable and High-Performance Computing , 2012 . John A Stratton, Christopher Rodrigues, I-Jui Sung, Nady Obeid, Li-Wen Chang, Nasser Anssari, Geng Daniel Liu, and Wen-Mei W Hwu. Parboil: A revised benchmark suite for scientific and commercial throughput computing. Center for Reliable and High-Performance Computing, 2012."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"e_1_3_2_1_49_1","volume-title":"CUDA Profiler User's Guide","author":"NVIDIA.","year":"2015","unstructured":"NVIDIA. CUDA Profiler User's Guide , 2015 . NVIDIA. CUDA Profiler User's Guide, 2015."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304582"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540718"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540742"}],"event":{"name":"ASPLOS '17: Architectural Support for Programming Languages and Operating Systems","location":"Xi'an China","acronym":"ASPLOS '17","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3037697.3037709","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3037697.3037709","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:50:27Z","timestamp":1750204227000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3037697.3037709"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,4,4]]},"references-count":52,"alternative-id":["10.1145\/3037697.3037709","10.1145\/3037697"],"URL":"https:\/\/doi.org\/10.1145\/3037697.3037709","relation":{"is-identical-to":[{"id-type":"doi","id":"10.1145\/3093337.3037709","asserted-by":"object"},{"id-type":"doi","id":"10.1145\/3093336.3037709","asserted-by":"object"}]},"subject":[],"published":{"date-parts":[[2017,4,4]]},"assertion":[{"value":"2017-04-04","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}