{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:36:22Z","timestamp":1780673782858,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T00:00:00Z","timestamp":1601424000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation","award":["1750667"],"award-info":[{"award-number":["1750667"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,9,30]]},"DOI":"10.1145\/3410463.3414623","type":"proceedings-article","created":{"date-parts":[[2020,9,30]],"date-time":"2020-09-30T10:43:04Z","timestamp":1601462584000},"page":"161-173","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Analyzing and Leveraging Shared L1 Caches in GPUs"],"prefix":"10.1145","author":[{"given":"Mohamed Assem","family":"Ibrahim","sequence":"first","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Onur","family":"Kayiran","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Fairport, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yasuko","family":"Eckert","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gabriel H.","family":"Loh","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Adwait","family":"Jog","sequence":"additional","affiliation":[{"name":"William &amp; Mary, Williamsburg, VA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,9,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2016604.2016637"},{"key":"e_1_3_2_1_2_1","volume-title":"AMD RDNA Architecture White Paper. (August","author":"AMD.","year":"2019"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00028"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.50"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS).","author":"Bakhoda A."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.10"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00030"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Chang Jichuan"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Symposium on Workload Characterization (IISWC).","author":"Che Shuai"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.39"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.31"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Workshop on General Purpose Processing Using GPU (GPGPU).","author":"Danalis Anthony"},{"key":"e_1_3_2_1_14_1","volume-title":"Near-Threshold System. In Proceedings of the Symposium on High Performance Chips (Hot Chips).","author":"Dreslinski R. G."},{"key":"e_1_3_2_1_15_1","volume-title":"Cooperative Caching for GPUs. ACM Transactions on Architecture and Code Optimization (TACO)","author":"Dublish Saumay","year":"2016"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/344779.344981"},{"key":"e_1_3_2_1_17_1","unstructured":"GPGPU-Sim v3.x. 2017. Address Mapping. (June 2017). http:\/\/gpgpu-sim.org\/manual\/index.php5\/GPGPU-Sim_3.x_Manual#Memory_Partition  GPGPU-Sim v3.x. 2017. Address Mapping. (June 2017). http:\/\/gpgpu-sim.org\/manual\/index.php5\/GPGPU-Sim_3.x_Manual#Memory_Partition"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Ayub"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555779"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the International Conference on Parallel Architecture and Compilation Techniques (PACT).","author":"Hossain H."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"J. Huh C. Kim H. Shafi L. Zhang D. Burger and S. W. Keckler. 2007. A NUCA Substrate for Flexible CMP Cache Sharing. IEEE Transactions on Parallel and Distributed Systems (TPDS) (2007).  J. Huh C. Kim H. Shafi L. Zhang D. Burger and S. W. Keckler. 2007. A NUCA Substrate for Flexible CMP Cache Sharing. IEEE Transactions on Parallel and Distributed Systems (TPDS) (2007).","DOI":"10.1109\/TPDS.2007.1091"},{"key":"e_1_3_2_1_22_1","unstructured":"Hynix. 2009. Hynix GDDR5 SGRAM Part H5GQ1H24AFR Revision 1.0. (2009). http:\/\/www.hynix.com\/datasheet\/pdf\/graphics\/H5GQ1H24AFR(Rev1.0).pdf  Hynix. 2009. Hynix GDDR5 SGRAM Part H5GQ1H24AFR Revision 1.0. (2009). http:\/\/www.hynix.com\/datasheet\/pdf\/graphics\/H5GQ1H24AFR(Rev1.0).pdf"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00028"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Jia W."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the International Symposium on Memory Systems (MEMSYS).","author":"Jog Adwait"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Jog Adwait"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Jog Adwait"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3300053.3319418"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the International Conference on Parallel Architecture and Compilation Techniques (PACT).","author":"Kayiran Onur"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO).","author":"Kayiran Onur"},{"key":"e_1_3_2_1_31_1","volume-title":"Rogers","author":"Khairy Mahmoud","year":"2018"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Komuravelli Rakesh"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the International Symposium on Workload Characterization (IISWC).","author":"Koo G."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080239"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835921"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485967"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541976"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Lee M."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037709"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"D. Li and T. M. Aamodt. 2016. Inter-Core Locality Aware Memory Scheduling. IEEE Computer Architecture Letters (CAL) (2016).  D. Li and T. M. Aamodt. 2016. Inter-Core Locality Aware Memory Scheduling. IEEE Computer Architecture Letters (CAL) (2016).","DOI":"10.1109\/LCA.2015.2435709"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056024"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Liu Chun","year":"2004"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.33"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Nugteren C."},{"key":"e_1_3_2_1_46_1","unstructured":"NVIDIA. 2011. CUDA C\/C SDK Code Samples. (2011). http:\/\/developer.nvidia.com\/cuda-cc-sdk-code-samples  NVIDIA. 2011. CUDA C\/C SDK Code Samples. (2011). http:\/\/developer.nvidia.com\/cuda-cc-sdk-code-samples"},{"key":"e_1_3_2_1_47_1","unstructured":"NVIDIA. 2019. CUDA C++ Programming Guide. (November 2019). https: \/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html  NVIDIA. 2019. CUDA C++ Programming Guide. (November 2019). https: \/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html"},{"key":"e_1_3_2_1_48_1","volume-title":"2019 b. Parallel Thread Execution ISA Version 6.5. (November","author":"NVIDIA.","year":"2019"},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Pattnaik Ashutosh"},{"key":"e_1_3_2_1_50_1","volume-title":"Polybench: The Polyhedral Benchmark Suite.","author":"Pouchet Louis-No\u00ebl","year":"2012"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2009.4798236"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Ren X."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540717"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO).","author":"Rogers Timothy G."},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO).","author":"Rogers Timothy G."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6169028"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Sanchez D."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.16"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00052"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the International Symposium on Networks-on-Chip (NOCS).","author":"Sun C."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.106"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the International Symposium on High-Performance Computer Architecture (HPCA).","author":"Tabbakh A."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.54"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2017.42"},{"key":"e_1_3_2_1_66_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA).","author":"Vijaykumar N."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00030"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/216585.216588"},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO).","author":"Young V."},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO).","author":"Zhan J."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.53"},{"key":"e_1_3_2_1_72_1","volume-title":"Towards Hybrid Last Level Caches for Chip-Multiprocessors. ACM SIGARCH Computer Architecture News","author":"Zhao Li","year":"2008"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322235"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"X. Zhao Y. Liu A. Adileh and L. Eeckhout. 2017. LA-LLC: Inter-Core Locality-Aware Last-Level Cache to Exploit Many-to-Many Traffic in GPGPUs. IEEE Computer Architecture Letters (CAL) (2017).  X. Zhao Y. Liu A. Adileh and L. Eeckhout. 2017. LA-LLC: Inter-Core Locality-Aware Last-Level Cache to Exploit Many-to-Many Traffic in GPGPUs. IEEE Computer Architecture Letters (CAL) (2017).","DOI":"10.1109\/LCA.2016.2611663"}],"event":{"name":"PACT '20: International Conference on Parallel Architectures and Compilation Techniques","location":"Virtual Event GA USA","acronym":"PACT '20","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3410463.3414623","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3410463.3414623","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3410463.3414623","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:31:51Z","timestamp":1750195911000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3410463.3414623"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,9,30]]},"references-count":74,"alternative-id":["10.1145\/3410463.3414623","10.1145\/3410463"],"URL":"https:\/\/doi.org\/10.1145\/3410463.3414623","relation":{},"subject":[],"published":{"date-parts":[[2020,9,30]]},"assertion":[{"value":"2020-09-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}