{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T09:53:34Z","timestamp":1769853214605,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,6,22]],"date-time":"2019-06-22T00:00:00Z","timestamp":1561161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100011199","name":"European Research Council","doi-asserted-by":"publisher","award":["741097"],"award-info":[{"award-number":["741097"]}],"id":[{"id":"10.13039\/100011199","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003130","name":"Fonds Wetenschappelijk Onderzoek","doi-asserted-by":"publisher","award":["G.0434.16N, G.0144.17N"],"award-info":[{"award-number":["G.0434.16N, G.0144.17N"]}],"id":[{"id":"10.13039\/501100003130","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572508, 61672526 and 61802427"],"award-info":[{"award-number":["61572508, 61672526 and 61802427"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,6,22]]},"DOI":"10.1145\/3307650.3322235","type":"proceedings-article","created":{"date-parts":[[2019,6,14]],"date-time":"2019-06-14T12:42:33Z","timestamp":1560516153000},"page":"411-423","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["Adaptive memory-side last-level GPU caching"],"prefix":"10.1145","author":[{"given":"Xia","family":"Zhao","sequence":"first","affiliation":[{"name":"Ghent University, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Almutaz","family":"Adileh","sequence":"additional","affiliation":[{"name":"Ghent University, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhibin","family":"Yu","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiying","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aamer","family":"Jaleel","sequence":"additional","affiliation":[{"name":"Nvidia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lieven","family":"Eeckhout","sequence":"additional","affiliation":[{"name":"Ghent University, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2019,6,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2008.917757"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_3_1","unstructured":"R. Collobert C. Farabet K. Kavukcuoglu and S. Chintala \"Torch.\" http:\/\/torch.ch\/."},{"key":"e_1_3_2_1_4_1","unstructured":"Nvidia \"NVIDIA's Next Generation CUDA Compute Architecture:Fermi.\" http:\/\/www.nvidia.com\/content\/PDF\/fermi_white_papers\/P.Glaskowsky_NVIDIA's_Fermi-The_First_Complete_GPU_Architecture.pdf 2009."},{"key":"e_1_3_2_1_5_1","volume-title":"White paper.\" http:\/\/www.nvidia.com\/object\/volta-architecture-whitepaper.html","author":"\"NVIDIA Tesla V100 GPU Architecture The World's Most Advanced Data Center GPU.","year":"2017","unstructured":"Nvidia, \"NVIDIA Tesla V100 GPU Architecture The World's Most Advanced Data Center GPU. White paper.\" http:\/\/www.nvidia.com\/object\/volta-architecture-whitepaper.html, 2017."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/605397.605420"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088154"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2006.17"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.39"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2005.53"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.10"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2010.83"},{"key":"e_1_3_2_1_13_1","first-page":"237","volume-title":"Architectures and Synthesis for Embedded Systems (CASES)","author":"Yeh T. Y.","year":"2005","unstructured":"T. Y. Yeh and G. Reinman, \"Fast and Fair: Data-stream Quality of Service,\" in Proceedings of the International Conference on Compilers, Architectures and Synthesis for Embedded Systems (CASES), pp. 237--248, September 2005."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1399972.1399973"},{"key":"e_1_3_2_1_15_1","first-page":"1","volume-title":"Utilizing Shared Data in Chip Multiprocessors with the Nahalal Architecture,\" in Proceedings of the International Symposium on Parallelism in Algorithms and Architectures (SPAA)","author":"Guz Z.","year":"2008","unstructured":"Z. Guz, I. Keidar, A. Kolodny, and U. C. Weiser, \"Utilizing Shared Data in Chip Multiprocessors with the Nahalal Architecture,\" in Proceedings of the International Symposium on Parallelism in Algorithms and Architectures (SPAA), pp. 1--10, June 2008."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/1399972.1399982"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2007.346180"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555779"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.31"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1178597.1178613"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155677"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/1413370.1413399"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1810085.1810095"},{"key":"e_1_3_2_1_24_1","first-page":"213","volume-title":"Jigsaw: Scalable software-defined caches,\" in Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT)","author":"Beckmann N.","year":"2013","unstructured":"N. Beckmann and D. Sanchez, \"Jigsaw: Scalable software-defined caches,\" in Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT), pp. 213--224, September 2013."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"H. Kasture and D. Sanchez \"Ubik: Efficient Cache Sharing with Strict Qos for Latency-critical Workloads \" in Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS) pp. 729--742 March 2014.","DOI":"10.1145\/2644865.2541944"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3134175"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS.2009.5071477"},{"key":"e_1_3_2_1_28_1","first-page":"163","volume-title":"Analyzing CUDA workloads using a detailed GPU simulator,\" in Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Bakhoda A.","year":"2009","unstructured":"A. Bakhoda, G. L. Yuan, W. W. L. Fung, H. Wong, and T. M. Aamodt, \"Analyzing CUDA workloads using a detailed GPU simulator,\" in Proceeding of the International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 163--174, April 2009."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS.2012.31"},{"key":"e_1_3_2_1_30_1","unstructured":"XILINX \"AXI High Bandwidth Memory Controller v1.0.\" https:\/\/www.xilinx.com\/support\/documentation\/ip_documentation\/hbm\/v1_0\/pg276-axi-hbm.pdf 2018."},{"key":"e_1_3_2_1_31_1","first-page":"123","volume-title":"Beyond the Socket: NUMA-aware GPUs,\" in Proceedings of the International Symposium on Microarchitecture (MICRO)","author":"Milic U.","year":"2017","unstructured":"U. Milic, O. Villa, E. Bolotin, A. Arunkumar, E. Ebrahimi, A. Jaleel, A. Ramirez, and D. Nellans, \"Beyond the Socket: NUMA-aware GPUs,\" in Proceedings of the International Symposium on Microarchitecture (MICRO), pp. 123--135, October 2017."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3236002"},{"key":"e_1_3_2_1_34_1","volume-title":"March","author":"Glasco D. B.","year":"2012","unstructured":"D. B. Glasco, P. B. Holmqvist, G. R. Lynch, P. R. Marchand, K. Mehra, and J. Roberts, \"Cache-based Control of Atomic Operations in Conjunction With an External ALU Block,\" Google Patents, March 2012."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168947"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.33"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2017.2783918"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3218603.3218635"},{"key":"e_1_3_2_1_39_1","first-page":"296","volume-title":"MP3: Minimizing performance penalty for power-gating of Clos network-on-chip,\" in Proceedings of the Symposium on High Performance Computer Architecture (HPCA)","author":"Chen L.","year":"2014","unstructured":"L. Chen, L. Zhao, R. Wang, and T. M. Pinkston, \"MP3: Minimizing performance penalty for power-gating of Clos network-on-chip,\" in Proceedings of the Symposium on High Performance Computer Architecture (HPCA), pp. 296--307, February 2014."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2006.5"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_2_1_43_1","first-page":"1","volume-title":"Auto-tuning a High-Level Language Targeted to GPU Codes,\" in Innovative Parallel Computing (InPar)","author":"Grauer-Gray S.","year":"2012","unstructured":"S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos, \"Auto-tuning a High-Level Language Targeted to GPU Codes,\" in Innovative Parallel Computing (InPar), pp. 1--10, May 2012."},{"key":"e_1_3_2_1_44_1","unstructured":"\"Tango: A Deep Neural Network Benchmark Suite for Various Accelerators.\" https:\/\/gitlab.com\/Tango-DNNbench\/Tango."},{"key":"e_1_3_2_1_45_1","unstructured":"\"NVIDIA CUDA SDK Code Samples.\" https:\/\/developer.nvidia.com\/cuda-downloads."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00024"},{"key":"e_1_3_2_1_47_1","volume-title":"White paper..\" http:\/\/www.nvidia.com\/object\/pascal-architecture-whitepaper.html","author":"Pascal Architecture NVIDIA","year":"2016","unstructured":"Nvidia, \"NVIDIA GP100 Pascal Architecture. White paper..\" http:\/\/www.nvidia.com\/object\/pascal-architecture-whitepaper.html, 2016."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS.2010.27"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_2_1_50_1","first-page":"221","volume-title":"LATTE-CC: Latency Tolerance Aware Adaptive Cache Compression Management for Energy Efficient GPUs,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Arunkumar A.","year":"2018","unstructured":"A. Arunkumar, S. Y. Lee, V. Soundararajan, and C. J. Wu, \"LATTE-CC: Latency Tolerance Aware Adaptive Cache Compression Management for Energy Efficient GPUs,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 221--234, February 2018."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080239"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.44"},{"key":"e_1_3_2_1_53_1","unstructured":"\"Hynix GDDR5 SGRAM Part H5GQ1H24AFR Revision 1.0.\" http:\/\/www.hynix.com\/datasheet\/pdf\/graphics\/H5GQ1H24AFR(Rev1.0).pdf 2009. Hynix."},{"key":"e_1_3_2_1_54_1","first-page":"260","volume-title":"Improving GPGPU Resource Utilization Through Alternative Thread Block Scheduling,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Lee M.","year":"2014","unstructured":"M. Lee, S. Song, J. Moon, J. Kim, W. Seo, Y. Cho, and S. Ryu, \"Improving GPGPU Resource Utilization Through Alternative Thread Block Scheduling,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 260--271, February 2014."},{"key":"e_1_3_2_1_55_1","first-page":"45","volume-title":"Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Qureshi M. K.","year":"2009","unstructured":"M. K. Qureshi, \"Adaptive Spill-Receive for Robust High-Performance Caching in CMPs,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 45--54, February 2009."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541976"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/1944862.1944891"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.5555\/1397757.1397999"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.40"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.5555\/1397757.1397985"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","unstructured":"H. Kwon A. Samajdar and T. Krishna \"MAERI: Enabling Flexible Dataflow Mapping over DNN Accelerators via Reconfigurable Interconnects \" in Proceedings of the International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS) pp. 461--475 March 2018. 10.1145\/3173162.3173176","DOI":"10.1145\/3173162.3173176"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130218.3130222"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2007.346209"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.50"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD.2012.6378671"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897937.2897963"},{"key":"e_1_3_2_1_67_1","first-page":"472","volume-title":"A Heterogeneous Low-cost and Low-latency Ring-Chain Network for GPGPUs,\" in Proceedings of the International Conference on Computer Design (ICCD)","author":"Zhao X.","year":"2016","unstructured":"X. Zhao, S. Ma, C. Li, L. Eeckhout, and Z. Wang, \"A Heterogeneous Low-cost and Low-latency Ring-Chain Network for GPGPUs,\" in Proceedings of the International Conference on Computer Design (ICCD), pp. 472--479, October 2016."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/2786572.2786596"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926267"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485951"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/2967938.2967947"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750418"},{"key":"e_1_3_2_1_74_1","first-page":"89","volume-title":"DaCache: Memory Divergence-Aware GPU Cache Management,\" in Proceedings of the International Conference on Supercomputing (ICS)","author":"Wang B.","year":"2015","unstructured":"B. Wang, W. Yu, X.-H. Sun, and X. Wang, \"DaCache: Memory Divergence-Aware GPU Cache Management,\" in Proceedings of the International Conference on Supercomputing (ICS), pp. 89--98, June 2015."},{"key":"e_1_3_2_1_75_1","first-page":"174","volume-title":"Mascar: Speeding up GPU Warps by Reducing Memory Pitstops,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Sethia A.","year":"2015","unstructured":"A. Sethia, D. A. Jamshidi, and S. Mahlke, \"Mascar: Speeding up GPU Warps by Reducing Memory Pitstops,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 174--185, February 2015."},{"key":"e_1_3_2_1_76_1","first-page":"76","volume-title":"Coordinated Static and Dynamic Cache Bypassing for GPUs,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Xie X.","year":"2015","unstructured":"X. Xie, Y. Liang, Y. Wang, G. Sun, and T. Wang, \"Coordinated Static and Dynamic Cache Bypassing for GPUs,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 76--88, February 2015."},{"key":"e_1_3_2_1_77_1","first-page":"272","volume-title":"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA)","author":"Jia W.","year":"2014","unstructured":"W. Jia, K. A. Shaw, and M. Martonosi, \"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in Proceedings of the International Symposium on High Performance Computer Architecture (HPCA), pp. 272--283, February 2014."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830784"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522337"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485952"},{"key":"e_1_3_2_1_81_1","first-page":"609","volume-title":"Virtual Thread: Maximizing Thread-Level Parallelism beyond GPU Scheduling Limit,\" in Proceedings of the International Symposium on Computer Architecture (ISCA)","author":"Yoon M. K.","year":"2016","unstructured":"M. K. Yoon, K. Kim, S. Lee, W. W. Ro, and M. Annavaram, \"Virtual Thread: Maximizing Thread-Level Parallelism beyond GPU Scheduling Limit,\" in Proceedings of the International Symposium on Computer Architecture (ISCA), pp. 609--621, June 2016."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195656"}],"event":{"name":"ISCA '19: The 46th Annual International Symposium on Computer Architecture","location":"Phoenix Arizona","acronym":"ISCA '19","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE-CS\\DATC IEEE Computer Society"]},"container-title":["Proceedings of the 46th International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3307650.3322235","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3307650.3322235","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:54:05Z","timestamp":1750204445000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3307650.3322235"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,6,22]]},"references-count":82,"alternative-id":["10.1145\/3307650.3322235","10.1145\/3307650"],"URL":"https:\/\/doi.org\/10.1145\/3307650.3322235","relation":{},"subject":[],"published":{"date-parts":[[2019,6,22]]},"assertion":[{"value":"2019-06-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}