{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T03:48:24Z","timestamp":1772164104847,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":150,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,3,19]],"date-time":"2018-03-19T00:00:00Z","timestamp":1521417600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["1409723, 1618563, 1657336"],"award-info":[{"award-number":["1409723, 1618563, 1657336"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,3,19]]},"DOI":"10.1145\/3173162.3173169","type":"proceedings-article","created":{"date-parts":[[2018,3,22]],"date-time":"2018-03-22T11:15:40Z","timestamp":1521717340000},"page":"503-518","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":53,"title":["MASK"],"prefix":"10.1145","author":[{"given":"Rachata","family":"Ausavarungnirun","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vance","family":"Miller","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, Austin, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joshua","family":"Landgraf","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, Austin, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saugata","family":"Ghose","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jayneel","family":"Gandhi","sequence":"additional","affiliation":[{"name":"VMware Research, Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Adwait","family":"Jog","sequence":"additional","affiliation":[{"name":"College of William and Mary, Williamsburg, VA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christopher J.","family":"Rossbach","sequence":"additional","affiliation":[{"name":"University of Texas at Austin&amp;VMware Research, Austin, TX, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich&amp;Carnegie Mellon University, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2018,3,19]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems,\" http:\/\/download.tensorflow.org\/paper\/whitepaper2015.pdf","author":"Abadi M.","year":"2015","unstructured":"M. Abadi , A. Agarwal , P. Barham , E. Brevdo , Z. Chen , C. Citro , G. Corrado , A. Davis , J. Dean , M. Devin , S. Ghemawat , I. Goodfellow , A. Harp , G. Irving , M. Isard , Y. Jia , R. Jozefowicz , L. Kaiser , M. Kudlur , J. Levenberg , D. Mane , R. Monga , S. Moore , D. Murray , C. Olah , M. Schuster , J. Shlens , B. Steiner , I. Sutskever , K. Talwar , P. Tucker , V. Vanhoucke , V. Vasudevan , F. Viegas , O. Vinyals , P. Warden , M. Wattenberg , M. Wicke , Y. Yu , and X. Zheng , \" TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems,\" http:\/\/download.tensorflow.org\/paper\/whitepaper2015.pdf , 2015 . M. Abadi, A. Agarwal, P. Barham, E. Brevdo, Z. Chen, C. Citro, G. Corrado, A. Davis, J. Dean, M. Devin, S. Ghemawat, I. Goodfellow, A. Harp, G. Irving, M. Isard, Y. Jia, R. Jozefowicz, L. Kaiser, M. Kudlur, J. Levenberg, D. Mane, R. Monga, S. Moore, D. Murray, C. Olah, M. Schuster, J. Shlens, B. Steiner, I. Sutskever, K. Talwar, P. Tucker, V. Vanhoucke, V. Vasudevan, F. Viegas, O. Vinyals, P. Warden, M. Wattenberg, M. Wicke, Y. Yu, and X. Zheng, \"TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems,\" http:\/\/download.tensorflow.org\/paper\/whitepaper2015.pdf, 2015."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"e_1_3_2_1_3_1","unstructured":"Advanced Micro Devices Inc. \"AMD Accelerated Processing Units \" http:\/\/www.amd.com\/us\/products\/technologies\/apu\/Pages\/apu.aspx.  Advanced Micro Devices Inc. \"AMD Accelerated Processing Units \" http:\/\/www.amd.com\/us\/products\/technologies\/apu\/Pages\/apu.aspx."},{"key":"e_1_3_2_1_4_1","unstructured":"Advanced Micro Devices Inc. \"AMD Radeon R9 290X \" http:\/\/www.amd.com\/us\/press-releases\/Pages\/amd-radeon-r9--290x-2013oct24.aspx.  Advanced Micro Devices Inc. \"AMD Radeon R9 290X \" http:\/\/www.amd.com\/us\/press-releases\/Pages\/amd-radeon-r9--290x-2013oct24.aspx."},{"key":"e_1_3_2_1_5_1","unstructured":"Advanced Micro Devices Inc. \"ATI Radeon GPGPUs \" http:\/\/www.amd.com\/us\/products\/desktop\/graphics\/amd-radeon-hd-6000\/Pages\/amd-radeon-hd-6000.aspx.  Advanced Micro Devices Inc. \"ATI Radeon GPGPUs \" http:\/\/www.amd.com\/us\/products\/desktop\/graphics\/amd-radeon-hd-6000\/Pages\/amd-radeon-hd-6000.aspx."},{"key":"e_1_3_2_1_6_1","unstructured":"Advanced Micro Devices Inc. \"OpenCL: The Future of Accelerated Application Performance Is Now \" https:\/\/www.amd.com\/Documents\/FirePro_OpenCL_Whitepaper.pdf.  Advanced Micro Devices Inc. \"OpenCL: The Future of Accelerated Application Performance Is Now \" https:\/\/www.amd.com\/Documents\/FirePro_OpenCL_Whitepaper.pdf."},{"key":"e_1_3_2_1_7_1","unstructured":"Advanced Micro Devices Inc. AMD-V Nested Paging 2010 http:\/\/developer.amd.com\/wordpress\/media\/2012\/10\/NPT-WP-1%201-final-TM.pdf.  Advanced Micro Devices Inc. AMD-V Nested Paging 2010 http:\/\/developer.amd.com\/wordpress\/media\/2012\/10\/NPT-WP-1%201-final-TM.pdf."},{"key":"e_1_3_2_1_8_1","volume-title":"System Architecture: A Technical Review,\" http:\/\/amd-dev.wpengine.netdna-cdn.com\/wordpress\/media\/2012\/10\/hsa10.pdf","year":"2012","unstructured":"Advanced Micro Devices , Inc ., \"Heterogeneous System Architecture: A Technical Review,\" http:\/\/amd-dev.wpengine.netdna-cdn.com\/wordpress\/media\/2012\/10\/hsa10.pdf , 2012 . Advanced Micro Devices, Inc., \"Heterogeneous System Architecture: A Technical Review,\" http:\/\/amd-dev.wpengine.netdna-cdn.com\/wordpress\/media\/2012\/10\/hsa10.pdf, 2012."},{"key":"e_1_3_2_1_9_1","unstructured":"Advanced Micro Devices Inc. \"AMD I\/O Virtualization Technology (IOMMU) Specification \" http:\/\/support.amd.com\/TechDocs\/48882_IOMMU.pdf 2016.  Advanced Micro Devices Inc. \"AMD I\/O Virtualization Technology (IOMMU) Specification \" http:\/\/support.amd.com\/TechDocs\/48882_IOMMU.pdf 2016."},{"key":"e_1_3_2_1_10_1","volume-title":"Unlocking Bandwidth for GPUs in CC-NUMA Systems,\" in HPCA","author":"Agarwal N.","year":"2015","unstructured":"N. Agarwal , D. Nellans , M. O'Connor , S. W. Keckler , and T. F. Wenisch , \" Unlocking Bandwidth for GPUs in CC-NUMA Systems,\" in HPCA , 2015 . N. Agarwal, D. Nellans, M. O'Connor, S. W. Keckler, and T. F. Wenisch, \"Unlocking Bandwidth for GPUs in CC-NUMA Systems,\" in HPCA, 2015."},{"key":"e_1_3_2_1_11_1","volume-title":"Distributed Neural Networks with GPUs in the AWS Cloud,\" http:\/\/techblog.netflix.com\/2014\/02\/distributed-neural-networks-with-gpus.html","author":"Alex Chen J. B.","year":"2014","unstructured":"J. B. Alex Chen and X. Amatriain , \" Distributed Neural Networks with GPUs in the AWS Cloud,\" http:\/\/techblog.netflix.com\/2014\/02\/distributed-neural-networks-with-gpus.html , 2014 . J. B. Alex Chen and X. Amatriain, \"Distributed Neural Networks with GPUs in the AWS Cloud,\" http:\/\/techblog.netflix.com\/2014\/02\/distributed-neural-networks-with-gpus.html, 2014."},{"key":"e_1_3_2_1_12_1","unstructured":"ARM Holdings PLC \"Take GPU Processing Power Beyond Graphics with Mali GPU Computing \" 2012.  ARM Holdings PLC \"Take GPU Processing Power Beyond Graphics with Mali GPU Computing \" 2012."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_14_1","volume-title":"Carnegie Mellon Univ.","author":"Ausavarungnirun R.","year":"2017","unstructured":"R. Ausavarungnirun , \"Techniques for Shared Resource Management in Systems with Throughput Processors,\" Ph. D. dissertation , Carnegie Mellon Univ. , 2017 . R. Ausavarungnirun, \"Techniques for Shared Resource Management in Systems with Throughput Processors,\" Ph.D. dissertation, Carnegie Mellon Univ., 2017."},{"key":"e_1_3_2_1_15_1","volume-title":"Staged Memory Scheduling: Achieving High Performance and Scalability in Heterogeneous Systems,\" in ISCA","author":"Ausavarungnirun R.","year":"2012","unstructured":"R. Ausavarungnirun , K. Chang , L. Subramanian , G. Loh , and O. Mutlu , \" Staged Memory Scheduling: Achieving High Performance and Scalability in Heterogeneous Systems,\" in ISCA , 2012 . R. Ausavarungnirun, K. Chang, L. Subramanian, G. Loh, and O. Mutlu, \"Staged Memory Scheduling: Achieving High Performance and Scalability in Heterogeneous Systems,\" in ISCA, 2012."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.38"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123975"},{"key":"e_1_3_2_1_19_1","volume-title":"Improving Multi-Application Concurrency Support Within the GPU Memory System,\" arXiv:1708.04911 {cs.AR}","author":"Ausavarungnirun R.","year":"2017","unstructured":"R. Ausavarungnirun , C. J. Rossbach , V. Miller , J. Landgraf , S. Ghose , J. Gandhi , A. Jog , and O. Mutlu , \" Improving Multi-Application Concurrency Support Within the GPU Memory System,\" arXiv:1708.04911 {cs.AR} , 2017 . R. Ausavarungnirun, C. J. Rossbach, V. Miller, J. Landgraf, S. Ghose, J. Gandhi, A. Jog, and O. Mutlu, \"Improving Multi-Application Concurrency Support Within the GPU Memory System,\" arXiv:1708.04911 {cs.AR}, 2017."},{"key":"e_1_3_2_1_20_1","volume-title":"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in ISPASS","author":"Bakhoda A.","year":"2009","unstructured":"A. Bakhoda , G. Yuan , W. Fung , H. Wong , and T. Aamodt , \" Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in ISPASS , 2009 . A. Bakhoda, G. Yuan, W. Fung, H. Wong, and T. Aamodt, \"Analyzing CUDA Workloads Using a Detailed GPU Simulator,\" in ISPASS, 2009."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000101"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485943"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540741"},{"key":"e_1_3_2_1_24_1","volume-title":"Shared Last-Level TLBs for Chip Multiprocessors,\" in HPCA","author":"Bhattacharjee A.","year":"2011","unstructured":"A. Bhattacharjee , D. Lustig , and M. Martonosi , \" Shared Last-Level TLBs for Chip Multiprocessors,\" in HPCA , 2011 . A. Bhattacharjee, D. Lustig, and M. Martonosi, \"Shared Last-Level TLBs for Chip Multiprocessors,\" in HPCA, 2011."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/1736020.1736060"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/68182.68193"},{"key":"e_1_3_2_1_27_1","volume-title":"Applying AMD's Kaveri APU for Heterogeneous Computing,\" in Hot Chips","author":"Bouvier D.","year":"2014","unstructured":"D. Bouvier and B. Sander , \" Applying AMD's Kaveri APU for Heterogeneous Computing,\" in Hot Chips , 2014 . D. Bouvier and B. Sander, \"Applying AMD's Kaveri APU for Heterogeneous Computing,\" in Hot Chips, 2014."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_2_1_29_1","volume-title":"Architecting an Energy-Efficient DRAM System for GPUs,\" in HPCA","author":"Chatterjee N.","year":"2017","unstructured":"N. Chatterjee , M. O'Connor , D. Lee , D. R. Johnson , S. W. Keckler , M. Rhu , and W. J. Dally , \" Architecting an Energy-Efficient DRAM System for GPUs,\" in HPCA , 2017 . N. Chatterjee, M. O'Connor, D. Lee, D. R. Johnson, S. W. Keckler, M. Rhu, and W. J. Dally, \"Architecting an Energy-Efficient DRAM System for GPUs,\" in HPCA, 2017."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.16"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.11"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2613908.2613909"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"M. Clark \"A New Xc6 Core Architecture for the Next Generation of Computing \" in Hot Chips 2016.  M. Clark \"A New Xc6 Core Architecture for the Next Generation of Computing \" in Hot Chips 2016.","DOI":"10.1109\/HOTCHIPS.2016.7936224"},{"key":"e_1_3_2_1_35_1","volume-title":"Supporting Address Translation for Accelerator-Centric Architectures,\" in HPCA","author":"Cong J.","year":"2017","unstructured":"J. Cong , Z. Fang , Y. Hao , and G. Reinman , \" Supporting Address Translation for Accelerator-Centric Architectures,\" in HPCA , 2017 . J. Cong, Z. Fang, Y. Hao, and G. Reinman, \"Supporting Address Translation for Accelerator-Centric Architectures,\" in HPCA, 2017."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037704"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669150"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815976"},{"key":"e_1_3_2_1_40_1","volume-title":"rCUDA: Reducing the Number of GPU-Based Accelerators in High Performance Clusters,\" in HPCS","author":"Duato J.","year":"2010","unstructured":"J. Duato , A. Pena , F. Silla , R. Mayo , and E. Quintana-Orti , \" rCUDA: Reducing the Number of GPU-Based Accelerators in High Performance Clusters,\" in HPCS , 2010 . J. Duato, A. Pena, F. Silla, R. Mayo, and E. Quintana-Orti, \"rCUDA: Reducing the Number of GPU-Based Accelerators in High Performance Clusters,\" in HPCS, 2010."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669154"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.44"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/L-CA.2013.9"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000093"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454152"},{"key":"e_1_3_2_1_46_1","volume-title":"Graphics Accelerated VDI with the Visual Performance of a Workstation,\" NVIDIA White Paper","author":"Herrera A.","year":"2014","unstructured":"A. Herrera , \"NV IDIA GRID : Graphics Accelerated VDI with the Visual Performance of a Workstation,\" NVIDIA White Paper , 2014 . A. Herrera, \"NVIDIA GRID: Graphics Accelerated VDI with the Visual Performance of a Workstation,\" NVIDIA White Paper, 2014."},{"key":"e_1_3_2_1_47_1","unstructured":"Intel Corp. \"Intel\u00ae Microarchitecture Codename Sandy Bridge \" http:\/\/www.intel.com\/technology\/architecture-silicon\/2ndgen\/.  Intel Corp. \"Intel\u00ae Microarchitecture Codename Sandy Bridge \" http:\/\/www.intel.com\/technology\/architecture-silicon\/2ndgen\/."},{"key":"e_1_3_2_1_48_1","volume-title":"Products Formerly Ivy Bridge,\" http:\/\/ark.intel.com\/products\/codename\/29902\/","author":"Intel Corp.","year":"2012","unstructured":"Intel Corp. , \"Product Speficiations : Products Formerly Ivy Bridge,\" http:\/\/ark.intel.com\/products\/codename\/29902\/ , 2012 . Intel Corp., \"Product Speficiations: Products Formerly Ivy Bridge,\" http:\/\/ark.intel.com\/products\/codename\/29902\/, 2012."},{"key":"e_1_3_2_1_49_1","unstructured":"Intel Corp. \"Introduction to Intel Architecture \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/white-papers\/ia-introduction-basics-paper.pdf 2014.  Intel Corp. \"Introduction to Intel Architecture \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/white-papers\/ia-introduction-basics-paper.pdf 2014."},{"key":"e_1_3_2_1_50_1","unstructured":"Intel Corp. \"Intel 64 and IA-32 Architectures Software Developers Manual \" 2016 https:\/\/www-ssl.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software-developer-manual-325462.pdf.  Intel Corp. \"Intel 64 and IA-32 Architectures Software Developers Manual \" 2016 https:\/\/www-ssl.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/manuals\/64-ia-32-architectures-software-developer-manual-325462.pdf."},{"key":"e_1_3_2_1_51_1","unstructured":"Intel Corp. \"Intel Virtualization Technology for Directed I\/O \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/product-specifications\/vt-directed-io-spec.pdf 2016.  Intel Corp. \"Intel Virtualization Technology for Directed I\/O \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/product-specifications\/vt-directed-io-spec.pdf 2016."},{"key":"e_1_3_2_1_52_1","unstructured":"Intel Corp. \"Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual \" 2016.  Intel Corp. \"Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual \" 2016."},{"key":"e_1_3_2_1_53_1","unstructured":"Intel Corp. \"6th Generation Intel Core Processor Family Datasheet Vol. 1 \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/datasheets\/desktop-6th-gen-core-family-datasheet-vol-1.pdf 2017.  Intel Corp. \"6th Generation Intel Core Processor Family Datasheet Vol. 1 \" http:\/\/www.intel.com\/content\/dam\/www\/public\/us\/en\/documents\/datasheets\/desktop-6th-gen-core-family-datasheet-vol-1.pdf 2017."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/40.710872"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454145"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815971"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228513"},{"key":"e_1_3_2_1_58_1","volume-title":"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in HPCA","author":"Jia W.","year":"2014","unstructured":"W. Jia , K. A. Shaw , and M. Martonosi , \" MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in HPCA , 2014 . W. Jia, K. A. Shaw, and M. Martonosi, \"MRPB: Memory Request Prioritization for Massively Parallel Processors,\" in HPCA, 2014."},{"key":"e_1_3_2_1_59_1","volume-title":"Pennsylvania State Univ.","author":"Jog A.","year":"2015","unstructured":"A. Jog , \"Design and Analysis of Scheduling Techniques for Throughput Processors,\" Ph. D. dissertation , Pennsylvania State Univ. , 2015 . A. Jog, \"Design and Analysis of Scheduling Techniques for Throughput Processors,\" Ph.D. dissertation, Pennsylvania State Univ., 2015."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485951"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/2896377.2901468"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749471"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.115"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","DOI":"10.2172\/1090032","volume-title":"LULESH 2.0 Updates and Changes","author":"Karlin I.","year":"2013","unstructured":"I. Karlin , J. Keasler , and R. Neely , \" LULESH 2.0 Updates and Changes ,\" 2013 . I. Karlin, J. Keasler, and R. Neely, \"LULESH 2.0 Updates and Changes,\" 2013."},{"key":"e_1_3_2_1_67_1","volume-title":"Gdev: First-Class GPU Resource Management in the Operating System,\" in USENIX ATC","author":"Kato S.","year":"2012","unstructured":"S. Kato , M. McThrow , C. Maltzahn , and S. Brandt , \" Gdev: First-Class GPU Resource Management in the Operating System,\" in USENIX ATC , 2012 . S. Kato, M. McThrow, C. Maltzahn, and S. Brandt, \"Gdev: First-Class GPU Resource Management in the Operating System,\" in USENIX ATC, 2012."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.62"},{"key":"e_1_3_2_1_69_1","volume-title":"Neither More Nor Less: Optimizing Thread-Level Parallelism for GPGPUs,\" in PACT","author":"Kay\u0131ran O.","year":"2013","unstructured":"O. Kay\u0131ran , A. Jog , M. T. Kandemir , and C. R. Das , \" Neither More Nor Less: Optimizing Thread-Level Parallelism for GPGPUs,\" in PACT , 2013 . O. Kay\u0131ran, A. Jog, M. T. Kandemir, and C. R. Das, \"Neither More Nor Less: Optimizing Thread-Level Parallelism for GPGPUs,\" in PACT, 2013."},{"key":"e_1_3_2_1_70_1","volume-title":"Managing GPU Concurrency in Heterogeneous Architectures,\" in MICRO","author":"Kay\u0131ran O.","year":"2014","unstructured":"O. Kay\u0131ran , N. C. Nachiappan , A. Jog , R. Ausavarungnirun , M. T. Kandemir , G. H. Loh , O. Mutlu , and C. R. Das , \" Managing GPU Concurrency in Heterogeneous Architectures,\" in MICRO , 2014 . O. Kay\u0131ran, N. C. Nachiappan, A. Jog, R. Ausavarungnirun, M. T. Kandemir, G. H. Loh, O. Mutlu, and C. R. Das, \"Managing GPU Concurrency in Heterogeneous Architectures,\" in MICRO, 2014."},{"key":"e_1_3_2_1_71_1","volume-title":"ATLAS: A Scalable and High-Performance Scheduling Algorithm for Multiple Memory Controllers,\" in HPCA","author":"Kim Y.","year":"2010","unstructured":"Y. Kim , D. Han , O. Mutlu , and M. Harchol-Balter , \" ATLAS: A Scalable and High-Performance Scheduling Algorithm for Multiple Memory Controllers,\" in HPCA , 2010 . Y. Kim, D. Han, O. Mutlu, and M. Harchol-Balter, \"ATLAS: A Scalable and High-Performance Scheduling Algorithm for Multiple Memory Controllers,\" in HPCA, 2010."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.51"},{"key":"e_1_3_2_1_73_1","volume-title":"Fast Gapped-Read Alignment with Bowtie 2,\" Nature Methods","author":"Langmead B.","year":"2012","unstructured":"B. Langmead and S. L. Salzberg , \" Fast Gapped-Read Alignment with Bowtie 2,\" Nature Methods , 2012 . B. Langmead and S. L. Salzberg, \"Fast Gapped-Read Alignment with Bowtie 2,\" Nature Methods, 2012."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168947"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628075"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751237"},{"key":"e_1_3_2_1_77_1","volume-title":"Priority-Based Cache Allocation in Throughput Processors,\" in HPCA","author":"Li D.","year":"2015","unstructured":"D. Li , M. Rhu , D. Johnson , M. O'Connor , M. Erez , D. Burger , D. Fussell , and S. Redder , \" Priority-Based Cache Allocation in Throughput Processors,\" in HPCA , 2015 . D. Li, M. Rhu, D. Johnson, M. O'Connor, M. Erez, D. Burger, D. Fussell, and S. Redder, \"Priority-Based Cache Allocation in Throughput Processors,\" in HPCA, 2015."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2597917.2597925"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2008.05.008"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/2445572.2445574"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1021\/ct400342e"},{"key":"e_1_3_2_1_83_1","volume-title":"iGPU: Exception Support and Speculative Execution on GPUs,\" in ISCA","author":"Menon J.","year":"2012","unstructured":"J. Menon , M. de Kruijf , and K. Sankaralingam , \" iGPU: Exception Support and Speculative Execution on GPUs,\" in ISCA , 2012 . J. Menon, M. de Kruijf, and K. Sankaralingam, \"iGPU: Exception Support and Speculative Execution on GPUs,\" in ISCA, 2012."},{"key":"e_1_3_2_1_84_1","volume-title":"Memory Performance Attacks: Denial of Memory Service in Multi-Core Systems,\" in USENIX Security","author":"Moscibroda T.","year":"2007","unstructured":"T. Moscibroda and O. Mutlu , \" Memory Performance Attacks: Denial of Memory Service in Multi-Core Systems,\" in USENIX Security , 2007 . T. Moscibroda and O. Mutlu, \"Memory Performance Attacks: Denial of Memory Service in Multi-Core Systems,\" in USENIX Security, 2007."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00894-014-2067-1"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155664"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.40"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.7"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"e_1_3_2_1_91_1","volume-title":"Graphics Processing Units in Bioinformatics, Computational Biology and Systems Biology,\" Briefings in Bioinformatics","author":"Nobile M. S.","year":"2016","unstructured":"M. S. Nobile , P. Cazzaniga , A. Tangherloni , and D. Besozzi , \" Graphics Processing Units in Bioinformatics, Computational Biology and Systems Biology,\" Briefings in Bioinformatics , 2016 . M. S. Nobile, P. Cazzaniga, A. Tangherloni, and D. Besozzi, \"Graphics Processing Units in Bioinformatics, Computational Biology and Systems Biology,\" Briefings in Bioinformatics, 2016."},{"key":"e_1_3_2_1_92_1","unstructured":"NVIDIA Corp. \"NVIDIA Tegra K1 \" http:\/\/www.nvidia.com\/content\/pdf\/tegra_white_papers\/tegra-k1-whitepaper-v1.0.pdf.  NVIDIA Corp. \"NVIDIA Tegra K1 \" http:\/\/www.nvidia.com\/content\/pdf\/tegra_white_papers\/tegra-k1-whitepaper-v1.0.pdf."},{"key":"e_1_3_2_1_93_1","unstructured":"NVIDIA Corp. \"NVIDIA Tegra X1 \" https:\/\/international.download.nvidia.com\/pdf\/tegra\/Tegra-X1-whitepaper-v1.0.pdf.  NVIDIA Corp. \"NVIDIA Tegra X1 \" https:\/\/international.download.nvidia.com\/pdf\/tegra\/Tegra-X1-whitepaper-v1.0.pdf."},{"key":"e_1_3_2_1_94_1","unstructured":"NVIDIA Corp. \"CUDA C\/C+ SDK Code Samples \" http:\/\/developer.nvidia.com\/cuda-cc-sdk-code-samples 2011.  NVIDIA Corp. \"CUDA C\/C+ SDK Code Samples \" http:\/\/developer.nvidia.com\/cuda-cc-sdk-code-samples 2011."},{"key":"e_1_3_2_1_95_1","volume-title":"Fermi,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf","author":"NVIDIA Corp.","year":"2011","unstructured":"NVIDIA Corp. , \"NVIDIA's Next Generation CUDA Compute Architecture : Fermi,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf , 2011 . NVIDIA Corp., \"NVIDIA's Next Generation CUDA Compute Architecture: Fermi,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf, 2011."},{"key":"e_1_3_2_1_96_1","volume-title":"Kepler GK110,\" http:\/\/www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf","author":"NVIDIA Corp.","year":"2012","unstructured":"NVIDIA Corp. , \"NVIDIA's Next Generation CUDA Compute Architecture : Kepler GK110,\" http:\/\/www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf , 2012 . NVIDIA Corp., \"NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110,\" http:\/\/www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf, 2012."},{"key":"e_1_3_2_1_97_1","unstructured":"NVIDIA Corp. \"Tesla K40 GPU Active Accelerator \" https:\/\/www.nvidia.com\/content\/PDF\/kepler\/Tesla-K40-Active-Board-Spec-BD-06949-001_v03.pdf 2013.  NVIDIA Corp. \"Tesla K40 GPU Active Accelerator \" https:\/\/www.nvidia.com\/content\/PDF\/kepler\/Tesla-K40-Active-Board-Spec-BD-06949-001_v03.pdf 2013."},{"key":"e_1_3_2_1_98_1","unstructured":"NVIDIA Corp. \"NVIDIA GeForce GTX 750 Ti \" http:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce-GTX-750-Ti-Whitepaper.pdf 2014.  NVIDIA Corp. \"NVIDIA GeForce GTX 750 Ti \" http:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce-GTX-750-Ti-Whitepaper.pdf 2014."},{"key":"e_1_3_2_1_99_1","unstructured":"NVIDIA Corp. \"Multi-Process Service \" https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf 2015.  NVIDIA Corp. \"Multi-Process Service \" https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf 2015."},{"key":"e_1_3_2_1_100_1","unstructured":"NVIDIA Corp. \"NVIDIA GeForce GTX 1080 \" https:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce_GTX_1080_Whitepaper_FINAL.pdf 2016.  NVIDIA Corp. \"NVIDIA GeForce GTX 1080 \" https:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce_GTX_1080_Whitepaper_FINAL.pdf 2016."},{"key":"e_1_3_2_1_101_1","unstructured":"NVIDIA Corp. \"NVIDIA Tesla P100 \" https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf 2016.  NVIDIA Corp. \"NVIDIA Tesla P100 \" https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf 2016."},{"key":"e_1_3_2_1_102_1","unstructured":"NVIDIA Corp. \"CUDA Toolkit Documentation \" http:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/stream-sync-behavior.html 2017.  NVIDIA Corp. \"CUDA Toolkit Documentation \" http:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/stream-sync-behavior.html 2017."},{"key":"e_1_3_2_1_103_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451160"},{"key":"e_1_3_2_1_104_1","volume-title":"A Case for Toggle-Aware Compression for GPU Systems,\" in HPCA","author":"Pekhimenko G.","year":"2016","unstructured":"G. Pekhimenko , E. Bolotin , N. Vijaykumar , O. Mutlu , T. C. Mowry , and S. W. Keckler , \" A Case for Toggle-Aware Compression for GPU Systems,\" in HPCA , 2016 . G. Pekhimenko, E. Bolotin, N. Vijaykumar, O. Mutlu, T. C. Mowry, and S. W. Keckler, \"A Case for Toggle-Aware Compression for GPU Systems,\" in HPCA, 2016."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_106_1","volume-title":"Supporting x86--64 Address Translation for 100s of GPU Lanes,\" in HPCA","author":"Power J.","year":"2014","unstructured":"J. Power , M. D. Hill , and D. A. Wood , \" Supporting x86--64 Address Translation for 100s of GPU Lanes,\" in HPCA , 2014 . J. Power, M. D. Hill, and D. A. Wood, \"Supporting x86--64 Address Translation for 100s of GPU Lanes,\" in HPCA, 2014."},{"key":"e_1_3_2_1_107_1","unstructured":"PowerVR \"PowerVR Hardware Architecture Overview for Developers \" http:\/\/cdn.imgtec.com\/sdk-documentation\/PowerVR+Hardware.Architecture+Overview+for+Developers.pdf 2016.  PowerVR \"PowerVR Hardware Architecture Overview for Developers \" http:\/\/cdn.imgtec.com\/sdk-documentation\/PowerVR+Hardware.Architecture+Overview+for+Developers.pdf 2016."},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250662.1250709"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2006.49"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1145\/342001.339668"},{"key":"e_1_3_2_1_111_1","volume-title":"dissertation","author":"Rogers T. G.","year":"2015","unstructured":"T. G. Rogers , \"Locality and Scheduling in the Massively Multithreaded Era,\" Ph. D. dissertation , Univ. of British Columbia , 2015 . T. G. Rogers, \"Locality and Scheduling in the Massively Multithreaded Era,\" Ph.D. dissertation, Univ. of British Columbia, 2015."},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_113_1","volume-title":"UNified Instruction\/Translation\/Data (UNITD) Coherence: One Protocol to Rule them All,\" in HPCA","author":"Romanescu B. F.","year":"2010","unstructured":"B. F. Romanescu , A. R. Lebeck , D. J. Sorin , and A. Bracy , \" UNified Instruction\/Translation\/Data (UNITD) Coherence: One Protocol to Rule them All,\" in HPCA , 2010 . B. F. Romanescu, A. R. Lebeck, D. J. Sorin, and A. Bracy, \"UNified Instruction\/Translation\/Data (UNITD) Coherence: One Protocol to Rule them All,\" in HPCA, 2010."},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/2043556.2043579"},{"key":"e_1_3_2_1_115_1","unstructured":"SAFARI Research Group \"Mosaic -- GitHub Repository \" https:\/\/github.com\/Carnegie Mellon University-SAFARI\/Mosaic\/.  SAFARI Research Group \"Mosaic -- GitHub Repository \" https:\/\/github.com\/Carnegie Mellon University-SAFARI\/Mosaic\/."},{"key":"e_1_3_2_1_116_1","doi-asserted-by":"publisher","DOI":"10.1145\/2370816.2370868"},{"key":"e_1_3_2_1_117_1","doi-asserted-by":"publisher","DOI":"10.1145\/2677956"},{"key":"e_1_3_2_1_118_1","unstructured":"SK Hynix Inc. \"Hynix GDDR5 SGRAM Part H5GQ1H24AFR Revision 1.0 \" http:\/\/www.hynix.com\/datasheet\/pdf\/graphics\/H5GQ1H24AFR(Rev1.0).pdf.  SK Hynix Inc. \"Hynix GDDR5 SGRAM Part H5GQ1H24AFR Revision 1.0 \" http:\/\/www.hynix.com\/datasheet\/pdf\/graphics\/H5GQ1H24AFR(Rev1.0).pdf."},{"key":"e_1_3_2_1_119_1","doi-asserted-by":"crossref","unstructured":"B. Smith \"Architecture and Applications of the HEP Multiprocessor Computer System \" SPIE 1981.  B. Smith \"Architecture and Applications of the HEP Multiprocessor Computer System \" SPIE 1981.","DOI":"10.1117\/12.932535"},{"key":"e_1_3_2_1_120_1","volume-title":"Shared Resource MIMD Computer,\" in ICPP","author":"Smith B. J.","year":"1978","unstructured":"B. J. Smith , \" A Pipelined , Shared Resource MIMD Computer,\" in ICPP , 1978 . B. J. Smith, \"A Pipelined, Shared Resource MIMD Computer,\" in ICPP, 1978."},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2526003"},{"key":"e_1_3_2_1_123_1","volume-title":"The Blacklisting Memory Scheduler: Achieving High Performance and Fairness at Low Cost,\" in ICCD","author":"Subramanian L.","year":"2014","unstructured":"L. Subramanian , D. Lee , V. Seshadri , H. Rastogi , and O. Mutlu , \" The Blacklisting Memory Scheduler: Achieving High Performance and Fairness at Low Cost,\" in ICCD , 2014 . L. Subramanian, D. Lee, V. Seshadri, H. Rastogi, and O. Mutlu, \"The Blacklisting Memory Scheduler: Achieving High Performance and Fairness at Low Cost,\" in ICCD, 2014."},{"key":"e_1_3_2_1_124_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830803"},{"key":"e_1_3_2_1_125_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522356"},{"key":"e_1_3_2_1_126_1","volume-title":"GPUvm: Why Not Virtualizing GPUs at the Hypervisor?\" in USENIX ATC","author":"Suzuki Y.","year":"2014","unstructured":"Y. Suzuki , S. Kato , H. Yamada , and K. Kono , \" GPUvm: Why Not Virtualizing GPUs at the Hypervisor?\" in USENIX ATC , 2014 . Y. Suzuki, S. Kato, H. Yamada, and K. Kono, \"GPUvm: Why Not Virtualizing GPUs at the Hypervisor?\" in USENIX ATC, 2014."},{"key":"e_1_3_2_1_127_1","volume-title":"Enabling Preemptive Multiprogramming on GPUs,\" in ISCA","author":"Tanasic I.","year":"2014","unstructured":"I. Tanasic , I. Gelado , J. Cabezas , A. Ramirez , N. Navarro , and M. Valero , \" Enabling Preemptive Multiprogramming on GPUs,\" in ISCA , 2014 . I. Tanasic, I. Gelado, J. Cabezas, A. Ramirez, N. Navarro, and M. Valero, \"Enabling Preemptive Multiprogramming on GPUs,\" in ISCA, 2014."},{"key":"e_1_3_2_1_128_1","doi-asserted-by":"publisher","DOI":"10.1145\/1464039.1464045"},{"key":"e_1_3_2_1_129_1","volume-title":"Design of a Computer: The Control Data 6600. hskip 1em plus 0.5em minus 0.4emrelax Scott Foresman & Co","author":"Thornton J. E.","year":"1970","unstructured":"J. E. Thornton , Design of a Computer: The Control Data 6600. hskip 1em plus 0.5em minus 0.4emrelax Scott Foresman & Co , 1970 . J. E. Thornton, Design of a Computer: The Control Data 6600. hskip 1em plus 0.5em minus 0.4emrelax Scott Foresman & Co, 1970."},{"key":"e_1_3_2_1_130_1","volume-title":"A Full GPU Virtualization Solution with Mediated Pass-Through,\" in USENIX ATC","author":"Tian K.","year":"2014","unstructured":"K. Tian , Y. Dong , and D. Cowperthwaite , \" A Full GPU Virtualization Solution with Mediated Pass-Through,\" in USENIX ATC , 2014 . K. Tian, Y. Dong, and D. Cowperthwaite, \"A Full GPU Virtualization Solution with Mediated Pass-Through,\" in USENIX ATC, 2014."},{"key":"e_1_3_2_1_131_1","volume-title":"SQUASH: Simple QoS-Aware High-Performance Memory Scheduler for Heterogeneous Systems with Hardware Accelerators,\" arXiv:1505.07502 {cs.AR}","author":"Usui H.","year":"2015","unstructured":"H. Usui , L. Subramanian , K. Chang , and O. Mutlu , \" SQUASH: Simple QoS-Aware High-Performance Memory Scheduler for Heterogeneous Systems with Hardware Accelerators,\" arXiv:1505.07502 {cs.AR} , 2015 . H. Usui, L. Subramanian, K. Chang, and O. Mutlu, \"SQUASH: Simple QoS-Aware High-Performance Memory Scheduler for Heterogeneous Systems with Hardware Accelerators,\" arXiv:1505.07502 {cs.AR}, 2015."},{"key":"e_1_3_2_1_132_1","doi-asserted-by":"publisher","DOI":"10.1145\/2847255"},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"publisher","DOI":"10.1109\/L-CA.2011.1"},{"key":"e_1_3_2_1_134_1","volume-title":"Observations and Opportunities in Architecting Shared Virtual Memory for Heterogeneous Systems,\" in ISPASS","author":"Vesely J.","year":"2016","unstructured":"J. Vesely , A. Basu , M. Oskin , G. H. Loh , and A. Bhattacharjee , \" Observations and Opportunities in Architecting Shared Virtual Memory for Heterogeneous Systems,\" in ISPASS , 2016 . J. Vesely, A. Basu, M. Oskin, G. H. Loh, and A. Bhattacharjee, \"Observations and Opportunities in Architecting Shared Virtual Memory for Heterogeneous Systems,\" in ISPASS, 2016."},{"key":"e_1_3_2_1_135_1","volume-title":"Design and Analysis of an APU for Exascale Computing,\" in HPCA","author":"Vijayaraghavany T.","year":"2017","unstructured":"T. Vijayaraghavany , Y. Eckert , G. H. Loh , M. J. Schulte , M. Ignatowski , B. M. Beckmann , W. C. Brantley , J. L. Greathouse , W. Huang , A. Karunanithi , O. Kayiran , M. Meswani , I. Paul , M. Poremba , S. Raasch , S. K. Reinhardt , G. Sadowski , and V. Sridharan , \" Design and Analysis of an APU for Exascale Computing,\" in HPCA , 2017 . T. Vijayaraghavany, Y. Eckert, G. H. Loh, M. J. Schulte, M. Ignatowski, B. M. Beckmann, W. C. Brantley, J. L. Greathouse, W. Huang, A. Karunanithi, O. Kayiran, M. Meswani, I. Paul, M. Poremba, S. Raasch, S. K. Reinhardt, G. Sadowski, and V. Sridharan, \"Design and Analysis of an APU for Exascale Computing,\" in HPCA, 2017."},{"key":"e_1_3_2_1_136_1","volume-title":"Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in MICRO","author":"Vijaykumar N.","year":"2016","unstructured":"N. Vijaykumar , K. Hsieh , G. Pekhimenko , S. Khan , A. Shrestha , S. Ghose , A. Jog , P. B. Gibbons , and O. Mutlu , \" Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in MICRO , 2016 . N. Vijaykumar, K. Hsieh, G. Pekhimenko, S. Khan, A. Shrestha, S. Ghose, A. Jog, P. B. Gibbons, and O. Mutlu, \"Zorua: A Holistic Approach to Resource Virtualization in GPUs,\" in MICRO, 2016."},{"key":"e_1_3_2_1_137_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750399"},{"key":"e_1_3_2_1_138_1","volume-title":"A Framework for Accelerating Bottlenecks in GPU Execution with Assist Warps,\" arXiv:1602.01348 {cs.AR}","author":"Vijaykumar N.","year":"2016","unstructured":"N. Vijaykumar , G. Pekhimenko , A. Jog , S. Ghose , A. Bhowmick , R. Ausavarungnirun , C. R. Das , M. T. Kandemir , T. C. Mowry , and O. Mutlu , \" A Framework for Accelerating Bottlenecks in GPU Execution with Assist Warps,\" arXiv:1602.01348 {cs.AR} , 2016 . N. Vijaykumar, G. Pekhimenko, A. Jog, S. Ghose, A. Bhowmick, R. Ausavarungnirun, C. R. Das, M. T. Kandemir, T. C. Mowry, and O. Mutlu, \"A Framework for Accelerating Bottlenecks in GPU Execution with Assist Warps,\" arXiv:1602.01348 {cs.AR}, 2016."},{"key":"e_1_3_2_1_139_1","unstructured":"Vivante \"Vivante Vega GPGPU Technology \" http:\/\/www.vivantecorp.com\/index.php\/en\/technology\/gpgpu.html 2016.  Vivante \"Vivante Vega GPGPU Technology \" http:\/\/www.vivantecorp.com\/index.php\/en\/technology\/gpgpu.html 2016."},{"key":"e_1_3_2_1_140_1","volume-title":"GPU Virtualization for High Performance General Purpose Computing on the ESX Hypervisor,\" in HPC","author":"Vu L.","year":"2014","unstructured":"L. Vu , H. Sivaraman , and R. Bidarkar , \" GPU Virtualization for High Performance General Purpose Computing on the ESX Hypervisor,\" in HPC , 2014 . L. Vu, H. Sivaraman, and R. Bidarkar, \"GPU Virtualization for High Performance General Purpose Computing on the ESX Hypervisor,\" in HPC, 2014."},{"key":"e_1_3_2_1_141_1","volume-title":"Simultaneous Multikernel GPU: Multi-Tasking Throughput Processors via Fine-Grained Sharing,\" in HPCA","author":"Wang Z.","year":"2016","unstructured":"Z. Wang , J. Yang , R. Melhem , B. R. Childers , Y. Zhang , and M. Guo , \" Simultaneous Multikernel GPU: Multi-Tasking Throughput Processors via Fine-Grained Sharing,\" in HPCA , 2016 . Z. Wang, J. Yang, R. Melhem, B. R. Childers, Y. Zhang, and M. Guo, \"Simultaneous Multikernel GPU: Multi-Tasking Throughput Processors via Fine-Grained Sharing,\" in HPCA, 2016."},{"key":"e_1_3_2_1_142_1","unstructured":"S. Wasson \"AMD's A8--3800 Fusion APU \" http:\/\/techreport.com\/articles.x\/21730 2011.  S. Wasson \"AMD's A8--3800 Fusion APU \" http:\/\/techreport.com\/articles.x\/21730 2011."},{"key":"e_1_3_2_1_143_1","volume-title":"Demystifying GPU Microarchitecture Through Microbenchmarking,\" in ISPASS","author":"Wong H.","year":"2010","unstructured":"H. Wong , M.-M. Papadopoulou , M. Sadooghi-Alvandi , and A. Moshovos , \" Demystifying GPU Microarchitecture Through Microbenchmarking,\" in ISPASS , 2010 . H. Wong, M.-M. Papadopoulou, M. Sadooghi-Alvandi, and A. Moshovos, \"Demystifying GPU Microarchitecture Through Microbenchmarking,\" in ISPASS, 2010."},{"key":"e_1_3_2_1_144_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2011.5762710"},{"key":"e_1_3_2_1_145_1","volume-title":"An Efficient Compiler Framework for Cache Bypassing on GPUs,\" in ICCAD","author":"Xie X.","year":"2013","unstructured":"X. Xie , Y. Liang , G. Sun , and D. Chen , \" An Efficient Compiler Framework for Cache Bypassing on GPUs,\" in ICCAD , 2013 . X. Xie, Y. Liang, G. Sun, and D. Chen, \"An Efficient Compiler Framework for Cache Bypassing on GPUs,\" in ICCAD, 2013."},{"key":"e_1_3_2_1_146_1","volume-title":"Coordinated Static and Dynamic Cache Bypassing for GPUs,\" in HPCA","author":"Xie X.","year":"2015","unstructured":"X. Xie , Y. Liang , Y. Wang , G. Sun , and T. Wang , \" Coordinated Static and Dynamic Cache Bypassing for GPUs,\" in HPCA , 2015 . X. Xie, Y. Liang, Y. Wang, G. Sun, and T. Wang, \"Coordinated Static and Dynamic Cache Bypassing for GPUs,\" in HPCA, 2015."},{"key":"e_1_3_2_1_147_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.29"},{"key":"e_1_3_2_1_148_1","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018754"},{"key":"e_1_3_2_1_149_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124555"},{"key":"e_1_3_2_1_150_1","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669119"},{"key":"e_1_3_2_1_151_1","volume-title":"Towards High Performance Paged Memory for GPUs,\" in HPCA","author":"Zheng T.","year":"2016","unstructured":"T. Zheng , D. Nellans , A. Zulfiqar , M. Stephenson , and S. W. Keckler , \" Towards High Performance Paged Memory for GPUs,\" in HPCA , 2016 . T. Zheng, D. Nellans, A. Zulfiqar, M. Stephenson, and S. W. Keckler, \"Towards High Performance Paged Memory for GPUs,\" in HPCA, 2016."},{"key":"e_1_3_2_1_152_1","first-page":"630","article-title":"Controller for a Synchronous DRAM That Maximizes Throughput by Allowing Memory Requests and Commands to Be Issued Out of Order","volume":"5","author":"Zuravleff W. K.","year":"1997","unstructured":"W. K. Zuravleff and T. Robinson , \" Controller for a Synchronous DRAM That Maximizes Throughput by Allowing Memory Requests and Commands to Be Issued Out of Order ,\" U.S. Patent Number 5 , 630 ,096, 1997 . W. K. Zuravleff and T. Robinson, \"Controller for a Synchronous DRAM That Maximizes Throughput by Allowing Memory Requests and Commands to Be Issued Out of Order,\" U.S. Patent Number 5,630,096, 1997.","journal-title":"U.S. Patent Number"}],"event":{"name":"ASPLOS '18: Architectural Support for Programming Languages and Operating Systems","location":"Williamsburg VA USA","acronym":"ASPLOS '18","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the Twenty-Third International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3173162.3173169","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3173162.3173169","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3173162.3173169","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:02:50Z","timestamp":1750201370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3173162.3173169"}},"subtitle":["Redesigning the GPU Memory Hierarchy to Support Multi-Application Concurrency"],"short-title":[],"issued":{"date-parts":[[2018,3,19]]},"references-count":150,"alternative-id":["10.1145\/3173162.3173169","10.1145\/3173162"],"URL":"https:\/\/doi.org\/10.1145\/3173162.3173169","relation":{"is-identical-to":[{"id-type":"doi","id":"10.1145\/3296957.3173169","asserted-by":"object"}]},"subject":[],"published":{"date-parts":[[2018,3,19]]},"assertion":[{"value":"2018-03-19","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}