{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:45:21Z","timestamp":1772725521767,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":102,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T00:00:00Z","timestamp":1554336000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1409723, 1422331, 1617071, 1618563, 1657336, 1718080, 1725657, 1750667"],"award-info":[{"award-number":["1409723, 1422331, 1617071, 1618563, 1657336, 1718080, 1725657, 1750667"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011002","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61832018"],"award-info":[{"award-number":["61832018"]}],"id":[{"id":"10.13039\/501100011002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,4,4]]},"DOI":"10.1145\/3297858.3304044","type":"proceedings-article","created":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T18:38:43Z","timestamp":1554403123000},"page":"49-63","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":67,"title":["A Framework for Memory Oversubscription Management in Graphics Processing Units"],"prefix":"10.1145","author":[{"given":"Chen","family":"Li","sequence":"first","affiliation":[{"name":"National University of Defense Technology &amp; University of Pittsburgh, Changsha, China"}]},{"given":"Rachata","family":"Ausavarungnirun","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University &amp; King Mongkut's University of Technology North Bangkok, Pittsburgh, PA, USA"}]},{"given":"Christopher J.","family":"Rossbach","sequence":"additional","affiliation":[{"name":"University of Texas Austin &amp; VMware Research, Austin, TX, USA"}]},{"given":"Youtao","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}]},{"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich &amp; Carnegie Mellon University, Zurich, Switzerland"}]},{"given":"Yang","family":"Guo","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"given":"Jun","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2019,4,4]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Advanced Micro Devices Inc. 2013. What is Heterogeneous System Architecture (HSA)? http:\/\/developer.amd.com\/resources\/heterogeneous-computing\/what-is-heterogeneous-system-architecture-hsa\/"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"N. Agarwal D. Nellans M. Stephenson M. O'Connor and S. Keckler. 2015. Page Placement Strategies for GPUs Within Heterogeneous Memory Systems. In ASPLOS . 10.1145\/2694344.2694381","DOI":"10.1145\/2694344.2694381"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/2337159.2337214"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2015.2401022"},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2011. AMD Accelerated Processing Units. http:\/\/www.amd.com\/us\/products\/technologies\/apu\/Pages\/apu.aspx ."},{"key":"e_1_3_2_1_6_1","unstructured":"AMD. 2012. AMD Graphics Cores Next (GCN) Architecture . https:\/\/www.amd.com\/Documents\/GCN_Architecture_whitepaper.pdf ."},{"key":"e_1_3_2_1_7_1","unstructured":"AMD. 2017. Radeon`s Next-generation Vega Architecture . https:\/\/radeon.com\/_downloads\/vega-whitepaper-11.6.17.pdf ."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","unstructured":"R. Ausavarungnirun J. Landgraf V. Miller S. Ghose J. Gandhi C. Rossbach and O. Mutlu. 2017. Mosaic: a GPU memory manager with application-transparent support for multiple page sizes. In MICRO . 10.1145\/3123939.3123975","DOI":"10.1145\/3123939.3123975"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173169"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"A. Bakhoda G. Yuan W. Fung H. Wong and T. Aamodt. 2009. Analyzing CUDA Workloads Using a Detailed GPU Simulator. In ISPASS .","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815970"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"T. W. Barr A. L. Cox and S. Rixner. 2011. SpecTLB: A Mechanism for Speculative Address Translation. In ISCA . 10.1145\/2000064.2000101","DOI":"10.1145\/2000064.2000101"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","unstructured":"A. Basu J. Gandhi J. Chang M. Hill and M. Swift. 2013. Efficient Virtual Memory for Big Memory Servers. In ISCA . 10.1145\/2485922.2485943","DOI":"10.1145\/2485922.2485943"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1147\/sj.52.0078"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"A. Bhattacharjee. 2013. Large-reach Memory Management Unit Caches. In MICRO . 10.1145\/2540708.2540741","DOI":"10.1145\/2540708.2540741"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","unstructured":"A. Bhattacharjee D. Lustig and M. Martonosi. 2011. Shared Last-level TLBs for Chip Multiprocessors . In HPCA .","DOI":"10.5555\/2014698.2014896"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"A. Bhattacharjee and M. Martonosi. 2009. Characterizing the TLB Behavior of Emerging Parallel Workloads on Chip Multiprocessors. In PACT . 10.1109\/PACT.2009.26","DOI":"10.1109\/PACT.2009.26"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"D. L. Black R. F. Rashid D. B. Golub and C. R. Hill. 1989. Translation Lookaside Buffer Consistency: A Software Approach. In ASPLOS . 10.1145\/70082.68193","DOI":"10.1145\/70082.68193"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","unstructured":"N. Chatterjee M. O'Connor G. H. Loh N. Jayasena and R. Balasubramonian. 2014. Managing DRAM Latency Divergence in Irregular GPGPU Applications. In SC . 10.1109\/SC.2014.16","DOI":"10.1109\/SC.2014.16"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_21_1","unstructured":"J. Cong Z. Fang Y. Hao and G. Reinmana. 2017. Supporting Address Translation for Accelerator-Centric Architectures. In HPCA ."},{"key":"e_1_3_2_1_22_1","unstructured":"N. Corp. 2015. NVIDIA Tegra X1: NVIDIA`s New Mobile Superchip . http:\/\/www.nvidia.com\/object\/tegra-x1-processor.html ."},{"key":"e_1_3_2_1_23_1","unstructured":"N. Corp. 2016. NVIDIA GTX 1060 . https:\/\/www.nvidia.com\/en-us\/geforce\/products\/10series\/geforce-gtx-1060\/."},{"key":"e_1_3_2_1_24_1","unstructured":"M. Ekman and P. Stenstrom. {n. d.}. A Robust Main-Memory Compression Scheme."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"W. Fung I. Sham G. Yuan and T. Aamodt. 2007. Dynamic Warp Formation and Scheduling for Efficient GPU Control Flow. In MICRO . 10.1109\/MICRO.2007.12","DOI":"10.1109\/MICRO.2007.12"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"J. Gandhi A. Basu M. Hill and M. Swift. 2014a. Efficient Memory Virtualization: Reducing Dimensionality of Nested Page Walks. In MICRO . 10.1109\/MICRO.2014.37","DOI":"10.1109\/MICRO.2014.37"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","unstructured":"J. Gandhi A. Basu M. D. Hill and M. M. Swift. 2014b. Efficient Memory Virtualization: Reducing Dimensionality of Nested Page Walks. In MICRO . 10.1109\/MICRO.2014.37","DOI":"10.1109\/MICRO.2014.37"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.67"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"N. Gawande J. Daily C. Siegel N. Tallent and A. Vishnu. 2018. Scaling Deep Learning Workloads: NVIDIA DGX-1\/Pascal and Intel Knights Landing . Future Generation Computer Systems (2018).","DOI":"10.1109\/IPDPSW.2017.36"},{"key":"e_1_3_2_1_30_1","volume-title":"Cloud TPUs: ML Accelerators for TensorFlow. https:\/\/cloud.google.com\/tpu\/","year":"2017","unstructured":"Google. 2017. Cloud TPUs: ML Accelerators for TensorFlow. https:\/\/cloud.google.com\/tpu\/ (2017)."},{"key":"e_1_3_2_1_31_1","volume":"201","author":"Grauer-Gray S.","unstructured":"S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos. 2012. Auto-tuning a High-level Language Targeted to GPU Codes. In InPar .","journal-title":"J. Cavazos."},{"key":"e_1_3_2_1_32_1","unstructured":"M. Harris. 2013. Unified Memory in CUDA 6. https:\/\/devblogs.nvidia.com\/unified-memory-in-cuda-6\/"},{"key":"e_1_3_2_1_33_1","unstructured":"Hybrid Memoty Cube Consortium. 2013. HMC Specification 1.1 ."},{"key":"e_1_3_2_1_34_1","unstructured":"Hybrid Memoty Cube Consortium. 2014. HMC Specification 2.0 ."},{"key":"e_1_3_2_1_35_1","unstructured":"IBM. 2017. Realizing the value of Large Model Support (LMS) with PowerAI IBM Caffe . http:\/\/developer.ibm.com\/linuxonpower\/2017\/09\/22\/realizing-value-large-model-support-lms-powerai-ibm-caffe\/."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"B. Jang D. Schaa F. Mistry and D. Kaeli. 2010. Exploiting Memory Access Patterns to Improve Memory Performance in Data-parallel Architectures . IEEE TPDS (2010). 10.1109\/TPDS.2010.107","DOI":"10.1109\/TPDS.2010.107"},{"key":"e_1_3_2_1_37_1","unstructured":"JEDEC. 2018. High Bandwidth Memory (HBM) DRAM . https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a ."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","unstructured":"A. Jog O. Kayiran T. Kesten A. Pattnaik E. Bolotin N. Chatterjee S. Keckler M. Kandemir and C. Das. 2015. Anatomy of GPU Memory System for Multi-Application Execution. In MEMSYS . 10.1145\/2818950.2818979","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","unstructured":"A. Jog O. Kayiran A. K. Mishra M. T. Kandemir O. Mutlu R. Iyer and C. R. Das. 2013a. Orchestrated Scheduling and Prefetching for GPGPUs. In ISCA . 10.1145\/2485922.2485951","DOI":"10.1145\/2485922.2485951"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","unstructured":"N. Jouppi C. Young N. Patil D. Patterson G. Agrawal and et.al. 2017. In-Datacenter Performance Analysis of a Tensor Processing Unit. In ISCA . 10.1145\/3079856.3080246","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","unstructured":"G. B. Kandiraju and A. Sivasubramaniam. 2002. Going the Distance for TLB Prefetching: An Application-Driven Study. In ISCA .","DOI":"10.5555\/545215.545237"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"V. Karakostas J. Gandhi F. Ayar A. Cristal M. D. Hill K. S. McKinley M. Nemirovsky M. M. Swift and O. \u00dcnsal. 2015. Redundant Memory Mappings for Fast Access to Large Memories. In ISCA . 10.1145\/2749469.2749471","DOI":"10.1145\/2749469.2749471"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"O. Kayiran N. Chidambaram A. Jog R. Ausavarungnirun M. Kandemir G. Loh O. Mutlu and C. Das. 2014. Managing GPU Concurrency in Heterogeneous Architectures. In MICRO . 10.1109\/MICRO.2014.62","DOI":"10.1109\/MICRO.2014.62"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","unstructured":"O. Kayiran A. Jog M. Kandemir and C. Das. 2013. Neither More nor Less: optimizing Thread-level Parallelism for GPGPUs. In PACT .","DOI":"10.5555\/2523721.2523745"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","unstructured":"J. Kehne J. Metter and F. Bellosa. 2015. GPUswap: Enabling Oversubscription of GPU Memory Through Transparent Swapping. In VEE . 10.1145\/2731186.2731192","DOI":"10.1145\/2731186.2731192"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.37"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Y. Kwon and M. Rhu. 2018a. A Case for Memory-Centric HPC System Architecture for Training Deep Neural Networks . IEEE CAL (2018).","DOI":"10.1109\/LCA.2018.2823302"},{"key":"e_1_3_2_1_49_1","volume-title":"Memory Wall: A Case for Memory-Centric HPC System for Deep Learning. In MICRO .","author":"Kwon Y.","year":"2018","unstructured":"Y. Kwon and M. Rhu. 2018b. Beyond the Memory Wall: A Case for Memory-Centric HPC System for Deep Learning. In MICRO ."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"D. Lee G. Pekhimenko S. M. Khan S. Ghose and O. Mutlu. 2016. Simultaneous Multi Layer Access: A High Bandwidth and Low Cost 3D-Stacked Memory Interface. In ACM TACO .","DOI":"10.1145\/2832911"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628075"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"C. Li S. L. Song H. Dai A. Sidelnik S. K. S. Hari and H. Zhou. 2015. Locality-Driven Dynamic GPU Cache Bypassing . In ICS . 10.1145\/2751205.2751237","DOI":"10.1145\/2751205.2751237"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","unstructured":"X. Li and Y. Liang. 2016. Efficient Kernel Management on GPUs. In DATE .","DOI":"10.5555\/2971808.2971827"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","unstructured":"G. Loh N. Jerger A. Kannan and Y. Eckert. 2015. Interconnect-Memory Challenges for Multi-chip Silicon Interposer Systems. In MEMSYS . 10.1145\/2818950.2818951","DOI":"10.1145\/2818950.2818951"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","unstructured":"D. Lustig A. Bhattacharjee and M. Martonosi. 2013. TLB Improvements for Chip Multiprocessors: Inter-Core Cooperative Prefetchers and Shared Last-Level TLBs. In ACM TACO . 10.1145\/2445572.2445574","DOI":"10.1145\/2445572.2445574"},{"key":"e_1_3_2_1_57_1","volume-title":"Proc. of ML Systems Workshop in NIPS .","author":"Meng C.","unstructured":"C. Meng, M. Sun, J. Yang, M. Qiu, and Y. Gu. 2017. Training deeper models by GPU memory optimization on TensorFlow. In Proc. of ML Systems Workshop in NIPS ."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","unstructured":"T. Merrifield and H. Taheri. 2016. Performance Implications of Extended Page Tables on Virtualized x86 Processors. In VEE . 10.1145\/2892242.2892258","DOI":"10.1145\/2892242.2892258"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155656"},{"key":"e_1_3_2_1_60_1","unstructured":"NVIDIA Corp. 2011a. CUDA C\/C"},{"key":"e_1_3_2_1_61_1","unstructured":"SDK Code Samples . http:\/\/developer.nvidia.com\/cuda-cc-sdk-code-samples ."},{"key":"e_1_3_2_1_62_1","unstructured":"NVIDIA Corp. 2011b. CUDA Toolkit 4.0 . https:\/\/developer.nvidia.com\/cuda-toolkit-40 ."},{"key":"e_1_3_2_1_63_1","unstructured":"NVIDIA Corp. 2012. NVIDIA's Next Generation CUDA Compute Architecture: Kepler GK110 . http:\/\/www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf ."},{"key":"e_1_3_2_1_64_1","unstructured":"NVIDIA Corp. 2014. NVIDIA GeForce GTX 750 Ti . http:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce-GTX-750-Ti-Whitepaper.pdf ."},{"key":"e_1_3_2_1_65_1","unstructured":"NVIDIA Corp. 2015. CUDA C Programming Guide . http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html ."},{"key":"e_1_3_2_1_66_1","unstructured":"NVIDIA Corp. 2016a. NVIDIA Tesla P100 P100 GPU Architecture . https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf ."},{"key":"e_1_3_2_1_67_1","unstructured":"NVIDIA Corp. 2016b. NVIDIA Tesla V100 GPU Architecture . http:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf ."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"M. M. Papadopoulou X. Tong A. Seznec and A. Moshovos. 2015. Prediction-Based Superpage-Friendly TLB Designs . In HPCA .","DOI":"10.1109\/HPCA.2015.7056034"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","unstructured":"G. Pekhimenko E. Bolotin M. O'Connor O. Mutlu T. C. Mowry and S. W. Keckler. 2015. Toggle-Aware Compression for GPUs . IEEE CAL . 10.1109\/LCA.2015.2430853","DOI":"10.1109\/LCA.2015.2430853"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"crossref","unstructured":"G. Pekhimenko E. Bolotin N. Vijaykumar O. Mutlu T. Mowry and S. Keckler. 2016. A Case for Toggle-aware Compression for GPU Systems. In HPCA .","DOI":"10.1109\/HPCA.2016.7446064"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"G. Pekhimenko V. Seshadri Y. Kim H. Xin O. Mutlu M. Kozuch P. Gibbons and T. Mowry. 2013. Linearly Compressed Pages: A Main Memory Compression Framework with Low Complexity and Low Latency. In MICRO .","DOI":"10.1145\/2540708.2540724"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/2370816.2370870"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"B. Pham A. Bhattacharjee Y. Eckert and G. H. Loh. 2014. Increasing TLB Reach by Exploiting Clustering in Page Translations. In HPCA .","DOI":"10.1109\/HPCA.2014.6835964"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","unstructured":"B. Pham V. Vaidyanathan A. Jaleel and A. Bhattacharjee. 2012. CoLT: Coalesced Large-Reach TLBs. In MICRO . 10.1109\/MICRO.2012.32","DOI":"10.1109\/MICRO.2012.32"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","unstructured":"B. Pichai L. Hsu and A. Bhattacharjee. 2014. Architectural Support for Address Translation on GPUs: Designing Memory Management Units for CPU\/GPUs with Unified Address Spaces. In ASPLOS . 10.1145\/2541940.2541942","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_76_1","first-page":"x86","volume":"201","author":"Power J.","unstructured":"J. Power, M. Hill, and D. Wood. 2014. Supporting x86--64 address translation for 100s of GPU lanes. In HPCA .","journal-title":"Wood."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","unstructured":"M. Rhu N. Gimelshein J. Clemons A. Zulfiqar and S. Keckler. 2016. vDNN: Virtualized Deep Neural Networks for Scalable Memory-efficient Neural Network Design. In MICRO .","DOI":"10.5555\/3195638.3195660"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"crossref","unstructured":"M. Rhu M. O'Connor N. Chatterjee J. Pool Y. Kwon and S. Keckler. 2018. Compressing DMA Engine: Leveraging Activation Sparsity for Training Deep Neural Networks. In HPCA .","DOI":"10.1109\/HPCA.2018.00017"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/339647.339668"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","unstructured":"T. Rogers M. O'Connor and T. Aamodt. 2012. Cache-conscious Wavefront Scheduling. In MICRO . 10.1109\/MICRO.2012.16","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_81_1","unstructured":"SAFARI Research Group. 2017. Mosaic -- GitHub Repository . https:\/\/github.com\/Carnegie Mellon University-SAFARI\/Mosaic\/."},{"key":"e_1_3_2_1_82_1","unstructured":"N. Sakharnykh. 2016. Beyond GPU Memory Limits with Unified Memory on Pascal. https:\/\/devblogs.nvidia.com\/beyond-gpu-memory-limits-unified-memory-pascal\/"},{"key":"e_1_3_2_1_83_1","unstructured":"N. Sakharnykh. 2017a. Maximizing Unified Memory Performance in CUDA. https:\/\/devblogs.nvidia.com\/maximizing-unified-memory-performance-cuda\/"},{"key":"e_1_3_2_1_84_1","unstructured":"N. Sakharnykh. 2017b. Unified Memory on Pascal and Volta. In NVIDIA GTC ."},{"key":"e_1_3_2_1_85_1","unstructured":"N. Sakharnykh. 2018. Everything You Need to Know About Unified Memory. In NVIDIA GTC ."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","unstructured":"V. Sathish M. Schulte and N. Kim. 2012. Lossless and Lossy Memory I\/O Link Compression for Improving Performance of GPGPU Workloads. In PACT . 10.1145\/2370816.2370864","DOI":"10.1145\/2370816.2370864"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","unstructured":"A. Saulsbury F. Dahlgren and P. Stenstr\u00f6m. 2000. Recency-Based TLB Preloading. In ISCA . 10.1145\/339647.339666","DOI":"10.1145\/339647.339666"},{"key":"e_1_3_2_1_88_1","unstructured":"T. Schroeder. 2011. Peer-to-Peer & Unified Virtual Addressing. https:\/\/developer.download.nvidia.com\/CUDA\/training\/cuda_webinars_GPUDirect_uva.pdf"},{"key":"e_1_3_2_1_89_1","volume-title":"Mascar: Speeding Up GPU Warps by Reducing Memory Pitstops. In HPCA .","author":"Sethia A.","year":"2015","unstructured":"A. Sethia, D. Jamshidi, and S. Mahlke. 2015. Mascar: Speeding Up GPU Warps by Reducing Memory Pitstops. In HPCA ."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","unstructured":"S. Shin G. Cox M. Oskin G. Loh Y. Solihin A. Bhattacharjee and A. Basu. 2018. Scheduling Page Table Walks for Irregular GPU Applications. In ISCA . 10.1109\/ISCA.2018.00025","DOI":"10.1109\/ISCA.2018.00025"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","unstructured":"S. Srikantaiah and M. Kandemir. 2010. Synergistic TLBs for High Performance Address Translation in Chip Multiprocessors. In MICRO . 10.1109\/MICRO.2010.26","DOI":"10.1109\/MICRO.2010.26"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","unstructured":"J. Stone D. Gohara and G. Shi. 2010. OpenCL: A parallel programming standard for heterogeneous computing systems. Computing in science & engineering (2010).","DOI":"10.5555\/2220077.2220227"},{"key":"e_1_3_2_1_93_1","volume-title":"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Technical Report IMPACT-12-01. Univ. of Illinois at Urbana-Champaign .","author":"Stratton J. A.","year":"2012","unstructured":"J. A. Stratton, C. Rodrigues, I. J. Sung, N. Obeid, L. W. Chang, N. Anssari, G. D. Liu, and W. W. Hwu. 2012. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Technical Report IMPACT-12-01. Univ. of Illinois at Urbana-Champaign ."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","unstructured":"M. Talluri and M. D. Hill. 1994. Surpassing the TLB Performance of Superpages with Less Operating System Support. In ASPLOS . 10.1145\/195473.195531","DOI":"10.1145\/195473.195531"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"crossref","unstructured":"J. Vesely A. Basu M. Oskin G. Loh and A. Bhattacharjee. 2016. Observations and Opportunities in Architecting Shared Virtual Memory for Heterogeneous Systems. In ISPASS .","DOI":"10.1109\/ISPASS.2016.7482091"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195656"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","unstructured":"N. Vijaykumar G. Pekhimenko A. Jog A. Bhowmick R. Ausavarungnirun C. Das M. Kandemir T. Mowry and O. Mutlu. 2015a. A Case for Core-Assisted Bottleneck Acceleration in GPUs: Enabling Flexible Data Compression with Assist Warps. In ISCA . 10.1145\/2749469.2750399","DOI":"10.1145\/2749469.2750399"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750399"},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"crossref","unstructured":"H. Wang F. Luo M. Ibrahim O. Kayiran and A. Jog. 2018. Efficient and Fair Multi-programming in GPUs via Effective Bandwidth Management. In HPCA .","DOI":"10.1109\/HPCA.2018.00030"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"crossref","unstructured":"P. Xiang Y. Yang and H. Zhou. 2014. Warp-Level Divergence in GPUs: Characterization Impact and Mitigation. In HPCA .","DOI":"10.1109\/HPCA.2014.6835939"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"crossref","unstructured":"T. Zheng D. Nellans A. Zulfiqar M. Stephenson and S. Keckler. 2016. Towards High Performance Paged Memory for GPUs . In HPCA .","DOI":"10.1109\/HPCA.2016.7446077"},{"key":"e_1_3_2_1_102_1","unstructured":"W. Zuravleff and T. Robinson. 1997. Controller for a Synchronous DRAM That Maximizes Throughput by Allowing Memory Requests and Commands to Be Issued Out of Order ."}],"event":{"name":"ASPLOS '19: Architectural Support for Programming Languages and Operating Systems","location":"Providence RI USA","acronym":"ASPLOS '19","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3297858.3304044","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3297858.3304044","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3297858.3304044","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:53:14Z","timestamp":1750204394000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3297858.3304044"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,4,4]]},"references-count":102,"alternative-id":["10.1145\/3297858.3304044","10.1145\/3297858"],"URL":"https:\/\/doi.org\/10.1145\/3297858.3304044","relation":{},"subject":[],"published":{"date-parts":[[2019,4,4]]},"assertion":[{"value":"2019-04-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}