{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T06:50:24Z","timestamp":1767855024898,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,18]]},"DOI":"10.1145\/3466752.3480088","type":"proceedings-article","created":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T19:12:05Z","timestamp":1634497925000},"page":"46-58","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["GPS: A Global Publish-Subscribe Model for Multi-GPU Memory Management"],"prefix":"10.1145","author":[{"given":"Harini","family":"Muthukrishnan","sequence":"first","affiliation":[{"name":"University of Michigan, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel","family":"Lustig","sequence":"additional","affiliation":[{"name":"NVIDIA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Nellans","sequence":"additional","affiliation":[{"name":"NVIDIA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Wenisch","sequence":"additional","affiliation":[{"name":"University of Michigan, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Page Placement Strategies for GPUs Within Heterogeneous Memory Systems. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Agarwal Neha","year":"2015","unstructured":"Neha Agarwal , David Nellans , Mark Stephenson , Mike O\u2019Connor , and Stephen\u00a0 W Keckler . 2015 . Page Placement Strategies for GPUs Within Heterogeneous Memory Systems. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). Neha Agarwal, David Nellans, Mark Stephenson, Mike O\u2019Connor, and Stephen\u00a0W Keckler. 2015. Page Placement Strategies for GPUs Within Heterogeneous Memory Systems. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/301308.301326"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2009.7478337"},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2019. AMD Infinity Architecture: The Foundation of the Modern Datacenter. Product Brief. amd.com\/system\/files\/documents\/LE-70001-SB-InfinityArchitecture.pdf last accessed on 08\/17\/2020.  AMD. 2019. AMD Infinity Architecture: The Foundation of the Modern Datacenter. Product Brief. amd.com\/system\/files\/documents\/LE-70001-SB-InfinityArchitecture.pdf last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2020. AMD Crossfire\u2122 Technology. www.amd.com\/en\/technologies\/crossfire last accessed on 04\/14\/2021.  AMD. 2020. AMD Crossfire\u2122 Technology. www.amd.com\/en\/technologies\/crossfire last accessed on 04\/14\/2021."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_7_1","volume-title":"An Efficient Multicast Protocol for Content-Based Publish-Subscribe Systems. In International Conference on Distributed Computing Systems (ICDCS).","author":"Banavar Guruduth","year":"1999","unstructured":"Guruduth Banavar , Tushar Chandra , Bodhi Mukherjee , Jay Nagarajarao , Robert\u00a0 E Strom , and Daniel\u00a0 C Sturman . 1999 . An Efficient Multicast Protocol for Content-Based Publish-Subscribe Systems. In International Conference on Distributed Computing Systems (ICDCS). Guruduth Banavar, Tushar Chandra, Bodhi Mukherjee, Jay Nagarajarao, Robert\u00a0E Strom, and Daniel\u00a0C Sturman. 1999. An Efficient Multicast Protocol for Content-Based Publish-Subscribe Systems. In International Conference on Distributed Computing Systems (ICDCS)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48169-9_1"},{"key":"e_1_3_2_1_9_1","volume-title":"Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Baruah Trinayan","year":"2020","unstructured":"Trinayan Baruah , Yifan Sun , Ali Din\u00e7er , Md\u00a0 Saiful\u00a0Arefin Mojumder , Jos\u00e9\u00a0 L. Abell\u00e1n , Yash Ukidave , Ajay Joshi , Norman Rubin , John Kim , and David Kaeli . 2020 . Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA). Trinayan Baruah, Yifan Sun, Ali Din\u00e7er, Md\u00a0Saiful\u00a0Arefin Mojumder, Jos\u00e9\u00a0L. Abell\u00e1n, Yash Ukidave, Ajay Joshi, Norman Rubin, John Kim, and David Kaeli. 2020. Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_10_1","volume-title":"Software Assisted Hardware Cache Coherence for Heterogeneous Processors. In International Symposium on Memory Systems (ISMM).","author":"Basu Arkaprava","year":"2016","unstructured":"Arkaprava Basu , Sooraj Puthoor , Shuai Che , and Bradford\u00a0 M Beckmann . 2016 . Software Assisted Hardware Cache Coherence for Heterogeneous Processors. In International Symposium on Memory Systems (ISMM). Arkaprava Basu, Sooraj Puthoor, Shuai Che, and Bradford\u00a0M Beckmann. 2016. Software Assisted Hardware Cache Coherence for Heterogeneous Processors. In International Symposium on Memory Systems (ISMM)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3399730"},{"key":"e_1_3_2_1_12_1","volume-title":"CANDY: Enabling Coherent DRAM Caches for Multi-node Systems. In International Symposium on Microarchitecture (MICRO).","author":"Chou Chiachen","year":"2016","unstructured":"Chiachen Chou , Aamer Jaleel , and Moinuddin\u00a0 K Qureshi . 2016 . CANDY: Enabling Coherent DRAM Caches for Multi-node Systems. In International Symposium on Microarchitecture (MICRO). Chiachen Chou, Aamer Jaleel, and Moinuddin\u00a0K Qureshi. 2016. CANDY: Enabling Coherent DRAM Caches for Multi-node Systems. In International Symposium on Microarchitecture (MICRO)."},{"key":"e_1_3_2_1_13_1","volume-title":"Analyzing Memory Management Methods on Integrated CPU-GPU Systems. In International Symposium on Memory Management (ISMM).","author":"Dashti Mohammad","year":"2017","unstructured":"Mohammad Dashti and Alexandra Fedorova . 2017 . Analyzing Memory Management Methods on Integrated CPU-GPU Systems. In International Symposium on Memory Management (ISMM). Mohammad Dashti and Alexandra Fedorova. 2017. Analyzing Memory Management Methods on Integrated CPU-GPU Systems. In International Symposium on Memory Management (ISMM)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451157"},{"key":"e_1_3_2_1_15_1","volume-title":"Towards Expressive Publish\/Subscribe Systems. In International Conference on Extending Database Technology (EDBT).","author":"Demers Alan","year":"2006","unstructured":"Alan Demers , Johannes Gehrke , Mingsheng Hong , Mirek Riedewald , and Walker White . 2006 . Towards Expressive Publish\/Subscribe Systems. In International Conference on Extending Database Technology (EDBT). Alan Demers, Johannes Gehrke, Mingsheng Hong, Mirek Riedewald, and Walker White. 2006. Towards Expressive Publish\/Subscribe Systems. In International Conference on Extending Database Technology (EDBT)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2641764"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/857076.857078"},{"key":"e_1_3_2_1_18_1","volume-title":"Filtering Algorithms and Implementation for Very Fast Publish\/Subscribe Systems. In International Conference on Management of Data (SIGMOD).","author":"Fabret Fran\u00e7oise","year":"2001","unstructured":"Fran\u00e7oise Fabret , H\u00a0Arno Jacobsen , Fran\u00e7ois Llirbat , Jo\u0103o Pereira , Kenneth\u00a0 A Ross , and Dennis Shasha . 2001 . Filtering Algorithms and Implementation for Very Fast Publish\/Subscribe Systems. In International Conference on Management of Data (SIGMOD). Fran\u00e7oise Fabret, H\u00a0Arno Jacobsen, Fran\u00e7ois Llirbat, Jo\u0103o Pereira, Kenneth\u00a0A Ross, and Dennis Shasha. 2001. Filtering Algorithms and Implementation for Very Fast Publish\/Subscribe Systems. In International Conference on Management of Data (SIGMOD)."},{"key":"e_1_3_2_1_19_1","volume-title":"HSA Memory Model. In A Symposium on High Performance Chips (Hot Chips).","author":"Gaster R","year":"2013","unstructured":"Benedict\u00a0 R Gaster . 2013 . HSA Memory Model. In A Symposium on High Performance Chips (Hot Chips). Benedict\u00a0R Gaster. 2013. HSA Memory Model. In A Symposium on High Performance Chips (Hot Chips)."},{"key":"e_1_3_2_1_20_1","unstructured":"Tom\u2019s Hardware. 2019. AMD Big Navi and RDNA 2 GPUs. tomshardware.com\/news\/amd-big_navi-rdna2-all-we-know last accessed on 08\/17\/2020.  Tom\u2019s Hardware. 2019. AMD Big Navi and RDNA 2 GPUs. tomshardware.com\/news\/amd-big_navi-rdna2-all-we-know last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_21_1","unstructured":"Mark Harris. 2017. Unified Memory for CUDA Beginners. developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/ last accessed on 08\/17\/2020.  Mark Harris. 2017. Unified Memory for CUDA Beginners. developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/ last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835930"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/143365.143537"},{"key":"e_1_3_2_1_24_1","volume-title":"Heterogeneous-race-free Memory Models. In International conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Hower R","year":"2014","unstructured":"Derek\u00a0 R Hower , Blake\u00a0 A Hechtman , Bradford\u00a0 M Beckmann , Benedict\u00a0 R Gaster , Mark\u00a0 D Hill , Steven\u00a0 K Reinhardt , and David\u00a0 A Wood . 2014 . Heterogeneous-race-free Memory Models. In International conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). Derek\u00a0R Hower, Blake\u00a0A Hechtman, Bradford\u00a0M Beckmann, Benedict\u00a0R Gaster, Mark\u00a0D Hill, Steven\u00a0K Reinhardt, and David\u00a0A Wood. 2014. Heterogeneous-race-free Memory Models. In International conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_25_1","volume-title":"Batch-Aware Unified Memory Management in GPUs for Irregular Workloads. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Kim Hyojong","year":"2020","unstructured":"Hyojong Kim , Jaewoong Sim , Prasun Gera , Ramyad Hadidi , and Hyesoon Kim . 2020 . Batch-Aware Unified Memory Management in GPUs for Irregular Workloads. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). Hyojong Kim, Jaewoong Sim, Prasun Gera, Ramyad Hadidi, and Hyesoon Kim. 2020. Batch-Aware Unified Memory Management in GPUs for Irregular Workloads. In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_26_1","volume-title":"Spare Register Aware Prefetching for Graph Algorithms on GPUs. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Lakshminarayana B","year":"2014","unstructured":"Nagesh\u00a0 B Lakshminarayana and Hyesoon Kim . 2014 . Spare Register Aware Prefetching for Graph Algorithms on GPUs. In International Symposium on High Performance Computer Architecture (HPCA). Nagesh\u00a0B Lakshminarayana and Hyesoon Kim. 2014. Spare Register Aware Prefetching for Graph Algorithms on GPUs. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_27_1","volume-title":"Many-Thread Aware Prefetching Mechanisms for GPGPU Applications. In International Symposium on Microarchitecture (MICRO).","author":"Lee Jaekyu","year":"2010","unstructured":"Jaekyu Lee , Nagesh\u00a0 B Lakshminarayana , Hyesoon Kim , and Richard Vuduc . 2010 . Many-Thread Aware Prefetching Mechanisms for GPGPU Applications. In International Symposium on Microarchitecture (MICRO). Jaekyu Lee, Nagesh\u00a0B Lakshminarayana, Hyesoon Kim, and Richard Vuduc. 2010. Many-Thread Aware Prefetching Mechanisms for GPGPU Applications. In International Symposium on Microarchitecture (MICRO)."},{"key":"e_1_3_2_1_28_1","volume-title":"Thread and Memory Placement on NUMA Systems: Asymmetry Matters. In USENIX Annual Technical Conference (USENIX ATC).","author":"Lepers Baptiste","year":"2015","unstructured":"Baptiste Lepers , Vivien Qu\u00e9ma , and Alexandra Fedorova . 2015 . Thread and Memory Placement on NUMA Systems: Asymmetry Matters. In USENIX Annual Technical Conference (USENIX ATC). Baptiste Lepers, Vivien Qu\u00e9ma, and Alexandra Fedorova. 2015. Thread and Memory Placement on NUMA Systems: Asymmetry Matters. In USENIX Annual Technical Conference (USENIX ATC)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2018.8573483"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522332"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304043"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124534"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00020"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2840807"},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. 2013. CUDA Toolkit Documentation. docs.nvidia.com\/cuda\/ last accessed on 08\/17\/2020.  NVIDIA. 2013. CUDA Toolkit Documentation. docs.nvidia.com\/cuda\/ last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2019. GP100 MMU Format. nvidia.github.io\/open-gpu-doc\/pascal\/gp100-mmu-format.pdf last accessed on 08\/17\/2020.  NVIDIA. 2019. GP100 MMU Format. nvidia.github.io\/open-gpu-doc\/pascal\/gp100-mmu-format.pdf last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2019. NVLink AND NVSwitch The Building Blocks of Advanced Multi-GPU Communication. nvidia.com\/en-us\/data-center\/nvlink\/ last accessed on 08\/17\/2020.  NVIDIA. 2019. NVLink AND NVSwitch The Building Blocks of Advanced Multi-GPU Communication. nvidia.com\/en-us\/data-center\/nvlink\/ last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA. 2020. NVIDIA Ampere Architecture. www.nvidia.com\/en-us\/data-center\/ampere-architecture\/ last accessed on 04\/14\/2021.  NVIDIA. 2020. NVIDIA Ampere Architecture. www.nvidia.com\/en-us\/data-center\/ampere-architecture\/ last accessed on 04\/14\/2021."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA. 2020. NVIDIA DGX Systems. www.nvidia.com\/en-us\/data-center\/dgx-systems\/ last accessed on 04\/14\/2021.  NVIDIA. 2020. NVIDIA DGX Systems. www.nvidia.com\/en-us\/data-center\/dgx-systems\/ last accessed on 04\/14\/2021."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA. 2020. NVIDIA NVLink High-Speed GPU Interconnect. nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/ last accessed on 08\/17\/2020.  NVIDIA. 2020. NVIDIA NVLink High-Speed GPU Interconnect. nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/ last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2020. NVIDIA TITAN V NVIDIA\u2019s Supercomputing GPU Architecture Now for Your PC. www.nvidia.com\/en-us\/titan\/titan-v\/ last accessed on 08\/17\/2020.  NVIDIA. 2020. NVIDIA TITAN V NVIDIA\u2019s Supercomputing GPU Architecture Now for Your PC. www.nvidia.com\/en-us\/titan\/titan-v\/ last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_43_1","volume-title":"PTX: Parallel Thread Execution ISA Version 7.0. docs.nvidia.com\/cuda\/pdf\/ptx_isa_7.0.pdf, last accessed on 08\/17\/2020.","author":"NVIDIA.","year":"2020","unstructured":"NVIDIA. 2020 . PTX: Parallel Thread Execution ISA Version 7.0. docs.nvidia.com\/cuda\/pdf\/ptx_isa_7.0.pdf, last accessed on 08\/17\/2020. NVIDIA. 2020. PTX: Parallel Thread Execution ISA Version 7.0. docs.nvidia.com\/cuda\/pdf\/ptx_isa_7.0.pdf, last accessed on 08\/17\/2020."},{"key":"e_1_3_2_1_44_1","volume-title":"Synchronization Using Remote-Scope Promotion. International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Orr S","year":"2015","unstructured":"Marc\u00a0 S Orr , Shuai Che , Ayse Yilmazer , Bradford\u00a0 M Beckmann , Mark\u00a0 D Hill , and David\u00a0 A Wood . 2015 . Synchronization Using Remote-Scope Promotion. International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS). Marc\u00a0S Orr, Shuai Che, Ayse Yilmazer, Bradford\u00a0M Beckmann, Mark\u00a0D Hill, and David\u00a0A Wood. 2015. Synchronization Using Remote-Scope Promotion. International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_45_1","volume-title":"Efficient Inter-node MPI Communication Using GPUDirect RDMA for InfiniBand Clusters with NVIDIA GPUs. In International Conference on Parallel Processing (ICPP).","author":"Potluri Sreeram","year":"2013","unstructured":"Sreeram Potluri , Khaled Hamidouche , Akshay Venkatesh , Devendar Bureddy , and Dhabaleswar\u00a0 K Panda . 2013 . Efficient Inter-node MPI Communication Using GPUDirect RDMA for InfiniBand Clusters with NVIDIA GPUs. In International Conference on Parallel Processing (ICPP). Sreeram Potluri, Khaled Hamidouche, Akshay Venkatesh, Devendar Bureddy, and Dhabaleswar\u00a0K Panda. 2013. Efficient Inter-node MPI Communication Using GPUDirect RDMA for InfiniBand Clusters with NVIDIA GPUs. In International Conference on Parallel Processing (ICPP)."},{"key":"e_1_3_2_1_46_1","volume-title":"Heterogeneous System Coherence for Integrated CPU-GPU Systems. In International Symposium on Microarchitecture (MICRO).","author":"Power Jason","year":"2013","unstructured":"Jason Power , Arkaprava Basu , Junli Gu , Sooraj Puthoor , Bradford\u00a0 M Beckmann , Mark\u00a0 D Hill , Steven\u00a0 K Reinhardt , and David\u00a0 A Wood . 2013 . Heterogeneous System Coherence for Integrated CPU-GPU Systems. In International Symposium on Microarchitecture (MICRO). Jason Power, Arkaprava Basu, Junli Gu, Sooraj Puthoor, Bradford\u00a0M Beckmann, Mark\u00a0D Hill, Steven\u00a0K Reinhardt, and David\u00a0A Wood. 2013. Heterogeneous System Coherence for Integrated CPU-GPU Systems. In International Symposium on Microarchitecture (MICRO)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835965"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.14778\/2824032.2824043"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.40"},{"key":"e_1_3_2_1_50_1","volume-title":"HMG: Extending Cache Coherence Protocols Across Modern Hierarchical Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Ren Xiaowei","year":"2020","unstructured":"Xiaowei Ren , Daniel Lustig , Evgeny Bolotin , Aamer Jaleel , Oreste Villa , and David Nellans . 2020 . HMG: Extending Cache Coherence Protocols Across Modern Hierarchical Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA). Xiaowei Ren, Daniel Lustig, Evgeny Bolotin, Aamer Jaleel, Oreste Villa, and David Nellans. 2020. HMG: Extending Cache Coherence Protocols Across Modern Hierarchical Multi-GPU Systems. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_51_1","volume-title":"Peer-to-peer & Unified Virtual Addressing. In GPU Technology Conference (GTC).","author":"Schroeder C","year":"2011","unstructured":"Tim\u00a0 C Schroeder . 2011 . Peer-to-peer & Unified Virtual Addressing. In GPU Technology Conference (GTC). Tim\u00a0C Schroeder. 2011. Peer-to-peer & Unified Virtual Addressing. In GPU Technology Conference (GTC)."},{"key":"e_1_3_2_1_52_1","volume-title":"APOGEE: Adaptive Prefetching on GPUs for Energy Efficiency. In International Conference on Parallel Architectures and Compilation Techniques (PACT).","author":"Sethia Ankit","year":"2013","unstructured":"Ankit Sethia , Ganesh Dasika , Mehrzad Samadi , and Scott Mahlke . 2013 . APOGEE: Adaptive Prefetching on GPUs for Energy Efficiency. In International Conference on Parallel Architectures and Compilation Techniques (PACT). Ankit Sethia, Ganesh Dasika, Mehrzad Samadi, and Scott Mahlke. 2013. APOGEE: Adaptive Prefetching on GPUs for Energy Efficiency. In International Conference on Parallel Architectures and Compilation Techniques (PACT)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830821"},{"key":"e_1_3_2_1_54_1","volume-title":"Cache Coherence for GPU Architectures. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Singh Inderpreet","year":"2013","unstructured":"Inderpreet Singh , Arrvindh Shriraman , Wilson\u00a0 WL Fung , Mike O\u2019Connor , and Tor\u00a0 M Aamodt . 2013 . Cache Coherence for GPU Architectures. In International Symposium on High Performance Computer Architecture (HPCA). Inderpreet Singh, Arrvindh Shriraman, Wilson\u00a0WL Fung, Mike O\u2019Connor, and Tor\u00a0M Aamodt. 2013. Cache Coherence for GPU Architectures. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_55_1","volume-title":"Effective Multi-GPU Communication Using Multiple CUDA Streams and Threads. In International Conference on Parallel and Distributed Systems (ICPADS).","author":"Sourouri Mohammed","year":"2014","unstructured":"Mohammed Sourouri , Tor Gillberg , Scott\u00a0 B Baden , and Xing Cai . 2014 . Effective Multi-GPU Communication Using Multiple CUDA Streams and Threads. In International Conference on Parallel and Distributed Systems (ICPADS). Mohammed Sourouri, Tor Gillberg, Scott\u00a0B Baden, and Xing Cai. 2014. Effective Multi-GPU Communication Using Multiple CUDA Streams and Threads. In International Conference on Parallel and Distributed Systems (ICPADS)."},{"key":"e_1_3_2_1_56_1","volume-title":"G-TSC: Timestamp Based Coherence for GPUs. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Tabbakh Abdulaziz","year":"2018","unstructured":"Abdulaziz Tabbakh , Xuehai Qian , and Murali Annavaram . 2018 . G-TSC: Timestamp Based Coherence for GPUs. In International Symposium on High Performance Computer Architecture (HPCA). Abdulaziz Tabbakh, Xuehai Qian, and Murali Annavaram. 2018. G-TSC: Timestamp Based Coherence for GPUs. In International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00077"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_2_1_59_1","unstructured":"Peng Wang. 2017. UNIFIED MEMORY ON P100. olcf.ornl.gov\/wp-content\/uploads\/2018\/02\/SummitDev_Unified-Memory.pdf last accessed on 02\/14\/2021.  Peng Wang. 2017. UNIFIED MEMORY ON P100. olcf.ornl.gov\/wp-content\/uploads\/2018\/02\/SummitDev_Unified-Memory.pdf last accessed on 02\/14\/2021."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2851141.2851145"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/2814270.2814283"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00035"},{"key":"e_1_3_2_1_63_1","volume-title":"NUMA-aware Graph-structured Analytics. In Symposium on Principles and Practice of Parallel Programming (PPoPP).","author":"Zhang Kaiyuan","year":"2015","unstructured":"Kaiyuan Zhang , Rong Chen , and Haibo Chen . 2015 . NUMA-aware Graph-structured Analytics. In Symposium on Principles and Practice of Parallel Programming (PPoPP). Kaiyuan Zhang, Rong Chen, and Haibo Chen. 2015. NUMA-aware Graph-structured Analytics. In Symposium on Principles and Practice of Parallel Programming (PPoPP)."},{"key":"e_1_3_2_1_64_1","volume-title":"Towards High Performance Paged Memory for GPUs. In International Symposium on High Performance Computer Architecture (HPCA).","author":"Zheng Tianhao","year":"2016","unstructured":"Tianhao Zheng , David Nellans , Arslan Zulfiqar , Mark Stephenson , and Stephen\u00a0 W Keckler . 2016 . Towards High Performance Paged Memory for GPUs. In International Symposium on High Performance Computer Architecture (HPCA). Tianhao Zheng, David Nellans, Arslan Zulfiqar, Mark Stephenson, and Stephen\u00a0W Keckler. 2016. Towards High Performance Paged Memory for GPUs. In International Symposium on High Performance Computer Architecture (HPCA)."}],"event":{"name":"MICRO '21: 54th Annual IEEE\/ACM International Symposium on Microarchitecture","location":"Virtual Event Greece","acronym":"MICRO '21","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["MICRO-54: 54th Annual IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3466752.3480088","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3466752.3480088","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:56Z","timestamp":1750191536000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3466752.3480088"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":63,"alternative-id":["10.1145\/3466752.3480088","10.1145\/3466752"],"URL":"https:\/\/doi.org\/10.1145\/3466752.3480088","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}