{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T16:15:54Z","timestamp":1772727354737,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,18]]},"DOI":"10.1145\/3466752.3480083","type":"proceedings-article","created":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T19:12:05Z","timestamp":1634497925000},"page":"1154-1168","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":28,"title":["Improving Address Translation in Multi-GPUs via Sharing and Spilling aware TLB Design"],"prefix":"10.1145","author":[{"given":"Bingyao","family":"Li","sequence":"first","affiliation":[{"name":"University of Pittsburgh, United States of America"}]},{"given":"Jieming","family":"Yin","sequence":"additional","affiliation":[{"name":"Lehigh University, United States of America"}]},{"given":"Youtao","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh"}]},{"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2012.6237041"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2015.2401022"},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. 2015. AMD APP SDK OpenCL Optimization Guide.  AMD. 2015. AMD APP SDK OpenCL Optimization Guide."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2015. AMD Radeon R9 Series Gaming Graphics Cards with High- Bandwidth Memory.  AMD. 2015. AMD Radeon R9 Series Gaming Graphics Cards with High- Bandwidth Memory."},{"key":"e_1_3_2_1_5_1","unstructured":"AMD. 2016. Graphics Core Next Architecture Generation 3 Reference Guide.  AMD. 2016. Graphics Core Next Architecture Generation 3 Reference Guide."},{"key":"e_1_3_2_1_6_1","unstructured":"AMD Corp. 2016. I\/O Virtualization Technology(IOMMU) Specification. http:\/\/developer.amd.com\/wordpress\/media\/2013\/12\/48882_IOMMU.pdf  AMD Corp. 2016. I\/O Virtualization Technology(IOMMU) Specification. http:\/\/developer.amd.com\/wordpress\/media\/2013\/12\/48882_IOMMU.pdf"},{"key":"e_1_3_2_1_7_1","volume-title":"Optimizing the TLB Shootdown Algorithm with Page Access Tracking. In 2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Amit Nadav","year":"2017","unstructured":"Nadav Amit . 2017 . Optimizing the TLB Shootdown Algorithm with Page Access Tracking. In 2017 USENIX Annual Technical Conference (USENIX ATC 17) . USENIX Association, Santa Clara, CA, 27\u201339. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/amit Nadav Amit. 2017. Optimizing the TLB Shootdown Algorithm with Page Access Tracking. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, 27\u201339. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/amit"},{"key":"e_1_3_2_1_8_1","volume-title":"Mosaic: A GPU Memory Manager with Application-Transparent Support for Multiple Page Sizes. In 2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 136\u2013150","author":"Ausavarungnirun R.","unstructured":"R. Ausavarungnirun , J. Landgraf , V. Miller , S. Ghose , J. Gandhi , C.\u00a0 J. Rossbach , and O. Mutlu . 2017 . Mosaic: A GPU Memory Manager with Application-Transparent Support for Multiple Page Sizes. In 2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 136\u2013150 . R. Ausavarungnirun, J. Landgraf, V. Miller, S. Ghose, J. Gandhi, C.\u00a0J. Rossbach, and O. Mutlu. 2017. Mosaic: A GPU Memory Manager with Application-Transparent Support for Multiple Page Sizes. In 2017 50th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 136\u2013150."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173169"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815970"},{"key":"e_1_3_2_1_11_1","volume-title":"2011 38th Annual International Symposium on Computer Architecture (ISCA). 307\u2013317","author":"Barr W.","unstructured":"T.\u00a0 W. Barr , A.\u00a0 L. Cox , and S. Rixner . 2011. SpecTLB: A mechanism for speculative address translation . In 2011 38th Annual International Symposium on Computer Architecture (ISCA). 307\u2013317 . T.\u00a0W. Barr, A.\u00a0L. Cox, and S. Rixner. 2011. SpecTLB: A mechanism for speculative address translation. In 2011 38th Annual International Symposium on Computer Architecture (ISCA). 307\u2013317."},{"key":"e_1_3_2_1_12_1","volume-title":"Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA). 596\u2013609","author":"Baruah T.","year":"2020","unstructured":"T. Baruah , Y. Sun , A.\u00a0 T. Din\u00e7er , S.\u00a0 A. Mojumder , J.\u00a0 L. Abell\u00e1n , Y. Ukidave , A. Joshi , N. Rubin , J. Kim , and D. Kaeli . 2020 . Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA). 596\u2013609 . https:\/\/doi.org\/10.1109\/HPCA47549. 2020 .00055 10.1109\/HPCA47549.2020.00055 T. Baruah, Y. Sun, A.\u00a0T. Din\u00e7er, S.\u00a0A. Mojumder, J.\u00a0L. Abell\u00e1n, Y. Ukidave, A. Joshi, N. Rubin, J. Kim, and D. Kaeli. 2020. Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA). 596\u2013609. https:\/\/doi.org\/10.1109\/HPCA47549.2020.00055"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414639"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485943"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3399730"},{"key":"e_1_3_2_1_16_1","volume-title":"Scalable Distributed Last-Level TLBs Using Low-Latency Interconnects. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 271\u2013284","author":"Bharadwaj S.","year":"2018","unstructured":"S. Bharadwaj , G. Cox , T. Krishna , and A. Bhattacharjee . 2018 . Scalable Distributed Last-Level TLBs Using Low-Latency Interconnects. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 271\u2013284 . https:\/\/doi.org\/10.1109\/MICRO. 2018 .00030 10.1109\/MICRO.2018.00030 S. Bharadwaj, G. Cox, T. Krishna, and A. Bhattacharjee. 2018. Scalable Distributed Last-Level TLBs Using Low-Latency Interconnects. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 271\u2013284. https:\/\/doi.org\/10.1109\/MICRO.2018.00030"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540741"},{"key":"e_1_3_2_1_18_1","unstructured":"Abhishek Bhattacharjee. 2019. Appendix L:Advanced Concepts on Address Translation. Elsevier.  Abhishek Bhattacharjee. 2019. Appendix L:Advanced Concepts on Address Translation. Elsevier."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749717"},{"key":"#cr-split#-e_1_3_2_1_20_1.1","doi-asserted-by":"crossref","unstructured":"A. Bhattacharjee D. Lustig and M. Martonosi. 2017. Architectural and Operating System Support for Virtual Memory. Morgan & Claypool Publishers. https:\/\/doi.org\/10.2200\/S00795ED1V01Y201708CAC042 10.2200\/S00795ED1V01Y201708CAC042","DOI":"10.2200\/S00795ED1V01Y201708CAC042"},{"key":"#cr-split#-e_1_3_2_1_20_1.2","doi-asserted-by":"crossref","unstructured":"A. Bhattacharjee D. Lustig and M. Martonosi. 2017. Architectural and Operating System Support for Virtual Memory. Morgan & Claypool Publishers. https:\/\/doi.org\/10.2200\/S00795ED1V01Y201708CAC042","DOI":"10.1007\/978-3-031-01757-5"},{"key":"e_1_3_2_1_21_1","volume-title":"Characterizing the TLB Behavior of Emerging Parallel Workloads on Chip Multiprocessors. In 2009 18th International Conference on Parallel Architectures and Compilation Techniques. 29\u201340","author":"Bhattacharjee A.","year":"2009","unstructured":"A. Bhattacharjee and M. Martonosi . 2009 . Characterizing the TLB Behavior of Emerging Parallel Workloads on Chip Multiprocessors. In 2009 18th International Conference on Parallel Architectures and Compilation Techniques. 29\u201340 . https:\/\/doi.org\/10.1109\/PACT. 2009 .26 10.1109\/PACT.2009.26 A. Bhattacharjee and M. Martonosi. 2009. Characterizing the TLB Behavior of Emerging Parallel Workloads on Chip Multiprocessors. In 2009 18th International Conference on Parallel Architectures and Compilation Techniques. 29\u201340. https:\/\/doi.org\/10.1109\/PACT.2009.26"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/1735971.1736060"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the fifteenth annual ACM-SIAM symposium on Discrete algorithms. Citeseer, 30\u201339","author":"Chazelle Bernard","year":"2004","unstructured":"Bernard Chazelle , Joe Kilian , Ronitt Rubinfeld , and Ayellet Tal . 2004 . The bloomier filter: an efficient data structure for static support lookup tables . In Proceedings of the fifteenth annual ACM-SIAM symposium on Discrete algorithms. Citeseer, 30\u201339 . Bernard Chazelle, Joe Kilian, Ronitt Rubinfeld, and Ayellet Tal. 2004. The bloomier filter: an efficient data structure for static support lookup tables. In Proceedings of the fifteenth annual ACM-SIAM symposium on Discrete algorithms. Citeseer, 30\u201339."},{"key":"e_1_3_2_1_24_1","volume-title":"Buddy Compression: Enabling Larger Memory for Deep Learning and HPC Workloads on GPUs. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 926\u2013939","author":"Choukse E.","year":"2020","unstructured":"E. Choukse , M.\u00a0 B. Sullivan , M. O\u2019Connor , M. Erez , J. Pool , D. Nellans , and S.\u00a0 W. Keckler . 2020 . Buddy Compression: Enabling Larger Memory for Deep Learning and HPC Workloads on GPUs. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 926\u2013939 . https:\/\/doi.org\/10.1109\/ISCA45697.2020.00080 10.1109\/ISCA45697.2020.00080 E. Choukse, M.\u00a0B. Sullivan, M. O\u2019Connor, M. Erez, J. Pool, D. Nellans, and S.\u00a0W. Keckler. 2020. Buddy Compression: Enabling Larger Memory for Deep Learning and HPC Workloads on GPUs. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 926\u2013939. https:\/\/doi.org\/10.1109\/ISCA45697.2020.00080"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037704"},{"key":"e_1_3_2_1_26_1","volume-title":"GPGPU-3: Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units(GPGPU-3)","author":"Danalis Anthony","unstructured":"Anthony Danalis , Gabriel Marin , Collin McCurdy , Jeremy\u00a0 S. Meredith , Philip\u00a0 C. Roth , Kyle Spafford , Vinod Tipparaju , and Jeffrey\u00a0 S. Vetter . 2010. The Scalable Heterogeneous Computing (SHOC) Benchmark Suite . In GPGPU-3: Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units(GPGPU-3) . Association for Computing Machinery , New York, NY, USA , 63\u201374. https:\/\/doi.org\/10.1145\/1735688.1735702 10.1145\/1735688.1735702 Anthony Danalis, Gabriel Marin, Collin McCurdy, Jeremy\u00a0S. Meredith, Philip\u00a0C. Roth, Kyle Spafford, Vinod Tipparaju, and Jeffrey\u00a0S. Vetter. 2010. The Scalable Heterogeneous Computing (SHOC) Benchmark Suite. In GPGPU-3: Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units(GPGPU-3). Association for Computing Machinery, New York, NY, USA, 63\u201374. https:\/\/doi.org\/10.1145\/1735688.1735702"},{"key":"e_1_3_2_1_27_1","volume-title":"2015 ACM\/IEEE 42nd Annual International Symposium on Computer Architecture (ISCA). 92\u2013104","author":"Du Z.","unstructured":"Z. Du , R. Fasthuber , T. Chen , P. Ienne , L. Li , T. Luo , X. Feng , Y. Chen , and O. Temam . 2015. ShiDianNao: Shifting vision processing closer to the sensor . In 2015 ACM\/IEEE 42nd Annual International Symposium on Computer Architecture (ISCA). 92\u2013104 . https:\/\/doi.org\/10.1145\/2749460779.2750389 10.1145\/2749460779.2750389 Z. Du, R. Fasthuber, T. Chen, P. Ienne, L. Li, T. Luo, X. Feng, Y. Chen, and O. Temam. 2015. ShiDianNao: Shifting vision processing closer to the sensor. In 2015 ACM\/IEEE 42nd Annual International Symposium on Computer Architecture (ISCA). 92\u2013104. https:\/\/doi.org\/10.1145\/2749460779.2750389"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2674005.2674994"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173194"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2591635.2667189"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378494"},{"key":"e_1_3_2_1_32_1","unstructured":"Intel. 2018. The Future of Core Intel GPUs 10nm and Hybrid x86. https:\/\/www.anandtech.com\/show\/13699\/intel-architecture-day-2018-core-future-hybrid-x86\/5  Intel. 2018. The Future of Core Intel GPUs 10nm and Hybrid x86. https:\/\/www.anandtech.com\/show\/13699\/intel-architecture-day-2018-core-future-hybrid-x86\/5"},{"key":"e_1_3_2_1_33_1","volume-title":"Inclusive Caches: Temporal Locality Aware (TLA) Cache Management Policies. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 151\u2013162. https:\/\/doi.org\/10.1109\/MICRO.2010.52","author":"Jaleel A.","year":"2010","unstructured":"A. Jaleel , E. Borch , M. Bhandaru , S.\u00a0 C. Steely Jr ., and J. Emer . 2010 . Achieving Non-Inclusive Cache Performance with Inclusive Caches: Temporal Locality Aware (TLA) Cache Management Policies. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 151\u2013162. https:\/\/doi.org\/10.1109\/MICRO.2010.52 10.1109\/MICRO.2010.52 A. Jaleel, E. Borch, M. Bhandaru, S.\u00a0C. Steely Jr., and J. Emer. 2010. Achieving Non-Inclusive Cache Performance with Inclusive Caches: Temporal Locality Aware (TLA) Cache Management Policies. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 151\u2013162. https:\/\/doi.org\/10.1109\/MICRO.2010.52"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3309710"},{"key":"e_1_3_2_1_35_1","volume-title":"2008 International Conference on Parallel Architectures and Compilation Techniques (PACT). 208\u2013219","author":"Jaleel A.","unstructured":"A. Jaleel , W. Hasenplaugh , M. Qureshi , J. Sebot , S. Steely , and J. Emer . 2008. Adaptive insertion policies for managing shared caches . In 2008 International Conference on Parallel Architectures and Compilation Techniques (PACT). 208\u2013219 . A. Jaleel, W. Hasenplaugh, M. Qureshi, J. Sebot, S. Steely, and J. Emer. 2008. Adaptive insertion policies for managing shared caches. In 2008 International Conference on Parallel Architectures and Compilation Techniques (PACT). 208\u2013219."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056045"},{"key":"e_1_3_2_1_37_1","volume-title":"High Bandwidth Memory (HBM) DRAM 2. Jesd235","author":"JEDEC.","year":"2020","unstructured":"JEDEC. 2020. High Bandwidth Memory (HBM) DRAM 2. Jesd235 ( 2020 ). https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a JEDEC. 2020. High Bandwidth Memory (HBM) DRAM 2. Jesd235 (2020). https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_1_39_1","unstructured":"Jog Adwait. 2015. Design and Analysis of Scheduling Techniques for Throughput Processors. https:\/\/etda.libraries.psu.edu\/catalog\/26480  Jog Adwait. 2015. Design and Analysis of Scheduling Techniques for Throughput Processors. https:\/\/etda.libraries.psu.edu\/catalog\/26480"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings 29th Annual International Symposium on Computer Architecture. 195\u2013206","author":"Kandiraju B.","year":"2002","unstructured":"G.\u00a0 B. Kandiraju and A. Sivasubramaniam . 2002. Going the distance for TLB prefetching: an application-driven study . In Proceedings 29th Annual International Symposium on Computer Architecture. 195\u2013206 . https:\/\/doi.org\/10.1109\/ISCA. 2002 .1003578 10.1109\/ISCA.2002.1003578 G.\u00a0B. Kandiraju and A. Sivasubramaniam. 2002. Going the distance for TLB prefetching: an application-driven study. In Proceedings 29th Annual International Symposium on Computer Architecture. 195\u2013206. https:\/\/doi.org\/10.1109\/ISCA.2002.1003578"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749471"},{"key":"e_1_3_2_1_42_1","volume-title":"Managing GPU Concurrency in Heterogeneous Architectures. In 2014 47th Annual IEEE\/ACM International Symposium on Microarchitecture. 114\u2013126","author":"Kayiran O.","year":"2014","unstructured":"O. Kayiran , N.\u00a0 C. Nachiappan , A. Jog , R. Ausavarungnirun , M.\u00a0 T. Kandemir , G.\u00a0 H. Loh , O. Mutlu , and C.\u00a0 R. Das . 2014 . Managing GPU Concurrency in Heterogeneous Architectures. In 2014 47th Annual IEEE\/ACM International Symposium on Microarchitecture. 114\u2013126 . https:\/\/doi.org\/10.1109\/MICRO.2014.62 10.1109\/MICRO.2014.62 O. Kayiran, N.\u00a0C. Nachiappan, A. Jog, R. Ausavarungnirun, M.\u00a0T. Kandemir, G.\u00a0H. Loh, O. Mutlu, and C.\u00a0R. Das. 2014. Managing GPU Concurrency in Heterogeneous Architectures. In 2014 47th Annual IEEE\/ACM International Symposium on Microarchitecture. 114\u2013126. https:\/\/doi.org\/10.1109\/MICRO.2014.62"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173198"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026931"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2445572.2445574"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358294"},{"key":"e_1_3_2_1_47_1","volume-title":"Article 69 (July","author":"Mittal Sparsh","year":"2015","unstructured":"Sparsh Mittal and Jeffrey\u00a0 S. Vetter . 2015. A Survey of CPU-GPU Heterogeneous Computing Techniques. ACM Comput. Surv. 47, 4 , Article 69 (July 2015 ), 35\u00a0pages. https:\/\/doi.org\/10.1145\/2788396 10.1145\/2788396 Sparsh Mittal and Jeffrey\u00a0S. Vetter. 2015. A Survey of CPU-GPU Heterogeneous Computing Techniques. ACM Comput. Surv. 47, 4, Article 69 (July 2015), 35\u00a0pages. https:\/\/doi.org\/10.1145\/2788396"},{"key":"e_1_3_2_1_48_1","unstructured":"NVIDIA. 2018. DB2 Launch Datasheet Deep Learning Letter WEB. https:\/\/www.scribd.com\/document\/336084072\/61681-DB2-Launch-Datasheet-Deep-Learning-Letter-WEB-NVidia-Deep-Learning-Box#  NVIDIA. 2018. DB2 Launch Datasheet Deep Learning Letter WEB. https:\/\/www.scribd.com\/document\/336084072\/61681-DB2-Launch-Datasheet-Deep-Learning-Letter-WEB-NVidia-Deep-Learning-Box#"},{"key":"e_1_3_2_1_49_1","unstructured":"NVIDIA. 2019. Memory Management on Modern GPU Architectures. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2019\/presentation\/s9727-memory-management-on-modern-gpu-architectures.pdf  NVIDIA. 2019. Memory Management on Modern GPU Architectures. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2019\/presentation\/s9727-memory-management-on-modern-gpu-architectures.pdf"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830819"},{"key":"e_1_3_2_1_51_1","volume-title":"2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 193\u2013206","author":"Parasar M.","unstructured":"M. Parasar , A. Bhattacharjee , and T. Krishna . 2018. SEESAW: Using Superpages to Improve VIPT Caches . In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 193\u2013206 . M. Parasar, A. Bhattacharjee, and T. Krishna. 2018. SEESAW: Using Superpages to Improve VIPT Caches. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 193\u2013206."},{"key":"e_1_3_2_1_52_1","volume-title":"2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 444\u2013456","author":"Park H.","unstructured":"C.\u00a0 H. Park , T. Heo , J. Jeong , and J. Huh . 2017. Hybrid TLB coalescing: Improving TLB translation coverage under diverse fragmented memory allocations . In 2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 444\u2013456 . https:\/\/doi.org\/10.1145\/3079856.3080217 10.1145\/3079856.3080217 C.\u00a0H. Park, T. Heo, J. Jeong, and J. Huh. 2017. Hybrid TLB coalescing: Improving TLB translation coverage under diverse fragmented memory allocations. In 2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 444\u2013456. https:\/\/doi.org\/10.1145\/3079856.3080217"},{"key":"e_1_3_2_1_53_1","volume-title":"An introduction to the infiniband architecture. High performance mass storage and parallel I\/O 42, 617-632","author":"Pfister F","year":"2001","unstructured":"Gregory\u00a0 F Pfister . 2001. An introduction to the infiniband architecture. High performance mass storage and parallel I\/O 42, 617-632 ( 2001 ), 102. Gregory\u00a0F Pfister. 2001. An introduction to the infiniband architecture. High performance mass storage and parallel I\/O 42, 617-632 (2001), 102."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835964"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.32"},{"key":"e_1_3_2_1_56_1","volume-title":"2015 48th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 1\u201312","author":"Pham B.","unstructured":"B. Pham , J. Vesel\u00fd , G.\u00a0 H. Loh , and A. Bhattacharjee . 2015. Large pages and lightweight memory management in virtualized environments: Can you have it both ways? . In 2015 48th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 1\u201312 . https:\/\/doi.org\/10.1145\/2830772.2830773 10.1145\/2830772.2830773 B. Pham, J. Vesel\u00fd, G.\u00a0H. Loh, and A. Bhattacharjee. 2015. Large pages and lightweight memory management in virtualized environments: Can you have it both ways?. In 2015 48th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 1\u201312. https:\/\/doi.org\/10.1145\/2830772.2830773"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835965"},{"key":"e_1_3_2_1_59_1","volume-title":"Improving GPU Multi-tenancy with Page Walk Stealing. In 2021 IEEE 27th International Symposium on High Performance Computer Architecture (HPCA).","author":"Pratheek B","year":"2021","unstructured":"B Pratheek , Neha Jawalkar , and Arkaprava Basu . 2021 . Improving GPU Multi-tenancy with Page Walk Stealing. In 2021 IEEE 27th International Symposium on High Performance Computer Architecture (HPCA). B Pratheek, Neha Jawalkar, and Arkaprava Basu. 2021. Improving GPU Multi-tenancy with Page Walk Stealing. In 2021 IEEE 27th International Symposium on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_60_1","volume-title":"Architecture-Centric Bottleneck Analysis for Deep Neural Network Applications. In 2019 IEEE 26th International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE, 205\u2013214","author":"Ryoo Jihyun","year":"2019","unstructured":"Jihyun Ryoo , Mengran Fan , Xulong Tang , Huaipan Jiang , Meena Arunachalam , Sharada Naveen , and Mahmut\u00a0 T Kandemir . 2019 . Architecture-Centric Bottleneck Analysis for Deep Neural Network Applications. In 2019 IEEE 26th International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE, 205\u2013214 . Jihyun Ryoo, Mengran Fan, Xulong Tang, Huaipan Jiang, Meena Arunachalam, Sharada Naveen, and Mahmut\u00a0T Kandemir. 2019. Architecture-Centric Bottleneck Analysis for Deep Neural Network Applications. In 2019 IEEE 26th International Conference on High Performance Computing, Data, and Analytics (HiPC). IEEE, 205\u2013214."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322254"},{"key":"e_1_3_2_1_62_1","volume-title":"ActivePointers: A Case for Software Address Translation on GPUs. In 2016 ACM\/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA). 596\u2013608","author":"Shahar S.","year":"2016","unstructured":"S. Shahar , S. Bergman , and M. Silberstein . 2016 . ActivePointers: A Case for Software Address Translation on GPUs. In 2016 ACM\/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA). 596\u2013608 . https:\/\/doi.org\/10.1109\/ISCA. 2016 .58 10.1109\/ISCA.2016.58 S. Shahar, S. Bergman, and M. Silberstein. 2016. ActivePointers: A Case for Software Address Translation on GPUs. In 2016 ACM\/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA). 596\u2013608. https:\/\/doi.org\/10.1109\/ISCA.2016.58"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3128571"},{"key":"e_1_3_2_1_64_1","volume-title":"Scheduling Page Table Walks for Irregular GPU Applications. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 180\u2013192","author":"Shin S.","year":"2018","unstructured":"S. Shin , G. Cox , M. Oskin , G.\u00a0 H. Loh , Y. Solihin , A. Bhattacharjee , and A. Basu . 2018 . Scheduling Page Table Walks for Irregular GPU Applications. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 180\u2013192 . https:\/\/doi.org\/10.1109\/ISCA. 2018 .00025 10.1109\/ISCA.2018.00025 S. Shin, G. Cox, M. Oskin, G.\u00a0H. Loh, Y. Solihin, A. Bhattacharjee, and A. Basu. 2018. Scheduling Page Table Walks for Irregular GPU Applications. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 180\u2013192. https:\/\/doi.org\/10.1109\/ISCA.2018.00025"},{"key":"e_1_3_2_1_65_1","volume-title":"Neighborhood-Aware Address Translation for Irregular GPU Applications. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 352\u2013363","author":"Shin S.","year":"2018","unstructured":"S. Shin , M. LeBeane , Y. Solihin , and A. Basu . 2018 . Neighborhood-Aware Address Translation for Irregular GPU Applications. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 352\u2013363 . https:\/\/doi.org\/10.1109\/MICRO. 2018 .00036 10.1109\/MICRO.2018.00036 S. Shin, M. LeBeane, Y. Solihin, and A. Basu. 2018. Neighborhood-Aware Address Translation for Irregular GPU Applications. In 2018 51st Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 352\u2013363. https:\/\/doi.org\/10.1109\/MICRO.2018.00036"},{"key":"e_1_3_2_1_66_1","volume-title":"Synergistic TLBs for High Performance Address Translation in Chip Multiprocessors. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 313\u2013324","author":"Srikantaiah S.","year":"2010","unstructured":"S. Srikantaiah and M. Kandemir . 2010 . Synergistic TLBs for High Performance Address Translation in Chip Multiprocessors. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 313\u2013324 . https:\/\/doi.org\/10.1109\/MICRO. 2010 .26 10.1109\/MICRO.2010.26 S. Srikantaiah and M. Kandemir. 2010. Synergistic TLBs for High Performance Address Translation in Chip Multiprocessors. In 2010 43rd Annual IEEE\/ACM International Symposium on Microarchitecture. 313\u2013324. https:\/\/doi.org\/10.1109\/MICRO.2010.26"},{"key":"e_1_3_2_1_67_1","volume-title":"High Bandwidth Memory (HBM) DRAM. Jesd235","author":"Standard JEDEC","year":"2013","unstructured":"JEDEC Standard . 2013. High Bandwidth Memory (HBM) DRAM. Jesd235 ( 2013 ). JEDEC Standard. 2013. High Bandwidth Memory (HBM) DRAM. Jesd235 (2013)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322230"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2016.7581262"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195708"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3309697.3331487"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123954"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.14"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3314221.3314599"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414633"},{"key":"e_1_3_2_1_76_1","unstructured":"Scott Thornton. 2021. Low cost low latency PCIe ideal for sharing resources. Website. https:\/\/www.microcontrollertips.com\/pcie-sharing-resources-faq\/.  Scott Thornton. 2021. Low cost low latency PCIe ideal for sharing resources. Website. https:\/\/www.microcontrollertips.com\/pcie-sharing-resources-faq\/."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.16"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482091"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Zeke Wang Hongjing Huang Jie Zhang and Gustavo Alonso. 2020. Benchmarking High Bandwidth Memory on FPGAs. arXiv preprint arXiv:2005.04324(2020).  Zeke Wang Hongjing Huang Jie Zhang and Gustavo Alonso. 2020. Benchmarking High Bandwidth Memory on FPGAs. arXiv preprint arXiv:2005.04324(2020).","DOI":"10.1109\/FCCM48280.2020.00024"},{"key":"e_1_3_2_1_81_1","unstructured":"Jinhui Wei Jianzhuang Lu Qi Yu Chen Li and Yunping Zhao. EasyChair 2020. Dynamic GMMU Bypass for Address Translation in Multi-GPU Systems. EasyChair Preprint no. 4179.  Jinhui Wei Jianzhuang Lu Qi Yu Chen Li and Yunping Zhao. EasyChair 2020. Dynamic GMMU Bypass for Address Translation in Multi-GPU Systems. EasyChair Preprint no. 4179."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322247"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555778"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322223"},{"key":"e_1_3_2_1_85_1","volume-title":"2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 430\u2013443","author":"Yan Z.","unstructured":"Z. Yan , J. Vesel\u00fd , G. Cox , and A. Bhattacharjee . 2017. Hardware translation coherence for virtualized systems . In 2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 430\u2013443 . https:\/\/doi.org\/10.1145\/3079856.3080211 10.1145\/3079856.3080211 Z. Yan, J. Vesel\u00fd, G. Cox, and A. Bhattacharjee. 2017. Hardware translation coherence for virtualized systems. In 2017 ACM\/IEEE 44th Annual International Symposium on Computer Architecture (ISCA). 430\u2013443. https:\/\/doi.org\/10.1145\/3079856.3080211"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3296957.3173195"}],"event":{"name":"MICRO '21: 54th Annual IEEE\/ACM International Symposium on Microarchitecture","location":"Virtual Event Greece","acronym":"MICRO '21","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["MICRO-54: 54th Annual IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3466752.3480083","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3466752.3480083","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:56Z","timestamp":1750191536000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3466752.3480083"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":87,"alternative-id":["10.1145\/3466752.3480083","10.1145\/3466752"],"URL":"https:\/\/doi.org\/10.1145\/3466752.3480083","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}