{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:33:44Z","timestamp":1772724824143,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,17]],"date-time":"2023-06-17T00:00:00Z","timestamp":1686960000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"CSC scholarship","award":["Grant No. 201903170128"],"award-info":[{"award-number":["Grant No. 201903170128"]}]},{"name":"Research Council of Norway","award":["Grant No. 286596"],"award-info":[{"award-number":["Grant No. 286596"]}]},{"name":"UGent-BOF-GOA","award":["Grant No. 01G01421"],"award-info":[{"award-number":["Grant No. 01G01421"]}]},{"name":"European Research Council (ERC)","award":["Advanced Grant No. 741097"],"award-info":[{"award-number":["Advanced Grant No. 741097"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,17]]},"DOI":"10.1145\/3579371.3589078","type":"proceedings-article","created":{"date-parts":[[2023,6,16]],"date-time":"2023-06-16T20:25:28Z","timestamp":1686947128000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["SAC: Sharing-Aware Caching in Multi-Chip GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6690-3718","authenticated-orcid":false,"given":"Shiqing","family":"Zhang","sequence":"first","affiliation":[{"name":"Ghent University, Ghent, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7762-2878","authenticated-orcid":false,"given":"Mahmood","family":"Naderan-Tahan","sequence":"additional","affiliation":[{"name":"Ghent University, Ghent, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9147-5228","authenticated-orcid":false,"given":"Magnus","family":"Jahre","sequence":"additional","affiliation":[{"name":"Norwegian University of Science and Technology (NTNU), Trondheim, Norway"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8792-4473","authenticated-orcid":false,"given":"Lieven","family":"Eeckhout","sequence":"additional","affiliation":[{"name":"Ghent University, Ghent, Belgium"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,6,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1--12","author":"Alwani Manoj","year":"2016","unstructured":"Manoj Alwani , Han Chen , Michael Ferdman , and Peter Milder . 2016 . Fused-Layer CNN Accelerators . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1--12 . Manoj Alwani, Han Chen, Michael Ferdman, and Peter Milder. 2016. Fused-Layer CNN Accelerators. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1--12."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 320--332","author":"Arunkumar Akhil","year":"2017","unstructured":"Akhil Arunkumar , Evgeny Bolotin , Benjamin Cho , Ugljesa Milic , Eiman Ebrahimi , Oreste Villa , Aamer Jaleel , Carole-Jean Wu , and David Nellans . 2017 . MCM-GPU: Multi-Chip-Module GPUs for Continued Performance Scalability . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 320--332 . Akhil Arunkumar, Evgeny Bolotin, Benjamin Cho, Ugljesa Milic, Eiman Ebrahimi, Oreste Villa, Aamer Jaleel, Carole-Jean Wu, and David Nellans. 2017. MCM-GPU: Multi-Chip-Module GPUs for Continued Performance Scalability. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 320--332."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 596--609","author":"Baruah Trinayan","year":"2020","unstructured":"Trinayan Baruah , Yifan Sun , Ali Tolga Din\u00e7er , Saiful A Mojumder , Jos\u00e9 L Abell\u00e1n , Yash Ukidave , Ajay Joshi , Norman Rubin , John Kim , and David Kaeli . 2020 . Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems . In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 596--609 . Trinayan Baruah, Yifan Sun, Ali Tolga Din\u00e7er, Saiful A Mojumder, Jos\u00e9 L Abell\u00e1n, Yash Ukidave, Ajay Joshi, Norman Rubin, John Kim, and David Kaeli. 2020. Griffin: Hardware-Software Support for Efficient Page Migration in Multi-GPU Systems. In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 596--609."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 443--454","author":"Beckmann Bradford M","year":"2006","unstructured":"Bradford M Beckmann , Michael R Marty , and David A Wood . 2006 . ASR: Adaptive Selective Replication for CMP Caches . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 443--454 . Bradford M Beckmann, Michael R Marty, and David A Wood. 2006. ASR: Adaptive Selective Replication for CMP Caches. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 443--454."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the International Symposium on Workload Characterization (IISWC). IEEE, 44--54","author":"Che Shuai","year":"2009","unstructured":"Shuai Che , Michael Boyer , Jiayuan Meng , David Tarjan , Jeremy W Sheaffer , Sang-Ha Lee , and Kevin Skadron . 2009 . Rodinia: A Benchmark Suite for Heterogeneous Computing . In Proceedings of the International Symposium on Workload Characterization (IISWC). IEEE, 44--54 . Shuai Che, Michael Boyer, Jiayuan Meng, David Tarjan, Jeremy W Sheaffer, Sang-Ha Lee, and Kevin Skadron. 2009. Rodinia: A Benchmark Suite for Heterogeneous Computing. In Proceedings of the International Symposium on Workload Characterization (IISWC). IEEE, 44--54."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2654822.2541967"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the International Symposium on VLSI Technology and Circuits (VLSI). IEEE, 3--6.","author":"Dally William J","year":"2018","unstructured":"William J Dally , C Thomas Gray , John Poulton , Brucek Khailany , John Wilson , and Larry Dennison . 2018 . Hardware-Enabled Artificial Intelligence . In Proceedings of the International Symposium on VLSI Technology and Circuits (VLSI). IEEE, 3--6. William J Dally, C Thomas Gray, John Poulton, Brucek Khailany, John Wilson, and Larry Dennison. 2018. Hardware-Enabled Artificial Intelligence. In Proceedings of the International Symposium on VLSI Technology and Circuits (VLSI). IEEE, 3--6."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499368.2451157"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 229--240","author":"Falsafi Babak","year":"1997","unstructured":"Babak Falsafi and David A Wood . 1997 . Reactive NUMA: A Design for Unifying S-COMA and CC-NUMA . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 229--240 . Babak Falsafi and David A Wood. 1997. Reactive NUMA: A Design for Unifying S-COMA and CC-NUMA. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 229--240."},{"key":"e_1_3_2_1_12_1","volume-title":"https:\/\/web.cse.ohiostate.edu\/~pouchet.2\/software\/polybench\/. [Online","author":"Scott Grauer-Gray John Cavazos","year":"2022","unstructured":"John Cavazos Scott Grauer-Gray . 2015. PolyBench\/ GPU 1.0. https:\/\/web.cse.ohiostate.edu\/~pouchet.2\/software\/polybench\/. [Online ; accessed 2022 -04-16]. John Cavazos Scott Grauer-Gray. 2015. PolyBench\/GPU 1.0. https:\/\/web.cse.ohiostate.edu\/~pouchet.2\/software\/polybench\/. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_13_1","volume-title":"Computer Architecture: A Quantitative Approach","author":"Hennessy John L","year":"2017","unstructured":"John L Hennessy and David A Patterson . 2017 . Computer Architecture: A Quantitative Approach . Morgan Kaufmann Publishers . John L Hennessy and David A Patterson. 2017. Computer Architecture: A Quantitative Approach. Morgan Kaufmann Publishers."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). 161--173","author":"Ibrahim Mohamed Assem","year":"2020","unstructured":"Mohamed Assem Ibrahim , Onur Kayiran , Yasuko Eckert , Gabriel H Loh , and Adwait Jog . 2020 . Analyzing and Leveraging Shared L1 Caches in GPUs . In Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). 161--173 . Mohamed Assem Ibrahim, Onur Kayiran, Yasuko Eckert, Gabriel H Loh, and Adwait Jog. 2020. Analyzing and Leveraging Shared L1 Caches in GPUs. In Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). 161--173."},{"key":"e_1_3_2_1_15_1","volume-title":"High Bandwidth Memory (HBM) DRAM. https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a. [Online","author":"JEDEC.","year":"2022","unstructured":"JEDEC. 2021. High Bandwidth Memory (HBM) DRAM. https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a. [Online ; accessed 2022 -~04-16]. JEDEC. 2021. High Bandwidth Memory (HBM) DRAM. https:\/\/www.jedec.org\/standards-documents\/docs\/jesd235a. [Online; accessed 2022-~04-16]."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the Workshop on General Purpose Processing Using GPUs (GPGPU). 12--21","author":"Karki Aajna","year":"2019","unstructured":"Aajna Karki , Chethan Palangotu Keshava , Spoorthi Mysore Shivakumar , Joshua Skow , Goutam Madhukeshwar Hegde , and Hyeran Jeon . 2019 . Detailed Characterization of Deep Neural Networks on GPUs and FPGAs . In Proceedings of the Workshop on General Purpose Processing Using GPUs (GPGPU). 12--21 . Aajna Karki, Chethan Palangotu Keshava, Spoorthi Mysore Shivakumar, Joshua Skow, Goutam Madhukeshwar Hegde, and Hyeran Jeon. 2019. Detailed Characterization of Deep Neural Networks on GPUs and FPGAs. In Proceedings of the Workshop on General Purpose Processing Using GPUs (GPGPU). 12--21."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1022--1036","author":"Khairy Mahmoud","year":"2020","unstructured":"Mahmoud Khairy , Vadim Nikiforov , David Nellans , and Timothy G Rogers . 2020 . Locality-Centric Data and Threadblock Management for Massive GPUs . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1022--1036 . Mahmoud Khairy, Vadim Nikiforov, David Nellans, and Timothy G Rogers. 2020. Locality-Centric Data and Threadblock Management for Massive GPUs. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 1022--1036."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 473--486","author":"Khairy Mahmoud","year":"2020","unstructured":"Mahmoud Khairy , Zhesheng Shen , Tor M Aamodt , and Timothy G Rogers . 2020 . Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 473--486 . Mahmoud Khairy, Zhesheng Shen, Tor M Aamodt, and Timothy G Rogers. 2020. Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 473--486."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 1--12","author":"Kurian George","year":"2014","unstructured":"George Kurian , Srinivas Devadas , and Omer Khan . 2014 . Locality-Aware Data Replication in the Last-Level Cache . In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 1--12 . George Kurian, Srinivas Devadas, and Omer Khan. 2014. Locality-Aware Data Replication in the Last-Level Cache. In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 1--12."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.31"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1147\/sj.71.0015"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 166--179","author":"Liu Yuxi","year":"2018","unstructured":"Yuxi Liu , Xia Zhao , Magnus Jahre , Zhenlin Wang , Xiaolin Wang , Yingwei Luo , and Lieven Eeckhout . 2018 . Get Out of the Valley: Power-Efficient Address Mapping for GPUs . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 166--179 . Yuxi Liu, Xia Zhao, Magnus Jahre, Zhenlin Wang, Xiaolin Wang, Yingwei Luo, and Lieven Eeckhout. 2018. Get Out of the Valley: Power-Efficient Address Mapping for GPUs. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 166--179."},{"key":"e_1_3_2_1_23_1","volume-title":"High Performance","year":"2022","unstructured":"Micron. 2014. High Performance , High Bandwidth : GDDR5 for Networking . https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/product-flyer\/flyer_gddr5_networking.pdf. [Online; accessed 2022 -04-16]. Micron. 2014. High Performance, High Bandwidth: GDDR5 for Networking. https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/product-flyer\/flyer_gddr5_networking.pdf. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_24_1","volume-title":"Technical Note: GDDR6 Design Guide. https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/technical-note\/dram\/tn-ed-04_gddr6_design_guide.pdf. [Online","year":"2021","unstructured":"Micron. 2021 . Technical Note: GDDR6 Design Guide. https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/technical-note\/dram\/tn-ed-04_gddr6_design_guide.pdf. [Online ; accessed 2022-04-16]. Micron. 2021. Technical Note: GDDR6 Design Guide. https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/technical-note\/dram\/tn-ed-04_gddr6_design_guide.pdf. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 123--135","author":"Milic Ugljesa","year":"2017","unstructured":"Ugljesa Milic , Oreste Villa , Evgeny Bolotin , Akhil Arunkumar , Eiman Ebrahimi , Aamer Jaleel , Alex Ramirez , and David Nellans . 2017 . Beyond the Socket: NUMA-Aware GPUs . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 123--135 . Ugljesa Milic, Oreste Villa, Evgeny Bolotin, Akhil Arunkumar, Eiman Ebrahimi, Aamer Jaleel, Alex Ramirez, and David Nellans. 2017. Beyond the Socket: NUMA-Aware GPUs. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 123--135."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 3--14","author":"Muralimanohar Naveen","year":"2007","unstructured":"Naveen Muralimanohar , Rajeev Balasubramonian , and Norm Jouppi . 2007 . Optimizing NUCA Organizations and Wiring Alternatives for Large Caches with CACTI 6.0 . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 3--14 . Naveen Muralimanohar, Rajeev Balasubramonian, and Norm Jouppi. 2007. Optimizing NUCA Organizations and Wiring Alternatives for Large Caches with CACTI 6.0. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 3--14."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 46--58","author":"Muthukrishnan Harini","year":"2021","unstructured":"Harini Muthukrishnan , Daniel Lustig , David Nellans , and Thomas Wenisch . 2021 . GPS: A Global Publish-Subscribe Model for Multi-GPU Memory Management . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 46--58 . Harini Muthukrishnan, Daniel Lustig, David Nellans, and Thomas Wenisch. 2021. GPS: A Global Publish-Subscribe Model for Multi-GPU Memory Management. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 46--58."},{"key":"e_1_3_2_1_28_1","volume-title":"NVIDIA DGX-1: Essential Instrument of AI Research. https:\/\/www.nvidia.com\/en-gb\/data-center\/dgx-systems\/dgx-1\/. [Online","year":"2022","unstructured":"Nvidia. 2016. NVIDIA DGX-1: Essential Instrument of AI Research. https:\/\/www.nvidia.com\/en-gb\/data-center\/dgx-systems\/dgx-1\/. [Online ; accessed 2022 -04-16]. Nvidia. 2016. NVIDIA DGX-1: Essential Instrument of AI Research. https:\/\/www.nvidia.com\/en-gb\/data-center\/dgx-systems\/dgx-1\/. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_29_1","volume-title":"NVIDIA Tesla P100: The Most Advanced Datacenter Accelerator Ever Built. https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf. [Online","year":"2022","unstructured":"Nvidia. 2016. NVIDIA Tesla P100: The Most Advanced Datacenter Accelerator Ever Built. https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf. [Online ; accessed 2022 -04-16]. Nvidia. 2016. NVIDIA Tesla P100: The Most Advanced Datacenter Accelerator Ever Built. https:\/\/images.nvidia.com\/content\/pdf\/tesla\/whitepaper\/pascal-architecture-whitepaper.pdf. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_30_1","volume-title":"NVIDIA DGX-2: Break Through the Barriers to AI Speed and Scale. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/. [Online","year":"2022","unstructured":"Nvidia. 2018. NVIDIA DGX-2: Break Through the Barriers to AI Speed and Scale. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/. [Online ; accessed 2022 -04-16]. Nvidia. 2018. NVIDIA DGX-2: Break Through the Barriers to AI Speed and Scale. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_31_1","volume-title":"NVIDIA A100 Tensor Core GPU Architecture: The World's Most Advanced Data Center GPU. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. [Online","year":"2022","unstructured":"Nvidia. 2020. NVIDIA A100 Tensor Core GPU Architecture: The World's Most Advanced Data Center GPU. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. [Online ; accessed 2022 -04-16]. Nvidia. 2020. NVIDIA A100 Tensor Core GPU Architecture: The World's Most Advanced Data Center GPU. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_32_1","volume-title":"NVIDIA cuDNN. https:\/\/developer.nvidia.com\/cudnn. [Online","year":"2022","unstructured":"Nvidia. 2020. NVIDIA cuDNN. https:\/\/developer.nvidia.com\/cudnn. [Online ; accessed 2022 -04-16]. Nvidia. 2020. NVIDIA cuDNN. https:\/\/developer.nvidia.com\/cudnn. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_33_1","volume-title":"NVIDIA CUDA SDK Code Samples. https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/index.html. [Online","year":"2022","unstructured":"Nvidia. 2022. NVIDIA CUDA SDK Code Samples. https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/index.html. [Online ; accessed 2022 -04-16]. Nvidia. 2022. NVIDIA CUDA SDK Code Samples. https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/index.html. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_34_1","volume-title":"NVIDIA NVLINK: High-speed GPU Interconnect. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/. [Online","year":"2022","unstructured":"Nvidia. 2022 . NVIDIA NVLINK: High-speed GPU Interconnect. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/. [Online ; accessed 2022-04-16]. Nvidia. 2022. NVIDIA NVLINK: High-speed GPU Interconnect. https:\/\/www.nvidia.com\/en-us\/design-visualization\/nvlink-bridges\/. [Online; accessed 2022-04-16]."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 167--178","author":"Qureshi Moinuddin K","year":"2006","unstructured":"Moinuddin K Qureshi , Daniel N Lynch , Onur Mutlu , and Yale N Patt . 2006 . A Case for MLP-Aware Cache Replacement . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 167--178 . Moinuddin K Qureshi, Daniel N Lynch, Onur Mutlu, and Yale N Patt. 2006. A Case for MLP-Aware Cache Replacement. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 167--178."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 582--595","author":"Ren Xiaowei","year":"2020","unstructured":"Xiaowei Ren , Daniel Lustig , Evgeny Bolotin , Aamer Jaleel , Oreste Villa , and David Nellans . 2020 . HMG: Extending Cache Coherence Protocols Across Modern Hierarchical Multi-GPU Systems . In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 582--595 . Xiaowei Ren, Daniel Lustig, Evgeny Bolotin, Aamer Jaleel, Oreste Villa, and David Nellans. 2020. HMG: Extending Cache Coherence Protocols Across Modern Hierarchical Multi-GPU Systems. In Proceedings of the International Symposium on High Performance Computer Architecture (HPCA). IEEE, 582--595."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 72--83","author":"Rogers Timothy G","year":"2012","unstructured":"Timothy G Rogers , Mike O'Connor , and Tor M Aamodt . 2012 . Cache-Conscious Wavefront Scheduling . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 72--83 . Timothy G Rogers, Mike O'Connor, and Tor M Aamodt. 2012. Cache-Conscious Wavefront Scheduling. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 72--83."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the Annual International Symposium on Computer Architecture (ISCA). 384--393","author":"Seznec A.","year":"1994","unstructured":"A. Seznec . 1994 . Decoupled Sectored Caches: Conciliating Low Tag Implementation Cost and Low Miss Ratio . In Proceedings of the Annual International Symposium on Computer Architecture (ISCA). 384--393 . A. Seznec. 1994. Decoupled Sectored Caches: Conciliating Low Tag Implementation Cost and Low Miss Ratio. In Proceedings of the Annual International Symposium on Computer Architecture (ISCA). 384--393."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 14--27","author":"Shao Yakun Sophia","unstructured":"Yakun Sophia Shao , Jason Clemons , Rangharajan Venkatesan , Brian Zimmer , Matthew Fojtik , Nan Jiang , Ben Keller , Alicia Klinefelter , Nathaniel Pinckney , Priyanka Raina , Stephen G. Tell , Yanqing Zhang , William J. Dally , Joel Emer , C. Thomas Gray , Brucek Khailany , and Stephen W. Keckler . 2019. Simba: Scaling Deep-Learning Inference with Multi-Chip-Module-Based Architecture . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 14--27 . Yakun Sophia Shao, Jason Clemons, Rangharajan Venkatesan, Brian Zimmer, Matthew Fojtik, Nan Jiang, Ben Keller, Alicia Klinefelter, Nathaniel Pinckney, Priyanka Raina, Stephen G. Tell, Yanqing Zhang, William J. Dally, Joel Emer, C. Thomas Gray, Brucek Khailany, and Stephen W. Keckler. 2019. Simba: Scaling Deep-Learning Inference with Multi-Chip-Module-Based Architecture. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 14--27."},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the Symposium on High-Performance Interconnects (HOTI). IEEE, 1--8.","author":"Sharma Debendra Das","year":"2020","unstructured":"Debendra Das Sharma . 2020 . PCI Express\u00ae 6.0 Specification: A Low-Latency, High-Bandwidth, High-Reliability, and Cost-Effective Interconnect With 64.0 GT\/s PAM-4 Signaling . In Proceedings of the Symposium on High-Performance Interconnects (HOTI). IEEE, 1--8. Debendra Das Sharma. 2020. PCI Express\u00ae 6.0 Specification: A Low-Latency, High-Bandwidth, High-Reliability, and Cost-Effective Interconnect With 64.0 GT\/s PAM-4 Signaling. In Proceedings of the Symposium on High-Performance Interconnects (HOTI). IEEE, 1--8."},{"key":"e_1_3_2_1_42_1","volume-title":"Geng Daniel Liu, and Wen-mei W Hwu","author":"Stratton John A","year":"2012","unstructured":"John A Stratton , Christopher Rodrigues , I- Jui Sung , Nady Obeid , Li-Wen Chang , Nasser Anssari , Geng Daniel Liu, and Wen-mei W Hwu . 2012 . Parboil : A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Technical Report. IMPACT-12-01, Center for Reliable and High-Performance Computing, University of Illinois at Urbana-Champaign. 29 pages. John A Stratton, Christopher Rodrigues, I-Jui Sung, Nady Obeid, Li-Wen Chang, Nasser Anssari, Geng Daniel Liu, and Wen-mei W Hwu. 2012. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. Technical Report. IMPACT-12-01, Center for Reliable and High-Performance Computing, University of Illinois at Urbana-Champaign. 29 pages."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the International Symposium on Networks-on-Chip (NOCS). IEEE, 201--210","author":"Sun Chen","year":"2012","unstructured":"Chen Sun , Chia-Hsin Owen Chen , George Kurian , Lan Wei , Jason Miller , Anant Agarwal , Li-Shiuan Peh , and Vladimir Stojanovic . 2012 . DSENT - A Tool Connecting Emerging Photonics with Electronics for Opto-Electronic Networks-on-Chip Modeling . In Proceedings of the International Symposium on Networks-on-Chip (NOCS). IEEE, 201--210 . Chen Sun, Chia-Hsin Owen Chen, George Kurian, Lan Wei, Jason Miller, Anant Agarwal, Li-Shiuan Peh, and Vladimir Stojanovic. 2012. DSENT - A Tool Connecting Emerging Photonics with Electronics for Opto-Electronic Networks-on-Chip Modeling. In Proceedings of the International Symposium on Networks-on-Chip (NOCS). IEEE, 201--210."},{"key":"e_1_3_2_1_44_1","volume-title":"DesignWare Library - Datapath and Building Block IP. https:\/\/www.synopsys.com\/dw\/buildingblock.php. [Online","year":"2023","unstructured":"Synopsys. 2022. DesignWare Library - Datapath and Building Block IP. https:\/\/www.synopsys.com\/dw\/buildingblock.php. [Online ; accessed 2023 -02-15]. Synopsys. 2022. DesignWare Library - Datapath and Building Block IP. https:\/\/www.synopsys.com\/dw\/buildingblock.php. [Online; accessed 2023-02-15]."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 698--707","author":"Tabbakh Abdulaziz","year":"2017","unstructured":"Abdulaziz Tabbakh , Murali Annavaram , and Xuehai Qian . 2017 . Power Efficient Sharing-Aware GPU Data Management . In Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 698--707 . Abdulaziz Tabbakh, Murali Annavaram, and Xuehai Qian. 2017. Power Efficient Sharing-Aware GPU Data Management. In Proceedings of the International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 698--707."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE, 166--179","author":"Tsai Po-An","year":"2017","unstructured":"Po-An Tsai , Nathan Beckmann , and Daniel Sanchez . 2017 . Nexus: A New Approach to Replication in Distributed Shared Caches . In Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE, 166--179 . Po-An Tsai, Nathan Beckmann, and Daniel Sanchez. 2017. Nexus: A New Approach to Replication in Distributed Shared Caches. In Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT). IEEE, 166--179."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 339--351","author":"Young Vinson","year":"2018","unstructured":"Vinson Young , Aamer Jaleel , Evgeny Bolotin , Eiman Ebrahimi , David Nellans , and Oreste Villa . 2018 . Combining HW\/SW Mechanisms to Improve NUMA Performance of Multi-GPU Systems . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 339--351 . Vinson Young, Aamer Jaleel, Evgeny Bolotin, Eiman Ebrahimi, David Nellans, and Oreste Villa. 2018. Combining HW\/SW Mechanisms to Improve NUMA Performance of Multi-GPU Systems. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 339--351."},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 411--423","author":"Zhao Xia","year":"2019","unstructured":"Xia Zhao , Almutaz Adileh , Zhibin Yu , Zhiying Wang , Aamer Jaleel , and Lieven Eeckhout . 2019 . Adaptive Memory-Side Last-Level GPU Caching . In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 411--423 . Xia Zhao, Almutaz Adileh, Zhibin Yu, Zhiying Wang, Aamer Jaleel, and Lieven Eeckhout. 2019. Adaptive Memory-Side Last-Level GPU Caching. In Proceedings of the International Symposium on Computer Architecture (ISCA). IEEE, 411--423."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 967--980","author":"Zhao Xia","year":"2020","unstructured":"Xia Zhao , Magnus Jahre , and Lieven Eeckhout . 2020 . Selective Replication in Memory-Side GPU Caches . In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 967--980 . Xia Zhao, Magnus Jahre, and Lieven Eeckhout. 2020. Selective Replication in Memory-Side GPU Caches. In Proceedings of the International Symposium on Microarchitecture (MICRO). IEEE, 967--980."}],"event":{"name":"ISCA '23: 50th Annual International Symposium on Computer Architecture","location":"Orlando FL USA","acronym":"ISCA '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE"]},"container-title":["Proceedings of the 50th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589078","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:39Z","timestamp":1750178799000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589078"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,17]]},"references-count":48,"alternative-id":["10.1145\/3579371.3589078","10.1145\/3579371"],"URL":"https:\/\/doi.org\/10.1145\/3579371.3589078","relation":{},"subject":[],"published":{"date-parts":[[2023,6,17]]},"assertion":[{"value":"2023-06-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}