{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:34:53Z","timestamp":1773192893860,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T00:00:00Z","timestamp":1687305600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"JST, PRESTO","award":["JPMJPR20MA"],"award-info":[{"award-number":["JPMJPR20MA"]}]},{"name":"JSPS KAKENHI","award":["JP22H03600"],"award-info":[{"award-number":["JP22H03600"]}]},{"name":"JSPS KAKENHI","award":["JP21K17750"],"award-info":[{"award-number":["JP21K17750"]}]},{"name":"New Energy and Industrial Technology Development Organization (NEDO)","award":["JPNP20006"],"award-info":[{"award-number":["JPNP20006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,21]]},"DOI":"10.1145\/3577193.3593705","type":"proceedings-article","created":{"date-parts":[[2023,6,20]],"date-time":"2023-06-20T18:47:05Z","timestamp":1687286825000},"page":"167-179","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["PERKS: a Locality-Optimized Execution Model for Iterative Memory-bound GPU Applications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2452-1551","authenticated-orcid":false,"given":"Lingqi","family":"Zhang","sequence":"first","affiliation":[{"name":"Tokyo Institute of Technology, Tokyo, Japan"},{"name":"National Institute of Advanced Industrial Science and Technology, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7165-2095","authenticated-orcid":false,"given":"Mohamed","family":"Wahib","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1244-3151","authenticated-orcid":false,"given":"Peng","family":"Chen","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology, Tokyo, Japan"},{"name":"RIKEN Center for Computational Science, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6208-4102","authenticated-orcid":false,"given":"Jintao","family":"Meng","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6545-1943","authenticated-orcid":false,"given":"Xiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Oak Ridge National Laboratory, Knoxville, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7297-6211","authenticated-orcid":false,"given":"Toshio","family":"Endo","sequence":"additional","affiliation":[{"name":"Tokyo Institute of Technology, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1910-8532","authenticated-orcid":false,"given":"Satoshi","family":"Matsuoka","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"},{"name":"Tokyo Institute of Technology, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2023,6,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/www.top500.org\/lists\/top500\/2022\/06\/highs\/ [Online","year":"2021","unstructured":"2022. TOP500. https:\/\/www.top500.org\/lists\/top500\/2022\/06\/highs\/ [Online ; accessed 27- Mar- 2021 ]. 2022. TOP500. https:\/\/www.top500.org\/lists\/top500\/2022\/06\/highs\/ [Online; accessed 27-Mar-2021]."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/1753228.1753233"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-016-1701-3"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1572769.1572792"},{"key":"e_1_3_2_1_5_1","volume-title":"Quintana-Ort\u00ed","author":"Aliaga Jos\u00e9 I.","year":"2015","unstructured":"Jos\u00e9 I. Aliaga , Joaqu\u00edn P\u00e9rez , and Enrique S . Quintana-Ort\u00ed . 2015 . Systematic Fusion of CUDA Kernels for Iterative Sparse Linear System Solvers. In Euro-Par 2015: Parallel Processing, Jesper Larsson Tr\u00e4ff, Sascha Hunold, and Francesco Versaci (Eds.). Springer Berlin Heidelberg , Berlin, Heidelberg, 675--686. Jos\u00e9 I. Aliaga, Joaqu\u00edn P\u00e9rez, and Enrique S. Quintana-Ort\u00ed. 2015. Systematic Fusion of CUDA Kernels for Iterative Sparse Linear System Solvers. In Euro-Par 2015: Parallel Processing, Jesper Larsson Tr\u00e4ff, Sascha Hunold, and Francesco Versaci (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 675--686."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3480935"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2017.05.006"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS51919.2020.00009"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ScalA51936.2020.00012"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 29th ACM on International Conference on Supercomputing. 25--35","author":"Belviranli Mehmet E","year":"2015","unstructured":"Mehmet E Belviranli , Peng Deng , Laxmi N Bhuyan , Rajiv Gupta , and Qi Zhu . 2015 . Peerwave: Exploiting wavefront parallelism on gpus with peer-sm synchronization . In Proceedings of the 29th ACM on International Conference on Supercomputing. 25--35 . Mehmet E Belviranli, Peng Deng, Laxmi N Bhuyan, Rajiv Gupta, and Qi Zhu. 2015. Peerwave: Exploiting wavefront parallelism on gpus with peer-sm synchronization. In Proceedings of the 29th ACM on International Conference on Supercomputing. 25--35."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2615094"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2615094"},{"key":"e_1_3_2_1_13_1","volume-title":"2010 IEEE International Symposium on Parallel & Distributed Processing (IPDPS). IEEE, 1--12","author":"Chen Long","year":"2010","unstructured":"Long Chen , Oreste Villa , Sriram Krishnamoorthy , and Guang R Gao . 2010 . Dynamic load balancing on single-and multi-GPU systems . In 2010 IEEE International Symposium on Parallel & Distributed Processing (IPDPS). IEEE, 1--12 . Long Chen, Oreste Villa, Sriram Krishnamoorthy, and Guang R Gao. 2010. Dynamic load balancing on single-and multi-GPU systems. In 2010 IEEE International Symposium on Parallel & Distributed Processing (IPDPS). IEEE, 1--12."},{"key":"e_1_3_2_1_14_1","volume-title":"Efficient Algorithms for the Summed Area Tables Primitive on GPUs. In 2018 IEEE International Conference on Cluster Computing (CLUSTER). IEEE, 482--493","author":"Chen Peng","year":"2018","unstructured":"Peng Chen , Mohamed Wahib , Shinichiro Takizawa , Ryousei Takano , and Satoshi Matsuoka . 2018 . Efficient Algorithms for the Summed Area Tables Primitive on GPUs. In 2018 IEEE International Conference on Cluster Computing (CLUSTER). IEEE, 482--493 . Peng Chen, Mohamed Wahib, Shinichiro Takizawa, Ryousei Takano, and Satoshi Matsuoka. 2018. Efficient Algorithms for the Summed Area Tables Primitive on GPUs. In 2018 IEEE International Conference on Cluster Computing (CLUSTER). IEEE, 482--493."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356163"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476139"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2009.51"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-011-0562-z"},{"key":"e_1_3_2_1_20_1","volume-title":"CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide [Online","author":"Nvidia CUDA.","year":"2023","unstructured":"Nvidia CUDA. 2021. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide [Online ; accessed 3- Jan- 2023 ]. Nvidia CUDA. 2021. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide [Online; accessed 3-Jan-2023]."},{"key":"e_1_3_2_1_21_1","volume-title":"NVIDIA A100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper [Online","author":"Nvidia CUDA.","year":"2021","unstructured":"Nvidia CUDA. 2021. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper [Online ; accessed 20- July - 2021 ]. Nvidia CUDA. 2021. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper [Online; accessed 20-July-2021]."},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA CUDA. 2023. CUDA Toolkit Documentation. NVIDIA Developer Zone. http:\/\/docs.nvidia.com\/cuda\/index.html (2023). NVIDIA CUDA. 2023. CUDA Toolkit Documentation. NVIDIA Developer Zone. http:\/\/docs.nvidia.com\/cuda\/index.html (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931088.2931091"},{"key":"e_1_3_2_1_24_1","unstructured":"Philip Roth David Eberius David Rogers. 2022. Understanding Strong Scaling on GPUs Using Empirical Performance Saturation Size. In The International Conference for High Performance Computing Networking Storage and Analysis (International Workshop on Performance Portability and Productivity (P3HPC)). Philip Roth David Eberius David Rogers. 2022. Understanding Strong Scaling on GPUs Using Empirical Performance Saturation Size. In The International Conference for High Performance Computing Networking Storage and Analysis (International Workshop on Performance Portability and Productivity (P3HPC) )."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2049662.2049670"},{"key":"e_1_3_2_1_26_1","volume-title":"2018 IEEE 7th Non-Volatile Memory Systems and Applications Symposium (NVMSA). IEEE, 19--24","author":"Endo Toshio","year":"2018","unstructured":"Toshio Endo . 2018 . Applying recursive temporal blocking for stencil computations to deeper memory hierarchy . In 2018 IEEE 7th Non-Volatile Memory Systems and Applications Symposium (NVMSA). IEEE, 19--24 . Toshio Endo. 2018. Applying recursive temporal blocking for stencil computations to deeper memory hierarchy. In 2018 IEEE 7th Non-Volatile Memory Systems and Applications Symposium (NVMSA). IEEE, 19--24."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPADS.2015.84"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2980179.2982437"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Andreas Frommer Kathryn Lund and Daniel B Szyld. 2017. Block Krylov subspace methods for functions of matrices. (2017). Andreas Frommer Kathryn Lund and Daniel B Szyld. 2017. Block Krylov subspace methods for functions of matrices. (2017).","DOI":"10.1553\/etna_vol47s100"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458526"},{"key":"e_1_3_2_1_31_1","volume-title":"A study of persistent threads style GPU programming for GPGPU workloads","author":"Gupta Kshitij","unstructured":"Kshitij Gupta , Jeff A Stuart , and John D Owens . 2012. A study of persistent threads style GPU programming for GPGPU workloads . IEEE. Kshitij Gupta, Jeff A Stuart, and John D Owens. 2012. A study of persistent threads style GPU programming for GPGPU workloads. IEEE."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751223"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3168824"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00097"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/73560.73588"},{"key":"e_1_3_2_1_38_1","unstructured":"Zhe Jia Marco Maggioni Jeffrey Smith and Daniele Paolo Scarpazza. 2019. Dissecting the NVidia Turing T4 GPU via Microbenchmarking. arXiv:1903.07486 [cs.DC] Zhe Jia Marco Maggioni Jeffrey Smith and Daniele Paolo Scarpazza. 2019. Dissecting the NVidia Turing T4 GPU via Microbenchmarking. arXiv:1903.07486 [cs.DC]"},{"key":"e_1_3_2_1_39_1","volume-title":"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. CoRR abs\/1804.06826","author":"Jia Zhe","year":"2018","unstructured":"Zhe Jia , Marco Maggioni , Benjamin Staiger , and Daniele Paolo Scarpazza . 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. CoRR abs\/1804.06826 ( 2018 ). arXiv:1804.06826 http:\/\/arxiv.org\/abs\/1804.06826 Zhe Jia, Marco Maggioni, Benjamin Staiger, and Daniele Paolo Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. CoRR abs\/1804.06826 (2018). arXiv:1804.06826 http:\/\/arxiv.org\/abs\/1804.06826"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488699"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00038"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2011.01.025"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10852-015-9272-5"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250734.1250761"},{"key":"e_1_3_2_1_45_1","volume-title":"2015 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO). IEEE, 23--33","author":"Li Chao","year":"2015","unstructured":"Chao Li , Yi Yang , Zhen Lin , and Huiyang Zhou . 2015 . Automatic data placement into GPU on-chip memory resources . In 2015 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO). IEEE, 23--33 . Chao Li, Yi Yang, Zhen Lin, and Huiyang Zhou. 2015. Automatic data placement into GPU on-chip memory resources. In 2015 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO). IEEE, 23--33."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1287\/opre.9.3.383"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1137\/140991133"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 1st International Workshop on High-Performance Stencil Computations, Armin Gr\u00f6\u00dflinger and Harald K\u00f6stler (Eds.)","author":"Maruyama Naoya","year":"2014","unstructured":"Naoya Maruyama and Takayuki Aoki . 2014 . Optimizing Stencil Computations for NVIDIA Kepler GPUs . In Proceedings of the 1st International Workshop on High-Performance Stencil Computations, Armin Gr\u00f6\u00dflinger and Harald K\u00f6stler (Eds.) . Vienna, Austria, 89--95. Naoya Maruyama and Takayuki Aoki. 2014. Optimizing Stencil Computations for NVIDIA Kepler GPUs. In Proceedings of the 1st International Workshop on High-Performance Stencil Computations, Armin Gr\u00f6\u00dflinger and Harald K\u00f6stler (Eds.). Vienna, Austria, 89--95."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377904"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-44917-2_13"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-010-0142-5"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.5555\/3014904.3014982"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/1513895.1513905"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2015.05.315"},{"key":"e_1_3_2_1_57_1","unstructured":"Nvidia. 2021. CUB Library. https:\/\/nvlabs.github.io\/cub Nvidia. 2021. CUB Library. https:\/\/nvlabs.github.io\/cub"},{"key":"e_1_3_2_1_58_1","unstructured":"Nvidia. 2021. NVIDIA CUDA Sample. https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/index.html Nvidia. 2021. NVIDIA CUDA Sample. https:\/\/docs.nvidia.com\/cuda\/cuda-samples\/index.html"},{"key":"e_1_3_2_1_59_1","unstructured":"Nvidia. 2023. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf Nvidia. 2023. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf"},{"key":"e_1_3_2_1_60_1","unstructured":"Nvidia. 2023. NVIDIA CUDA Runtime API. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/index.html Nvidia. 2023. NVIDIA CUDA Runtime API. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/index.html"},{"key":"e_1_3_2_1_61_1","unstructured":"Nvidia. 2023. Programming guide. https:\/\/docs.nvidia.com\/cuda\/cuda-cprogramming-guide\/index.html Nvidia. 2023. Programming guide. https:\/\/docs.nvidia.com\/cuda\/cuda-cprogramming-guide\/index.html"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2014.6844463"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW50202.2020.00136"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","first-page":"e202000015","DOI":"10.1002\/gamm.202000015","article-title":"Preconditioners for Krylov subspace methods: An overview","volume":"43","author":"Pearson John W","year":"2020","unstructured":"John W Pearson and Jennifer Pestana . 2020 . Preconditioners for Krylov subspace methods: An overview . GAMM-Mitteilungen 43 , 4 (2020), e202000015 . John W Pearson and Jennifer Pestana. 2020. Preconditioners for Krylov subspace methods: An overview. GAMM-Mitteilungen 43, 4 (2020), e202000015.","journal-title":"GAMM-Mitteilungen"},{"key":"e_1_3_2_1_65_1","volume-title":"International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. Springer, 68--84","author":"Phillips Everett","year":"2014","unstructured":"Everett Phillips and Massimiliano Fatica . 2014 . A CUDA implementation of the High Performance Conjugate Gradient benchmark . In International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. Springer, 68--84 . Everett Phillips and Massimiliano Fatica. 2014. A CUDA implementation of the High Performance Conjugate Gradient benchmark. In International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems. Springer, 68--84."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830018.2830025"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/2884045.2884047"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2862896"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387537"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2010.5470941"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2018.08.004"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783718"},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings of the GPU technology conference, GTC","volume":"10","author":"Volkov Vasily","year":"2010","unstructured":"Vasily Volkov . 2010 . Better performance at lower occupancy . In Proceedings of the GPU technology conference, GTC , Vol. 10 . San Jose, CA, 16. Vasily Volkov. 2010. Better performance at lower occupancy. In Proceedings of the GPU technology conference, GTC, Vol. 10. San Jose, CA, 16."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.21"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433726"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMPSAC.2009.82"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/76263.76337"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00057"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356210"},{"key":"e_1_3_2_1_84_1","volume-title":"Sparse Persistent RNNs: Squeezing Large Recurrent Networks On-Chip. CoRR abs\/1804.10223","author":"Zhu Feiwen","year":"2018","unstructured":"Feiwen Zhu , Jeff Pool , Michael Andersch , Jeremy Appleyard , and Fung Xie . 2018. Sparse Persistent RNNs: Squeezing Large Recurrent Networks On-Chip. CoRR abs\/1804.10223 ( 2018 ). arXiv:1804.10223 http:\/\/arxiv.org\/abs\/1804.10223 Feiwen Zhu, Jeff Pool, Michael Andersch, Jeremy Appleyard, and Fung Xie. 2018. Sparse Persistent RNNs: Squeezing Large Recurrent Networks On-Chip. CoRR abs\/1804.10223 (2018). arXiv:1804.10223 http:\/\/arxiv.org\/abs\/1804.10223"}],"event":{"name":"ICS '23: 37th International Conference on Supercomputing","location":"Orlando FL USA","acronym":"ICS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 37th International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577193.3593705","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:31Z","timestamp":1750178851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577193.3593705"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,21]]},"references-count":84,"alternative-id":["10.1145\/3577193.3593705","10.1145\/3577193"],"URL":"https:\/\/doi.org\/10.1145\/3577193.3593705","relation":{},"subject":[],"published":{"date-parts":[[2023,6,21]]},"assertion":[{"value":"2023-06-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}