{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T09:58:40Z","timestamp":1740131920020,"version":"3.37.3"},"reference-count":45,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Samsung Advanced Institute of Technology, Samsung Electronics Co., Ltd."},{"name":"Engineering Research Center Program through the NRF of Korea funded by the Korean Government MSIT","award":["NRF-2018R1A5A1059921"],"award-info":[{"award-number":["NRF-2018R1A5A1059921"]}]},{"name":"IC Design Education Center (IDEC), Korea"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput."],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/tc.2022.3207134","type":"journal-article","created":{"date-parts":[[2022,9,15]],"date-time":"2022-09-15T19:34:45Z","timestamp":1663270485000},"page":"1-12","source":"Crossref","is-referenced-by-count":2,"title":["Future Scaling of Memory Hierarchy for Tensor Cores and Eliminating Redundant Shared Memory Traffic Using Inter-Warp Multicasting"],"prefix":"10.1109","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5177-0916","authenticated-orcid":false,"given":"Sunjung","family":"Lee","sequence":"first","affiliation":[{"name":"Department of Intelligence and Information, Seoul National University (SNU), South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1733-1394","authenticated-orcid":false,"given":"Seunghwan","family":"Hwang","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, Seoul National University (SNU), South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7146-1926","authenticated-orcid":false,"given":"Michael Jaemin","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, Seoul National University (SNU), South Korea"}]},{"given":"Jaewan","family":"Choi","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, Seoul National University (SNU), South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2447-4369","authenticated-orcid":false,"given":"Jung Ho","family":"Ahn","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, Seoul National University (SNU), South Korea"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2014.94"},{"year":"2012","key":"ref38","article-title":"GPU performance analysis and optimization"},{"year":"2020","key":"ref33","article-title":"DEVELOPING CUDA KERNELS TO PUSH TENSOR CORES TO THE ABSOLUTE LIMIT ON NVIDIA A100"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378520"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080205"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3287624.3287633"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00041"},{"year":"0","key":"ref36"},{"year":"0","key":"ref35"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071134"},{"key":"ref40","first-page":"1","article-title":"Design flows and collateral for the ASAP7 7 nm FinFET predictive process design kit","author":"clark","year":"2017","journal-title":"Proc IEEE Int Conf Microelectronic Syst Educ"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2971677"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"article-title":"Dissecting the NVIDIA volta GPU architecture via microbenchmarking","year":"2018","author":"jia","key":"ref13"},{"year":"0","key":"ref14","article-title":"gpumembench"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/232973.232983"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3484505"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00047"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835937"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414623"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3001589"},{"key":"ref3","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"radford","key":"ref6"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080231"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"devlin","key":"ref5"},{"article-title":"Rethinking attention with performers","year":"2020","author":"choromanski","key":"ref8"},{"article-title":"Longformer: The long-document transformer","year":"2020","author":"beltagy","key":"ref7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref9","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","author":"zaheer","year":"2020","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"article-title":"cuDNN: Efficient primitives for deep learning","year":"2014","author":"chetlur","key":"ref20"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2015.2435709"},{"year":"0","key":"ref22","article-title":"CUTLASS"},{"year":"0","key":"ref21","article-title":"cuBLAS"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2017.7975270"},{"article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","year":"2019","author":"shoeybi","key":"ref24"},{"key":"ref41","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc 25th Int Conf Neural Inf Process Syst"},{"article-title":"Automatic kernel generation for volta tensor cores","year":"2020","author":"bhaskaracharya","key":"ref23"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830830"},{"article-title":"Dissecting the graphcore IPU architecture via microbenchmarking","year":"2019","author":"jia","key":"ref26"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2918851"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"}],"container-title":["IEEE Transactions on Computers"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/12\/4358213\/09893362.pdf?arnumber=9893362","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,17]],"date-time":"2022-11-17T16:54:51Z","timestamp":1668704091000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9893362\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":45,"URL":"https:\/\/doi.org\/10.1109\/tc.2022.3207134","relation":{},"ISSN":["0018-9340","1557-9956","2326-3814"],"issn-type":[{"type":"print","value":"0018-9340"},{"type":"electronic","value":"1557-9956"},{"type":"electronic","value":"2326-3814"}],"subject":[],"published":{"date-parts":[[2022]]}}}