{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:50:33Z","timestamp":1768031433887,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2000722,2046102"],"award-info":[{"award-number":["2000722,2046102"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3476138","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-18","source":"Crossref","is-referenced-by-count":13,"title":["E.T."],"prefix":"10.1145","author":[{"given":"Shiyang","family":"Chen","sequence":"first","affiliation":[{"name":"Stevens Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaoyi","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Connecticut"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Santosh","family":"Pandey","sequence":"additional","affiliation":[{"name":"Stevens Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bingbing","family":"Li","sequence":"additional","affiliation":[{"name":"University of Connecticut"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guang R.","family":"Gao","sequence":"additional","affiliation":[{"name":"University of Delaware"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Long","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Delaware"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Caiwen","family":"Ding","sequence":"additional","affiliation":[{"name":"University of Connecticut"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hang","family":"Liu","sequence":"additional","affiliation":[{"name":"Stevens Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation. USENIX Association, USA, 265--283","author":"Abadi Mart\u00edn","year":"2016"},{"key":"e_1_3_2_2_2_1","unstructured":"AMD. 2021. INTRODUCING AMD CDNA ARCHITECTURE. https:\/\/www.amd.com\/system\/files\/documents\/amd-cdna-whitepaper.pdf.  AMD. 2021. INTRODUCING AMD CDNA ARCHITECTURE. https:\/\/www.amd.com\/system\/files\/documents\/amd-cdna-whitepaper.pdf."},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems","author":"Brown Tom","year":"1877"},{"key":"e_1_3_2_2_4_1","first-page":"5","article-title":"Enhancing sparsity by reweighted l1 minimization","volume":"14","author":"Candes Emmanuel J","year":"2008","journal-title":"Journal of Fourier analysis and applications"},{"key":"e_1_3_2_2_5_1","volume-title":"Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)","author":"Cer Daniel","year":"2017"},{"key":"e_1_3_2_2_6_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Chen Mark","year":"2020"},{"key":"e_1_3_2_2_7_1","volume-title":"Advances in Neural Information Processing Systems. Curran Associates","author":"Chen Tianlong"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation. USENIX Association, USA, 579--594","author":"Chen Tianqi","year":"2018"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Jianpeng Cheng Li Dong and Mirella Lapata. 2016. Long Short-Term Memory-Networks for Machine Reading. (2016) 551--561.  Jianpeng Cheng Li Dong and Mirella Lapata. 2016. Long Short-Term Memory-Networks for Machine Reading. (2016) 551--561.","DOI":"10.18653\/v1\/D16-1053"},{"key":"e_1_3_2_2_10_1","volume-title":"14th Symposium on Networked Systems Design and Implementation. USENIX Association","author":"Crankshaw Daniel","year":"2017"},{"key":"e_1_3_2_2_11_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_2_12_1","volume-title":"Third International Workshop on Paraphrasing (third international workshop on paraphrasing (IWP2005)","author":"Dolan Bill","year":"2005"},{"key":"e_1_3_2_2_13_1","volume-title":"International Conference on Acoustics, Speech and Signal Processing. IEEE","author":"Dong Linhao","year":"2018"},{"key":"e_1_3_2_2_14_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. (2020). arXiv:arXiv preprint arXiv:2010.11929  Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. (2020). arXiv:arXiv preprint arXiv:2010.11929"},{"key":"e_1_3_2_2_15_1","unstructured":"Angela Fan Edouard Grave and Armand Joulin. 2019. Reducing transformer depth on demand with structured dropout. (2019). arXiv:arXiv:1909.11556  Angela Fan Edouard Grave and Armand Joulin. 2019. Reducing transformer depth on demand with structured dropout. (2019). arXiv:arXiv:1909.11556"},{"key":"e_1_3_2_2_16_1","volume-title":"Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming. ACM","author":"Fang Jiarui","year":"2021"},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE Press, Virtual, 1--14","author":"Gale Trevor","year":"2020"},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings of the Thirteenth EuroSys Conference. ACM","author":"Gao Pin","year":"2018"},{"key":"e_1_3_2_2_19_1","volume-title":"XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla","year":"2021"},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the 5th Workshop on Representation Learning for NLP. Association for Computational Linguistics, Virtual, 143--155","author":"Gordon Mitchell","year":"2020"},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, Virtual, 1--15","author":"Guo Cong","year":"2020"},{"key":"e_1_3_2_2_22_1","unstructured":"Kai Han Yunhe Wang Hanting Chen Xinghao Chen Jianyuan Guo Zhenhua Liu Yehui Tang An Xiao Chunjing Xu Yixing Xu Zhaohui Yang Yiman Zhang and Dacheng Tao. 2020. A Survey on Visual Transformer. (2020). arXiv:arXiv:2012.12556  Kai Han Yunhe Wang Hanting Chen Xinghao Chen Jianyuan Guo Zhenhua Liu Yehui Tang An Xiao Chunjing Xu Yixing Xu Zhaohui Yang Yiman Zhang and Dacheng Tao. 2020. A Survey on Visual Transformer. (2020). arXiv:arXiv:2012.12556"},{"key":"e_1_3_2_2_23_1","volume-title":"Advances in Neural Information Processing Systems","author":"Han Song"},{"key":"e_1_3_2_2_24_1","volume-title":"Proceedings of the Fourteenth EuroSys Conference","author":"Holmes Connor","year":"2019"},{"key":"e_1_3_2_2_25_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations.","author":"Lan Zhenzhong","year":"2019"},{"key":"e_1_3_2_2_26_1","volume-title":"Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. AAAI Press","author":"Levesque Hector","year":"2012"},{"key":"e_1_3_2_2_27_1","volume-title":"Partial Order Pruning: For Best Speed\/Accuracy Trade-Off in Neural Architecture Search. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE","author":"Li Xin","year":"2019"},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics","author":"Luong Thang"},{"key":"e_1_3_2_2_29_1","unstructured":"JS McCarley Rishav Chakravarti and Avirup Sil. 2019. Structured pruning of a bert-based question answering model. (2019). arXiv:arXiv:1910.06360  JS McCarley Rishav Chakravarti and Avirup Sil. 2019. Structured pruning of a bert-based question answering model. (2019). arXiv:arXiv:1910.06360"},{"key":"e_1_3_2_2_30_1","unstructured":"Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2017. Pointer Sentinel Mixture Models. (2017). arXiv:arXiv:1609.07843  Stephen Merity Caiming Xiong James Bradbury and Richard Socher. 2017. Pointer Sentinel Mixture Models. (2017). arXiv:arXiv:1609.07843"},{"key":"e_1_3_2_2_31_1","volume-title":"Block-sparse recurrent neural networks. arXiv preprint arXiv:1711.02782","author":"Narang Sharan","year":"2017"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems-Volume 1. MIT Press","author":"Novikov Alexander","year":"2015"},{"key":"e_1_3_2_2_33_1","unstructured":"NVIDIA. 2007. cuBLAS. https:\/\/developer.nvidia.com\/cublas.  NVIDIA. 2007. cuBLAS. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_2_34_1","unstructured":"NVIDIA. 2017. NVIDIA TESLA V100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf.  NVIDIA. 2017. NVIDIA TESLA V100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf."},{"key":"e_1_3_2_2_35_1","unstructured":"NVIDIA. 2020. CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass.  NVIDIA. 2020. CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_2_36_1","unstructured":"NVIDIA. 2020. Matrix Multiplication Background User Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html#math-mem.  NVIDIA. 2020. Matrix Multiplication Background User Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-matrix-multiplication\/index.html#math-mem."},{"key":"e_1_3_2_2_37_1","unstructured":"NVIDIA. 2020. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf.  NVIDIA. 2020. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_2_2_38_1","unstructured":"NVIDIA. 2021. cuBLAS: cublasgemmalgo_t. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html#cublasgemmalgo_t.  NVIDIA. 2021. cuBLAS: cublasgemmalgo_t. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html#cublasgemmalgo_t."},{"key":"e_1_3_2_2_39_1","unstructured":"NVIDIA. 2021. FasterTransformer. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/FasterTransformer.  NVIDIA. 2021. FasterTransformer. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/FasterTransformer."},{"key":"e_1_3_2_2_40_1","unstructured":"NVIDIA. 2021. TensorRT. https:\/\/developer.nvidia.com\/tensorrt.  NVIDIA. 2021. TensorRT. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings of the Conference on Artificial Intelligence. AAAI Press, Virtual, 13657--13665","author":"Passban Peyman","year":"2020"},{"key":"e_1_3_2_2_42_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. Curran Associates","author":"Paszke Adam","year":"2019"},{"key":"e_1_3_2_2_43_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Virtual, 3208--3229","author":"Prasanna Sai","year":"2020"},{"key":"e_1_3_2_2_44_1","volume-title":"International Symposium on High Performance Computer Architecture. IEEE","author":"Qin Eric","year":"2020"},{"key":"e_1_3_2_2_45_1","volume-title":"Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics","author":"Rajpurkar Pranav","year":"2016"},{"key":"e_1_3_2_2_46_1","volume-title":"Proceedings of the 1st International Workshop on Software Engineering for AI in Autonomous Systems. IEEE","author":"Rao Qing","year":"2018"},{"key":"e_1_3_2_2_47_1","volume-title":"Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques. ACM","author":"Rumi Masuma Akter","year":"2020"},{"key":"e_1_3_2_2_48_1","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. (2019). arXiv:arXiv:1910.01108  Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. (2019). arXiv:arXiv:1910.01108"},{"key":"e_1_3_2_2_49_1","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics","author":"Socher Richard","year":"2013"},{"key":"e_1_3_2_2_50_1","volume-title":"Proceedings of the 28th ACM International Conference on Information and Knowledge Management. ACM","author":"van Aken Betty","year":"2019"},{"key":"e_1_3_2_2_51_1","volume-title":"Advances in neural information processing systems","author":"Vaswani Ashish"},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP. Association for Computational Linguistics","author":"Wang Alex","year":"2018"},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques. Association for Computing Machinery","author":"Wang Ziheng","year":"2020"},{"key":"e_1_3_2_2_54_1","volume-title":"Advances in Neural Information Processing Systems. Curran Associates Inc.","author":"Wen Wei","year":"2016"},{"key":"e_1_3_2_2_55_1","volume-title":"Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Williams Adina","year":"2018"},{"key":"e_1_3_2_2_56_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020"},{"key":"e_1_3_2_2_57_1","unstructured":"Xilinx. 2017. SDAccel Environment Profiling and Optimization Guide. https:\/\/www.xilinx.com\/html_docs\/xilinx2017_4\/sdaccel_doc\/jbt1504034294480.html.  Xilinx. 2017. SDAccel Environment Profiling and Optimization Guide. https:\/\/www.xilinx.com\/html_docs\/xilinx2017_4\/sdaccel_doc\/jbt1504034294480.html."},{"key":"e_1_3_2_2_58_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Virtual, 7859--7869","author":"Xu Canwen","year":"2020"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"crossref","first-page":"106848","DOI":"10.1016\/j.compeleceng.2020.106848","article-title":"Accelerating sparse matrix-matrix multiplication with GPU Tensor Cores","volume":"88","author":"Zachariadis Orestis","year":"2020","journal-title":"Computers & Electrical Engineering"},{"key":"e_1_3_2_2_60_1","volume-title":"53rd Annual International Symposium on Microarchitecture. IEEE","author":"Zadeh Ali Hadi","year":"2020"},{"key":"e_1_3_2_2_61_1","volume-title":"Proceedings of the European Conference on Computer Vision. Springer","author":"Zhang Tianyun","year":"2018"},{"key":"e_1_3_2_2_62_1","volume-title":"Proceedings of the Conference on Artificial Intelligence","volume":"32","author":"Zhang Xiaodong","year":"2018"},{"key":"e_1_3_2_2_63_1","volume-title":"Proceedings of the 57th Annual Meeting. Association for Computational Linguistics","author":"Zhang Xingxing","year":"2019"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476138","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476138","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476138","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:06Z","timestamp":1750268946000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476138"}},"subtitle":["re-thinking self-attention for transformer models on GPUs"],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":63,"alternative-id":["10.1145\/3458817.3476138","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3476138","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}