{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T22:43:42Z","timestamp":1774219422752,"version":"3.50.1"},"reference-count":55,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,2,1]],"date-time":"2023-02-01T00:00:00Z","timestamp":1675209600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1109\/hpca56546.2023.10071058","type":"proceedings-article","created":{"date-parts":[[2023,3,24]],"date-time":"2023-03-24T17:42:55Z","timestamp":1679679775000},"page":"259-272","source":"Crossref","is-referenced-by-count":27,"title":["VEGETA: Vertically-Integrated Extensions for Sparse\/Dense GEMM Tile Acceleration on CPUs"],"prefix":"10.1109","author":[{"given":"Geonhwa","family":"Jeong","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Sana","family":"Damani","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Abhimanyu Rajeshkumar","family":"Bambhaniya","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Eric","family":"Qin","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Christopher J.","family":"Hughes","sequence":"additional","affiliation":[{"name":"Intel Labs"}]},{"given":"Sreenivas","family":"Subramoney","sequence":"additional","affiliation":[{"name":"Intel Labs"}]},{"given":"Hyesoon","family":"Kim","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Tushar","family":"Krishna","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]}],"member":"263","reference":[{"key":"ref2","article-title":"Nvidia tesla v100 gpu architecture","year":"2017"},{"key":"ref3","article-title":"Tensorfloat-32 in the a100 gpu accelerates ai training, hpc up to 20x","year":"2019"},{"key":"ref4","article-title":"Nvidia ampere ga102 gpu architecture","year":"2021"},{"key":"ref5","article-title":"oneapi deep neural network library (onednn)","year":"2021"},{"key":"ref6","article-title":"Sambanova whitepaper","year":"2021"},{"key":"ref7","article-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi","year":"2015"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358305"},{"key":"ref9","article-title":"Powering the edge: Driving optimal performance with the ethos-n77 npu","year":"2019","journal-title":"Whitepaper"},{"key":"ref10","article-title":"Language models are few-shot learners","author":"Brown","year":"2020"},{"key":"ref11","first-page":"579","article-title":"Tvm: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Chen"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"ref14","author":"Chetlur","year":"2014","journal-title":"cudnn: Efficient primitives for deep learning"},{"key":"ref15","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00079"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00032"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00070"},{"key":"ref19","article-title":"Deep compression: Compressing deep neural network with pruning, trained quantization and huffman coding","volume-title":"4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings","author":"Han"},{"key":"ref20","first-page":"1135","article-title":"Learning both weights and connections for efficient neural networks","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1","author":"Han"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392751"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"ref24","article-title":"Power isa version 3.1","year":"2020","journal-title":"Whitepaper"},{"key":"ref25","article-title":"Theoretical maximum memory bandwidth for intel\u00ae core\u2122 x-series processors"},{"key":"ref26","article-title":"Intel 64 and ia-32 architectures software developer\u2019s manual","year":"2020"},{"key":"ref27","article-title":"Presentation deck: Intel architecture day 2021","year":"2021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00011"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586257"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref31","article-title":"Macsim: A cpu-gpu heterogeneous simulation framework user guide","author":"Kim","year":"2012"},{"key":"ref32","article-title":"Intel\u2019s mlperf results show robust cpu-based training performance for a range of workloads","author":"Koichi Yamada","year":"2020"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/MC.1982.1653825"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304028"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2979965"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00049"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/1064978.1065034"},{"key":"ref39","article-title":"Accelerating sparse deep neural networks","author":"Mishra","year":"2021"},{"key":"ref40","article-title":"Deep learning recommendation model for personalization and recommendation systems","volume-title":"CoRR","volume":"abs\/1906.00091","author":"Naumov","year":"2019"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00062"},{"key":"ref45","article-title":"Lower numerical precision deep learning inference and training","author":"Rodrigues","year":"2018","journal-title":"Whitepaper"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/tc.2018.2879434"},{"key":"ref47","article-title":"Bfloat16: The secret to high performance on cloud tpus","author":"Shibo Wang","year":"2019"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378450"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00062"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3058632"},{"key":"ref52","article-title":"Dominosearch: Find layer-wise fine-grained n:m sparse schemes from dense neural networks","volume":"34","author":"Sun","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00048"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00030"},{"key":"ref55","article-title":"Learning n:m fine-grained structured sparse neural networks from scratch","volume-title":"International Conference on Learning Representations","author":"Zhou"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"}],"event":{"name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","location":"Montreal, QC, Canada","start":{"date-parts":[[2023,2,25]]},"end":{"date-parts":[[2023,3,1]]}},"container-title":["2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10070856\/10070923\/10071058.pdf?arnumber=10071058","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T13:13:33Z","timestamp":1707830013000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10071058\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/hpca56546.2023.10071058","relation":{},"subject":[],"published":{"date-parts":[[2023,2]]}}}