{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:48:08Z","timestamp":1775069288361,"version":"3.50.1"},"reference-count":274,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF\u2014NSF\/Intel Joint Research Center for Computer Assisted Programming for Heterogeneous Architectures","doi-asserted-by":"publisher","award":["CCF 1723476"],"award-info":[{"award-number":["CCF 1723476"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1109\/jproc.2021.3098483","type":"journal-article","created":{"date-parts":[[2021,8,5]],"date-time":"2021-08-05T20:00:55Z","timestamp":1628193655000},"page":"1706-1752","source":"Crossref","is-referenced-by-count":83,"title":["Hardware Acceleration of Sparse and Irregular Tensor Computations of ML Models: A Survey and Insights"],"prefix":"10.1109","volume":"109","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4262-3938","authenticated-orcid":false,"given":"Shail","family":"Dave","sequence":"first","affiliation":[]},{"given":"Riyadh","family":"Baghdadi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8483-3824","authenticated-orcid":false,"given":"Tony","family":"Nowatzki","sequence":"additional","affiliation":[]},{"given":"Sasikanth","family":"Avancha","sequence":"additional","affiliation":[]},{"given":"Aviral","family":"Shrivastava","sequence":"additional","affiliation":[]},{"given":"Baoxin","family":"Li","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref274","first-page":"1","article-title":"Sparse GPU kernels for deep learning","author":"gale","year":"2020","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref271","first-page":"1","article-title":"A survey of FPGA-based accelerators for convolutional neural networks","volume":"32","author":"mittal","year":"2018","journal-title":"Neural Comput Appl"},{"key":"ref270","article-title":"The deep learning compiler: A comprehensive survey","author":"li","year":"2020","journal-title":"arXiv 2002 03794"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00447"},{"key":"ref170","author":"weste","year":"2015","journal-title":"CMOS VLSI Design A Circuits and Systems Perspective"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.2977722"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00030"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080221"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS.2010.23"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783725"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1145\/3373087.3375306"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1145\/3130218.3130230"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1145\/3296957.3173176"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00080"},{"key":"ref169","doi-asserted-by":"crossref","first-page":"1599","DOI":"10.1109\/TCAD.2011.2161217","article-title":"High throughput data mapping for coarse-grained reconfigurable architectures","volume":"30","author":"kim","year":"2011","journal-title":"IEEE Trans Comput -Aided Design Integr Circuits Syst"},{"key":"ref39","article-title":"Exploring sparsity in recurrent neural networks","author":"narang","year":"2017","journal-title":"arXiv 1704 05119"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304076"},{"key":"ref33","article-title":"SqueezeNet: AlexNet-level accuracy with 50&#x00D7; fewer parameters and <0.5 MB model size","author":"iandola","year":"2016","journal-title":"arXiv 1602 07360"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.2976475"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ASPDAC.2017.7858395"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2890150"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00011"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1145\/3186332"},{"key":"ref36","article-title":"Quantizing deep convolutional networks for efficient inference: A whitepaper","author":"krishnamoorthi","year":"2018","journal-title":"arXiv 1806 08342"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2019.8916327"},{"key":"ref35","author":"moroney","year":"2018","journal-title":"Introducing Ragged Tensors"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2013.2297439"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062259"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2017.2682138"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304041"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322255"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00069"},{"key":"ref189","first-page":"67","article-title":"?NN: Power-efficient neural network acceleration using differential weights","volume":"40","author":"mahdiani","year":"2019","journal-title":"IEEE Micro"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1145\/3358174"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00027"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"ref28","first-page":"2074","article-title":"Learning structured sparsity in deep neural networks","author":"wen","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref27","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","author":"han","year":"2015","journal-title":"arXiv 1510 00149 [cs]"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054275"},{"key":"ref29","first-page":"1","article-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks","author":"frankle","year":"2018","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502276"},{"key":"ref24","author":"amodei","year":"2018","journal-title":"AI and Compute"},{"key":"ref23","article-title":"DNN dataflow choice is overrated","author":"yang","year":"2018","journal-title":"arXiv 1809 04070"},{"key":"ref26","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref25","first-page":"1135","article-title":"Learning both weights and connections for efficient neural network","author":"han","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-06486-4_7"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-100560-3.50010-X"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1145\/1961189.1961199"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-41321-1_4"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898718003"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.39"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1145\/1583991.1584053"},{"key":"ref151","article-title":"Non-structured DNN weight pruning&#x2014;Is it beneficial in any platform?","author":"ma","year":"2019","journal-title":"arXiv 1907 02124"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1145\/2833179.2833183"},{"key":"ref147","article-title":"An investigation of sparse tensor formats for tensor libraries","author":"tew","year":"2016"},{"key":"ref148","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"arXiv 1409 1556"},{"key":"ref149","first-page":"533","article-title":"EVA&#x00B2;: Exploiting temporal redundancy in live computer vision","author":"buckler","year":"2018","journal-title":"Proc ACM\/IEEE 45th Annu Int Symp Comput Archit (ISCA)"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080215"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682791"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3358198"},{"key":"ref55","article-title":"Apparatus, methods, and systems with a configurable spatial accelerator","author":"fleming","year":"2019"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/2847263.2847276"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00012"},{"key":"ref52","article-title":"Xilinx DNN processor (xDNN), accelerating AI in datacenters","author":"khan","year":"2018"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.643"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358318"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174253"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124552"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2019.2946771"},{"key":"ref163","year":"0","journal-title":"CUDA Sparse Matrix Library (CuSPARSE)"},{"key":"ref162","author":"bader","year":"2015","journal-title":"MATLAB Tensor Toolbox Version 2 6"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295712"},{"key":"ref160","volume":"1","author":"vuduc","year":"2003","journal-title":"Automatic performance tuning of sparse matrix kernels"},{"key":"ref4","article-title":"YOLOv3: An incremental improvement","author":"redmon","year":"2018","journal-title":"arXiv 1804 02767"},{"key":"ref3","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref5","article-title":"Rethinking atrous convolution for semantic image segmentation","author":"chen","year":"2017","journal-title":"arXiv 1706 05587"},{"key":"ref8","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume":"1","author":"devlin","year":"2019","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1137\/060676489"},{"key":"ref7","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref49","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc of USENIX Symp on Operating Systems Design and Implementation (OSDI)"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1145\/1183401.1183444"},{"key":"ref9","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"arXiv 2005 14165"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2012.6408676"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"292","DOI":"10.3390\/electronics8030292","article-title":"A state-of-the-art survey on deep learning theory and architectures","volume":"8","author":"alom","year":"2019","journal-title":"Electronics"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3234150"},{"key":"ref48","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref47","article-title":"Representation learning on graphs: Methods and applications","author":"hamilton","year":"2017","journal-title":"arXiv 1709 05584"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001163"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI.2019.00197"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001138"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref70","article-title":"The state of sparsity in deep neural networks","author":"gale","year":"2019","journal-title":"arXiv 1902 09574"},{"key":"ref76","article-title":"Speeding up convolutional neural networks by exploiting the sparsity of rectifier units","author":"shi","year":"2017","journal-title":"arXiv 1704 07724"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00009"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001165"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00725"},{"key":"ref78","article-title":"SpAtten: Efficient sparse attention architecture with cascade token and head pruning","author":"wang","year":"2020","journal-title":"arXiv 2012 09852"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00060"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2911674"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2979965"},{"key":"ref61","author":"krashinsky","year":"2020","journal-title":"NVIDIA Ampere Architecture In-Depth"},{"key":"ref63","article-title":"Hierarchical block sparse neural networks","author":"vooturi","year":"2018","journal-title":"arXiv 1808 03420"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01147"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2019.2924215"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2019.8662302"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01464"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021745"},{"key":"ref69","article-title":"To prune, or not to prune: Exploring the efficacy of pruning for model compression","author":"zhu","year":"2017","journal-title":"arXiv 1710 01878"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"ref198","article-title":"End to end learning for self-driving cars","author":"bojarski","year":"2016","journal-title":"arXiv 1604 07316 [cs]"},{"key":"ref199","first-page":"173","article-title":"Deep speech 2: End-to-end speech recognition in English and Mandarin","author":"amodei","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00052"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00061"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2018.8342010"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205295"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref94","first-page":"10","article-title":"1.1 Computing&#x2019;s energy problem (and what we can do about it)","author":"horowitz","year":"2014","journal-title":"IEEE Int Solid-State Circuits Conf (ISSCC) Dig Tech Papers"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00020"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358309"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1977.10514"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317749"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.3014632"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1145\/3373087.3375312"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00062"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00016"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00908"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00022"},{"key":"ref84","first-page":"1","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989161"},{"key":"ref80","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","author":"radford","year":"2015","journal-title":"arXiv 1511 06434"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00079"},{"key":"ref85","article-title":"Training recommender systems at scale: Communication-efficient model and data parallelism","author":"gupta","year":"2020","journal-title":"arXiv 2010 08899"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"ref87","article-title":"Understanding training efficiency of deep learning recommendation models at scale","author":"acun","year":"2020","journal-title":"arXiv 2011 05497"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00012"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.789"},{"key":"ref101","article-title":"A study of BFLOAT16 for deep learning training","author":"kalamkar","year":"2019","journal-title":"arXiv 1905 12322"},{"key":"ref100","author":"warden","year":"2015","journal-title":"Why are 8 bits enough for deep neural networks"},{"key":"ref209","first-page":"1581","article-title":"The polyhedron model","author":"paul","year":"2011","journal-title":"Encyclopedia of Parallel Computing"},{"key":"ref203","year":"2020","journal-title":"Understanding Memory Formats Intel MKL-DNN"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref201","article-title":"One weird trick for parallelizing convolutional neural networks","author":"krizhevsky","year":"2014","journal-title":"arXiv 1404 5997"},{"key":"ref202","article-title":"Neural network distiller: A Python package for DNN compression research","author":"zmora","year":"2019","journal-title":"arXiv 1910 12232"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1145\/2185520.2185528"},{"key":"ref208","article-title":"MLIR: A compiler infrastructure for the end of Moore&#x2019;s law","author":"lattner","year":"2020","journal-title":"arXiv 2002 11054"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"ref206","first-page":"1","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","author":"chen","year":"2018","journal-title":"Proc of USENIX Symp on Operating Systems Design and Implementation (OSDI)"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.17"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15582-6_49"},{"key":"ref212","article-title":"Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions","author":"vasilache","year":"2018","journal-title":"arXiv 1802 04730"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1145\/3211346.3211354"},{"key":"ref214","article-title":"XLA: TensorFlow, compiled","author":"leary","year":"2017"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626412500107"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694364"},{"key":"ref218","first-page":"17","article-title":"AlphaZ: A system for design space exploration in the polyhedral model","author":"yuki","year":"2012","journal-title":"Proc Int Workshop Lang Compil Parallel Comput"},{"key":"ref219","article-title":"Chill: A framework for composing high-level loop transformations","author":"chen","year":"2008"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-11970-5_16"},{"key":"ref222","article-title":"PENCIL language specification","author":"baghdadi","year":"2015"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542301"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356506"},{"key":"ref227","article-title":"Improving locality and parallelism in nested loops","author":"wolf","year":"1992"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1007\/BF01407931"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/2908080.2908105"},{"key":"ref224","article-title":"DLVM: A modern compiler infrastructure for deep learning systems","author":"wei","year":"2017","journal-title":"arXiv 1711 03016"},{"key":"ref223","first-page":"1","article-title":"Scalable polyhedral compilation, syntax vs. semantics: 1&#x2013;0 in the first round","author":"baghdadi","year":"2020","journal-title":"Proc IMPACT Workshop Associated With HIPEAC"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545462"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2019.2930057"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1145\/3174243.3174261"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00013"},{"key":"ref129","first-page":"1","article-title":"Stitch-X: An accelerator architecture for exploiting unstructured sparsity in deep neural networks","author":"lee","year":"2018","journal-title":"Proc SysML Conf"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2017.2778281"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2017.2741463"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2018.2841824"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952679"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322214"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330385"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2006.37"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3322967"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1145\/1356052.1356053"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2009.18"},{"key":"ref239","article-title":"A hardware-software blueprint for flexible deep learning specialization","author":"moreau","year":"2018","journal-title":"arXiv 1807 04188"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1145\/3314221.3314597"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2004.1281665"},{"key":"ref234","first-page":"4505","article-title":"Ithemal: Accurate, portable and fast basic block throughput estimation using deep neural networks","author":"mendis","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1145\/3331469"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080244"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392751"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392749"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1145\/3276493"},{"key":"ref140","author":"smith","year":"2017","journal-title":"Frostt File Format"},{"key":"ref141","year":"2013","journal-title":"Matrix Market Exchange Formats"},{"key":"ref142","article-title":"SciPy: Open source scientific tools for Python","author":"jones","year":"2001"},{"key":"ref143","article-title":"SPARSKIT: A basic tool kit for sparse matrix computations","author":"saad","year":"1990"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536313"},{"key":"ref1","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref145","first-page":"1","article-title":"Model-based memory hierarchy optimizations for sparse matrices","volume":"139","author":"im","year":"1998","journal-title":"Proc Workshop Profile Feedback-Directed Compilation"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00821"},{"key":"ref242","first-page":"2178","article-title":"Automatic neural network compression by sparsity-quantization joint learning: A constrained optimization-based approach","author":"yang","year":"2019","journal-title":"Proc IEEE Conf Comput Vis and Pattern Recog"},{"key":"ref243","first-page":"1","article-title":"Co-design of deep neural nets and neural net accelerators for embedded vision applications","author":"kwon","year":"2018","journal-title":"Proc 55th ACM\/ESDA\/IEEE Design Automat Conf (DAC)"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3243479"},{"key":"ref240","article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","author":"chen","year":"2015","journal-title":"arXiv 1512 01274"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2018.8342009"},{"key":"ref247","first-page":"784","article-title":"AMC: AutoML for model compression and acceleration on mobile devices","author":"he","year":"2018","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218596"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2019.00014"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1145\/3195970.3196116"},{"key":"ref109","first-page":"246","article-title":"14.5 Envision: A 0.26-to-10 TOPS\/W subword-parallel dynamic-voltage-accuracy-frequency-scalable convolutional neural network processor in 28 nm FDSOI","author":"moons","year":"2017","journal-title":"IEEE Int Solid-State Circuits Conf (ISSCC) Dig Tech Papers"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123982"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.23919\/VLSIC.2019.8778193"},{"key":"ref106","article-title":"Faster neural network training with approximate tensor operations","author":"adelman","year":"2018","journal-title":"arXiv 1805 08079"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541967"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953288"},{"key":"ref102","first-page":"6869","article-title":"Quantized neural networks: Training neural networks with low precision weights and activations","volume":"18","author":"hubara","year":"2017","journal-title":"J Mach Learn Res"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1145\/3358178"},{"key":"ref112","article-title":"Cnvlutin2: Ineffectual-activation-and-weight-free deep neural network computing","author":"judd","year":"2017","journal-title":"arXiv 1705 00125"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358276"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240801"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00033"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228584"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358292"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293910"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942127"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750386"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783759"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173193"},{"key":"ref10","first-page":"1","article-title":"Generative adversarial nets","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541961"},{"key":"ref11","article-title":"Deep learning inference in Facebook data centers: Characterization, performance optimizations and hardware implications","author":"park","year":"2018","journal-title":"arXiv 1811 09886"},{"key":"ref12","article-title":"Deep learning recommendation model for personalization and recommendation systems","author":"naumov","year":"2019","journal-title":"arXiv 1906 00091"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2017.07.005"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00054"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2982416"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2019.2904897"},{"key":"ref118","first-page":"1","article-title":"SparseCore: An accelerator for structurally sparse CNNs","author":"chole","year":"2018","journal-title":"Proc SysML Conf"},{"key":"ref17","article-title":"Benchmarking TinyML systems: Challenges and direction","author":"banbury","year":"2020","journal-title":"arXiv 2003 04821"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502404"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC19947.2020.9063049"},{"key":"ref19","article-title":"Designing computer systems for software 2.0","author":"olukotun","year":"0"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2017.2714667"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/DSD.2018.00070"},{"key":"ref113","first-page":"137","article-title":"An efficient kernel transformation architecture for binary- and ternary-weight neural network inference","author":"zheng","year":"2018","journal-title":"Proc 55th Annu Design Automat Conf"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358291"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"ref120","year":"2018","journal-title":"NVIDIA Deep Learning Accelerator"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2852335"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1145\/3005448"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/DAC.2018.8465842"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00017"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2015.46"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.10.013"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1109\/TBCAS.2020.2974154"},{"key":"ref264","first-page":"1997","article-title":"Neural architecture search: A survey","volume":"20","author":"elsken","year":"2019","journal-title":"J Mach Learn Res"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2765695"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1145\/3309551"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/5\/9540802\/9507542-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5\/9540802\/09507542.pdf?arnumber=9507542","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,8]],"date-time":"2022-04-08T18:55:54Z","timestamp":1649444154000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9507542\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":274,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/jproc.2021.3098483","relation":{},"ISSN":["0018-9219","1558-2256"],"issn-type":[{"value":"0018-9219","type":"print"},{"value":"1558-2256","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,10]]}}}