{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,27]],"date-time":"2026-06-27T19:43:37Z","timestamp":1782589417183,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,12]],"date-time":"2019-10-12T00:00:00Z","timestamp":1570838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,12]]},"DOI":"10.1145\/3352460.3358275","type":"proceedings-article","created":{"date-parts":[[2019,10,11]],"date-time":"2019-10-11T11:16:45Z","timestamp":1570792605000},"page":"319-333","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":212,"title":["ExTensor"],"prefix":"10.1145","author":[{"given":"Kartik","family":"Hegde","sequence":"first","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hadi","family":"Asghari-Moghaddam","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael","family":"Pellauer","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Neal","family":"Crago","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aamer","family":"Jaleel","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Edgar","family":"Solomonik","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joel","family":"Emer","sequence":"additional","affiliation":[{"name":"NVIDIA\/MIT"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Christopher W.","family":"Fletcher","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2019,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"OSDI'16","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. Tensorflow: a system for large-scale machine learning. In OSDI'16."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Seher Acer Oguz Selvitopi and Cevdet Aykanat. 2016. Improving performance of sparse matrix dense matrix multiplication on large-scale parallel systems. Parallel Comput. (2016).","DOI":"10.1016\/j.parco.2016.10.001"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.125"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001138"},{"key":"e_1_3_2_1_5_1","volume-title":"Tensor decompositions for learning latent variable models. The Journal of Machine Learning Research","author":"Anandkumar Animashree","year":"2014","unstructured":"Animashree Anandkumar, Rong Ge, Daniel Hsu, Sham M Kakade, and Matus Telgarsky. 2014. Tensor decompositions for learning latent variable models. The Journal of Machine Learning Research (2014)."},{"key":"e_1_3_2_1_6_1","volume":"201","author":"Azad A.","unstructured":"A. Azad, A. Buluc, and J. Gilbert. 2015. Parallel Triangle Counting and Enumeration Using Matrix Algebra. In IPDPS'15 Workshop.","journal-title":"J. Gilbert."},{"key":"e_1_3_2_1_7_1","volume-title":"Discussion tracking in Enron email using PARAFAC","author":"Bader Brett W","unstructured":"Brett W Bader, Michael W Berry, and Murray Browne. 2008. Discussion tracking in Enron email using PARAFAC. In Survey of Text Mining II. Springer, 147--163."},{"key":"e_1_3_2_1_8_1","volume-title":"Rolf Fagerberg, Riko Jacob, and Elias Vicari.","author":"Bender Michael A","year":"2010","unstructured":"Michael A Bender, Gerth St\u00f8lting Brodal, Rolf Fagerberg, Riko Jacob, and Elias Vicari. 2010. Optimal sparse matrix dense vector multiplication in the I\/O-model. Theory of Computing Systems (2010)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of KDD cup and workshop","author":"Bennett James","year":"2007","unstructured":"James Bennett, Stan Lanning, et al. 2007. The netflix prize. In Proceedings of KDD cup and workshop (2007)."},{"key":"e_1_3_2_1_10_1","unstructured":"L Susan Blackford Antoine Petitet Roldan Pozo Karin Remington R Clint Whaley James Demmel Jack Dongarra Iain Duff Sven Hammarling Greg Henry et al. 2002. An updated set of basic linear algebra subprograms (BLAS). ACM Trans. Math. Software (2002)."},{"key":"e_1_3_2_1_11_1","volume-title":"ISSCC'00","author":"Bohr Mark","year":"2010","unstructured":"Mark Bohr. 2010. The new era of scaling in an SoC world. In ISSCC'00."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Yu-Hsin Chen Joel Emer and Vivienne Sze. 2018. Eyeriss v2: A Flexible and High-Performance Accelerator for Emerging Deep Neural Networks. arXiv:1807.07928","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3276493"},{"key":"e_1_3_2_1_15_1","volume-title":"Mandic","author":"Cichocki Andrzej","year":"2016","unstructured":"Andrzej Cichocki, Namgil Lee, Ivan Oseledets, Anh-Huy Phan, Qibin Zhao, and Danilo P. Mandic. 2016. Tensor Networks for Dimensionality Reduction and Large-scale Optimization: Part 1 Low-Rank Tensor Decompositions. Foundations and Trends in Machine Learning (2016)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Timothy A Davis and Yifan Hu. 2011. The University of Florida sparse matrix collection. (2011). https:\/\/sparse.tamu.edu","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_2_1_17_1","volume-title":"Sparse Matrix-Matrix Multiplication on Multilevel Memory Architectures: Algorithms and Experiments. arXiv preprint arXiv:1804.00695","author":"Deveci Mehmet","year":"2018","unstructured":"Mehmet Deveci, Simon D Hammond, Michael M Wolf, and Sivasankaran Rajamanickam. 2018. Sparse Matrix-Matrix Multiplication on Multilevel Memory Architectures: Algorithms and Experiments. arXiv preprint arXiv:1804.00695 (2018)."},{"key":"e_1_3_2_1_18_1","volume-title":"The new millennium edition: mainly mechanics, radiation, and heat. Basic books.","author":"Feynman Richard P","unstructured":"Richard P Feynman, Robert B Leighton, and Matthew Sands. 2011. The Feynman lectures on physics, Vol. I: The new millennium edition: mainly mechanics, radiation, and heat. Basic books."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2010.121"},{"key":"e_1_3_2_1_20_1","volume-title":"A literature survey of low-rank tensor approximation techniques. GAMM-Mitteilungen","author":"Grasedyck Lars","year":"2013","unstructured":"Lars Grasedyck, Daniel Kressner, and Christine Tobler. 2013. A literature survey of low-rank tensor approximation techniques. GAMM-Mitteilungen (2013)."},{"key":"e_1_3_2_1_21_1","volume":"201","author":"Han Song","unstructured":"Song Han, Xingyu Liu, Huizi Mao, Jing Pu, Ardavan Pedram, Mark A Horowitz, and William J Dally. 2016. EIE: efficient inference engine on compressed deep neural network. In ISCA'16.","journal-title":"William J Dally."},{"key":"e_1_3_2_1_22_1","volume-title":"Deep Residual Learning for Image Recognition. In CVPR'16","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In CVPR'16."},{"key":"e_1_3_2_1_23_1","volume-title":"MICRO'18","author":"Hegde K.","unstructured":"K. Hegde, R. Agrawal, Y. Yao, and C. Fletcher. 2018. Morph: Flexible Acceleration for 3D CNN-basedVideo Understanding. In MICRO'18."},{"key":"e_1_3_2_1_24_1","volume-title":"Technische Universit\u00e4t M\u00fcnchen","author":"Heinecke Alexander Friedrich","year":"2008","unstructured":"Alexander Friedrich Heinecke. 2008. Cache Optimised Data Structures and Algorithms for Sparse Matrices. Bachelorarbeit in Informatik, Technische Universit\u00e4t M\u00fcnchen (2008)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"T. Kolda and B. Bader. 2009. Tensor Decompositions and Applications. SIAM Rev. (2009).","DOI":"10.1137\/07070111X"},{"key":"e_1_3_2_1_28_1","unstructured":"Joseph C Kolecki. 2002. An introduction to tensors for students of physics and engineering. (2002)."},{"key":"e_1_3_2_1_29_1","volume-title":"NIPS'12","author":"Krizhevsky Alex","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. Imagenet classification with deep convolutional neural networks. In NIPS'12."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173176"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"C. Y. Lin Z. Zhang N. Wong and H. K. So. 2010. Design space exploration for sparse matrix-matrix multiplication on FPGAs. In FPT'10.","DOI":"10.1109\/FPT.2010.5681425"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Daofu Liu Tianshi Chen Shaoli Liu Jinhong Zhou Shengyuan Zhou Olivier Teman Xiaobing Feng Xuehai Zhou and Yunji Chen. 2015. PuDianNao: A Polyvalent Machine Learning Accelerator. SIGPLAN Not. (2015).","DOI":"10.1145\/2694344.2694358"},{"key":"e_1_3_2_1_33_1","unstructured":"Kiran Matam Siva Rama Krishna Bharadwaj Indarapu and Kishore Kothapalli. [n. d.]. Sparse matrix-matrix multiplication on modern architectures. In HiPC'12."},{"key":"e_1_3_2_1_34_1","volume-title":"HPEC'13","author":"Mattson T.","unstructured":"T. Mattson, D. Bader, J. Berry, A. Buluc, J. Dongarra, C. Faloutsos, J. Feo, J. Gilbert, J. Gonzalez, B. Hendrickson, J. Kepner, C. Leiserson, A. Lumsdaine, D. Padua, S. Poole, S. Reinhardt, M. Stonebraker, S. Wallach, and A. Yoo. 2013. Standards for graph algorithm primitives. In HPEC'13."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2507157.2507163"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Asit K Mishra Eriko Nurvitadhi Ganesh Venkatesh Jonathan Pearce and Debbie Marr. 2017. Fine-grained accelerators for sparse machine learning workloads. In ASP-DAC'17.","DOI":"10.1109\/ASPDAC.2017.7858395"},{"key":"e_1_3_2_1_37_1","unstructured":"Naveen Muralimanohar and Rajeev Balasubramonian. 2009. CACTI 6.0: A Tool to Understand Large Caches."},{"key":"e_1_3_2_1_38_1","volume-title":"High-performance sparse matrix-matrix products on Intel KNL and multicore architectures. arXiv preprint arXiv:1804.01698","author":"Nagasaka Yusuke","year":"2018","unstructured":"Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018. High-performance sparse matrix-matrix products on Intel KNL and multicore architectures. arXiv preprint arXiv:1804.01698 (2018)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/2830689.2830704"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3850\/9783981537079_0766"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"e_1_3_2_1_42_1","volume":"201","author":"Parashar Angshuman","unstructured":"Angshuman Parashar, Minsoo Rhu, Anurag Mukkara, Antonio Puglielli, Rangharajan Venkatesan, Brucek Khailany, Joel Emer, Stephen W Keckler, and William J Dally. 2017. Scnn: An accelerator for compressed-sparse convolutional neural networks. In ISCA'17.","journal-title":"William J Dally."},{"key":"e_1_3_2_1_43_1","volume":"201","author":"Pellauer M.","unstructured":"M. Pellauer, Y. Shao, J. Clemons, N. Crago, K. Hegde, R. Venkatesan, S. Keckler, C. Fletcher, and J. Emer. 2019. Buffets: An Efficient and Composable Storage Idiom for Explicit Decoupled Data Orchestration. In ASPLOS'19.","journal-title":"J. Emer."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080256"},{"key":"e_1_3_2_1_45_1","volume-title":"FROSTT: The Formidable Repository of Open Sparse Tensors and Tools","author":"Smith Shaden","year":"2017","unstructured":"Shaden Smith, Jee W. Choi, Jiajia Li, Richard Vuduc, Jongsoo Park, Xing Liu, and George Karypis. 2017. FROSTT: The Formidable Repository of Open Sparse Tensors and Tools. http:\/\/frostt.io\/"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2833179.2833183"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126971"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1592665.1592675"},{"key":"e_1_3_2_1_49_1","volume-title":"High-Performance Computing on the Intel\u00ae Xeon Phi","author":"Wang Endong","unstructured":"Endong Wang, Qing Zhang, Bo Shen, Guangyong Zhang, Xiaowei Lu, Qing Wu, and Yajuan Wang. 2014. Intel math kernel library. In High-Performance Computing on the Intel\u00ae Xeon Phi. Springer."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"L. Yavits A. Morad and R. Ginosar. 2015. Sparse Matrix Multiplication On An Associative Processor. IEEE Transactions on Parallel and Distributed Systems (2015).","DOI":"10.1109\/TPDS.2014.2370055"},{"key":"e_1_3_2_1_51_1","volume-title":"MICRO'16","author":"Zhang S.","unstructured":"S. Zhang, Z. Du, L. Zhang, H. Lan, S. Liu, L. Li, Q. Guo, T. Chen, and Y. Chen. 2016. Cambricon-X: An accelerator for sparse neural networks. In MICRO'16."},{"key":"e_1_3_2_1_52_1","unstructured":"Huasha Zhao. 2014. High Performance Machine Learning through Codesign and Rooflining. PhD Thesis (2014)."}],"event":{"name":"MICRO '52: The 52nd Annual IEEE\/ACM International Symposium on Microarchitecture","location":"Columbus OH USA","acronym":"MICRO '52","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing","IEEE CS"]},"container-title":["Proceedings of the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3352460.3358275","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3352460.3358275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,29]],"date-time":"2025-07-29T22:24:24Z","timestamp":1753827864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3352460.3358275"}},"subtitle":["An Accelerator for Sparse Tensor Algebra"],"short-title":[],"issued":{"date-parts":[[2019,10,12]]},"references-count":52,"alternative-id":["10.1145\/3352460.3358275","10.1145\/3352460"],"URL":"https:\/\/doi.org\/10.1145\/3352460.3358275","relation":{},"subject":[],"published":{"date-parts":[[2019,10,12]]},"assertion":[{"value":"2019-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}