{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T15:02:38Z","timestamp":1773414158393,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,28]],"date-time":"2023-10-28T00:00:00Z","timestamp":1698451200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,28]]},"DOI":"10.1145\/3613424.3614284","type":"proceedings-article","created":{"date-parts":[[2023,12,8]],"date-time":"2023-12-08T12:22:15Z","timestamp":1702038135000},"page":"1332-1346","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["A Tensor Marshaling Unit for Sparse Tensor Algebra on General-Purpose Processors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2782-837X","authenticated-orcid":false,"given":"Marco","family":"Siracusa","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center, Spain and Universitat Polit\u00e8cnica de Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8337-6326","authenticated-orcid":false,"given":"V\u00edctor","family":"Soria-Pardos","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Spain and Universitat Polit\u00e8cnica de Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8319-7491","authenticated-orcid":false,"given":"Francesco","family":"Sgherzi","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Spain and Universitat Polit\u00e8cnica de Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5154-8688","authenticated-orcid":false,"given":"Joshua","family":"Randall","sequence":"additional","affiliation":[{"name":"Arm, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7701-015X","authenticated-orcid":false,"given":"Douglas J.","family":"Joseph","sequence":"additional","affiliation":[{"name":"Samsung, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9848-8758","authenticated-orcid":false,"given":"Miquel","family":"Moret\u00f3 Planas","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Spain and Universitat Polit\u00e8cnica de Catalunya, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2869-668X","authenticated-orcid":false,"given":"Adri\u00e0","family":"Armejach","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Spain and Universitat Polit\u00e8cnica de Catalunya, Spain"}]}],"member":"320","published-online":{"date-parts":[[2023,12,8]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg\u00a0S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926254"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3296957.3173189"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00012"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447818.3460368"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611971538"},{"key":"e_1_3_2_1_7_1","unstructured":"Scott Beamer Krste Asanovi\u0107 and David Patterson. 2015. The GAP Benchmark Suite."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00118"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00014"},{"key":"e_1_3_2_1_10_1","unstructured":"M. Brandt J. Brooks M. Cahir T. Hewitt E. LopezPineda and D. Sandness. [n. d.]. The Benchmarker\u2019s Guide for CRAY SV1 Systems. https:\/\/parallel.ru\/sites\/default\/files\/ftp\/computers\/cray\/sv1_bmguide.pdf."},{"key":"e_1_3_2_1_11_1","unstructured":"Benjamin Brock Scott McMillan Ayd\u0131n Bulu\u00e7 Timothy Mattson and Jos\u00e9 Moreira. 2022. GraphBLAS C++ Specification. https:\/\/github.com\/GraphBLAS\/graphblas-api-cpp."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536313"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3276493"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358276"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2021.3098483"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2049662.2049663"},{"key":"e_1_3_2_1_17_1","volume-title":"SAND2013-4744 312","author":"Dongarra Jack","year":"2013","unstructured":"Jack Dongarra and Michael\u00a0A Heroux. 2013. Toward a new metric for ranking high performance computing systems. Sandia Report, SAND2013-4744 312 (2013), 150."},{"key":"e_1_3_2_1_18_1","unstructured":"Jack Dongarra Piotr Luszczek and M Heroux. 2013. HPCG technical specification. Sandia National Laboratories Sandia Report SAND2013-8752 (2013)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2002.1003586"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571157"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1101\/464420"},{"key":"e_1_3_2_1_22_1","volume-title":"Parallel String Graph Construction and Transitive Reduction for De Novo Genome Assembly. In 2020 IEEE International Parallel and Distributed Processing Symposium.","author":"Guidi Giulia","year":"2020","unstructured":"Giulia Guidi, Oguz Selvitopi, Marquita Ellis, Leonid Oliker, Katherine Yelick, and Aydin Buluc. 2020. Parallel String Graph Construction and Transitive Reduction for De Novo Genome Assembly. In 2020 IEEE International Parallel and Distributed Processing Symposium."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"e_1_3_2_1_25_1","article-title":"Sparsity in Deep Learning: Pruning and Growth for Efficient Inference and Training in Neural Networks","volume":"22","author":"Hoefler Torsten","year":"2021","unstructured":"Torsten Hoefler, Dan Alistarh, Tal Ben-Nun, Nikoli Dryden, and Alexandra Peste. 2021. Sparsity in Deep Learning: Pruning and Growth for Efficient Inference and Training in Neural Networks. J. Mach. Learn. Res. 22, 1, Article 241 (jan 2021), 124\u00a0pages.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582051"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00058"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358286"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898719918"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205296"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_32_1","volume-title":"Meet the walkers accelerating index traversals for in-memory databases. In 2013 46th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 468\u2013479","author":"Kocberber Onur","year":"2013","unstructured":"Onur Kocberber, Boris Grot, Javier Picorel, Babak Falsafi, Kevin Lim, and Parthasarathy Ranganathan. 2013. Meet the walkers accelerating index traversals for in-memory databases. In 2013 46th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 468\u2013479."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/977395.977673"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.aam9744"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441581"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2015.230"},{"key":"e_1_3_2_1_37_1","unstructured":"J. Lowe-Power A.\u00a0Mutaal Ahmad A. Akram M. Alian R. Amslinger M. Andreozzi A. Armejach N. Asmussen B. Beckmann S. Bharadwaj G. Black G. Bloom B.\u00a0R. Bruce D.\u00a0Rodrigues Carvalho J. Castrillon L. Chen N. Derumigny S. Diestelhorst W. Elsasser C. Escuin M. Fariborz A. Farmahini-Farahani P. Fotouhi R. Gambord J. Gandhi D. Gope T. Grass A. Gutierrez B. Hanindhito A. Hansson S. Haria A. Harris T. Hayes A. Herrera M. Horsnell S.\u00a0A.\u00a0R. Jafri R. Jagtap H. Jang R. Jeyapaul T.\u00a0M. Jones M. Jung S. Kannoth H. Khaleghzadeh Y. Kodama T. Krishna T. Marinelli C. Menard A. Mondelli M. Moreto T. M\u00fcck O. Naji K. Nathella H. Nguyen N. Nikoleris L.\u00a0E. Olson M. Orr B. Pham P. Prieto T. Reddy A. Roelke M. Samani A. Sandberg J. Setoain B. Shingarov M.\u00a0D. Sinclair T. Ta R. Thakur G. Travaglini M. Upton N. Vaish I. Vougioukas W. Wang Z. Wang N. Wehn C. Weis D.\u00a0A. Wood H. Yoon and \u00c9.\u00a0F. Zulian. 2020. The gem5 Simulator: Version 20.0+. arxiv:2007.03152\u00a0[cs.AR]"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508430"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00010"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3229710.3229720"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.6570"},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2023. Best Practices for Building and Deploying Recommender Systems. https:\/\/docs.nvidia.com\/deeplearning\/performance\/recsys-best-practices\/index.html"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00067"},{"key":"e_1_3_2_1_44_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32. Curran Associates, Inc., 8024\u20138035. http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2972222"},{"key":"e_1_3_2_1_46_1","unstructured":"et\u00a0al. Phipps Eric. [n. d.]. Genten: Software for Generalized Tensor Decompositions by Sandia National Laboratories. https:\/\/gitlab.com\/tensors\/genten\/."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1137\/18M1210691"},{"key":"e_1_3_2_1_48_1","volume-title":"Introduction to Tensor Decompositions and their Applications in Machine Learning. ArXiv abs\/1711.10781","author":"Rabanser Stephan","year":"2017","unstructured":"Stephan Rabanser, Oleksandr Shchur, and Stephan G\u00fcnnemann. 2017. Introduction to Tensor Decompositions and their Applications in Machine Learning. ArXiv abs\/1711.10781 (2017)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480047"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433763"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00079"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3428226"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2017.2690524"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3111761"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2020.3012318"},{"key":"e_1_3_2_1_56_1","volume-title":"FROSTT: The Formidable Repository of Open Sparse Tensors and Tools","author":"Smith Shaden","year":"2017","unstructured":"Shaden Smith, Jee\u00a0W. Choi, Jiajia Li, Richard Vuduc, Jongsoo Park, Xing Liu, and George Karypis. 2017. FROSTT: The Formidable Repository of Open Sparse Tensors and Tools. http:\/\/frostt.io\/"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2833179.2833183"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00068"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.vlsi.2017.02.002"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00061"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527437"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3430936"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.485896"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/1362622.1362674"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00087"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830807"},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, and Amr Ahmed. 2020. Big Bird: Transformers for Longer Sequences. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS\u201920). Curran Associates Inc., Red Hook, NY, USA, Article 1450, 15\u00a0pages."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446702"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/12.966490"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00030"}],"event":{"name":"MICRO '23: 56th Annual IEEE\/ACM International Symposium on Microarchitecture","location":"Toronto ON Canada","acronym":"MICRO '23","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["56th Annual IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613424.3614284","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613424.3614284","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:19:25Z","timestamp":1755890365000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613424.3614284"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,28]]},"references-count":71,"alternative-id":["10.1145\/3613424.3614284","10.1145\/3613424"],"URL":"https:\/\/doi.org\/10.1145\/3613424.3614284","relation":{},"subject":[],"published":{"date-parts":[[2023,10,28]]},"assertion":[{"value":"2023-12-08","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}