{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:27:32Z","timestamp":1768030052609,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T00:00:00Z","timestamp":1708387200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,2]]},"DOI":"10.1145\/3627535.3638489","type":"proceedings-article","created":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T14:22:41Z","timestamp":1708438961000},"page":"390-403","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Fast Kronecker Matrix-Matrix Multiplication on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4849-6776","authenticated-orcid":false,"given":"Abhinav","family":"Jangda","sequence":"first","affiliation":[{"name":"Microsoft Research, Redmond, Washington, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7487-6811","authenticated-orcid":false,"given":"Mohit","family":"Yadav","sequence":"additional","affiliation":[{"name":"Manning College of Information &amp; Computer Sciences, University of Massachusetts Amherst, Amherst, Massachusetts, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,2,20]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Accessed: 2022-07-30. NVIDIA cuBLAS. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_1_2_1","unstructured":"Accessed: 2023-07-30. NVIDIA cuTLASS: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_1_3_1","unstructured":"Accessed: 2023-07-30. NVIDIA NCCL: Optimized primitives for collective multi-GPU communication. https:\/\/github.com\/NVIDIA\/nccl."},{"key":"e_1_3_2_1_4_1","unstructured":"Accessed: 2023-07-30. UCI ML Dataset. https:\/\/archive.ics.uci.edu\/datasets."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818311"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.04900"},{"key":"e_1_3_2_1_7_1","volume-title":"Advances in Neural Information Processing Systems","author":"Bonilla Edwin V","year":"2007","unstructured":"Edwin V Bonilla, Kian Chai, and Christopher Williams. 2007. Multitask Gaussian Process Prediction. In Advances in Neural Information Processing Systems, J. Platt, D. Koller, Y. Singer, and S. Roweis (Eds.). Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2007\/file\/66368270ffd51418ec58bd793f2d9b1b-Paper.pdf"},{"key":"e_1_3_2_1_8_1","unstructured":"Lynn Elliot Cannon. 1969. A Cellular Computer to Implement the Kalman Filter Algorithm. Ph. D. Dissertation. USA."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1080\/10618600.2022.2134873"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1981.6312174"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1137\/140980326"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.80"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2851141.2851157"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (Proceedings of Machine Learning Research","author":"Gardner Jacob","year":"2018","unstructured":"Jacob Gardner, Geoff Pleiss, Ruihan Wu, Kilian Weinberger, and Andrew Wilson. 2018. Product Kernel Interpolation for Scalable Gaussian Processes. In Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (Proceedings of Machine Learning Research, Vol. 84). PMLR. https:\/\/proceedings.mlr.press\/v84\/gardner18a.html"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems","author":"Gardner Jacob R.","year":"2018","unstructured":"Jacob R. Gardner, Geoff Pleiss, David Bindel, Kilian Q. Weinberger, and Andrew Gordon Wilson. 2018. GPyTorch: Blackbox Matrix-Matrix Gaussian Process Inference with GPU Acceleration. In Proceedings of the 32nd International Conference on Neural Information Processing Systems (Montr\u00e9al, Canada) (NIPS'18). 11 pages."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2016.51"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1008970"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.4705"},{"key":"e_1_3_2_1_19_1","volume-title":"Lyakh","author":"Hynninen Antti-Pekka","year":"2017","unstructured":"Antti-Pekka Hynninen and Dmitry I. Lyakh. 2017. cuTT: A High-Performance Tensor Transpose Library for CUDA Compatible GPUs. CoRR abs\/1705.01598 (2017). arXiv:1705.01598"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Abhinav Jangda. 2023. (Artifact) Fast Kronecker Matrix Multiplication on GPUs. (12 2023). 10.6084\/m9.figshare.24803229.v1","DOI":"10.6084\/m9.figshare.24803229.v1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_22_1","volume-title":"Kronecker Recurrent Units. CoRR abs\/1705.10142","author":"Jose Cijo","year":"2017","unstructured":"Cijo Jose, Moustapha Ciss\u00e9, and Fran\u00e7ois Fleuret. 2017. Kronecker Recurrent Units. CoRR abs\/1705.10142 (2017). http:\/\/arxiv.org\/abs\/1705.10142"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205296"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661182"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356181"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cam.2003.10.010"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/263580.263591"},{"key":"e_1_3_2_1_28_1","volume-title":"Kronecker Graphs: An Approach to Modeling Networks. J. Mach. Learn. Res. 11","author":"Leskovec Jure","year":"2010","unstructured":"Jure Leskovec, Deepayan Chakrabarti, Jon Kleinberg, Christos Faloutsos, and Zoubin Ghahramani. 2010. Kronecker Graphs: An Approach to Modeling Networks. J. Mach. Learn. Res. 11 (2010)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2014.12.013"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-011-0179-2"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1137\/16M108968X"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2015.106"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460945.3464955"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning (Proceedings of Machine Learning Research). PMLR. https:\/\/proceedings.mlr.press\/v80\/pleiss18a.html","author":"Pleiss Geoff","year":"2018","unstructured":"Geoff Pleiss, Jacob Gardner, Kilian Weinberger, and Andrew Gordon Wilson. 2018. Constant-Time Predictive Distributions for Gaussian Processes. In Proceedings of the 35th International Conference on Machine Learning (Proceedings of Machine Learning Research). PMLR. https:\/\/proceedings.mlr.press\/v80\/pleiss18a.html"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.36"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/3206.001.0001"},{"key":"e_1_3_2_1_37_1","volume-title":"Williams","author":"Rasmussen Carl Edward","year":"2006","unstructured":"Carl Edward Rasmussen and Christopher K. I. Williams. 2006. Gaussian processes for machine learning. MIT Press."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2021.02.013"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2016.031"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","unstructured":"Edgar Solomonik and James Demmel. 2011. Communication-Optimal Parallel 2.5D Matrix Multiplication and LU Factorization Algorithms. In Euro-Par 2011 Parallel Processing. 10.1007\/978-3-642-23397-5_10","DOI":"10.1007\/978-3-642-23397-5_10"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2014.06.002"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3157733"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2935323.2935328"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3091966.3091968"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of Machine Learning and Systems 2021","author":"Thakker Urmish","year":"2021","unstructured":"Urmish Thakker, Paul N. Whatmough, Zhi Gang Liu, Matthew Mattina, and Jesse G. Beu. 2021. Doping: A technique for Extreme Compression of LSTM Models using Sparse Structured Additive Matrices. In Proceedings of Machine Learning and Systems 2021, MLSys 2021, Alex Smola, Alex Dimakis, and Ion Stoica (Eds.). mlsys.org."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1002\/(SICI)1096-9128(199704)9:4"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2764454"},{"key":"e_1_3_2_1_48_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. arXiv:1802.04730 [cs.PL]","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. arXiv:1802.04730 [cs.PL]"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2018.00067"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-06127-y"},{"key":"e_1_3_2_1_51_1","unstructured":"Andrew Gordon Wilson Christoph Dann and Hannes Nickisch. 2015. Thoughts on Massively Scalable Gaussian Processes. arXiv:1511.01870 [cs.LG]"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 32nd International Conference on International Conference on Machine Learning -","volume":"37","author":"Wilson Andrew Gordon","year":"2015","unstructured":"Andrew Gordon Wilson and Hannes Nickisch. 2015. Kernel Interpolation for Scalable Structured Gaussian Processes (KISS-GP). In Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37 (Lille, France) (ICML'15)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523437"}],"event":{"name":"PPoPP '24: 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Edinburgh United Kingdom","acronym":"PPoPP '24","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638489","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627535.3638489","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:27Z","timestamp":1750182567000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638489"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,20]]},"references-count":53,"alternative-id":["10.1145\/3627535.3638489","10.1145\/3627535"],"URL":"https:\/\/doi.org\/10.1145\/3627535.3638489","relation":{},"subject":[],"published":{"date-parts":[[2024,2,20]]},"assertion":[{"value":"2024-02-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}