{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:59:11Z","timestamp":1774353551457,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","funder":[{"name":"US Department of Energy","award":["DE-AC05-00OR22725"],"award-info":[{"award-number":["DE-AC05-00OR22725"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759877","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"281-297","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Scaling the memory wall using mixed-precision - HPG-MxP on an exascale machine"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2589-3792","authenticated-orcid":false,"given":"Aditya","family":"Kashi","sequence":"first","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, TN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5384-9741","authenticated-orcid":false,"given":"Nicholson","family":"Koukpaizan","sequence":"additional","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, TN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8941-870X","authenticated-orcid":false,"given":"Hao","family":"Lu","sequence":"additional","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, TN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1512-5255","authenticated-orcid":false,"given":"Michael","family":"Matheson","sequence":"additional","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8745-7078","authenticated-orcid":false,"given":"Sarp","family":"Oral","sequence":"additional","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, TN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0099-1559","authenticated-orcid":false,"given":"Feiyi","family":"Wang","sequence":"additional","affiliation":[{"name":"National Center for Computational Sciences, Oak Ridge National Laboratory (ORNL), Oak Ridge, TN, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_3_2_2","volume-title":"NVIDIA Blackwell architecture technical brief","year":"2024","unstructured":"2024. NVIDIA Blackwell architecture technical brief. Technical Report. NVIDIA. version 1.1."},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.2172\/1814677"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.2172\/2318788"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"publisher","unstructured":"Hartwig Anzt Terry Cojean Goran Flegar Fritz G\u00f6bel Thomas Gr\u00fctzmacher Pratik Nayak Tobias Ribizel Yuhsiang Tsai and Enrique\u00a0S. Quintana-Ort\u00ed. 2022. Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing. ACM Trans. Math. Software 48 1 (March 2022) 1\u201333. 10.1145\/3480935","DOI":"10.1145\/3480935"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","unstructured":"Hartwig Anzt Bj\u00f6rn Rocker and Vincent Heuveline. 2010. Energy efficiency of mixed precision iterative refinement methods using hybrid hardware platforms. Computer Science - Research and Development 25 3 (2010). 10.1007\/s00450-010-0124-2","DOI":"10.1007\/s00450-010-0124-2"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24025-618"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.2172\/2476320"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654078"},{"key":"e_1_3_3_3_10_2","unstructured":"Michele Benzi Wayne Joubert and Gabriel Mateescu. 1999. Numerical experiments with parallel orderings for ILU preconditioners. Electron. Trans. Numer. Anal. 8 (1999) 88\u2013114. https:\/\/etna.math.kent.edu\/volumes\/1993-2000\/vol8\/abstract.php?vol=8%20&pages=88-114"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"crossref","unstructured":"Achi Brandt. 1977. Multi-level adaptive solutions to boundary value problems. Math. Comp. 31 138 (1977) 333\u2013390. https:\/\/www.jstor.org\/stable\/2006422","DOI":"10.1090\/S0025-5718-1977-0431719-X"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","unstructured":"Alfredo Buttari Jack Dongarra Jakub Kurzak Piotr Luszczek and Stanimir Tomov. 2008. Using mixed precision for sparse matrix computations to enhance the performance while achieving 64-bit accuracy. ACM Trans. Math. Software 34 4 (2008). 10.1145\/1377596.1377597","DOI":"10.1145\/1377596.1377597"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","unstructured":"Todd\u00a0T. Chisholm and David\u00a0W. Zingg. 2009. A Jacobian-free Newton\u2013Krylov algorithm for compressible turbulent fluid flows. J. Comput. Phys. 228 9 (2009) 3490\u20133507. 10.1016\/j.jcp.2009.02.004","DOI":"10.1016\/j.jcp.2009.02.004"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","unstructured":"Ewa Deelman Jack Dongarra Bruce Hendrickson Amanda Randles Daniel Reed Edward Seidel and Katherine Yelick. 2025. High-performance computing at a crossroads. Science 387 6736 (2025) 829\u2013831. 10.1126\/science.adu0801","DOI":"10.1126\/science.adu0801"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","unstructured":"Jack Dongarra Michael\u00a0A Heroux and Piotr Luszczek. 2016. High-performance conjugate-gradient benchmark: A new metric for ranking high-performance computing systems. The International Journal of High Performance Computing Applications 30 1 (Feb. 2016) 3\u201310. 10.1177\/1094342015593158","DOI":"10.1177\/1094342015593158"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","unstructured":"L. Giraud J. Langou and M. Rozloznik. 2005. The loss of orthogonality in the Gram-Schmidt orthogonalization process. Computers & Mathematics with Applications 50 7 (2005) 1069\u20131075. 10.1016\/j.camwa.2005.08.009Numerical Methods and Computational Mechanics.","DOI":"10.1016\/j.camwa.2005.08.009"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","unstructured":"William\u00a0D Gropp Dinesh\u00a0K Kaushik David\u00a0E Keyes and Barry\u00a0F Smith. 2001. High-performance parallel implicit CFD. Parallel Comput. 27 4 (2001) 337\u2013362. 10.1016\/S0167-8191(00)00075-2Parallel computing in aerospace.","DOI":"10.1016\/S0167-8191(00)00075-2"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93698-745"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","unstructured":"Mark\u00a0T. Jones and Paul\u00a0E. Plassmann. 1993. A Parallel Graph Coloring Heuristic. SIAM Journal on Scientific Computing 14 3 (1993) 654\u2013669. 10.1137\/0914041 arXiv:https:\/\/doi.org\/10.1137\/0914041","DOI":"10.1137\/0914041"},{"key":"e_1_3_3_3_20_2","unstructured":"Aditya Kashi. 2020. Asynchronous fine-grain parallel iterative solvers for computational fluid dynamics. phdthesis. https:\/\/escholarship.mcgill.ca\/downloads\/2f75rd57s"},{"key":"e_1_3_3_3_21_2","unstructured":"Aditya Kashi Hao Lu Wesley Brewer David Rogers Michael Matheson Mallikarjun Shankar and Feiyi Wang. 2025. Mixed-precision numerics in scientific applications: survey and perspectives. arxiv:https:\/\/arXiv.org\/abs\/2412.19322\u00a0[cs.CE]"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW52791.2021.00078"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","unstructured":"Michael Luby. 1986. A Simple Parallel Algorithm for the Maximal Independent Set Problem. SIAM J. Comput. 15 4 (1986) 1036\u20131053. 10.1137\/0215074","DOI":"10.1137\/0215074"},{"key":"e_1_3_3_3_24_2","volume-title":"Parallel Solution of Sparse Triangular Linear Systems in the Preconditioned Iterative Methods on the GPU","author":"Naumov Maxim","year":"2011","unstructured":"Maxim Naumov. 2011. Parallel Solution of Sparse Triangular Linear Systems in the Preconditioned Iterative Methods on the GPU. Technical Report NVR-2011-001. NVIDIA."},{"key":"e_1_3_3_3_25_2","volume-title":"Parallel graph coloring with applications to the incomplete LU factorization on the GPU","author":"Naumov Maxim","year":"2015","unstructured":"Maxim Naumov, Patrice Castonguay, and J. Cohen. 2015. Parallel graph coloring with applications to the incomplete LU factorization on the GPU. Technical Report NVR-2015-001. NVIDIA."},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898718003"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"crossref","unstructured":"Youcef Saad and Martin\u00a0H. Schulz. 1986. GMRES - A generalized minimal residual algorithm for solving nonsymmetric linear systems. SIAM J. Sci. Statist. Comput. 7 3 (1986).","DOI":"10.1137\/0907058"},{"key":"e_1_3_3_3_28_2","volume-title":"Domain decomposition - parallel multilevel methods for elliptic partial differential equations","author":"Smith B.F.","year":"1996","unstructured":"B.F. Smith, P.E. Bj\u00f8rstad, and W.D. Gropp. 1996. Domain decomposition - parallel multilevel methods for elliptic partial differential equations. Cambridge University Press."},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2012.23"},{"key":"e_1_3_3_3_30_2","unstructured":"Nico Trost. 2023. rocHPCG. https:\/\/github.com\/ROCm\/rocHPCG Advanced Micro Devices inc.."},{"key":"e_1_3_3_3_31_2","unstructured":"T. Washio and C.W. Oosterlee. 1997. Krylov subspace acceleration for nonlinear mulrigrid schemes. Electronic Transactions on Numerical Analysis 6 (Dec. 1997) 271\u2013290."},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS56514.2022.00015"}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759877","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:49:59Z","timestamp":1773254999000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759877"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":31,"alternative-id":["10.1145\/3712285.3759877","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759877","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}