{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T02:37:08Z","timestamp":1768012628759,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767703","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:20:02Z","timestamp":1762532402000},"page":"1534-1543","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A GPU FFT Wrapper to Co-optimize Floating-Point Precision and Library Selection via Predictive Error Modeling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9032-107X","authenticated-orcid":false,"given":"Julius","family":"Lehner","sequence":"first","affiliation":[{"name":"Technical University of Munich, Munich, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7043-4288","authenticated-orcid":false,"given":"Eishi","family":"Arima","sequence":"additional","affiliation":[{"name":"Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9013-435X","authenticated-orcid":false,"given":"Martin","family":"Schulz","sequence":"additional","affiliation":[{"name":"Technical University of Munich, Garching, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"1985. IEEE standard for binary floating-point arithmetic. ANSI\/IEEE Std 754-1985 (1985) 1\u201320."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","unstructured":"2008. IEEE standard for floating-point arithmetic. IEEE Std 754-2008 (2008) 1\u201370. 10.1109\/IEEESTD.2008.4610935","DOI":"10.1109\/IEEESTD.2008.4610935"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Tahmid Abtahi Colin Shea Amey Kulkarni and Tinoosh Mohsenin. 2018. Accelerating Convolutional Neural Network With FFT on Embedded Hardware. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 26 9 (2018) 1737\u20131749.","DOI":"10.1109\/TVLSI.2018.2825145"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Andrey Alekseenko Szil\u00e1rd P\u00e1ll and Erik Lindahl. 2024. GROMACS on AMD GPU-Based HPC Platforms: Using SYCL for Performance and Portability. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.01420 (2024).","DOI":"10.1145\/3725789.3725797"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"James\u00a0W. Cooley and John\u00a0W. Tukey. 1965. An algorithm for the machine calculation of complex Fourier series. Math. Comp. 19 (1965) 297\u2013301.","DOI":"10.1090\/S0025-5718-1965-0178586-1"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"William\u00a0J. Dally Stephen\u00a0W. Keckler and David\u00a0B. Kirk. 2021. Evolution of the Graphics Processing Unit (GPU). IEEE Micro 41 6 (2021) 42\u201351.","DOI":"10.1109\/MM.2021.3113475"},{"key":"e_1_3_3_1_8_2","unstructured":"ONNX\u00a0Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. Version: 1.21.0."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00019"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"J. Dongarra and F. Sullivan. 2000. Guest Editors Introduction to the top 10 algorithms. Computing in Science & Engineering 2 01 (2000) 22\u201323.","DOI":"10.1109\/MCISE.2000.814652"},{"key":"e_1_3_3_1_11_2","unstructured":"Jack\u00a0J Dongarra Piotr Luszczek and Yaohung\u00a0M Tsai. 2019. HPL-MxP mixed-precision benchmark. https:\/\/hpl-mxp.org\/ Accessed: 05.05.2025."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"M. Frigo and S.G. Johnson. 2005. The Design and Implementation of FFTW3. Proc. IEEE 93 2 (2005) 216\u2013231.","DOI":"10.1109\/JPROC.2004.840301"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/1810085.1810127"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3213846.3213862"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"John\u00a0L Gustafson and Isaac\u00a0T Yonemoto. 2017. Beating floating point at its own game: Posit arithmetic. Supercomputing frontiers and innovations 4 2 (2017) 71\u201386.","DOI":"10.14529\/jsfi170206"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ETS.2013.6569370"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611973099.93"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Wei Hu Xinming Qin Qingcai Jiang Junshi Chen Hong An Weile Jia Fang Li Xin Liu Dexun Chen Fangfang Liu Yuwen Zhao and Jinlong Yang. 2021. High performance computing of DGDFT for tens of thousands of atoms using millions of cores on Sunway TaihuLight. Science Bulletin 66 2 (2021) 111\u2013119.","DOI":"10.1016\/j.scib.2020.06.025"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Michael\u00a0L Katz Nikolaos Karnesis Natalia Korsakova Jonathan\u00a0R Gair and Nikolaos Stergioulas. 2025. Efficient GPU-accelerated multisource global fit pipeline for LISA data analysis. Physical Review D 111 2 (2025) 024060.","DOI":"10.1103\/PhysRevD.111.024060"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Alexander Kunkel Hei Yin\u00a0Jowett Chan Hsi-Yu Schive Hsinhao Huang and Pin-Yu Liao. 2025. A Hybrid Scheme for Fuzzy Dark Matter Simulations Combining the Schr\u00f6dinger and Hamilton\u2013Jacobi\u2013Madelung Equations. The Astrophysical Journal Supplement Series 279 2 (2025) 39.","DOI":"10.3847\/1538-4365\/addc59"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/Correctness49594.2019.00009"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/Cluster48925.2021.00035"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Stefan Mach Fabian Schuiki Florian Zaruba and Luca Benini. 2020. FPnew: An open-source multiformat floating-point unit architecture for energy-proportional transprecision computing. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 29 4 (2020) 774\u2013787.","DOI":"10.1109\/TVLSI.2020.3044752"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Chahak Mehta Amarnath Karthi Vishrut Jetly and Bhaskar Chaudhury. 2021. Parallel Fast Multipole Method accelerated FFT on HPC clusters. Parallel Comput. 104-105 (2021) 102783.","DOI":"10.1016\/j.parco.2021.102783"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00051"},{"key":"e_1_3_3_1_27_2","unstructured":"Paulius Micikevicius Sharan Narang Jonah Alben Gregory Diamos Erich Elsen David Garcia Boris Ginsburg Michael Houston Oleksii Kuchaiev Ganesh Venkatesh and Hao Wu. 2018. Mixed Precision Training. arxiv:https:\/\/arXiv.org\/abs\/1710.03740\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/1710.03740"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Sparsh Mittal. 2016. A survey of techniques for approximate computing. ACM Comput. Surv. 48 4 Article 62 (mar 2016) 33\u00a0pages.","DOI":"10.1145\/2893356"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.5555\/1413370.1413376"},{"key":"e_1_3_3_1_30_2","volume-title":"cuFFTDx Documentation","year":"2024","unstructured":"NVIDIA. 2024. cuFFTDx Documentation. https:\/\/docs.nvidia.com\/cuda\/cufftdx Accessed: 01.09.2024."},{"key":"e_1_3_3_1_31_2","unstructured":"NVIDIA Corporation. 2021. NVIDIA A100 TENSOR CORE GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/ Accessed: 07.08.2025."},{"key":"e_1_3_3_1_32_2","volume-title":"NVIDIA cuFFT Library","author":"Corporation NVIDIA","year":"2023","unstructured":"NVIDIA Corporation. 2023. NVIDIA cuFFT Library. https:\/\/docs.nvidia.com\/cuda\/cufft\/index.html."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2008.4536163"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-97557-8_6"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CPEE.2015.7333365"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356209"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/2503210.2503296"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2017.8167780"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Dmitrii Tolmachev. 2023. VkFFT-A Performant Cross-Platform and Open-Source GPU FFT Library. IEEE Access 11 (2023) 12039\u201312058.","DOI":"10.1109\/ACCESS.2023.3242240"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.95"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3566097.3567885"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Qiang Xu Todd Mytkowicz and Nam\u00a0Sung Kim. 2016. Approximate computing: A survey. IEEE Design & Test 33 1 (2016) 8\u201322.","DOI":"10.1109\/MDAT.2015.2505723"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Caleb\u00a0O Yenusah Nathaniel\u00a0R Morgan Ricardo\u00a0A Lebensohn Miroslav Zecevic and Marko Knezevic. 2024. A parallel and performance portable implementation of a full-field crystal plasticity model. Computer Physics Communications 300 (2024) 109190.","DOI":"10.1016\/j.cpc.2024.109190"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Davide Zoni Andrea Galimberti and William Fornaciari. 2021. An FPU design template to optimize the accuracy-efficiency-area trade-off. Sustainable Computing: Informatics and Systems 29 (2021) 100450.","DOI":"10.1016\/j.suscom.2020.100450"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767703","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:35:27Z","timestamp":1767987327000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767703"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":43,"alternative-id":["10.1145\/3731599.3767703","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767703","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}