{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T21:40:10Z","timestamp":1770068410564,"version":"3.49.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T00:00:00Z","timestamp":1692403200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T00:00:00Z","timestamp":1692403200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Department of Science and Technology, India","award":["IF180094"],"award-info":[{"award-number":["IF180094"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-023-02109-0","type":"journal-article","created":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T04:02:32Z","timestamp":1692417752000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Scalable Multi-node Fast Fourier Transform on GPUs"],"prefix":"10.1007","volume":"4","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8050-7546","authenticated-orcid":false,"given":"Manthan","family":"Verma","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7957-1727","authenticated-orcid":false,"given":"Soumyadeep","family":"Chatterjee","sequence":"additional","affiliation":[]},{"given":"Gaurav","family":"Garg","sequence":"additional","affiliation":[]},{"given":"Bharatkumar","family":"Sharma","sequence":"additional","affiliation":[]},{"given":"Nishant","family":"Arya","sequence":"additional","affiliation":[]},{"given":"Sashi","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Anish","family":"Saxena","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3380-4561","authenticated-orcid":false,"given":"Mahendra K.","family":"Verma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,19]]},"reference":[{"key":"2109_CR1","unstructured":"Private communication with PK, Yeung and Ravikumar K."},{"key":"2109_CR2","unstructured":"Highlights - November 2021. https:\/\/www.top500.org\/lists\/top500\/2021\/11\/. 2021."},{"key":"2109_CR3","unstructured":"NVIDIA A100 TENSOR CORE GPU. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf. 2021. Accessed 10 Mar 2022"},{"key":"2109_CR4","unstructured":"Self communication very slow for device buffers. https:\/\/github.com\/openucx\/ucx\/issues\/6972. 2021. Accessed 8 Jan 2022"},{"key":"2109_CR5","doi-asserted-by":"publisher","unstructured":"Aji AM, Panwar LS, Ji F, Chabbi M, Murthy KS, Balaji P, Bisset KR, Dinan J, Feng W-C, Mellor-Crummey JM, Ma X, and Thakur R. On the efficacy of GPU-integrated MPI for scientific applications. Proceedings of the 22nd international symposium on High-performance parallel and distributed computing. 2013;191\u2013202. https:\/\/doi.org\/10.1145\/2493123.2462915","DOI":"10.1145\/2493123.2462915"},{"key":"2109_CR6","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.6508","volume":"35","author":"S Aseeri","year":"2021","unstructured":"Aseeri S, Chatterjee A, Verma M, Keyes D. A scheduling policy to save 10% of communication time in parallel fast Fourier transform. Concurr Computat Pract Exp. 2021;35: e6508.","journal-title":"Concurr Computat Pract Exp"},{"key":"2109_CR7","series-title":"pp","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1007\/978-3-030-50371-0_19","volume-title":"Computational Science - ICCS 2020","author":"A Ayala","year":"2020","unstructured":"Ayala A, Tomov S, Haidar A, Dongarra J. heFFTe: Highly Efficient FFT for Exascale. In: Krzhizhanovskaya VV, Z\u00e1vodszky G, Lees MH, Dongarra JJ, Sloot PMA, Brissos S, Teixeira J, editors. Computational Science - ICCS 2020. pp. Cham: Springer International Publishing; 2020. p. 262\u201375."},{"key":"2109_CR8","doi-asserted-by":"crossref","unstructured":"Ayala A, Tomov S, Luo X, Shaeik H, Haidar A, Bosilca G, and Dongarra J. Impacts of Multi-GPU MPI Collective Communications on Large FFT Computation. In 2019 IEEE\/ACM Workshop on Exascale MPI (ExaMPI). 2019;12\u201318.","DOI":"10.1109\/ExaMPI49596.2019.00007"},{"key":"2109_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2021.102856","volume":"109","author":"S Bak","year":"2022","unstructured":"...Bak S, Bertoni C, Boehm S, Budiardja R, Chapman BM, Doerfert J, Eisenbach M, Finkel H, Hernandez O, Huber J, Iwasaki S, Kale V, Kent PR, Kwack J, Lin M, Luszczek P, Luo Y, Pham B, Pophale S, Ravikumar K, Sarkar V, Scogland T, Tian S, Yeung P. OpenMP application experiences: Porting to accelerated nodes. Parallel Comput. 2022;109: 102856.","journal-title":"Parallel Comput"},{"key":"2109_CR10","volume-title":"Chebyshev and Fourier Spectral Methods, 2nd","author":"JP Boyd","year":"2003","unstructured":"Boyd JP. Chebyshev and Fourier Spectral Methods, 2nd. revised. New York: Dover Publications; 2003.","edition":"revised"},{"issue":"1","key":"2109_CR11","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.jpdc.2012.04.003","volume":"73","author":"AR Brodtkorb","year":"2013","unstructured":"Brodtkorb AR, Hagen TR, S\u00e6tra ML. 2013 Graphics processing unit (GPU) programming strategies and trends in GPU computing. J Parallel Distribut Comput. 2013;73(1):4\u201313.","journal-title":"J Parallel Distribut Comput"},{"key":"2109_CR12","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1016\/j.jpdc.2017.10.014","volume":"113","author":"AG Chatterjee","year":"2018","unstructured":"Chatterjee AG, Verma MK, Kumar A, Samtaney R, Hadri B, Khurram R. Scaling of a Fast Fourier Transform and a pseudo-spectral fluid solver up to 196608 cores. J Parallel Distrib Comput. 2018;113:77\u201391.","journal-title":"J Parallel Distrib Comput"},{"issue":"90","key":"2109_CR13","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1090\/S0025-5718-1965-0178586-1","volume":"19","author":"JW Cooley","year":"1965","unstructured":"Cooley JW, Tukey JW. An Algorithm for the Machine Calculation of Complex Fourier Series. Mathemat Comput. 1965;19(90):297\u2013301.","journal-title":"Mathemat Comput"},{"key":"2109_CR14","doi-asserted-by":"crossref","unstructured":"Czechowski K, Battaglino C, McClanahan C, Iyer K, Yeung PK, and Vuduc R. On the communication complexity of 3D FFTs and its implications for Exascale. In: Proceedings of the 26th ACM international conference on Supercomputing. 2012;205\u2013214, New York, New York, USA. ACM.","DOI":"10.1145\/2304576.2304604"},{"issue":"20","key":"2109_CR15","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevLett.115.204501","volume":"115","author":"V Dallas","year":"2015","unstructured":"Dallas V, Fauve S, Alexakis A. Statistical Equilibria of Large Scales in Dissipative Hydrodynamic Turbulence. Phys Rev Lett. 2015;115(20): 204501.","journal-title":"Phys Rev Lett"},{"key":"2109_CR16","doi-asserted-by":"publisher","first-page":"430","DOI":"10.17706\/IJCEE.2017.9.2.430-438","volume":"9","author":"T Dobravec","year":"2017","unstructured":"Dobravec T, Buli\u0107 P. Comparing CPU and GPU Implementations of a Simple Matrix Multiplication Algorithm. Int J Comput Elect Eng. 2017;9:430\u20138.","journal-title":"Int J Comput Elect Eng"},{"key":"2109_CR17","doi-asserted-by":"crossref","unstructured":"Doerfler D, Brightwell R. Measuring MPI Send and Receive Overhead and Application Availability in High Performance Network Interfaces. 2006;4192:331\u20138.","DOI":"10.1007\/11846802_46"},{"key":"2109_CR18","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1017\/S0022112010001400","volume":"657","author":"DA Donzis","year":"2010","unstructured":"Donzis DA, Sreenivasan KR. The bottleneck effect and the Kolmogorov constant in isotropic turbulence. J Fluid Mech. 2010;657:171\u201388.","journal-title":"J Fluid Mech"},{"issue":"3\u20134","key":"2109_CR19","doi-asserted-by":"publisher","first-page":"549","DOI":"10.1007\/s10494-010-9271-6","volume":"85","author":"DA Donzis","year":"2010","unstructured":"Donzis DA, Sreenivasan KR, Yeung PK. The Batchelor Spectrum for Mixing of Passive Scalars in Isotropic Turbulence. Flow Turbul Combust. 2010;85(3\u20134):549\u201366.","journal-title":"Flow Turbul Combust"},{"key":"2109_CR20","unstructured":"Donzis DA, Yeung PK, and Pekurovsky D. Turbulence simulations on $$O (10^4)$$ processors. In Proc TeraGrid. 2008."},{"issue":"4","key":"2109_CR21","doi-asserted-by":"publisher","DOI":"10.1063\/1.2907227","volume":"20","author":"DA Donzis","year":"2008","unstructured":"Donzis DA, Yeung PK, Sreenivasan KR. Dissipation and enstrophy in isotropic turbulence: Resolution effects and scaling in direct numerical simulations. Phys Fluids. 2008;20(4): 045108.","journal-title":"Phys Fluids"},{"key":"2109_CR22","unstructured":"Faraji I. Improving Communication Performance in GPU-Accelerated HPC Clusters. PhD thesis, Queen\u2019s University, Queen\u2019s University, Kingston, Ontario, Canada. 2018."},{"key":"2109_CR23","unstructured":"FFTW, The open source fast Fourier transform library. http:\/\/www.fftw.org\/. 2017. Accessed 10 Dec 2021"},{"issue":"2","key":"2109_CR24","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1109\/JPROC.2004.840301","volume":"93","author":"M Frigo","year":"2005","unstructured":"Frigo M, Johnson SG. The design and implementation of FFTW3. Proc IEEE. 2005;93(2):216\u201331.","journal-title":"Proc IEEE"},{"key":"2109_CR25","unstructured":"Gholami A, Hill J, Malhotra D, and Biros G. AccFFT: A library for distributed-memory FFT on CPU and GPU architectures. 2015."},{"issue":"2","key":"2109_CR26","doi-asserted-by":"publisher","first-page":"L21","DOI":"10.1063\/1.1539855","volume":"15","author":"T Ishihara","year":"2003","unstructured":"Ishihara T, Yokokawa M, Itakura K, Uno A. Energy dissipation rate and energy spectrum in high resolution direct numerical simulations of turbulence in a periodic box. Phys Fluids. 2003;15(2):L21.","journal-title":"Phys Fluids"},{"key":"2109_CR27","doi-asserted-by":"crossref","unstructured":"Lustig D, and Martonosi M. Reducing GPU offload latency via fine-grained CPU-GPU synchronization. In: 2013 IEEE 19th International Symposium on High Performance Computer Architecture (HPCA), 2013;354\u2013365.","DOI":"10.1109\/HPCA.2013.6522332"},{"issue":"6\u20137","key":"2109_CR28","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1016\/j.parco.2011.05.004","volume":"37","author":"PD Mininni","year":"2011","unstructured":"Mininni PD, Rosenberg DL, Reddy R, Pouquet AG. A hybrid MPI-OpenMP scheme for scalable parallel pseudospectral computations for fluid turbulence. Parall Comput. 2011;37(6\u20137):316\u201326.","journal-title":"Parall Comput"},{"key":"2109_CR29","unstructured":"Nvidia. Multinode Multi-GPU: Using NVIDIA cuFFTMp FFTs at Scale. https:\/\/developer.nvidia.com\/blog\/multinode-multi-gpu-using-nvidia-cufftmp-ffts-at-scale\/. Accessed 5 Mar 2022"},{"key":"2109_CR30","unstructured":"Nvidia. NVLINK and NVSWITCH Building Blocks of Advanced multi-GPU communication. https:\/\/www.nvidia.com\/en-in\/data-center\/nvlink\/. Accessed 16 Feb 2022"},{"key":"2109_CR31","unstructured":"Nvidia. cuFFT Documentation. https:\/\/docs.nvidia.com\/cuda\/cufft\/index.html. 2021. Accessed 12 Dec 2021"},{"issue":"4","key":"2109_CR32","doi-asserted-by":"publisher","first-page":"C192","DOI":"10.1137\/11082748X","volume":"34","author":"D Pekurovsky","year":"2012","unstructured":"Pekurovsky D. P3DFFT: a framework for parallel computations of Fourier transforms in three dimensions. Siam J Sci Comput. 2012;34(4):C192\u2013209.","journal-title":"Siam J Sci Comput"},{"issue":"3","key":"2109_CR33","doi-asserted-by":"publisher","first-page":"C213","DOI":"10.1137\/120885887","volume":"35","author":"M Pippig","year":"2013","unstructured":"Pippig M. PFFT: An Extension of FFTW to Massively Parallel Architectures. Siam J Sci Comput. 2013;35(3):C213\u201336.","journal-title":"Siam J Sci Comput"},{"key":"2109_CR34","doi-asserted-by":"crossref","unstructured":"Ravikumar K, Appelhans D, and Yeung PK. GPU Acceleration of Extreme Scale Pseudo-Spectral Simulations of Turbulence Using Asynchronism. In: SC \u201919: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201919, New York, NY, USA. 2019. Association for Computing Machinery.","DOI":"10.1145\/3295500.3356209"},{"issue":"1","key":"2109_CR35","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevE.92.013003","volume":"92","author":"C Rorai","year":"2015","unstructured":"Rorai C, Mininni PD, Pouquet AG. Stably stratified turbulence in the presence of large-scale forcing. Phys Rev E. 2015;92(1): 013003.","journal-title":"Phys Rev E"},{"issue":"5","key":"2109_CR36","doi-asserted-by":"publisher","DOI":"10.1063\/1.4921076","volume":"27","author":"DL Rosenberg","year":"2015","unstructured":"Rosenberg DL, Pouquet AG, Marino R, Mininni PD. Evidence for Bolgiano-Obukhov scaling in rotating stratified turbulence using high-resolution direct numerical simulations. Phys Fluids. 2015;27(5): 055105.","journal-title":"Phys Fluids"},{"key":"2109_CR37","doi-asserted-by":"crossref","unstructured":"Wang C, Chandrasekaran S, and Chapman BM. cusFFT: A High-Performance Sparse Fast Fourier Transform Algorithm on GPUs. In: 2016 IEEE International Parallel and Distributed Processing Symposium (IPDPS), 2016;963\u2013972.","DOI":"10.1109\/IPDPS.2016.95"},{"issue":"8","key":"2109_CR38","doi-asserted-by":"publisher","DOI":"10.1063\/1.2001690","volume":"17","author":"PK Yeung","year":"2005","unstructured":"Yeung PK, Donzis DA, Sreenivasan KR. High-Reynolds-number simulation of turbulent mixing. Phys Fluids. 2005;17(8): 081703.","journal-title":"Phys Fluids"},{"key":"2109_CR39","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevFluids.5.110517","volume":"5","author":"PK Yeung","year":"2020","unstructured":"Yeung PK, Ravikumar K. Advancing understanding of turbulence through extreme-scale computation: Intermittency and simulations at large problem sizes. Phys Rev Fluids. 2020;5: 110517.","journal-title":"Phys Rev Fluids"},{"key":"2109_CR40","doi-asserted-by":"publisher","first-page":"R14","DOI":"10.1017\/jfm.2012.632","volume":"716","author":"PK Yeung","year":"2013","unstructured":"Yeung PK, Sreenivasan KR. Spectrum of passive scalars of high molecular diffusivity in turbulent mixing. J Fluid Mech. 2013;716:R14.","journal-title":"J Fluid Mech"},{"issue":"41","key":"2109_CR41","doi-asserted-by":"publisher","first-page":"12633","DOI":"10.1073\/pnas.1517368112","volume":"112","author":"PK Yeung","year":"2015","unstructured":"Yeung PK, Zhai XM, Sreenivasan KR. Extreme events in computational turbulence. PNAS. 2015;112(41):12633.","journal-title":"PNAS"},{"key":"2109_CR42","doi-asserted-by":"crossref","unstructured":"Yokokawa M, Itakura K, Uno A, and Ishihara T. 16.4-Tflops Direct Numerical Simulation of Turbulence by a Fourier Spectral Method on the Earth Simulator. In: ACM\/IEEE 2002 Conference. IEEE. 2002.","DOI":"10.1109\/SC.2002.10052"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-023-02109-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-023-02109-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-023-02109-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T04:14:26Z","timestamp":1692418466000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-023-02109-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,19]]},"references-count":42,"journal-issue":{"issue":"5","published-online":{"date-parts":[[2023,9]]}},"alternative-id":["2109"],"URL":"https:\/\/doi.org\/10.1007\/s42979-023-02109-0","relation":{},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8,19]]},"assertion":[{"value":"7 October 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 June 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 August 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"625"}}