{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T05:55:00Z","timestamp":1769752500440,"version":"3.49.0"},"reference-count":28,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2013,1,1]],"date-time":"2013-01-01T00:00:00Z","timestamp":1356998400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2013,1]]},"DOI":"10.1007\/s11390-013-1314-8","type":"journal-article","created":{"date-parts":[[2013,2,1]],"date-time":"2013-02-01T11:52:54Z","timestamp":1359719574000},"page":"90-105","source":"Crossref","is-referenced-by-count":23,"title":["MPFFT: An Auto-Tuning FFT Library for OpenCL GPUs"],"prefix":"10.1007","volume":"28","author":[{"given":"Yan","family":"Li","sequence":"first","affiliation":[]},{"given":"Yun-Quan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yi-Qun","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Guo-Ping","family":"Long","sequence":"additional","affiliation":[]},{"given":"Hai-Peng","family":"Jia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2013,2,1]]},"reference":[{"issue":"14","key":"1314_CR1","doi-asserted-by":"crossref","first-page":"259","DOI":"10.1016\/0165-1684(90)90158-U","volume":"9","author":"P Duhamel","year":"1990","unstructured":"Duhamel P, Vetterli M. Fast fourier transforms: A tutorial review and a state of the art. Signal Processing, 1990, 9(14): 259-299.","journal-title":"Signal Processing"},{"key":"1314_CR2","doi-asserted-by":"crossref","unstructured":"Govindaraju N K, Lloyd B, Dotsenko Y, Smith B, Manferdelli J. High performance discrete Fourier transforms on graphics processors. In Proc. SC, Nov. 2008, Article No.2.","DOI":"10.1109\/SC.2008.5213922"},{"key":"1314_CR3","doi-asserted-by":"crossref","unstructured":"Nukada A, Matsuoka S. Auto-tuning 3-D FFT library for CUDA GPUs. In Proc. SC, Nov. 2009, Article No.30.","DOI":"10.1145\/1654059.1654090"},{"key":"1314_CR4","doi-asserted-by":"crossref","unstructured":"Dotsenko Y, Baghsorkhi S S, Lloyd B, Govindaraju N K. Auto-tuning of fast Fourier transform on graphics processors. In Proc PPoPP, Feb. 2011, pp.257-266.","DOI":"10.1145\/2038037.1941589"},{"key":"1314_CR5","doi-asserted-by":"crossref","unstructured":"Gu L, Li X M, Siegel J. An empirically tuned 2D and 3D FFT library on CUDA GPU. In Proc. the 24th ICS, June 2010, pp.305-314.","DOI":"10.1145\/1810085.1810127"},{"key":"1314_CR6","volume-title":"Heterogeneous Computing with OpenCL","author":"B Gaster","year":"2011","unstructured":"Gaster B, Howes L, Kaeli D R, Mistry P, Schaa D. Heterogeneous Computing with OpenCL. San Fransisco, USA: Morgan Kaufmann, 2011."},{"key":"1314_CR7","volume-title":"OpenCL Programming Guide","author":"A Munshi","year":"2011","unstructured":"Munshi A, Gaster B, Mattson T G, Fung J, Ginsburg D. OpenCL Programming Guide. Boston, USA: Addison-Wesley Professional, 2011."},{"key":"1314_CR8","doi-asserted-by":"crossref","unstructured":"Zhang E Z, Jiang Y L, Guo Z Y, Shen X P. Streamlining GPU applications on the fly: Thread divergence elimination through runtime thread-data remapping. In Proc. the 24th ICS, June 2010, pp.115-126.","DOI":"10.1145\/1810085.1810104"},{"key":"1314_CR9","doi-asserted-by":"crossref","unstructured":"Yang Y, Xiang P, Kong J F, Zhou H Y. A GPGPU compiler for memory optimization and parallelism management. In Proc. PLDI, June 2010, pp.86-97.","DOI":"10.1145\/1809028.1806606"},{"key":"1314_CR10","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1090\/S0025-5718-1965-0178586-1","volume":"19","author":"JW Cooley","year":"1965","unstructured":"Cooley J W, Tukey J W. An algorithm for the machine calculation of complex Fourier series. Mathematics of Computation, 1965, 19: 297-301.","journal-title":"Mathematics of Computation"},{"key":"1314_CR11","doi-asserted-by":"crossref","DOI":"10.1137\/1.9781611970999","volume-title":"Computational Frameworks for the Fast Fourier Transform","author":"C Loan Van","year":"1992","unstructured":"Van Loan C. Computational Frameworks for the Fast Fourier Transform. Philadelphia, USA: SIAM, 1992."},{"issue":"4","key":"1314_CR12","doi-asserted-by":"crossref","first-page":"449","DOI":"10.1007\/BF01189337","volume":"9","author":"J Johnson","year":"1990","unstructured":"Johnson J, Johnson R W, Rodriguez D, Tolimieri R. A methodology for designing, modifying, and implementing Fourier transform algorithms on various architectures. Circuits, Systems and Signal Processing, 1990, 9(4): 449-500.","journal-title":"Circuits, Systems and Signal Processing"},{"issue":"6","key":"1314_CR13","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1109\/MSP.2009.934155","volume":"26","author":"F Franchetti","year":"2009","unstructured":"Franchetti F, P\u00c4uschel M, Voronenko Y, Chellappa S, Moura J M F. Discrete Fourier transform on multicore. IEEE Signal Processing Magazine, 2009, 26(6): 90-102.","journal-title":"Moura J M F. Discrete Fourier transform on multicore. IEEE Signal Processing Magazine"},{"key":"1314_CR14","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4757-3854-4","volume-title":"Algorithms for Discrete Fourier Transforms and Convolution","author":"R Tolimieri","year":"1989","unstructured":"Tolimieri R, An M, Lu C. Algorithms for Discrete Fourier Transforms and Convolution. Berlin: Springer-Verlag, 1989."},{"key":"1314_CR15","unstructured":"NVIDIA Corporation. NVIDIA CUDA compute unified device architecture\u2009\u2212\u2009Programming guide version 4.2. 2008, http:\/\/developer.nvidia.com\/cuda\/nvidia-gpu-computing-documentation , Sept. 2012."},{"key":"1314_CR16","doi-asserted-by":"crossref","unstructured":"Ji F, Ma X S. Using shared memory to accelerate MapReduce on graphics processing units. In Proc. IPDPS, May 2011, pp.805-816.","DOI":"10.1109\/IPDPS.2011.80"},{"key":"1314_CR17","doi-asserted-by":"crossref","unstructured":"Jiao Y, Lin H, Balaji P, Feng W. Power and performance characterization of computational kernels on the GPU. In Proc. GreenCom-CPSCOM, Dec. 2010, pp.221-228.","DOI":"10.1109\/GreenCom-CPSCom.2010.143"},{"key":"1314_CR18","doi-asserted-by":"crossref","unstructured":"Baghsorkhi S S, Delahaye M, Patel S J, Gropp W D, Hwu W W. An adaptive performance modeling tool for GPU architectures. In Proc. the 15th PPoPP, May 2010, pp.105-114.","DOI":"10.1145\/1693453.1693470"},{"key":"1314_CR19","doi-asserted-by":"crossref","first-page":"197","DOI":"10.1016\/0167-8191(87)90018-4","volume":"5","author":"PN Schwarztrauber","year":"1987","unstructured":"Schwarztrauber P N. Multiprocessor FFTs. Parallel Computing, 1987, 5: 197-210.","journal-title":"Multiprocessor FFTs. Parallel Computing"},{"key":"1314_CR20","doi-asserted-by":"crossref","unstructured":"Frigo M, Johnson S G. The design and implementation of FFTW3. In Proceedings of the IEEE, 2005, 93(2): 216-231.","DOI":"10.1109\/JPROC.2004.840301"},{"key":"1314_CR21","doi-asserted-by":"crossref","unstructured":"Frigo M. A fast Fourier transform compiler. In Proc. PLDI, May 1999, pp.169-180.","DOI":"10.1145\/301618.301661"},{"key":"1314_CR22","volume-title":"Encyclopedia of Parallel Computing","author":"F Mesmay","year":"2011","unstructured":"Mesmay F, Franchetti F, Voronenko Y. Encyclopedia of Parallel Computing. Berlin: Springer, 2011."},{"key":"1314_CR23","doi-asserted-by":"crossref","unstructured":"de Mesmay F, Voronenko Y, P\u00c4uschel M. Offline library adaptation using automatically generated heuristics. In Proc. IPDPS, Apr. 2010, pp.1-10.","DOI":"10.1109\/IPDPS.2010.5470479"},{"key":"1314_CR24","volume-title":"Programming Massively Parallel Processors: A Hands-on Approach","author":"DB Kirk","year":"2010","unstructured":"Kirk D B, Hwu W W. Programming Massively Parallel Processors: A Hands-on Approach. San Fransisco, USA: Morgan Kaufmann, 2010."},{"key":"1314_CR25","doi-asserted-by":"crossref","unstructured":"Purnomo B, Rubin N, Houston M. ATI stream profiler: A tool to optimize an OpenCL kernel on ATI radeon GPUs. In Proc. SIGGRAPH, July 2011, pp.26-30.","DOI":"10.1145\/1836845.1836904"},{"key":"1314_CR26","doi-asserted-by":"crossref","unstructured":"Mirkovic D, Johnsson S L. Automatic performance tuning in the UHFFT library. In Proc. ICCS, May 2001, pp.71-80.","DOI":"10.1007\/3-540-45545-0_17"},{"key":"1314_CR27","unstructured":"Ali A, Johnsson L, Mirkovic D. Empirical auto-tuning code generator for FFT and trigonometric transforms. In Proc. ODES, Mar. 2007."},{"key":"1314_CR28","doi-asserted-by":"crossref","unstructured":"Mirkovic D, Mahasoom R, Johnsson L. An adaptive software library for fast Fourier transforms. In Proc. the 14th ICS, May 2000, pp.215-224.","DOI":"10.1145\/335231.335252"}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-013-1314-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11390-013-1314-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-013-1314-8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,7,8]],"date-time":"2019-07-08T23:53:09Z","timestamp":1562629989000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11390-013-1314-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2013,1]]},"references-count":28,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,1]]}},"alternative-id":["1314"],"URL":"https:\/\/doi.org\/10.1007\/s11390-013-1314-8","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"value":"1000-9000","type":"print"},{"value":"1860-4749","type":"electronic"}],"subject":[],"published":{"date-parts":[[2013,1]]}}}