{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,4,2]],"date-time":"2022-04-02T15:31:00Z","timestamp":1648913460317},"reference-count":13,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2012,7,5]],"date-time":"2012-07-05T00:00:00Z","timestamp":1341446400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2013,4]]},"DOI":"10.1007\/s11227-012-0807-5","type":"journal-article","created":{"date-parts":[[2012,7,5]],"date-time":"2012-07-05T01:43:17Z","timestamp":1341452597000},"page":"120-131","source":"Crossref","is-referenced-by-count":6,"title":["Influence of memory access patterns to small-scale FFT performance"],"prefix":"10.1007","volume":"64","author":[{"given":"J.","family":"Lobeiras","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"M.","family":"Amor","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"R.","family":"Doallo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2012,7,5]]},"reference":[{"key":"807_CR1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1654059.1654090","volume-title":"SC \u201909: proceedings of the conference on high performance computing networking, storage and analysis","author":"A Nukada","year":"2009","unstructured":"Nukada A, Matsuoka S (2009) Auto-tuning 3-D FFT library for CUDA GPUs. In:\u00a0SC \u201909: proceedings of the conference on high performance computing networking, storage and analysis, pp\u00a01\u201310"},{"key":"807_CR2","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1109\/ISPASS.2010.5452013","volume-title":"2010 IEEE international symposium on performance analysis of systems software (ISPASS)","author":"H Wong","year":"2010","unstructured":"Wong H, Papadopoulou M-M, Sadooghi-Alvandi M, Moshovos A (2010) Demystifying GPU microarchitecture through microbenchmarking. In: 2010 IEEE international symposium on performance analysis of systems software (ISPASS), pp\u00a0235\u2013246"},{"key":"807_CR3","series-title":"Signal processing","volume-title":"Intel integrated performance primitives for Intel architecture, reference manual","author":"Intel","year":"2009","unstructured":"Intel (2009) Intel integrated performance primitives for Intel architecture, reference manual. Signal processing, vol\u00a01"},{"key":"807_CR4","first-page":"381","volume-title":"PDP \u201911: proceedings of the 19th Euromicro conference on parallel, distributed and network-based processing","author":"J Lobeiras","year":"2011","unstructured":"Lobeiras J, Amor M, Doallo R (2011) FFT implementation on a streaming architecture. In: PDP \u201911: proceedings of the 19th Euromicro conference on parallel, distributed and network-based processing. IEEE Computer Society, Los Alamitos, pp\u00a0381\u2013388"},{"key":"807_CR5","first-page":"750","volume-title":"Proceedings of the international conference on computational and mathematical methods in science and engineering (CMMSE 2011)","author":"J Lobeiras","year":"2011","unstructured":"Lobeiras J, Amor M, Doallo R (2011) Performance evaluation of GPU memory hierarchy using the FFT. In: Proceedings of the international conference on computational and mathematical methods in science and engineering (CMMSE 2011), vol\u00a02, pp\u00a0750\u2013761"},{"key":"807_CR6","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1145\/1693453.1693471","volume-title":"Proceedings of the 15th ACM SIGPLAN symposium on principles and practice of parallel programming (PPoPP 2010)","author":"JW Choi","year":"2010","unstructured":"Choi JW, Singh A, Vuduc RW (2010) Model-driven autotuning of sparse matrix-vector multiply on GPUs. In: Proceedings of the 15th ACM SIGPLAN symposium on principles and practice of parallel programming (PPoPP 2010), vol\u00a045, pp\u00a0115\u2013126"},{"issue":"90","key":"807_CR7","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1090\/S0025-5718-1965-0178586-1","volume":"19","author":"JW Cooley","year":"1965","unstructured":"Cooley JW, Tukey JW (1965) An algorithm for the machine calculation of complex Fourier series. Math Comput 19(90):297\u2013301","journal-title":"Math Comput"},{"issue":"2","key":"807_CR8","doi-asserted-by":"crossref","first-page":"252","DOI":"10.1145\/321450.321457","volume":"15","author":"MC Pease","year":"1968","unstructured":"Pease MC (1968) An adaptation of the fast Fourier transform for parallel processing. J ACM 15(2):252\u2013264","journal-title":"J ACM"},{"key":"807_CR9","first-page":"152","volume-title":"Proceedings of the 36th international symposium on computer architecture (ISCA \u201909)","author":"S Hong","year":"2009","unstructured":"Hong S, Kim H (2009) An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness. In: Proceedings of the 36th international symposium on computer architecture (ISCA \u201909), vol\u00a037, pp\u00a0152\u2013163"},{"key":"807_CR10","doi-asserted-by":"crossref","first-page":"105","DOI":"10.1145\/1693453.1693470","volume-title":"Proceedings of the 15 th ACM SIGPLAN symposium on principles and practice of parallel programming (PPoPP 2010)","author":"SS Baghsorkhi","year":"2010","unstructured":"Baghsorkhi SS et al (2010) An adaptive performance modeling tool for GPU architectures. In: Proceedings of the 15 th ACM SIGPLAN symposium on principles and practice of parallel programming (PPoPP 2010), pp\u00a0105\u2013114"},{"key":"807_CR11","volume-title":"GPU technology conference (GTC 2010)","author":"V Volkov","year":"2010","unstructured":"Volkov V (2010) Better performance at lower occupancy. In: GPU technology conference (GTC 2010)"},{"key":"807_CR12","volume-title":"International workshop on parallel matrix algorithms and applications (PMAA\u201910)","author":"V Volkov","year":"2010","unstructured":"Volkov V (2010) Use registers and multiple outputs per thread on GPU. In: International workshop on parallel matrix algorithms and applications (PMAA\u201910)"},{"key":"807_CR13","volume-title":"Proceedings of the 17th IEEE international symposium on high-performance computer architecture (HPCA 17)","author":"Y Zhang","year":"2011","unstructured":"Zhang Y, Owens JD (2011) A quantitative performance analysis model for GPU architectures. In: Proceedings of the 17th IEEE international symposium on high-performance computer architecture (HPCA 17)"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-012-0807-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-012-0807-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-012-0807-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T10:24:07Z","timestamp":1559384647000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-012-0807-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,7,5]]},"references-count":13,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2013,4]]}},"alternative-id":["807"],"URL":"https:\/\/doi.org\/10.1007\/s11227-012-0807-5","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2012,7,5]]}}}