{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T23:09:52Z","timestamp":1774307392826,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":77,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710897","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"355-368","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["FlashFFTStencil: Bridging Fast Fourier Transforms to Memory-Efficient Stencil Computations on Tensor Core Units"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3075-3385","authenticated-orcid":false,"given":"Haozhi","family":"Han","sequence":"first","affiliation":[{"name":"Peking University, Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1013-1325","authenticated-orcid":false,"given":"Kun","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9362-3585","authenticated-orcid":false,"given":"Wei","family":"Cui","sequence":"additional","affiliation":[{"name":"Microsoft Research, Vancouver, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3655-8677","authenticated-orcid":false,"given":"Donglin","family":"Bai","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2050-572X","authenticated-orcid":false,"given":"Yiwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-2907","authenticated-orcid":false,"given":"Liang","family":"Yuan","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2392-8472","authenticated-orcid":false,"given":"Yifeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7520-9640","authenticated-orcid":false,"given":"Yunquan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6455-3898","authenticated-orcid":false,"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Hamdy Abdelkhalik Yehia Arafa Nandakishore Santhi and Abdel-Hameed Badawy. 2022. Demystifying the Nvidia Ampere Architecture through Microbenchmarking and Instruction-level Analysis. arXiv:2208.11174 [cs.AR] https:\/\/arxiv.org\/abs\/2208.11174","DOI":"10.1109\/HPEC55821.2022.9926299"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3409964.3461803"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606338"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS55109.2022.00010"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.9"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1562764.1562783"},{"key":"e_1_3_2_1_7_1","unstructured":"Krste Asanovi\u0107 Ras Bodik Bryan Christopher Catanzaro Joseph James Gebis Parry Husbands Kurt Keutzer David A. Patterson William Lester Plishker John Shalf Samuel Webb Williams and Katherine A. Yelick. 2006. The Landscape of Parallel Computing Research: A View from Berkeley. Technical Report UCB\/EECS-2006-183. http:\/\/www2.eecs.berkeley.edu\/Pubs\/TechRpts\/2006\/EECS-2006-183.html"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.107"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00078"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2615094"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2014.6983061"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/31.85638"},{"key":"e_1_3_2_1_13_1","unstructured":"Charlene Yang. 2018. Introduction to the Roofline Model. https:\/\/www.nersc.gov\/assets\/Uploads\/Tutorial-ISC2018-Roofline-Model.pdf\/ Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638476"},{"key":"e_1_3_2_1_16_1","volume-title":"cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759","author":"Chetlur Sharan","year":"2014","unstructured":"Sharan Chetlur, Cliff Woolley, Philippe Vandermersch, Jonathan Cohen, John Tran, Bryan Catanzaro, and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SYNASC.2014.70"},{"key":"e_1_3_2_1_19_1","volume-title":"FlashFFTConv: Efficient Convolutions for Long Sequences with Tensor Cores. In The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Fu Daniel Y.","year":"2024","unstructured":"Daniel Y. Fu, Hermann Kumbong, Eric Nguyen, and Christopher R\u00e9. 2024. FlashFFTConv: Efficient Convolutions for Long Sequences with Tensor Cores. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=gPKTTAfYBp"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-C.1971.223236"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458526"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3469030"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/MASSP.1984.1162257"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Tom Henretty Kevin Stock Louis-No\u00ebl Pouchet Franz Franchetti J. Ramanujam and P. Sadayappan. 2011. Data Layout Transformation for Stencil Computations on Short-Vector SIMD Architectures. In Compiler Construction Jens Knoop (Ed.). Springer Berlin Heidelberg Berlin Heidelberg 225--245.","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2467268"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.oompfluid.2013.12.007"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00035"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2001.10041"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/3PGCIC.2015.66"},{"key":"e_1_3_2_1_31_1","volume-title":"Digital signal processing algorithms: number theory, convolution, fast Fourier transforms, and applications","author":"Krishna Hari","unstructured":"Hari Krishna. 2017. Digital signal processing algorithms: number theory, convolution, fast Fourier transforms, and applications. Routledge."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476154"},{"key":"e_1_3_2_1_33_1","volume-title":"Reducing Redundancy in Data Organization and Arithmetic Calculation for Stencil Computations. In SC21: International Conference for High Performance Computing, Networking, Storage and Analysis. 01--15","author":"Li Kun","year":"2021","unstructured":"Kun Li, Liang Yuan, Yunquan Zhang, and Yue Yue. 2021. Reducing Redundancy in Data Organization and Arithmetic Calculation for Stencil Computations. In SC21: International Conference for High Performance Computing, Networking, Storage and Analysis. 01--15."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1986.1169089"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2023.04.019"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2021.108063"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.87"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the 1st international workshop on high-performance stencil computations","author":"Maruyama Naoya","year":"2014","unstructured":"Naoya Maruyama and Takayuki Aoki. 2014. Optimizing stencil computations for NVIDIA Kepler GPUs. In Proceedings of the 1st international workshop on high-performance stencil computations, Vienna. Citeseer, 89--95."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063398"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377904"},{"key":"e_1_3_2_1_42_1","unstructured":"Robert Matusiak. 2002. Implementing Fast Fourier Transform Algorithms of Real-Valued Sequences With the TMS 320 DSP Platform. https:\/\/api.semanticscholar.org\/CorpusID:11262963"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_44_1","unstructured":"Nvidia. 2023. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.cn\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_45_1","unstructured":"Nvidia. 2024. CUDA C++Best Practices Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-best-practices-guide\/index.html Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_46_1","unstructured":"Nvidia. 2024. CUDA C++ Programming Guide. https:\/\/https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/ Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_47_1","unstructured":"Nvidia. 2024. cuDNN. https:\/\/developer.nvidia.com\/cudnn Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_48_1","unstructured":"Nvidia. 2024. cuFFT. https:\/\/docs.nvidia.com\/cuda\/cufft\/index.html Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_49_1","unstructured":"Nvidia. 2024. NVIDIA Blackwell Platform Arrives to Power a New Era of Computing. https:\/\/nvidianews.nvidia.com\/news\/nvidia-blackwellplatform-arrives-to-power-a-new-era-of-computing Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_50_1","unstructured":"Nvidia. 2024. Parallel Thread Execution ISA Version 8.5. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/ Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2014.6844463"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1080\/00029890.1952.11988142"},{"key":"e_1_3_2_1_53_1","unstructured":"Peter Van Sandt and Zhe Jia. 2021. Dissecting the Ampere GPU Architecture through Microbenchmarking. https:\/\/www.nvidia.com\/en-us\/on-demand\/session\/gtcspring21-s33322\/ Last accessed on 2024-8-16."},{"key":"e_1_3_2_1_54_1","volume-title":"FFT-based 2D convolution. NVIDIA white paper 32, 1","author":"Podlozhnyuk Victor","year":"2007","unstructured":"Victor Podlozhnyuk. 2007. FFT-based 2D convolution. NVIDIA white paper 32, 1 (2007)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830018.2830025"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2884045.2884047"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00049"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2862896"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593719"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1987.1165220"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/2666356.2594342"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295816.3295817"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2022.3217824"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/1989493.1989508"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_2_1_68_1","unstructured":"Wikipedia. 2024. Top500 Supercomputers. https:\/\/en.wikipedia.org\/wiki\/TOP500."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1015460304860"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337835"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126920"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593705"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593716"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00059"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356210"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441598"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC.2018.00009"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710897","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710897","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:13:46Z","timestamp":1755875626000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710897"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":77,"alternative-id":["10.1145\/3710848.3710897","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710897","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}