{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T09:22:20Z","timestamp":1754558540466,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,9,30]]},"DOI":"10.1145\/3695794.3695796","type":"proceedings-article","created":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T04:06:53Z","timestamp":1733976413000},"page":"13-25","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Pimacolaba: Collaborative Acceleration for FFT on Commercial Processing-In-Memory Architectures"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4129-0310","authenticated-orcid":false,"given":"Mohamed Assem","family":"Ibrahim","sequence":"first","affiliation":[{"name":"Advanced Micro Devices, Inc., Santa Clara, CA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9552-0508","authenticated-orcid":false,"given":"Shaizeen","family":"Aga","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc., Santa Clara, CA, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,12,11]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2018. Evaluating Attainable Memory Bandwidth of Parallel Programming Models via BabelStream. International Journal of Computational Science and Engineering (2018)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3357526.3357532"},{"key":"e_1_3_3_2_4_2","unstructured":"AMD. 2023. AMD Instinct\u2122 MI210 Accelerator. https:\/\/www.amd.com\/ en\/products\/server-accelerators\/amd-instinct-mi210."},{"key":"e_1_3_3_2_5_2","unstructured":"AMD. 2023. AMD Optimizing CPU Libraries (AOCL) FFTW. https:\/\/www.amd.com\/en\/developer\/aocl\/fftw.html."},{"key":"e_1_3_3_2_6_2","unstructured":"AMD. 2023. BabelStream. https:\/\/www.amd.com\/en\/technologies\/infinity-hub\/babelstream."},{"key":"e_1_3_3_2_7_2","unstructured":"AMD. 2023. Omniperf. https:\/\/github.com\/AMDResearch\/omniperf."},{"key":"e_1_3_3_2_8_2","unstructured":"AMD. 2023. rocFFT Library. https:\/\/github.com\/ROCmSoftwarePlatform\/roc FFT."},{"key":"e_1_3_3_2_9_2","unstructured":"AMD. 2024. rocFFT Library Documentation. https:\/\/rocm.docs.amd.com\/projects\/rocFFT\/en\/latest\/."},{"key":"e_1_3_3_2_10_2","volume-title":"Proceedings of the IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)","author":"Anzt Hartwig","year":"2020","unstructured":"Hartwig Anzt, Yuhsiang\u00a0M. Tsai, Ahmad Abdelfattah, Terry Cojean, and Jack Dongarra. 2020. Evaluating the Performance of NVIDIA\u2019s A100 Ampere GPU for Sparse and Batched Computations. In Proceedings of the IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)."},{"key":"e_1_3_3_2_11_2","unstructured":"Apple. 2023. Apple Accelerate Libraries. https:\/\/developer.apple.com\/docu mentation\/accelerate\/vdsp."},{"key":"e_1_3_3_2_12_2","unstructured":"ARM. 2023. ARM Performance Libraries. https:\/\/developer.arm.com\/down loads\/-\/arm-performance-libraries."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-50371-0_19"},{"key":"e_1_3_3_2_14_2","volume-title":"Proceedings of the IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)","author":"Cheng Shenggan","year":"2020","unstructured":"Shenggan Cheng, Hao-Ran Yu, Derek Inman, Qiucheng Liao, Qiaoya Wu, and James Lin. 2020. CUBE \u2013 Towards an Optimal Scaling of Cosmological N-body Simulations. In Proceedings of the IEEE\/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"James\u00a0W. Cooley and John\u00a0W. Tukey. 1965. An Algorithm for the Machine Calculation of Complex Fourier Series. Math. Comp. (1965).","DOI":"10.2307\/2003354"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT52795.2021.00032"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1998.681704"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"crossref","unstructured":"M. Frigo and S.G. Johnson. 2005. The Design and Implementation of FFTW3. Proc. IEEE (2005).","DOI":"10.1109\/JPROC.2004.840301"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"K. Germaschewski B. Allen T. Dannert M. Hrywniak J. Donaghy G. Merlo S. Ethier E. D\u2019Azevedo F. Jenko and A. Bhattacharjee. 2021. Toward Exascale Whole-device Modeling of Fusion Devices: Porting the GENE Gyrokinetic Microturbulence Code to GPU. Physics of Plasmas (2021).","DOI":"10.1063\/5.0046327"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5213922"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00013"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Juan G\u00f3mez-Luna Izzat\u00a0El Hajj Ivan Fernandez Christina Giannoula Geraldo\u00a0F. Oliveira and Onur Mutlu. 2022. Benchmarking a New Paradigm: Experimental Analysis and Characterization of a Real Processing-in-Memory System. IEEE Access (2022).","DOI":"10.1109\/ACCESS.2022.3174101"},{"key":"e_1_3_3_2_23_2","volume-title":"Proceedings of Machine Learning and Systems","author":"Ibrahim Mohamed","year":"2024","unstructured":"Mohamed Ibrahim, Shaizeen Aga, Ada Li, Suchita Pati, and Mahzabeen Islam. 2024. JIT-Q: Just-in-time Quantization with Processing-In-Memory for Efficient ML Training. In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_3_2_24_2","unstructured":"Intel. 2023. Intel oneAPI Math Kernel Library. https:\/\/www.intel.com\/con tent\/www\/us\/en\/docs\/onemkl\/get-started-guide\/2023-0\/overview.html."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"e_1_3_3_2_26_2","unstructured":"JEDEC. 2013. High Bandwidth Memory (HBM) DRAM. https:\/\/www.jedec.org \/standards-documents\/docs\/jesd235a."},{"key":"e_1_3_3_2_27_2","unstructured":"JEDEC. 2023. High Bandwidth Memory (HBM3) DRAM. https:\/\/www.jedec.org\/standards-documents\/docs\/jesd238a."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Liu Ke Xuan Zhang Jinin So Jong-Geon Lee Shin-Haeng Kang Sukhan Lee Songyi Han YeonGon Cho Jin\u00a0Hyun Kim Yongsuk Kwon KyungSoo Kim Jin Jung Ilkwon Yun Sung\u00a0Joo Park Hyunsun Park Joonho Song Jeonghyeon Cho Kyomin Sohn Nam\u00a0Sung Kim and Hsien-Hsin\u00a0S. Lee. 2022. Near-Memory Processing in Action: Accelerating Personalized Recommendation With AxDIMM. IEEE Micro (2022).","DOI":"10.1109\/MM.2021.3097700"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731711"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Orian Leitersdorf Yahav Boneh Gonen Gazit Ronny Ronen and Shahar Kvatinsky. 2023. FourierPIM: High-Throughput In-Memory Fast Fourier Transform and Polynomial Multiplication. Memories - Materials Devices Circuits and Systems (2023).","DOI":"10.1016\/j.memori.2023.100034"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/Cluster48925.2021.00035"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356138"},{"key":"e_1_3_3_2_34_2","unstructured":"Zongyi Li Nikola Kovachki Kamyar Azizzadenesheli Burigede Liu Kaushik Bhattacharya Andrew Stuart and Anima Anandkumar. 2020. Fourier Neural Operator for Parametric Partial Differential Equations. arXiv (2020)."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2008.4607357"},{"key":"e_1_3_3_2_36_2","unstructured":"John\u00a0D. McCalpin. 2023. STREAM. https:\/\/www.cs.virginia.edu\/\u00a0mccalpin\/pap ers\/bandwidth\/bandwidth.html."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00037"},{"key":"e_1_3_3_2_38_2","unstructured":"NVIDIA. 2023. cuFFT Library. https:\/\/docs.nvidia.com\/cuda\/cufft\/."},{"key":"e_1_3_3_2_39_2","unstructured":"Oak Ridge Leadership Computing Facility. 2023. Frontier. https:\/\/www.olcf.ornl.gov\/frontier\/."},{"key":"e_1_3_3_2_40_2","unstructured":"Oak Ridge Leadership Computing Facility. 2023. Update on Frontier and Early Science. https:\/\/indico.mit.edu\/ event\/352\/contributions\/638\/attachments\/357\/661\/HEP-QCD.pdf."},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00033"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00059"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356209"},{"key":"e_1_3_3_2_44_2","unstructured":"Samsung. 2022. Samsung Electronics Semiconductor Unveils Cutting-edge Memory Technology to Accelerate Next-generation AI. https:\/\/ semiconductor.samsung.com\/newsroom\/tech-blog\/samsung-electronics-semiconductor-unveils-cutting-edge-memory-technology-to-accelerate-next-generation-ai\/."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Evan\u00a0E. Schneider and Brant\u00a0E. Robertson. 2015. CHOLLA: A New Massively Parallel Hydrodynamics Code for Astrophysical Simulation. The Astrophysical Journal Supplement Series (2015).","DOI":"10.1088\/0067-0049\/217\/2\/24"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/HiPCW.2018.8634417"},{"key":"e_1_3_3_2_47_2","unstructured":"Stefan Seritan and Craig Ulmer. 2021. Benchmarking the NVIDIA A100 Graphics Processing Unit for High-Performance Computing and Data Analytics Workloads. https:\/\/www.craigulmer.com\/data\/2021\/SAND2021-1220_uur.pdf."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Dmitrii Tolmachev. 2023. VkFFT-A Performant Cross-Platform and Open-Source GPU FFT Library. IEEE Access (2023).","DOI":"10.1109\/ACCESS.2023.3242240"},{"key":"e_1_3_3_2_49_2","unstructured":"Tom Deakin. 2020. Performance Portability of OpenMP on CPUs and GPUs. https:\/\/www.openmp.org\/wp-content\/uploads\/OpenMPBoothTalk-Deakin-SC20.pdf."},{"key":"e_1_3_3_2_50_2","unstructured":"Tom Deakin. 2024. BabelStream. https:\/\/hpc.tomdeakin.com\/projects\/babelstream."},{"key":"e_1_3_3_2_51_2","unstructured":"Top500. 2023. The 61st Edition of the TOP500. https:\/\/www.top500.org\/lists\/ top500\/2023\/06\/."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480071"}],"event":{"name":"MEMSYS '24: The International Symposium on Memory Systems","acronym":"MEMSYS '24","location":"Washington DC USA"},"container-title":["Proceedings of the International Symposium on Memory Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695794.3695796","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695794.3695796","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:06Z","timestamp":1750295406000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695794.3695796"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"references-count":51,"alternative-id":["10.1145\/3695794.3695796","10.1145\/3695794"],"URL":"https:\/\/doi.org\/10.1145\/3695794.3695796","relation":{},"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"2024-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}