{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T21:34:10Z","timestamp":1777066450871,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"Guangdong S&T Program","award":["2024B0101040005"],"award-info":[{"award-number":["2024B0101040005"]}]},{"name":"National Natural Science Foundation of China","award":["62461146204"],"award-info":[{"award-number":["62461146204"]}]},{"name":"National Natural Science Foundation of China","award":["62502552"],"award-info":[{"award-number":["62502552"]}]},{"name":"Guangdong Province Special Support Program for Cultivating High-Level Talents","award":["2021TQ06X160"],"award-info":[{"award-number":["2021TQ06X160"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,27]]},"DOI":"10.1145\/3767295.3769378","type":"proceedings-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:20:04Z","timestamp":1777062004000},"page":"2073-2092","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Matrix\u2011PIC: Harnessing Matrix Outer-product for High\u2011Performance Particle\u2011in\u2011Cell Simulations"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3572-6969","authenticated-orcid":false,"given":"Yizhuo","family":"Rao","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1442-5526","authenticated-orcid":false,"given":"Xingjian","family":"Cui","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0770-3086","authenticated-orcid":false,"given":"Jiabin","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5488-9235","authenticated-orcid":false,"given":"Shangzhi","family":"Pang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1382-280X","authenticated-orcid":false,"given":"Guangnan","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9850-8384","authenticated-orcid":false,"given":"Jinhui","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-5715","authenticated-orcid":false,"given":"Zhiguang","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5315-3375","authenticated-orcid":false,"given":"Yutong","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,26]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","first-page":"2031","DOI":"10.1002\/jcc.21773","article-title":"Optimization of parameters for molecular dynamics simulation using smooth particle-mesh Ewald in GROMACS 4.5","volume":"32","author":"Abraham Mark J","year":"2011","unstructured":"Mark J Abraham and Jill E Gready. 2011. Optimization of parameters for molecular dynamics simulation using smooth particle-mesh Ewald in GROMACS 4.5. Journal of computational chemistry 32, 9 (2011), 2031\u20132040.","journal-title":"Journal of computational chemistry"},{"key":"e_1_3_2_1_2_1","volume-title":"Advanced Micro Devices","author":"Inc.","year":"2020","unstructured":"Inc. Advanced Micro Devices. 2020. Oak Ridge National Laboratory Case Study: ORNL Expands Possibilities for Plasma Physics with Open and Portable AMD ROCm. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/resources\/case-studies\/oak-ridge-national-laboratory-case-study.pdf. [Online; accessed 14 May 2025]."},{"key":"e_1_3_2_1_3_1","unstructured":"Advanced Micro Devices Inc. 2023. AMD Instinct\u2122 MI300X Accelerator Data Sheet. Technical Report. Advanced Micro Devices Inc. [Online; accessed 14 May 2025]."},{"key":"e_1_3_2_1_4_1","volume-title":"Apple introduces M4 chip. https:\/\/www.apple.com\/hk\/en\/newsroom\/2024\/05\/apple-introduces-m4-chip\/. [Online","author":"Apple Inc. 2024.","year":"2025","unstructured":"Apple Inc. 2024. Apple introduces M4 chip. https:\/\/www.apple.com\/hk\/en\/newsroom\/2024\/05\/apple-introduces-m4-chip\/. [Online; accessed 14 May 2025]."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-96983-1_53"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jocs.2018.06.004"},{"key":"e_1_3_2_1_7_1","volume-title":"2017 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE","author":"Barsamian Yann","year":"2017","unstructured":"Yann Barsamian, \u00c9ric Violard, and Sever A. Hirstoaga. 2017. Efficient Data Structures for a Hybrid Parallel and Vectorized Particle-in-Cell Code. In 2017 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, Orlando, FL, USA, 1168\u20131177. 10.1109\/IPDPSW.2017.74"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2019.05.001"},{"key":"e_1_3_2_1_9_1","volume-title":"Cache-oblivious B-trees. In Proceedings 41st Annual Symposium on Foundations of Computer Science. 399\u2013409","author":"Bender M.A.","year":"2000","unstructured":"M.A. Bender, E.D. Demaine, and M. Farach-Colton. 2000. Cache-oblivious B-trees. In Proceedings 41st Annual Symposium on Foundations of Computer Science. 399\u2013409. 10.1109\/SFCS.2000.892128"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1292609.1292616"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3084795"},{"key":"e_1_3_2_1_12_1","volume-title":"Plasma physics via computer simulation","author":"Birdsall Charles K","unstructured":"Charles K Birdsall and A Bruce Langdon. 2018. Plasma physics via computer simulation. CRC press."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1006\/jcph.2001.6851"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1063\/1.2840133"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","first-page":"A170","DOI":"10.1051\/0004-6361\/202452770","article-title":"PySCo: A fast particle-mesh N-body code for modified gravity simulations in Python","volume":"695","author":"Breton Michel-Andr\u00e8s","year":"2025","unstructured":"Michel-Andr\u00e8s Breton. 2025. PySCo: A fast particle-mesh N-body code for modified gravity simulations in Python. Astronomy & Astrophysics 695 (2025), A170.","journal-title":"Astronomy & Astrophysics"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Bussmann M.","unstructured":"M. Bussmann, H. Burau, T. E. Cowan, A. Debus, A. Huebl, G. Juckeland, T. Kluge, W. E. Nagel, R. Pausch, F. Schmitt, U. Schramm, J. Schuchart, and R. Widera. 2013. Radiative signatures of the relativistic Kelvin-Helmholtz instability. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Denver, Colorado) (SC '13). Association for Computing Machinery, New York, NY, USA, Article 5, 12 pages. 10.1145\/2503210.2504564"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","first-page":"4193","DOI":"10.1021\/ja00119a045","article-title":"Molecular dynamics simulations on solvated biomolecular systems: the particle mesh Ewald method leads to stable trajectories of DNA, RNA, and proteins","volume":"117","author":"Cheatham TE III","year":"1995","unstructured":"TE III Cheatham, JL Miller, T Fox, TA Darden, and PA Kollman. 1995. Molecular dynamics simulations on solvated biomolecular systems: the particle mesh Ewald method leads to stable trajectories of DNA, RNA, and proteins. Journal of the American Chemical Society 117, 14 (1995), 4193\u20134194.","journal-title":"Journal of the American Chemical Society"},{"key":"e_1_3_2_1_18_1","first-page":"L26","volume-title":"Feb. 20, 1991","author":"Couchman HMP","year":"1991","unstructured":"HMP Couchman. 1991. Mesh-refined P3M-A fast adaptive N-body algorithm. Astrophysical Journal, Part 2-Letters (ISSN 0004-637X), vol. 368, Feb. 20, 1991, p. L23\u2013L26. Research supported by NSERC. 368 (1991), L23-L26."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2013.10.013"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Jo\u00ebl Derouillat Antoine Beck Fr\u00e9d\u00e9ric P\u00e9rez Thomas Vinci Micka\u00ebl Chiaramello A. Grassi M. Flentje G. Bouchard I. Plotnikov N. Aunai J. Dargent C. Riconda and M. Grech. 2018. SMILEI: a collaborative open-source multi-purpose particle-in-cell code for plasma simulation. Computer Physics Communications 222 (jan 2018) 351\u2013373. 10.1016\/j.cpc.2017.09.024","DOI":"10.1016\/j.cpc.2017.09.024"},{"key":"e_1_3_2_1_21_1","volume-title":"Ninth Workshop on Virtual Reality Interaction and Physical Simulation (VRIPHYS) (Ninth Workshop on Virtual Reality Interaction and Physical Simulation (VRIPHYS)), Jan Bender, Arjan Kuijper, Dieter W","author":"Durand Marie","unstructured":"Marie Durand, Bruno Raffin, and Fran\u00e7ois Faure. 2012. A Packed Memory Array to Keep Moving Particles Sorted. In Ninth Workshop on Virtual Reality Interaction and Physical Simulation (VRIPHYS) (Ninth Workshop on Virtual Reality Interaction and Physical Simulation (VRIPHYS)), Jan Bender, Arjan Kuijper, Dieter W. Fellner, and Eric Gu\u00e9rin (Eds.). The Eurographics Association, Darmstadt, Germany, 69\u201377. https:\/\/inria.hal.science\/hal-00762593"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/27.509991"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0010-4655(00)00228-9"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00008"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the twenty-first annual symposium on Parallelism in algorithms and architectures. 79\u201390","author":"Frigo Matteo","year":"2009","unstructured":"Matteo Frigo, Pablo Halpern, Charles E Leiserson, and Stephen Lewin-Berlin. 2009. Reducers and other Cilk++ hyperobjects. In Proceedings of the twenty-first annual symposium on Parallelism in algorithms and architectures. 79\u201390."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1063\/5.0178288"},{"key":"e_1_3_2_1_27_1","volume-title":"Performance engineering on A64FX with SVE intrinsics (Early experience on Ookami). Presentation","author":"Harrison Robert J.","unstructured":"Robert J. Harrison. 2020. Performance engineering on A64FX with SVE intrinsics (Early experience on Ookami). Presentation. Stony Brook University, Institute for Advanced Computational Science. https:\/\/www.stonybrook.edu\/commcms\/ookami\/support\/_docs\/RJHACMCF21.pdf Accessed May 2024."},{"key":"e_1_3_2_1_28_1","volume-title":"Eastwood","author":"Hockney Roger W.","year":"1988","unstructured":"Roger W. Hockney and James W. Eastwood. 1988. Computer Simulation Using Particles. Taylor & Francis."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Alon Itai Alan G Konheim and Michael Rodeh. 1981. A sparse table implementation of priority queues. (1981) 417\u2013431.","DOI":"10.1007\/3-540-10843-2_34"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 27th ACM Symposium on Parallelism in Algorithms and Architectures. 111\u2013122","author":"Lee Ting Angelina","year":"2015","unstructured":"I-Ting Angelina Lee and Tao B Schardl. 2015. Efficiently detecting races in cilk programs that use reducer hyperobjects. In Proceedings of the 27th ACM Symposium on Parallelism in Algorithms and Architectures. 111\u2013122."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of HPC China","author":"Li Biao","year":"2021","unstructured":"Biao Li, Qingyang Zhang, Jie Liu, Xinhai Chen, Xiaoxiong Zhu, Qinglin Wang, and Hongbin Zhuo. 2021. Kinetic Simulations of Laser Plasma Interaction with 42.9 Trillion Particles and 10.4 Billion Grids. In Proceedings of HPC China 2021."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","unstructured":"R. L. Morse and C. W. Nielson. 1971. Numerical Simulation of the Weibel Instability in One and Two Dimensions. The Physics of Fluids 14 4 (04 1971) 830\u2013840. arXiv:https:\/\/pubs.aip.org\/aip\/pfl\/article-pdf\/14\/4\/830\/12783644\/830_1_online.pdf 10.1063\/1.1693518","DOI":"10.1063\/1.1693518"},{"key":"e_1_3_2_1_33_1","volume-title":"Kubrusly","author":"de Moura Carlos A.","year":"2012","unstructured":"Carlos A. de Moura and Carlos S. Kubrusly. 2012. The Courant-Friedrichs-Lewy (CFL) Condition: 80 Years After Its Discovery. Birkh\u00e4user Basel."},{"key":"e_1_3_2_1_34_1","volume-title":"Large Scale Manycore-Aware PIC Simulation with Efficient Particle Binning. In 2017 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE","author":"Nakashima Hiroshi","year":"2017","unstructured":"Hiroshi Nakashima, Yoshiki Summura, Keisuke Kikura, and Yohei Miyake. 2017. Large Scale Manycore-Aware PIC Simulation with Efficient Particle Binning. In 2017 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, Orlando, FL, USA, 202\u2013211. 10.1109\/IPDPS.2017.65"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/SCW63240.2024.00185"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the 35th ACM International Conference on Supercomputing","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Jiaolin Luo, Ivy Peng, Kai Wu, and Dong Li. 2021. Optimizing large-scale plasma simulations on persistent memory-based heterogeneous memory with effective data placement across memory hierarchy. In Proceedings of the 35th ACM International Conference on Supercomputing (Virtual Event, USA) (ICS '21). Association for Computing Machinery, New York, NY, USA, 203\u2013214. 10.1145\/3447818.3460356"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region Workshops","author":"Simakov Nikolay A.","unstructured":"Nikolay A. Simakov, Matthew D. Jones, Thomas R. Furlani, Eva Siegmann, and Robert J. Harrison. 2024. First Impressions of the NVIDIA Grace CPU Superchip and NVIDIA Grace Hopper Superchip for Scientific Workloads. In Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region Workshops (Nagoya, Japan) (HPCAsia '24 Workshops). Association for Computing Machinery, New York, NY, USA, 36\u201344. 10.1145\/3636480.3637097"},{"key":"e_1_3_2_1_38_1","volume-title":"AIP Conference Proceedings 91","author":"Tajima T.","year":"1982","unstructured":"T. Tajima and J. M. Dawson. 1982. Laser accelerator by plasma waves. AIP Conference Proceedings 91, 1 (Sept. 1982), 69\u201393. 10.1063\/1.33805"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2013.03.010"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2016.08.023"},{"key":"e_1_3_2_1_41_1","volume-title":"Introducing the Scalable Matrix Extension for the Armv9-A architecture. Arm Community Blog. (aug","author":"Weidman Martin","year":"2021","unstructured":"Martin Weidman. 2021. Introducing the Scalable Matrix Extension for the Armv9-A architecture. Arm Community Blog. (aug 2021). https:\/\/community.arm.com\/arm-community-blogs\/b\/architectures-and-processors-blog\/posts\/scalable-matrix-extension-armv9-a-architecture Accessed May 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"High Performance Computing. ISC High Performance 2016 (Lecture Notes in Computer Science","volume":"301","author":"Zenker E.","unstructured":"E. Zenker, M. Bussmann, G. Juckeland, A. Debus, A. Huebl, R. Widera, T. Kluge, and U. Schramm. 2016. Performance-Portable Many-Core Plasma Simulations: Porting PIConGPU to OpenPOWER and Beyond. In High Performance Computing. ISC High Performance 2016 (Lecture Notes in Computer Science, Vol. 9945), Michela Taufer, Bernd Mohr, and Julian M. Kunkel (Eds.). Springer, 293\u2013301. 10.1007\/978-3-319-46079-6_21"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 38th ACM International Conference on Supercomputing (ICS '24)","author":"Zhao Wenxuan","year":"2024","unstructured":"Wenxuan Zhao, Liang Yuan, Baicheng Yan, Penghao Ma, Yunquan Zhang, Long Wang, and Zhe Wang. 2024. Stencil Computation with Vector Outer Product. In Proceedings of the 38th ACM International Conference on Supercomputing (ICS '24). ACM, Kyoto, Japan, 247\u2013258. 10.1145\/3650200.3656611"}],"event":{"name":"EUROSYS '26: 21st European Conference on Computer Systems","location":"McEwan Hall\/The University of Edinburgh Edinburgh Scotland UK","acronym":"EUROSYS '26","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 21st European Conference on Computer Systems"],"original-title":[],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T20:35:36Z","timestamp":1777062936000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767295.3769378"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,26]]},"references-count":43,"alternative-id":["10.1145\/3767295.3769378","10.1145\/3767295"],"URL":"https:\/\/doi.org\/10.1145\/3767295.3769378","relation":{},"subject":[],"published":{"date-parts":[[2026,4,26]]},"assertion":[{"value":"2026-04-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}