{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T07:04:00Z","timestamp":1772867040418,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656611","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"247-258","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Stencil Computation with Vector Outer Product"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4464-7104","authenticated-orcid":false,"given":"Wenxuan","family":"Zhao","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China and University of Chinese Academy of Science, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-2907","authenticated-orcid":false,"given":"Liang","family":"Yuan","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1277-8357","authenticated-orcid":false,"given":"Baicheng","family":"Yan","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co. Ltd, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9684-2960","authenticated-orcid":false,"given":"Penghao","family":"Ma","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7520-9640","authenticated-orcid":false,"given":"Yunquan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1638-5993","authenticated-orcid":false,"given":"Long","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1888-4311","authenticated-orcid":false,"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/29873.29875"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"R. Andonov S. Balev S. Rajopadhye and N. Yanev. 2001. Optimal Semi-oblique Tiling. In SPAA \u201901. 153\u2013162.","DOI":"10.1145\/378580.378619"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2003.1233716"},{"key":"e_1_3_2_1_4_1","unstructured":"ARM. [n. d.]. A-Profile Architectures. https:\/\/www.arm.com\/architecture\/cpu\/a-profile"},{"key":"e_1_3_2_1_5_1","unstructured":"K. Asanovic R. Bodik B. Catanzaro J. Gebis P. Husbands K. Keutzer D.\u00a0Patterson andxs W.\u00a0Plishker J. Shalf S. Williams and K. Yelick. 2006. The Landscape of Parallel Computing Research: A View from Berkeley. Technical Report."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"V. Bandishti I. Pananilath and U. Bondhugula. 2012. Tiling stencil computations to maximize parallelism(SC \u201912). 1\u201311.","DOI":"10.1109\/SC.2012.107"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Protonu Basu Mary Hall Samuel Williams Brian\u00a0Van Straalen Leonid Oliker and Phillip Colella. 2015. Compiler-Directed Transformation for Higher-Order Stencils(IPDPS \u201915). 313\u2013323.","DOI":"10.1109\/IPDPS.2015.103"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Uday Bondhugula Albert Hartono J. Ramanujam and P. Sadayappan. 2008. A Practical Automatic Polyhedral Parallelizer and Locality Optimizer(PLDI \u201908).","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Diego Caballero Sara Royuela Roger Ferrer Alejandro Duran and Xavier Martorell. 2015. Optimizing overlapped memory accesses in user-directed vectorization. In ICS. 393\u2013404.","DOI":"10.1145\/2751205.2751224"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-010-9373-6"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Kaushik Datta Mark Murphy Vasily Volkov Samuel Williams Jonathan Carter Leonid Oliker David Patterson John Shalf and Katherine Yelick. 2008. Stencil Computation Optimization and Auto-tuning on State-of-the-art Multicore Architectures(SC \u201908). Article 4 12\u00a0pages.","DOI":"10.1109\/SC.2008.5222004"},{"key":"e_1_3_2_1_12_1","article-title":"Algorithm 942","volume":"40","author":"de\u00a0la Cruz Ra\u00fal","year":"2014","unstructured":"Ra\u00fal de\u00a0la Cruz and Mauricio Araya-Polo. 2014. Algorithm 942: Semi-Stencil. ACM Trans. Math. Softw. 40, 3, Article 23 (April 2014), 39\u00a0pages.","journal-title":"Semi-Stencil. ACM Trans. Math. Softw."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Steven\u00a0J. Deitz Bradford\u00a0L. Chamberlain and Lawrence Snyder. 2001. Eliminating Redundancies in Sum-of-product Array Computations(ICS \u201901). 65\u201377.","DOI":"10.1145\/377792.377807"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Chris Ding and Yun He. 2001. A Ghost Cell Expansion Method for Reducing Communications in Solving PDE Problems(SC \u201901). 50\u201350.","DOI":"10.1145\/582034.582084"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Matteo Frigo and Volker Strumpen. 2005. Cache oblivious stencil computations(ICS \u201905). 361\u2013366.","DOI":"10.1145\/1088149.1088197"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Matteo Frigo and Volker Strumpen. 2006. The cache complexity of multithreaded cache oblivious algorithms(SPAA \u201906). 271\u2013280.","DOI":"10.1145\/1148109.1148157"},{"key":"e_1_3_2_1_17_1","unstructured":"Google. 2021. Tensor Processing Units (TPUs). https:\/\/cloud.google.com\/tpu"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Tobias Grosser Albert Cohen Justin Holewinski P. Sadayappan and Sven Verdoolaege. 2014. Hybrid Hexagonal\/Classical Tiling for GPUs(CGO \u201914). 66\u201375.","DOI":"10.1145\/2581122.2544160"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Tobias Grosser Albert Cohen Paul H.\u00a0J. Kelly J. Ramanujam P. Sadayappan and Sven Verdoolaege. 2013. Split Tiling for GPUs: Automatic Parallelization Using Trapezoidal Tiles(GPGPU-6). 24\u201331.","DOI":"10.1145\/2458523.2458526"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1142\/S0129626414410023"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Mark Hampton and Krste Asanovic. 2008. Compiling for vector-thread architectures. In CGO. 205\u2013215.","DOI":"10.1145\/1356058.1356085"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Tom Henretty Kevin Stock Louis-No\u00ebl Pouchet Franz Franchetti J. Ramanujam and P. Sadayappan. 2011. Data Layout Transformation for Stencil Computations on Short-vector SIMD Architectures(CC\u201911\/ETAPS\u201911). 225\u2013245.","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Tom Henretty Richard Veras Franz Franchetti Louis-No\u00ebl Pouchet J. Ramanujam and P. Sadayappan. 2013. A Stencil Compiler for Short-vector SIMD Architectures(ICS \u201913). 13\u201324.","DOI":"10.1145\/2464996.2467268"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Justin Holewinski Louis-No\u00ebl Pouchet and P. Sadayappan. 2012. High-performance Code Generation for Stencil Computations on GPU Architectures(ICS \u201912). 311\u2013320.","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_2_1_25_1","unstructured":"IBM. 2020. Power10 with Matrix Math Accelerator. https:\/\/developer.ibm.com\/tutorials\/power10-business-inferencing-at-scale-with-mma\/"},{"key":"e_1_3_2_1_26_1","unstructured":"Intel. 2021. Intel\u00ae Advanced Matrix Extensions Overview. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/accelerator-engines\/advanced-matrix-extensions\/overview.html"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"F. Irigoin and R. Triolet. 1988. Supernode Partitioning(POPL \u201988). 319\u2013329.","DOI":"10.1145\/73560.73588"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Guohua Jin John Mellor-Crummey and Robert Fowler. 2001. Increasing Temporal Locality with Skewing and Recursive Blocking(SC \u201901). 43\u201343.","DOI":"10.1145\/582034.582077"},{"key":"e_1_3_2_1_29_1","volume-title":"Optimizing Compilers for Modern Architectures: A Dependence-Based Approach","author":"Kennedy Ken","unstructured":"Ken Kennedy and John\u00a0R. Allen. 2001. Optimizing Compilers for Modern Architectures: A Dependence-Based Approach. Morgan Kaufmann Publishers Inc."},{"key":"e_1_3_2_1_30_1","unstructured":"D. K\u00f6nig. 1931. Graphs and matrices. In Mat. Lapok. 116\u2013\u2013119."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Sriram Krishnamoorthy Muthu Baskaran Uday Bondhugula J. Ramanujam Atanas Rountev and P Sadayappan. 2007. Effective Automatic Parallelization of Stencil Computations(PLDI \u201907). 235\u2013244.","DOI":"10.1145\/1250734.1250761"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Monica\u00a0D. Lam Edward\u00a0E. Rothberg and Michael\u00a0E. Wolf. 1991. The Cache Performance and Optimizations of Blocked Algorithms(ASPLOS IV). 63\u201374.","DOI":"10.1145\/106972.106981"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Samuel Larsen and Saman Amarasinghe. 2000. Exploiting Superword Level Parallelism with Multimedia Instruction Sets. In PLDI. 145\u2013156.","DOI":"10.1145\/349299.349320"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Xiaoyan Liu Yi Liu Hailong Yang Jianjin Liao Mingzhen Li Zhongzhi Luan and Depei Qian. 2022. Toward Accelerated Stencil Computation by Adapting Tensor Core Unit on GPU. In ICS \u201922 . 1\u201312.","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_2_1_35_1","unstructured":"L. Lovasz. 1975. On minimax theorems of combinatorics. Doctoral Thesis (1975) 26:209\u2013264."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400718"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1137\/140991133"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/362875.362879"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6377(82)90039-6"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Jiayuan Meng and Kevin Skadron. 2009. Performance modeling and automatic ghost zone optimization for iterative stencil loops on GPUs(ICS \u201909). 256\u2013265.","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_41_1","volume-title":"Arxiv","author":"Moreira Jose","year":"2021","unstructured":"Jose Moreira. [n. d.]. A matrix math facility for Power ISA processors. In Arxiv 2021. 1\u201312."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"A. Nguyen N. Satish J. Chhugani C. Kim and P. Dubey. 2010. 3.5-D Blocking Optimization for Stencil Computations on Modern CPUs and GPUs(SC \u201910).","DOI":"10.1109\/SC.2010.2"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Dorit Nuzman and Ayal Zaks. 2008. Outer-loop vectorization: revisited for short SIMD architectures. In PACT. 2\u201311.","DOI":"10.1145\/1454115.1454119"},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA. 2020. NVIDIA V100 TENSOR CORE GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/v100\/"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2739047"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"E.\u00a0H. Phillips and M. Fatica. 2010. Implementing the Himeno benchmark with CUDA on GPU clusters(IPDPS \u201910). 1\u201310.","DOI":"10.1109\/IPDPS.2010.5470394"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Fabrice Rastello and Thierry Dauxois. 2002. Efficient Tiling for an ODE Discrete Integration Program: Redundant Tasks Instead of Trapezoidal Shaped-Tiles(IPDPS \u201902).","DOI":"10.1109\/IPDPS.2002.1016667"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Gabriel Rivera and Chau-Wen Tseng. 2000. Tiling Optimizations for 3D Scientific Computations(SC \u201900). Article 32.","DOI":"10.1109\/SC.2000.10015"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Yonghong Song and Zhiyuan Li. 1999. New Tiling Techniques to Improve Cache Temporal Locality(PLDI \u201999). 215\u2013228.","DOI":"10.1145\/301618.301668"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007559022013"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Kevin Stock Martin Kong Tobias Grosser Louis-No\u00ebl Pouchet Fabrice Rastello Jagannathan Ramanujam and Ponnuswamy Sadayappan. 2014. A framework for enhancing data reuse via associative reordering. In PLDI. 65\u201376.","DOI":"10.1145\/2594291.2594342"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Robert Strzodka Mohammed Shaheen Dawid Pajak and Hans-Peter Seidel. 2010. Cache oblivious parallelograms in iterative stencil computations(ICS \u201910). 49\u201359.","DOI":"10.1145\/1810085.1810096"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Robert Strzodka Mohammed Shaheen Dawid Pajak and Hans-Peter Seidel. 2011. Cache Accurate Time Skewing in Iterative Stencil Computations(ICPP \u201911). 11\u00a0pages.","DOI":"10.1109\/ICPP.2011.47"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Yuan Tang Rezaul\u00a0Alam Chowdhury Bradley\u00a0C. Kuszmaul Chi-Keung Luk and Charles\u00a0E. Leiserson. 2011. The Pochoir Stencil Compiler(SPAA \u201911). 117\u2013128.","DOI":"10.1145\/1989493.1989508"},{"key":"e_1_3_2_1_55_1","unstructured":"Martin Weidmann. 2021. Scalable Matrix Extension for the Armv9-A Architecture. https:\/\/community.arm.com\/arm-community-blogs\/b\/architectures-and-processors-blog\/posts\/scalable-matrix-extension-armv9-a-architecture\/"},{"key":"e_1_3_2_1_56_1","volume-title":"An Initial Evaluation of Arm\u2019s Scalable Matrix Extension","author":"Wilkinson Finn","unstructured":"Finn Wilkinson and Simon McIntosh-Smith. 2022. An Initial Evaluation of Arm\u2019s Scalable Matrix Extension. In PMBS. IEEE, 135\u2013140."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Michael\u00a0E. Wolf and Monica\u00a0S. Lam. 1991. A Data Locality Optimizing Algorithm(PLDI \u201991). 30\u201344.","DOI":"10.1145\/113445.113449"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"M. Wolfe. 1989. More Iteration Space Tiling(Supercomputing \u201989). 655\u2013664.","DOI":"10.1145\/76263.76337"},{"key":"e_1_3_2_1_59_1","unstructured":"D. Wonnacott. 2000. Using time skewing to eliminate idle time due to memory bandwidth and network limitations(IPDPS \u201900). 171\u2013180."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1015460304860"},{"key":"e_1_3_2_1_61_1","volume-title":"IMPACT 2013","author":"Wonnacott G","year":"2013","unstructured":"David\u00a0G Wonnacott and Michelle\u00a0Mills Strout. 2013. On the scalability of loop tiling techniques. IMPACT 2013 (2013)."},{"key":"e_1_3_2_1_62_1","volume-title":"Vector Folding: Improving Stencil Performance via Multi-Dimensional SIMD-Vector Representation. In HPCC. 865\u2013870.","author":"Yount Charles","year":"2015","unstructured":"Charles Yount. 2015. Vector Folding: Improving Stencil Performance via Multi-Dimensional SIMD-Vector Representation. In HPCC. 865\u2013870."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Liang Yuan Hang Cao Yunquan Zhang Kun Li Pengqi Lu and Yue Yue. 2021. Temporal vectorization for stencils. In SC. Article 82 13\u00a0pages.","DOI":"10.1145\/3458817.3476149"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Liang Yuan Yunquan Zhang Peng Guo and Shan Huang. 2017. Tessellating stencils. In SC. 1\u201313.","DOI":"10.1145\/3126908.3126920"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Yongpeng Zhang and Frank Mueller. 2012. Auto-generation and Auto-tuning of 3D Stencil Codes on GPU Clusters(CGO \u201912). 155\u2013164.","DOI":"10.1145\/2259016.2259037"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Tuowen Zhao Protonu Basu Samuel Williams Mary Hall and Hans Johansen. 2019. Exploiting reuse and vectorization in blocked stencil computations on CPUs and GPUs. In SC. 1\u201344.","DOI":"10.1145\/3295500.3356210"},{"key":"e_1_3_2_1_67_1","volume-title":"Exploiting mixed SIMD parallelism by reducing data reorganization overhead","author":"Zhou Hao","unstructured":"Hao Zhou and Jingling Xue. 2016. Exploiting mixed SIMD parallelism by reducing data reorganization overhead. In CGO. IEEE, 59\u201369."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Xing Zhou Jean-Pierre Giacalone Mar\u00eda\u00a0Jes\u00fas Garzar\u00e1n Robert\u00a0H. Kuhn Yang Ni and David Padua. 2012. Hierarchical Overlapped Tiling(CGO \u201912). 207\u2013218.","DOI":"10.1145\/2259016.2259044"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656611","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656611","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:23:57Z","timestamp":1755876237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656611"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":68,"alternative-id":["10.1145\/3650200.3656611","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656611","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}