{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:13Z","timestamp":1755870013400,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFB3001501"],"award-info":[{"award-number":["2023YFB3001501"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62322201, U23B2020"],"award-info":[{"award-number":["62322201, U23B2020"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["YWF-23-L-1121, JKF-20240198, JK2024-58"],"award-info":[{"award-number":["YWF-23-L-1121, JKF-20240198, JK2024-58"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725760","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"250-264","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Locality-aware Instruction Stream Scheduling for Stencil Computation on ARM Processors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-2651-9651","authenticated-orcid":false,"given":"Shanghao","family":"Liu","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1101-7927","authenticated-orcid":false,"given":"Hailong","family":"Yang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5163-4607","authenticated-orcid":false,"given":"Xin","family":"You","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7186-0556","authenticated-orcid":false,"given":"Zhongzhi","family":"Luan","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1829-2817","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5382-1473","authenticated-orcid":false,"given":"Depei","family":"Qian","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Andrew Adams Karima Ma Luke Anderson Riyadh Baghdadi Tzu-Mao Li Micha\u00ebl Gharbi Benoit Steiner Steven Johnson Kayvon Fatahalian Fr\u00e9do Durand et\u00a0al. 2019. Learning to optimize halide with tree search and random programs. ACM Transactions on Graphics (TOG) 38 4 (2019) 1\u201312.","DOI":"10.1145\/3306346.3322967"},{"key":"e_1_3_3_1_3_2","unstructured":"Yulong Ao Chao Yang Xinliang Wang Wei Xue and Wenjing Ma. 2017. 26 PFLOPS Stencil Computations for Atmospheric Modeling on Sunway TaihuLight. IEEE (2017)."},{"key":"e_1_3_3_1_4_2","unstructured":"ARM. 2025. ARM cortex-A78 core software optimization guide. https:\/\/documentation-service.arm.com\/static\/60a5413bd63d3c31550c391e\/"},{"key":"e_1_3_3_1_5_2","unstructured":"ARM. 2025. Dual issue for NEON instructions version1.0. https:\/\/developer.arm.com\/documentation\/den0018\/a\/Optimizing-NEON-Code\/Scheduling\/Dual-issue-for-NEON-instructions"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Adri\u00e0 Armejach Helena Caminal Juan\u00a0M Cebrian Rub\u00e9n Langarita Rekai Gonz\u00e1lez-Alberquilla Chris Adeniyi-Jones Mateo Valero Marc Casas and Miquel Moret\u00f3. 2020. Using Arm\u2019s scalable vector extension on stencil codes. The Journal of Supercomputing 76 3 (2020) 2039\u20132062.","DOI":"10.1007\/s11227-019-02842-5"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.107"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751226"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.70"},{"key":"e_1_3_3_1_10_2","unstructured":"Phillip Colella. 2004. Defining software requirements for scientific computing. DARPA HPCS. (2004). https:\/\/www.krellinst.org\/doecsgf\/conf\/2013\/pres\/pcolella.pdf"},{"key":"e_1_3_3_1_11_2","volume-title":"In Proc. iWAPT2009: The Fourth International Workshop on Automatic Performance Tuning","author":"Datta Kaushik","year":"2009","unstructured":"Kaushik Datta, Samuel Williams, Vasily Volkov, Jonathan Carter, Leonid Oliker, John Shalf, and Katherine Yelick. 2009. Auto-tuning the 27-point stencil for multicore. In In Proc. iWAPT2009: The Fourth International Workshop on Automatic Performance Tuning , Vol.\u00a070."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Tyler Denniston Shoaib Kamil and Saman Amarasinghe. 2016. Distributed halide. ACM SIGPLAN Notices 51 8 (2016) 1\u201312.","DOI":"10.1145\/3016078.2851157"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Jian-Bin Fang Xiang-Ke Liao Chun Huang and De-Zun Dong. 2021. Performance evaluation of memory-centric armv8 many-core architectures: A case study with phytium 2000+. Journal of Computer Science and Technology 36 (2021) 33\u201343.","DOI":"10.1007\/s11390-020-0741-6"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088197"},{"key":"e_1_3_3_1_15_2","volume-title":"Processor microarchitecture: An implementation perspective","author":"Gonzalez Antonio","year":"2022","unstructured":"Antonio Gonzalez, Fernando Latorre, and Grigorios Magklis. 2022. Processor microarchitecture: An implementation perspective. Springer Nature."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Jia Guo Ganesh Bikshandi Basilio\u00a0B. Fraguela and David Padua. 2010. Writing productive stencil codes with overlapped tiling. Concurrency & Computation Practice & Experience 21 1 (2010) 25\u201339.","DOI":"10.1002\/cpe.1340"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Tobias Gysi Christoph M\u00fcller Oleksandr Zinenko Stephan Herhut Eddie Davis Tobias Wicky Oliver Fuhrer Torsten Hoefler and Tobias Grosser. 2021. Domain-specific multi-level IR rewriting for GPU: The Open Earth compiler for GPU-accelerated climate simulation. ACM Transactions on Architecture and Code Optimization (TACO) 18 4 (2021) 1\u201323.","DOI":"10.1145\/3469030"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_3_1_19_2","unstructured":"Mingzhen Li Yi Liu Hailong Yang Yongmin Hu Qingxiao Sun Bangduo Chen Xin You Xiaoyan Liu Zhongzhi Luan and D. Qian. 2021. Automatic Code Generation and Optimization of Large-scale Stencil Computation on Many-core Processors. Proceedings of the 50th International Conference on Parallel Processing (2021)."},{"key":"e_1_3_3_1_20_2","series-title":"(SC \u201924)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis","author":"Liu Xiaoyan","year":"2024","unstructured":"Xiaoyan Liu, Xinyu Yang, Kejie Ma, Shanghao Liu, Kaige Zhang, Hailong Yang, Yi Liu, Zhongzhi Luan, and Depei Qian. 2024. Moirae: Generating High-Performance Composite Stencil Programs with Global Optimizations. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis (Atlanta, GA, USA) (SC \u201924). IEEE Press, Article 20, 15\u00a0pages."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063398"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Ravi\u00a0Teja Mullapudi Andrew Adams Dillon Sharlet Jonathan Ragan-Kelley and Kayvon Fatahalian. 2016. Automatically scheduling halide image processing pipelines. ACM Transactions on Graphics (TOG) 35 4 (2016) 1\u201311.","DOI":"10.1145\/2897824.2925952"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Jonathan Ragan-Kelley Connelly Barnes Andrew Adams Sylvain Paris Fr\u00e9do Durand and Saman Amarasinghe. 2013. Halide: a language and compiler for optimizing parallelism locality and recomputation in image processing pipelines. Acm Sigplan Notices 48 6 (2013) 519\u2013530.","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/2884045.2884047"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178500"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2007.370291"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2011.59"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Mageda Sharafeddine Komal Jothi and Haitham Akkary. 2012. Disjoint out-of-order execution processor. ACM Transactions on Architecture and Code Optimization (TACO) 9 3 (2012) 1\u201332.","DOI":"10.1145\/2355585.2355592"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751240"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Larisa Stoltzfus Bastian Hagedorn Michel Steuwer Sergei Gorlatch and Christophe Dubach. 2019. Tiling Optimizations for Stencil Computations Using Rewrite Rules in Lift. ACM Transactions on Architecture and Code Optimization (TACO) (2019).","DOI":"10.1145\/3368858"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00090"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Gerhard Wellein Georg Hager Thomas Zeiser Markus Wittmann and Holger Fehske. 2009. Efficient Temporal Blocking for Stencil Computations by Multicore-Aware Wavefront Parallelization. IEEE (2009).","DOI":"10.1109\/COMPSAC.2009.82"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Samuel Williams Andrew Waterman and David Patterson. 2009. Roofline: an insightful visual performance model for multicore architectures. Commun. ACM 52 4 (2009) 65\u201376.","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Jing Xia Chuanning Cheng Xiping Zhou Yuxing Hu and Peter Chun. 2021. Kunpeng 920: The first 7-nm chiplet-based 64-core arm soc for cloud services. IEEE Micro 41 5 (2021) 67\u201375.","DOI":"10.1109\/MM.2021.3085578"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-CSS-ICESS.2015.27"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/WOLFHPC.2016.08"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476149"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126920"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Kaifang Zhang Huayou Su and Yong Dou. 2021. Multilevel parallelism optimization of stencil computations on SIMDlized NUMA architectures. The Journal of Supercomputing 77 11 (2021) 13584\u201313600.","DOI":"10.1007\/s11227-021-03823-3"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-SmartCity-DSS50907.2020.00023"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-SmartCity-DSS50907.2020.00015"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICBASE51474.2020.00078"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725760","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:01:44Z","timestamp":1755867704000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725760"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":42,"alternative-id":["10.1145\/3721145.3725760","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725760","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}