{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T03:31:06Z","timestamp":1769830266351,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T00:00:00Z","timestamp":1708387200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,2]]},"DOI":"10.1145\/3627535.3638476","type":"proceedings-article","created":{"date-parts":[[2024,2,20]],"date-time":"2024-02-20T14:22:41Z","timestamp":1708438961000},"page":"333-347","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["ConvStencil: Transform Stencil Computation to Matrix Multiplication on Tensor Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3818-4753","authenticated-orcid":false,"given":"Yuetao","family":"Chen","sequence":"first","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1013-1325","authenticated-orcid":false,"given":"Kun","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7753-0499","authenticated-orcid":false,"given":"Yuhao","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"},{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3655-8677","authenticated-orcid":false,"given":"Donglin","family":"Bai","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2313-5348","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"},{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9524-5476","authenticated-orcid":false,"given":"Lingxiao","family":"Ma","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3406-2907","authenticated-orcid":false,"given":"Liang","family":"Yuan","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7520-9640","authenticated-orcid":false,"given":"Yunquan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6455-3898","authenticated-orcid":false,"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,2,20]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC55821.2022.9926299"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.9"},{"key":"e_1_3_2_1_3_1","volume-title":"Joseph James Gebis, Parry Husbands, Kurt Keutzer, David A Patterson, William Lester Plishker, John Shalf, Samuel Webb Williams, et al.","author":"Asanovic Krste","year":"2006","unstructured":"Krste Asanovic, Ras Bodik, Bryan Christopher Catanzaro, Joseph James Gebis, Parry Husbands, Kurt Keutzer, David A Patterson, William Lester Plishker, John Shalf, Samuel Webb Williams, et al. 2006. The landscape of parallel computing research: A view from berkeley. (2006)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1562764.1562783"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.107"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00078"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2615094"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_9_1","volume-title":"Tenth international workshop on frontiers in handwriting recognition. Suvisoft.","author":"Chellapilla Kumar","year":"2006","unstructured":"Kumar Chellapilla, Sidd Puri, and Patrice Simard. 2006. High performance convolutional neural networks for document processing. In Tenth international workshop on frontiers in handwriting recognition. Suvisoft."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_2_1_11_1","volume-title":"cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759","author":"Chetlur Sharan","year":"2014","unstructured":"Sharan Chetlur, Cliff Woolley, Philippe Vandermersch, Jonathan Cohen, John Tran, Bryan Catanzaro, and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331057"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/SYNASC.2014.70"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458526"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3469030"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Tom Henretty Kevin Stock Louis-No\u00ebl Pouchet Franz Franchetti J. Ramanujam and P. Sadayappan. 2011. Data Layout Transformation for Stencil Computations on Short-Vector SIMD Architectures. In Compiler Construction Jens Knoop (Ed.). Springer Berlin Heidelberg Berlin Heidelberg 225--245.","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2467268"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compfluid.2013.12.007"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00035"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2001.10041"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476154"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532392"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2021.108063"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the 1st international workshop on high-performance stencil computations","author":"Maruyama Naoya","year":"2014","unstructured":"Naoya Maruyama and Takayuki Aoki. 2014. Optimizing stencil computations for NVIDIA Kepler GPUs. In Proceedings of the 1st international workshop on high-performance stencil computations, Vienna. Citeseer, 89--95."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063398"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368826.3377904"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1542275.1542313"},{"key":"e_1_3_2_1_30_1","unstructured":"Nvidia. 2023. CUDA C++ Programming Guide. https:\/\/https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/ Last accessed on 2023-7-24."},{"key":"e_1_3_2_1_31_1","unstructured":"Nvidia. 2023. cuDNN. https:\/\/developer.nvidia.com\/cudnn Last accessed on 2023-7-24."},{"key":"e_1_3_2_1_32_1","unstructured":"Nvidia. 2023. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.cn\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Last accessed on 2023-7-24."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830018.2830025"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2884045.2884047"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00049"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2862896"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00073"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2000.10015"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/2666356.2594342"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1015460304860"},{"key":"e_1_3_2_1_43_1","volume-title":"DRStencil: Exploiting Data Reuse within Low-order Stencil on GPU. In 2021 IEEE 23rd Int Conf on High Performance Computing & Communications","author":"You Xin","unstructured":"Xin You, Hailong Yang, Zhonghui Jiang, Zhongzhi Luan, and Depei Qian. 2021. DRStencil: Exploiting Data Reuse within Low-order Stencil on GPU. In 2021 IEEE 23rd Int Conf on High Performance Computing & Communications; 7th Int Conf on Data Science & Systems; 19th Int Conf on Smart City; 7th Int Conf on Dependability in Sensor, Cloud & Big Data Systems & Application (HPCC\/DSS\/SmartCity\/DependSys). IEEE, 63--70."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337835"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126920"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593705"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3577193.3593716"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00064"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356210"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441598"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC.2018.00009"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"}],"event":{"name":"PPoPP '24: 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Edinburgh United Kingdom","acronym":"PPoPP '24","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGPLAN ACM Special Interest Group on Programming Languages"]},"container-title":["Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638476","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627535.3638476","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:26Z","timestamp":1750182566000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627535.3638476"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,20]]},"references-count":52,"alternative-id":["10.1145\/3627535.3638476","10.1145\/3627535"],"URL":"https:\/\/doi.org\/10.1145\/3627535.3638476","relation":{},"subject":[],"published":{"date-parts":[[2024,2,20]]},"assertion":[{"value":"2024-02-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}