{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:40:00Z","timestamp":1766220000919,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,8]]},"DOI":"10.1145\/3754598.3754599","type":"proceedings-article","created":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:34:32Z","timestamp":1766219672000},"page":"115-124","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["WinRS: Accelerate Winograd Backward-Filter Convolution with Tiny Workspace"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7849-3222","authenticated-orcid":false,"given":"Zhiyi","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6487-3658","authenticated-orcid":false,"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5098-1503","authenticated-orcid":false,"given":"Jingwei","family":"Sun","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5415-9592","authenticated-orcid":false,"given":"Pengfei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1629-5988","authenticated-orcid":false,"given":"Zhuopin","family":"Xu","sequence":"additional","affiliation":[{"name":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9888-6238","authenticated-orcid":false,"given":"Jun","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5810-9223","authenticated-orcid":false,"given":"Qi","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,20]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Roberto\u00a0L. Castro Diego Andrade and Basilio\u00a0B. Fraguela. 2021. OpenCNN: A Winograd Minimal Filtering Algorithm Implementation in CUDA. Mathematics 9 17 (2021). 10.3390\/math9172033","DOI":"10.3390\/math9172033"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASP-DAC47756.2020.9045214"},{"key":"e_1_3_3_2_4_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch and et al.2014. cuDNN: Efficient Primitives for Deep Learning. CoRR abs\/1410.0759 2014 (2014)."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00527"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT52863.2021.9609907"},{"key":"e_1_3_3_2_10_2","first-page":"4174","volume-title":"AAAI Conference on Artificial Intelligence","volume":"34","author":"Huang Di","year":"2020","unstructured":"Di Huang, Xishan Zhang, Rui Zhang, Tian Zhi, and et al.2020. DWM: A decomposable Winograd method for convolution acceleration. In AAAI Conference on Artificial Intelligence, Vol.\u00a034. 4174\u20134181."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","unstructured":"Liancheng Jia Yun Liang Xiuhong Li Liqiang Lu and Shengen Yan. 2020. Enabling Efficient Fast Convolution Algorithms on GPUs via MegaKernels. IEEE Trans. Comput. 69 7 (2020) 986\u2013997. 10.1109\/TC.2020.2973144","DOI":"10.1109\/TC.2020.2973144"},{"key":"e_1_3_3_2_12_2","first-page":"675","volume-title":"Proceedings of the 22nd ACM international conference on Multimedia Retrieval (ICMR)","author":"Jia Yangqing","year":"2014","unstructured":"Yangqing Jia, Evan Shelhamer, Jeff Donahue, and et al.2014. caffe: Convolutional architecture for fast feature embedding. In Proceedings of the 22nd ACM international conference on Multimedia Retrieval (ICMR) (Newark, NJ, USA). 675\u2013678."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSI-SoC57769.2023.10321932"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","unstructured":"S. Kala Babita\u00a0R. Jose Jimson Mathew and S. Nalesh. 2019. High-Performance CNN Accelerator on FPGA Using Unified Winograd-GEMM Architecture. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 27 12 (2019) 2816\u20132828. 10.1109\/TVLSI.2019.2941250","DOI":"10.1109\/TVLSI.2019.2941250"},{"key":"e_1_3_3_2_15_2","unstructured":"Alex Krizhevsky. [n. d.]. Cifar10. http:\/\/www.cs.toronto.edu\/\u00a0kriz\/cifar.html."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/106972.106981"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472496"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054562"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","unstructured":"Yun Liang Liqiang Lu Qingcheng Xiao and Shengen Yan. 2020. Evaluating Fast Algorithms for Convolutional Neural Networks on FPGAs. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 39 4 (2020) 857\u2013870. 10.1109\/TCAD.2019.2897701","DOI":"10.1109\/TCAD.2019.2897701"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Yun Liang Liqiang Lu Qingcheng Xiao and Shengen Yan. 2020. Evaluating Fast Algorithms for Convolutional Neural Networks on FPGAs. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 39 4 (2020) 857\u2013870. 10.1109\/TCAD.2019.2897701","DOI":"10.1109\/TCAD.2019.2897701"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472473"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","unstructured":"Jinming Lu Hui Wang Jun Lin and Zhongfeng Wang. 2024. WinTA: An Efficient Reconfigurable CNN Training Accelerator With Decomposition Winograd. IEEE Transactions on Circuits and Systems I: Regular Papers 71 2 (2024) 634\u2013645. 10.1109\/TCSI.2023.3338471","DOI":"10.1109\/TCSI.2023.3338471"},{"key":"e_1_3_3_2_25_2","volume-title":"Proceedings of the 2rd International Conference on Learning Representations (ICLR)","author":"Mathieu Michael","year":"2014","unstructured":"Michael Mathieu, Mikael Henaff, and Yann LeCun. 2014. Fast Training of Convolutional Networks through FFTs. In Proceedings of the 2rd International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_26_2","unstructured":"NVIDIA. 2025. cuDNN. https:\/\/developer.nvidia.com\/cudnn."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Bizhao Shi Jiaxi Zhang Zhuolun He and et al.2023. Efficient Super-Resolution System With Block-Wise Hybridization and Quantized Winograd on FPGA. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems 42 11 (2023) 3910\u20133924. 10.1109\/TCAD.2023.3247621","DOI":"10.1109\/TCAD.2023.3247621"},{"key":"e_1_3_3_2_28_2","volume-title":"Proceedings of the 3rd International Conference on Learning Representations (ICLR)","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In Proceedings of the 3rd International Conference on Learning Representations (ICLR) (San Diego, CA)."},{"key":"e_1_3_3_2_29_2","volume-title":"Proceedings of the 3rd International Conference on Learning Representations (ICLR)","author":"Vasilache Nicolas","year":"2015","unstructured":"Nicolas Vasilache, Jeff Johnson, Michael Mathieu, and et al.2015. Fast Convolutional Nets with fbfft: A GPU Performance Evaluation. In Proceedings of the 3rd International Conference on Learning Representations (ICLR) (San Diego, CA)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611970364"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511985"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178496"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374520"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","unstructured":"Chen Yang Yizhou Wang Xiaoli Wang and Li Geng. 2019. WRA: A 2.2-to-6.3 TOPS Highly Unified Dynamically Reconfigurable Accelerator Using a Novel Winograd Decomposition Algorithm for Convolutional Neural Networks. IEEE Transactions on Circuits and Systems I: Regular Papers 66 9 (2019) 3480\u20133493. 10.1109\/TCSI.2019.2928682","DOI":"10.1109\/TCSI.2019.2928682"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","unstructured":"Juan Yepez and Seok-Bum Ko. 2020. Stride 2 1-D 2-D and 3-D Winograd for Convolutional Neural Networks. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 28 4 (2020) 853\u2013863. 10.1109\/TVLSI.2019.2961602","DOI":"10.1109\/TVLSI.2019.2961602"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3673038.3673039"}],"event":{"name":"ICPP '25: 54th International Conference on Parallel Processing","location":"San Diego CA USA","acronym":"ICPP '25"},"container-title":["Proceedings of the 54th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3754598.3754599","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,20]],"date-time":"2025-12-20T08:38:26Z","timestamp":1766219906000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3754598.3754599"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,8]]},"references-count":35,"alternative-id":["10.1145\/3754598.3754599","10.1145\/3754598"],"URL":"https:\/\/doi.org\/10.1145\/3754598.3754599","relation":{},"subject":[],"published":{"date-parts":[[2025,9,8]]},"assertion":[{"value":"2025-12-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}