{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T08:17:37Z","timestamp":1769501857617,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","funder":[{"name":"Ministry of Education AcRF Tier 2 grant in Singapore","award":["MOE-000766-00"],"award-info":[{"award-number":["MOE-000766-00"]}]},{"name":"open research fund of Pengcheng Laboratory","award":["2025KF1B0030"],"award-info":[{"award-number":["2025KF1B0030"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756124","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1579-1591","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["HiPACK: Efficient Sub-8-Bit Direct Convolution with SIMD and Bitwise Management"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5798-2282","authenticated-orcid":false,"given":"Yao","family":"Chen","sequence":"first","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6594-8375","authenticated-orcid":false,"given":"Cheng","family":"Gong","sequence":"additional","affiliation":[{"name":"Tiangong University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8618-4581","authenticated-orcid":false,"given":"Bingsheng","family":"He","sequence":"additional","affiliation":[{"name":"School of Computing, National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Tahmid Abtahi Colin Shea Amey Kulkarni and Tinoosh Mohsenin. 2018. Accelerating convolutional neural network with FFT on embedded hardware. ISVLSI 26 9 (2018) 1737\u20131749.","DOI":"10.1109\/TVLSI.2018.2825145"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3289602.3293915"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2019.00012"},{"key":"e_1_3_3_2_5_2","unstructured":"Jungwook Choi Zhuo Wang Swagath Venkataramani Pierce\u00a0I-Jen Chuang Vijayalakshmi Srinivasan and Kailash Gopalakrishnan. 2018. PACT: Parameterized Clipping Activation for Quantized Neural Networks. CoRR abs\/1805.06085 (2018)."},{"key":"e_1_3_3_2_6_2","volume-title":"International Conference on Learning Representations","author":"Esser Steven\u00a0K","year":"2020","unstructured":"Steven\u00a0K Esser, Jeffrey\u00a0L McKinstry, Deepika Bablani, Rathinakumar Appuswamy, and Dharmendra\u00a0S Modha. 2020. Learned Step Size Quantization. In International Conference on Learning Representations."},{"key":"e_1_3_3_2_7_2","unstructured":"Facebook AI\u00a0Research (FAIR). 2018. QNNPACK: Quantized Neural Networks PACKage. https:\/\/github.com\/pytorch\/QNNPACK. Accessed: 2024-10-17."},{"key":"e_1_3_3_2_8_2","unstructured":"Amir Gholami Sehoon Kim Zhen Dong Zhewei Yao Michael\u00a0W. Mahoney and Kurt Keutzer. 2021. A Survey of Quantization Methods for Efficient Neural Network Inference. CoRR abs\/2103.13630 (2021)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Cheng Gong Yao Chen Ye Lu Tao Li Cong Hao and Deming Chen. 2021. VecQ: Minimal Loss DNN Model Compression With Vectorized Weight Quantization. IEEE Trans. Comput. 70 5 (2021) 696\u2013710.","DOI":"10.1109\/TC.2020.2995593"},{"key":"e_1_3_3_2_10_2","volume-title":"Proceedings of the International Joint Conference on Neural Networks","author":"Gong Cheng","year":"2019","unstructured":"Cheng Gong, Tao Li, Ye Lu, Cong Hao, Xiaofan Zhang, Deming Chen, and Yao Chen. 2019. \u03bcL2Q: An Ultra-Low Loss Quantization Method for DNN Compression. In Proceedings of the International Joint Conference on Neural Networks. IEEE."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Cheng Gong Ye Lu Kunpeng Xie Zongming Jin Tao Li and Yanzhi Wang. 2022. Elastic Significant Bit Quantization and Acceleration for Deep Neural Networks. IEEE Transactions on Parallel and Distributed Systems (2022).","DOI":"10.1109\/TPDS.2021.3129615"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00495"},{"key":"e_1_3_3_2_13_2","unstructured":"Google. 2017. gemmlowp: Low-precision matrix multiplication library. https:\/\/github.com\/google\/gemmlowp. [Online; accessed 30-April-2024]."},{"key":"e_1_3_3_2_14_2","unstructured":"Google. 2021. XNNPACK: High-efficiency floating-point neural network inference operators for mobile server and Web. https:\/\/github.com\/google\/XNNPACK."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317829"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00448"},{"key":"e_1_3_3_2_18_2","unstructured":"Hossein Katebi Navidreza Asadi and Maziar Goudarzi. 2022. FullPack: Full Vector Utilization for Sub-Byte Quantized Inference on General Purpose CPUs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.06982 (2022)."},{"key":"e_1_3_3_2_19_2","unstructured":"Liangzhen Lai Naveen Suda and Vikas Chandra. 2018. CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs. CoRR abs\/1801.06601 (2018)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00120"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASP-DAC52403.2022.9712553"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP52443.2021.00045"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Zechun Liu Wenhan Luo Baoyuan Wu Xin Yang Wei Liu and Kwang-Ting Cheng. 2020. Bi-real net: Binarizing deep network towards real-network performance. International Journal of Computer Vision 128 (2020) 202\u2013219.","DOI":"10.1007\/s11263-019-01227-8"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577435"},{"key":"e_1_3_3_2_26_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_2_27_2","unstructured":"ARM Software. 2019. ArmNN: Open-source neural network library. https:\/\/github.com\/ARM-software\/armnn. Accessed: 2024-10-17."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/SiPS.2018.8598402"},{"key":"e_1_3_3_2_29_2","first-page":"52","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Won Jaeyeon","year":"2022","unstructured":"Jaeyeon Won, Jeyeon Si, Sam Son, Tae\u00a0Jun Ham, and Jae\u00a0W. Lee. 2022. ULPPACK: Fast Sub-8-bit Matrix Multiply on Commodity SIMD Hardware. In Proceedings of Machine Learning and Systems , D.\u00a0Marculescu, Y.\u00a0Chi, and C.\u00a0Wu (Eds.), Vol.\u00a04. 52\u201363."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05677-3"},{"key":"e_1_3_3_2_31_2","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Zhang Xiaofan","year":"2020","unstructured":"Xiaofan Zhang, Haoming Lu, Cong Hao, Jiachen Li, Bowen Cheng, Yuhong Li, Kyle Rupnow, Jinjun Xiong, Thomas Huang, Honghui Shi, Wen-Mei Hwu, and Deming Chen. 2020. SkyNet: a Hardware-Efficient Method for Object Detection and Tracking on Embedded Systems. In Proceedings of Machine Learning and Systems (MLSys)."},{"key":"e_1_3_3_2_32_2","volume-title":"Proceedings of IEEE\/ACM International Conference on Computer-Aided Design","author":"Zhang Xiaofan","year":"2017","unstructured":"Xiaofan Zhang, Anand Ramachandran, Chuanhao Zhuge, Di He, Wei Zuo, Zuofu Cheng, Kyle Rupnow, and Deming Chen. 2017. Machine Learning on FPGAs to Face the IoT Revolution. In Proceedings of IEEE\/ACM International Conference on Computer-Aided Design."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240801"},{"key":"e_1_3_3_2_34_2","unstructured":"Shuchang Zhou Zekun Ni Xinyu Zhou He Wen Yuxin Wu and Yuheng Zou. 2016. DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. CoRR abs\/1606.06160 (2016)."}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756124","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:45:46Z","timestamp":1769463946000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756124"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":33,"alternative-id":["10.1145\/3725843.3756124","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756124","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}