{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:47Z","timestamp":1776931187752,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,18]]},"DOI":"10.1145\/3725843.3756101","type":"proceedings-article","created":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T17:19:56Z","timestamp":1760721596000},"page":"1625-1639","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MHE-TPE: Multi-Operand High-Radix Encoder for Mixed-Precision Fixed-Point Tensor Processing Engines"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4977-5363","authenticated-orcid":false,"given":"Qizhe","family":"Wu","sequence":"first","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0685-1817","authenticated-orcid":false,"given":"Jinyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5352-3126","authenticated-orcid":false,"given":"Zhanhe","family":"Hu","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0023-2367","authenticated-orcid":false,"given":"Zhichen","family":"Zeng","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3196-2942","authenticated-orcid":false,"given":"Huawen","family":"Liang","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7130-1949","authenticated-orcid":false,"given":"Jiuru","family":"Zhu","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7001-8893","authenticated-orcid":false,"given":"Linfeng","family":"Tao","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1450-2740","authenticated-orcid":false,"given":"Xin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zbit Semiconductor, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9047-1783","authenticated-orcid":false,"given":"Zekang","family":"Cheng","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0352-8946","authenticated-orcid":false,"given":"Letian","family":"Zhao","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9357-5716","authenticated-orcid":false,"given":"Wei","family":"Yuan","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4024-4013","authenticated-orcid":false,"given":"Xiaotian","family":"Wang","sequence":"additional","affiliation":[{"name":"Raytron Technology, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1109-1328","authenticated-orcid":false,"given":"Xi","family":"Jin","sequence":"additional","affiliation":[{"name":"USTC, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,17]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"2017. Nvidia tesla v100 gpu architecture white paper. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf."},{"key":"e_1_3_3_1_3_2","unstructured":"2020. Nvidia A100 gpu architecture white paper. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Syed\u00a0Asad Alam Andrew Anderson Barbara Barabasz and David Gregg. 2022. Winograd convolution for deep neural networks: Efficient point selection. ACM Transactions on Embedded Computing Systems 21 6 (2022) 1\u201328.","DOI":"10.1145\/3524069"},{"key":"e_1_3_3_1_5_2","unstructured":"J. Albericio P. Judd A. Delm\u00e1s S. Sharify and A. Moshovos. 2016. Bit-pragmatic Deep Neural Network Computing. arxiv:https:\/\/arXiv.org\/abs\/1610.06920\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1610.06920"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Orest\u00a0J Bedrij. 1962. Carry-select adder. IRE Transactions on Electronic Computers3 (1962) 340\u2013346.","DOI":"10.1109\/IRETELC.1962.5407919"},{"key":"e_1_3_3_1_7_2","unstructured":"Yaniv Blumenfeld Itay Hubara and Daniel Soudry. 2024. Towards Cheaper Inference in Deep Networks with Lower Bit-Width Accumulators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.14110 (2024)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Stephen Cass. 2019. Taking AI to the edge: Google\u2019s TPU now comes in a maker-friendly package. IEEE Spectrum 56 5 (2019) 16\u201317.","DOI":"10.1109\/MSPEC.2019.8701189"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00047"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Fu-Chiung Cheng Stephen\u00a0H Unger and Michael Theobald. 2000. Self-timed carry-lookahead adders. IEEE Trans. Comput. 49 7 (2000) 659\u2013672.","DOI":"10.1109\/12.863035"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304041"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.1998.706891"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3623783"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1201\/9781003162810-13"},{"key":"e_1_3_3_1_15_2","unstructured":"Christopher Grimm Jinseok Lee and Naveen Verma. 2024. Training Neural Networks With In-Memory-Computing Hardware and Multi-Level Radix-4 Inputs. IEEE Transactions on Circuits and Systems I: Regular Papers (2024)."},{"key":"e_1_3_3_1_16_2","first-page":"II\u2013473","volume-title":"2004 IEEE International Symposium on Circuits and Systems (IEEE Cat. No. 04CH37512)","author":"Gustafsson Oscar","year":"2004","unstructured":"Oscar Gustafsson, Andrew\u00a0G Dempster, and Lars Wanhammar. 2004. Multiplier blocks using carry-save adders. In 2004 IEEE International Symposium on Circuits and Systems (IEEE Cat. No. 04CH37512) , Vol.\u00a02. IEEE, II\u2013473."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00063"},{"key":"e_1_3_3_1_18_2","unstructured":"Zhe Jia Blake Tillman Marco Maggioni and Daniele\u00a0Paolo Scarpazza. 2019. Dissecting the graphcore ipu architecture via microbenchmarking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1912.03413 (2019)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783722"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Shiann-Rong Kuang Jiun-Ping Wang and Cang-Yuan Guo. 2009. Modified booth multipliers with a regular partial product array. IEEE Transactions on Circuits and Systems II: Express Briefs 56 5 (2009) 404\u2013408.","DOI":"10.1109\/TCSII.2009.2019334"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Jinmook Lee Changhyeon Kim Sanghoon Kang Dongjoo Shin Sangyeob Kim and Hoi-Jun Yoo. 2018. UNPU: An energy-efficient deep neural network accelerator with fully variable weight bit precision. IEEE Journal of Solid-State Circuits 54 1 (2018) 173\u2013185.","DOI":"10.1109\/JSSC.2018.2865489"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Sae\u00a0Kyu Lee Ankur Agrawal Joel Silberman Matthew Ziegler Mingu Kang Swagath Venkataramani Nianzheng Cao Bruce Fleischer Michael Guillorn Matthew Cohen et\u00a0al. 2021. A 7-nm four-core mixed-precision AI chip with 26.2-TFLOPS hybrid-FP8 training 104.9-TOPS INT4 inference and workload-aware throttling. IEEE Journal of Solid-State Circuits 57 1 (2021) 182\u2013197.","DOI":"10.1109\/JSSC.2021.3120113"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00097"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-49556-5_5"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00071"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00082"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614249"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480123"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Arnab\u00a0Neelim Mazumder Jian Meng Hasib-Al Rashid Utteja Kallakuri Xin Zhang Jae-Sun Seo and Tinoosh Mohsenin. 2021. A survey on the optimization of neural network accelerators for micro-ai on-device inference. IEEE Journal on Emerging and Selected Topics in Circuits and Systems 11 4 (2021) 532\u2013547.","DOI":"10.1109\/JETCAS.2021.3129415"},{"key":"e_1_3_3_1_33_2","unstructured":"Zhiwen Mo Lei Wang Jianyu Wei Zhichen Zeng Shijie Cao Lingxiao Ma Naifeng Jing Ting Cao Jilong Xue Fan Yang et\u00a0al. 2024. Lut tensor core: Lookup table enables efficient low-bit llm inference acceleration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06003 (2024)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Thomas Norrie Nishant Patil Doe\u00a0Hyun Yoon George Kurian Sheng Li James Laudon Cliff Young Norman Jouppi and David Patterson. 2021. The design process for Google\u2019s training chips: TPUv2 and TPUv3. IEEE Micro 41 2 (2021) 56\u201363.","DOI":"10.1109\/MM.2021.3058217"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Yunjie Pan Jiecao Yu Andrew Lukefahr Reetuparna Das and Scott Mahlke. 2023. BitSET: Bit-serial early termination for computation reduction in convolutional neural networks. ACM Transactions on Embedded Computing Systems 22 5s (2023) 1\u201324.","DOI":"10.1145\/3609093"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Gunho Park Jaeha Kung and Youngjoo Lee. 2023. Simplified Compressor and Encoder Designs for Low-Cost Approximate Radix-4 Booth Multiplier. IEEE Transactions on Circuits and Systems II: Express Briefs 70 3 (2023) 1154\u20131158.","DOI":"10.1109\/TCSII.2022.3217696"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Jun-Seok Park Changsoo Park Suknam Kwon Taeho Jeon Yesung Kang Heonsoo Lee Dongwoo Lee James Kim Hyeong-Seok Kim YoungJong Lee et\u00a0al. 2022. A multi-mode 8k-MAC HW-utilization-aware neural processing unit with a unified multi-precision datapath in 4-nm flagship mobile SoC. IEEE Journal of Solid-State Circuits 58 1 (2022) 189\u2013202.","DOI":"10.1109\/JSSC.2022.3205713"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731612"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Mark\u00a0R Santoro and Mark\u00a0A Horowitz. 1989. SPIM: a pipelined 64* 64-bit iterative multiplier. IEEE journal of solid-state circuits 24 2 (1989) 487\u2013493.","DOI":"10.1109\/4.18614"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322255"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00069"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00062"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRTCST54752.2022.9781868"},{"key":"e_1_3_3_1_44_2","volume-title":"Design Compiler User Guide","author":"Inc. Synopsys","year":"2022","unstructured":"Synopsys Inc.2022. Design Compiler User Guide."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD57390.2023.10323910"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Fengbin Tu Shouyi Yin Peng Ouyang Shibin Tang Leibo Liu and Shaojun Wei. 2017. Deep convolutional neural network architecture with reconfigurable computation patterns. IEEE Transactions on Very Large Scale Integration (VLSI) Systems 25 8 (2017) 2220\u20132233.","DOI":"10.1109\/TVLSI.2017.2688340"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/VLSID.2007.116"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00021"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Christopher\u00a0S Wallace. 1964. A suggestion for a fast multiplier. IEEE Transactions on electronic Computers1 (1964) 14\u201317.","DOI":"10.1109\/PGEC.1964.263830"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Gang Wang Siqi Cai Wenjie Li Dongxu Lyu and Guanghui He. 2024. BSViT: A Bit-Serial Vision Transformer Accelerator Exploiting Dynamic Patch and Weight Bit-Group Quantization. IEEE Transactions on Circuits and Systems I: Regular Papers (2024).","DOI":"10.1109\/TCSI.2024.3426653"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"crossref","unstructured":"Junbin Wang Shaoxia Fang Xi Wang Jiangsha Ma Taobo Wang and Yi Shan. 2021. High-performance mixed-low-precision cnn inference accelerator on fpga. IEEE Micro 41 4 (2021) 31\u201338.","DOI":"10.1109\/MM.2021.3081735"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD63220.2024.00097"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00058"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"crossref","unstructured":"Rui Xu Sheng Ma Yang Guo and Dongsheng Li. 2023. A survey of design and optimization for systolic array-based dnn accelerators. Comput. Surveys 56 1 (2023) 1\u201337.","DOI":"10.1145\/3604802"},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00079"}],"event":{"name":"MICRO 2025: 58th IEEE\/ACM International Symposium on Microarchitecture","location":"Seoul Korea","acronym":"MICRO 2025","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"]},"container-title":["Proceedings of the 58th IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3725843.3756101","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T21:43:19Z","timestamp":1769463799000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3725843.3756101"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,17]]},"references-count":54,"alternative-id":["10.1145\/3725843.3756101","10.1145\/3725843"],"URL":"https:\/\/doi.org\/10.1145\/3725843.3756101","relation":{},"subject":[],"published":{"date-parts":[[2025,10,17]]},"assertion":[{"value":"2025-10-17","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}