{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T11:32:43Z","timestamp":1763724763320,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676733","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T12:53:56Z","timestamp":1744203236000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Co-Designing Binarized Transformer and Hardware Accelerator for Efficient End-to-End Edge Deployment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9376-753X","authenticated-orcid":false,"given":"Yuhao","family":"Ji","sequence":"first","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"},{"name":"Department of Computer Science and Engineering, The Chinese University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3430-1189","authenticated-orcid":false,"given":"Chao","family":"Fang","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6173-3983","authenticated-orcid":false,"given":"Shaobo","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6965-3436","authenticated-orcid":false,"given":"Haikuo","family":"Shao","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7227-4786","authenticated-orcid":false,"given":"Zhongfeng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China"},{"name":"School of Integrated Circuits, Sun Yat-sen University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"doi-asserted-by":"publisher","key":"e_1_3_2_1_1_1","DOI":"10.18653\/v1\/2021.acl-long.334"},{"unstructured":"Tom B. Brown Benjamin Mann et al. 2020. Language Models are Few-Shot Learners. In Advances in neural information processing systems (NeurIPS).","key":"e_1_3_2_1_2_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1109\/JETCAS.2019.2950386"},{"key":"e_1_3_2_1_4_1","volume-title":"PP-Transformer: Enable Efficient Deployment of Transformers Through Pattern Pruning. In IEEE\/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 1--9.","author":"Cao Jialin","year":"2023","unstructured":"Jialin Cao, Xuanda Lin, et al. 2023. PP-Transformer: Enable Efficient Deployment of Transformers Through Pattern Pruning. In IEEE\/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 1--9."},{"unstructured":"Tim Dettmers Mike Lewis et al. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. CoRR abs\/2208.07339 (2022).","key":"e_1_3_2_1_5_1"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT) (1). Association for Computational Linguistics, 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, et al. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT) (1). Association for Computational Linguistics, 4171--4186."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, et al. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Learning Representations (ICLR). OpenReview.net.","author":"Esser Steven K.","year":"2020","unstructured":"Steven K. Esser, Jeffrey L. McKinstry, et al. 2020. Learned Step Size quantization. In International Conference on Learning Representations (ICLR). OpenReview.net."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/MICRO56248.2022.00050"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_10_1","DOI":"10.1109\/TVLSI.2022.3197282"},{"key":"e_1_3_2_1_11_1","volume-title":"IEEE International Symposium on High Performance Computer Architecture (HPCA). IEEE, 328--341","author":"Ham Tae Jun","year":"2020","unstructured":"Tae Jun Ham, Sungjun Jung, et al. 2020. A3: Accelerating Attention Mechanisms in Neural Networks with Approximation. In IEEE International Symposium on High Performance Computer Architecture (HPCA). IEEE, 328--341."},{"unstructured":"Lu Hou Zhiqi Huang et al. 2020. DynaBERT: Dynamic BERT with Adaptive Width and Depth. In Advances in Neural Information Processing Systems (NeurIPS).","key":"e_1_3_2_1_12_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.18653\/v1\/2021.naacl-main.258"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.1109\/ASP-DAC58780.2024.10473817"},{"unstructured":"Itay Hubara Matthieu Courbariaux et al. 2016. Binarized Neural Networks. In Advances in neural information processing systems (NeurIPS) Vol. 29. Curran Associates Inc.","key":"e_1_3_2_1_15_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/TCSI.2023.3335949"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of International Conference on Machine Learning (ICML) (Proceedings of Machine Learning Research)","volume":"139","author":"Kim Sehoon","year":"2021","unstructured":"Sehoon Kim, Amir Gholami, et al. 2021. I-BERT: Integer-only BERT Quantization. In Proceedings of International Conference on Machine Learning (ICML) (Proceedings of Machine Learning Research), Vol. 139. PMLR, 5506--5518."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1145\/3370748.3406567"},{"key":"e_1_3_2_1_19_1","volume-title":"I-ViT: Integer-only Quantization for Efficient Vision Transformer Inference. In IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, 17019--17029","author":"Li Zhikai","year":"2023","unstructured":"Zhikai Li and Qingyi Gu. 2023. I-ViT: Integer-only Quantization for Efficient Vision Transformer Inference. In IEEE\/CVF International Conference on Computer Vision (ICCV). IEEE, 17019--17029."},{"key":"e_1_3_2_1_20_1","volume-title":"Automation & Test in Europe Conference (DATE). IEEE, 513--516","author":"Liu Zejian","year":"2021","unstructured":"Zejian Liu, Gang Li, et al. 2021. Hardware Acceleration of Fully Quantized BERT for Efficient Natural Language Processing. In Design, Automation & Test in Europe Conference (DATE). IEEE, 513--516."},{"unstructured":"Zechun Liu Barlas Oguz et al. 2022. BiT: Robustly Binarized Multi-distilled Transformer. In Advances in neural information processing systems (NeurIPS) Vol. 35. 14303--14316.","key":"e_1_3_2_1_21_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_22_1","DOI":"10.1007\/978-3-030-01267-0_44"},{"key":"e_1_3_2_1_23_1","volume-title":"Hardware Accelerator for Multi-Head Attention and Position-Wise Feed-Forward in the Transformer. In IEEE International System-on-Chip Conference (SOCC). IEEE, 84--89","author":"Lu Siyuan","year":"2020","unstructured":"Siyuan Lu, Meiqi Wang, et al. 2020. Hardware Accelerator for Multi-Head Attention and Position-Wise Feed-Forward in the Transformer. In IEEE International System-on-Chip Conference (SOCC). IEEE, 84--89."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1109\/TCSVT.2020.3020569"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys). mlsys.org.","author":"Park Junki","year":"2020","unstructured":"Junki Park, Hyunsung Yoon, et al. 2020. OPTIMUS: OPTImized matrix Multiplication Structure for Transformer neural network accelerator. In Proceedings of Machine Learning and Systems (MLSys). mlsys.org."},{"key":"e_1_3_2_1_26_1","volume-title":"BiBERT: Accurate Fully Binarized BERT. In International Conference on Learning Representations (ICLR).","author":"Qin Haotong","year":"2022","unstructured":"Haotong Qin, Yifu Ding, et al. 2022. BiBERT: Accurate Fully Binarized BERT. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_27_1","volume-title":"Going Deeper with Embedded FPGA Platform for Convolutional Neural Network. In International Symposium on Field-Programmable Gate Arrays (ISFPGA). ACM, 26--35","author":"Qiu Jiantao","year":"2016","unstructured":"Jiantao Qiu, Jie Wang, et al. 2016. Going Deeper with Embedded FPGA Platform for Convolutional Neural Network. In International Symposium on Field-Programmable Gate Arrays (ISFPGA). ACM, 26--35."},{"key":"e_1_3_2_1_28_1","volume-title":"XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. In European Conference on Computer Vision (ECCV) (4)","volume":"9908","author":"Rastegari Mohammad","year":"2016","unstructured":"Mohammad Rastegari, Vicente Ordonez, et al. 2016. XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks. In European Conference on Computer Vision (ECCV) (4), Vol. 9908. Springer, 525--542."},{"doi-asserted-by":"crossref","unstructured":"Haikuo Shao Huihong Shi et al. 2024. An FPGA-Based Reconfigurable Accelerator for Convolution-Transformer Hybrid EfficientViT. CoRR abs\/2403.20230 (2024).","key":"e_1_3_2_1_29_1","DOI":"10.1109\/ISCAS58744.2024.10557992"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_30_1","DOI":"10.1609\/aaai.v34i05.6409"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_31_1","DOI":"10.1109\/ISCAS48785.2022.9937660"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_32_1","DOI":"10.1109\/ICASSP49357.2023.10096223"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.1109\/JSSC.2020.3021661"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_34_1","DOI":"10.1109\/ACCESS.2022.3151916"},{"key":"e_1_3_2_1_35_1","volume-title":"GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In International Conference on Learning Representations (ICLR).","author":"Wang Alex","year":"2019","unstructured":"Alex Wang, Amanpreet Singh, et al. 2019. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In International Conference on Learning Representations (ICLR)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1109\/TCAD.2022.3197489"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.1109\/TCSII.2023.3241487"},{"unstructured":"Yuzhuang Xu Xu Han et al. 2024. OneBit: Towards Extremely Low-bit Large Language Models. CoRR abs\/2402.11295 (2024).","key":"e_1_3_2_1_38_1"},{"key":"e_1_3_2_1_39_1","volume-title":"EFA-Trans: An Efficient and Flexible Acceleration Architecture for Transformers. Electronics 11, 21","author":"Yang Xin","year":"2022","unstructured":"Xin Yang and Tao Su. 2022. EFA-Trans: An Efficient and Flexible Acceleration Architecture for Transformers. Electronics 11, 21 (2022)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_40_1","DOI":"10.1109\/EMC2-NIPS53020.2019.00016"},{"doi-asserted-by":"crossref","unstructured":"Min Zhang Linpeng Li et al. 2019. Optimized Compression for Implementing Convolutional Neural Networks on FPGA. Electronics 8 3 (2019).","key":"e_1_3_2_1_41_1","DOI":"10.3390\/electronics8030295"},{"key":"e_1_3_2_1_42_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. CoRR abs\/2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, et al. 2022. OPT: Open Pre-trained Transformer Language Models. CoRR abs\/2205.01068 (2022)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_43_1","DOI":"10.18653\/v1\/2020.emnlp-main.37"}],"event":{"sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"],"acronym":"ICCAD '24","name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","location":"Newark Liberty International Airport Marriott New York NY USA"},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676733","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676733","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:57Z","timestamp":1750290237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676733"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":43,"alternative-id":["10.1145\/3676536.3676733","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676733","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}