{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:00:18Z","timestamp":1750309218944,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Research Grants Council (RGC)","award":["T45-701\/22-R","17203224"],"award-info":[{"award-number":["T45-701\/22-R","17203224"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676747","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T13:21:20Z","timestamp":1744204880000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MCUBERT: Memory-Efficient BERT Inference on Commodity Microcontrollers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0672-6389","authenticated-orcid":false,"given":"Zebin","family":"Yang","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuits, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5938-7965","authenticated-orcid":false,"given":"Renze","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3664-3513","authenticated-orcid":false,"given":"Taiqiang","family":"Wu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3026-0108","authenticated-orcid":false,"given":"Ngai","family":"Wong","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-7998","authenticated-orcid":false,"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7514-0767","authenticated-orcid":false,"given":"Runsheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8146-4821","authenticated-orcid":false,"given":"Ru","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7212-2264","authenticated-orcid":false,"given":"Meng","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Adaptive input representations for neural language modeling. arXiv preprint arXiv:1809.10853","author":"Baevski Alexei","year":"2018","unstructured":"Alexei Baevski and Michael Auli. 2018. Adaptive input representations for neural language modeling. arXiv preprint arXiv:1809.10853 (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Binarybert: Pushing the limit of bert quantization. arXiv preprint arXiv:2012.15701","author":"Bai Haoli","year":"2020","unstructured":"Haoli Bai, Wei Zhang, Lu Hou, Lifeng Shang, Jing Jin, Xin Jiang, Qun Liu, Michael Lyu, and Irwin King. 2020. Binarybert: Pushing the limit of bert quantization. arXiv preprint arXiv:2012.15701 (2020)."},{"key":"e_1_3_2_1_3_1","first-page":"517","article-title":"Micronets: Neural network architectures for deploying tinyml applications on commodity microcontrollers","volume":"3","author":"Banbury Colby","year":"2021","unstructured":"Colby Banbury, Chuteng Zhou, Igor Fedorov, Ramon Matas, Urmish Thakker, Dibakar Gope, Vijay Janapa Reddi, Matthew Mattina, and Paul Whatmough. 2021. Micronets: Neural network architectures for deploying tinyml applications on commodity microcontrollers. Proceedings of Machine Learning and Systems 3 (2021), 517--532.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Prajjwal Bhargava Aleksandr Drozd and Anna Rogers. 2021. Generalization in NLI: Ways (Not) To Go Beyond Simple Heuristics. arXiv:2110.01518 [cs.CL]","DOI":"10.18653\/v1\/2021.insights-1.18"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/COINS51742.2021.9524173"},{"key":"e_1_3_2_1_6_1","volume-title":"Sparse models are robust. arXiv preprint arXiv:2205.12452","author":"Campos Daniel","year":"2022","unstructured":"Daniel Campos, Alexandre Marques, Tuan Nguyen, Mark Kurtz, and ChengXiang Zhai. 2022. Sparse* bert: Sparse models are robust. arXiv preprint arXiv:2205.12452 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Groupreduce: Block-wise low-rank approximation for neural language model shrinking. Advances in Neural Information Processing Systems 31","author":"Chen Patrick","year":"2018","unstructured":"Patrick Chen, Si Si, Yang Li, Ciprian Chelba, and Cho-Jui Hsieh. 2018. Groupreduce: Block-wise low-rank approximation for neural language model shrinking. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578--594."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1361"},{"key":"e_1_3_2_1_11_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. Advances in Neural Information Processing Systems 35 (2022), 16344--16359.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","first-page":"800","article-title":"Tensorflow lite micro: Embedded machine learning for tinyml systems","volume":"3","author":"David Robert","year":"2021","unstructured":"Robert David, Jared Duke, Advait Jain, Vijay Janapa Reddi, Nat Jeffries, Jian Li, Nick Kreeger, Ian Nappier, Meghna Natraj, Tiezhen Wang, et al. 2021. Tensorflow lite micro: Embedded machine learning for tinyml systems. Proceedings of Machine Learning and Systems 3 (2021), 800--811.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_15_1","volume-title":"Sparse: Sparse architecture search for cnns on resource-constrained microcontrollers. Advances in Neural Information Processing Systems 32","author":"Fedorov Igor","year":"2019","unstructured":"Igor Fedorov, Ryan P Adams, Matthew Mattina, and Paul Whatmough. 2019. Sparse: Sparse architecture search for cnns on resource-constrained microcontrollers. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_16_1","first-page":"18456","article-title":"UDC: Unified DNAS for Compressible TinyML Models for Neural Processing Units","volume":"35","author":"Fedorov Igor","year":"2022","unstructured":"Igor Fedorov, Ramon Matas, Hokchhay Tann, Chuteng Zhou, Matthew Mattina, and Paul Whatmough. 2022. UDC: Unified DNAS for Compressible TinyML Models for Neural Processing Units. Advances in Neural Information Processing Systems 35 (2022), 18456--18471.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"Language model compression with weighted low-rank factorization. arXiv preprint arXiv:2207.00112","author":"Hsu Yen-Chang","year":"2022","unstructured":"Yen-Chang Hsu, Ting Hua, Sungen Chang, Qian Lou, Yilin Shen, and Hongxia Jin. 2022. Language model compression with weighted low-rank factorization. arXiv preprint arXiv:2207.00112 (2022)."},{"key":"e_1_3_2_1_18_1","first-page":"711","article-title":"Data movement is all you need: A case study on optimizing transformers","volume":"3","author":"Ivanov Andrei","year":"2021","unstructured":"Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. 2021. Data movement is all you need: A case study on optimizing transformers. Proceedings of Machine Learning and Systems 3 (2021), 711--732.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_1_20_1","volume-title":"Cmsis-nn: Efficient neural network kernels for arm cortex-m cpus. arXiv preprint arXiv:1801.06601","author":"Lai Liangzhen","year":"2018","unstructured":"Liangzhen Lai, Naveen Suda, and Vikas Chandra. 2018. Cmsis-nn: Efficient neural network kernels for arm cortex-m cpus. arXiv preprint arXiv:1801.06601 (2018)."},{"key":"e_1_3_2_1_21_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_22_1","volume-title":"MCUFormer: Deploying Vision Tranformers on Microcontrollers with Limited Memory. arXiv preprint arXiv:2310.16898","author":"Liang Yinan","year":"2023","unstructured":"Yinan Liang, Ziwei Wang, Xiuwei Xu, Yansong Tang, Zhou Jie, and Jiwen Lu. 2023. MCUFormer: Deploying Vision Tranformers on Microcontrollers with Limited Memory. arXiv preprint arXiv:2310.16898 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Mcunetv2: Memory-efficient patch-based inference for tiny deep learning. arXiv preprint arXiv:2110.15352","author":"Lin Ji","year":"2021","unstructured":"Ji Lin, Wei-Ming Chen, Han Cai, Chuang Gan, and Song Han. 2021. Mcunetv2: Memory-efficient patch-based inference for tiny deep learning. arXiv preprint arXiv:2110.15352 (2021)."},{"key":"e_1_3_2_1_24_1","first-page":"11711","article-title":"Mcunet: Tiny deep learning on iot devices","volume":"33","author":"Lin Ji","year":"2020","unstructured":"Ji Lin, Wei-Ming Chen, Yujun Lin, Chuang Gan, Song Han, et al. 2020. Mcunet: Tiny deep learning on iot devices. Advances in Neural Information Processing Systems 33 (2020), 11711--11722.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_25_1","volume-title":"Md Akmal Haidar, and Mehdi Rezagholizadeh","author":"Lioutas Vasileios","year":"2019","unstructured":"Vasileios Lioutas, Ahmad Rashid, Krtin Kumar, Md Akmal Haidar, and Mehdi Rezagholizadeh. 2019. Distilled embedding: non-linear embedding factorization using knowledge distillation. (2019)."},{"key":"e_1_3_2_1_26_1","volume-title":"Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055","author":"Liu Hanxiao","year":"2018","unstructured":"Hanxiao Liu, Karen Simonyan, and Yiming Yang. 2018. Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055 (2018)."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 17573--17583","author":"Patil Shishir G","year":"2022","unstructured":"Shishir G Patil, Paras Jain, Prabal Dutta, Ion Stoica, and Joseph Gonzalez. 2022. Poet: Training neural networks on tiny devices with integrated rematerialization and paging. In International Conference on Machine Learning. PMLR, 17573--17583."},{"key":"e_1_3_2_1_28_1","volume-title":"Bibert: Accurate fully binarized bert. arXiv preprint arXiv:2203.06390","author":"Qin Haotong","year":"2022","unstructured":"Haotong Qin, Yifu Ding, Mingyuan Zhang, Qinghua Yan, Aishan Liu, Qingqing Dang, Ziwei Liu, and Xianglong Liu. 2022. Bibert: Accurate fully binarized bert. arXiv preprint arXiv:2203.06390 (2022)."},{"key":"e_1_3_2_1_29_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_31_1","first-page":"326","article-title":"Memory-driven mixed low precision quantization for enabling deep network inference on microcontrollers","volume":"2","author":"Rusci Manuele","year":"2020","unstructured":"Manuele Rusci, Alessandro Capotondi, and Luca Benini. 2020. Memory-driven mixed low precision quantization for enabling deep network inference on microcontrollers. Proceedings of Machine Learning and Systems 2 (2020), 326--335.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 9547--9557","author":"Shi Han","year":"2021","unstructured":"Han Shi, Jiahui Gao, Xiaozhe Ren, Hang Xu, Xiaodan Liang, Zhenguo Li, and James Tin-Yau Kwok. 2021. Sparsebert: Rethinking the importance analysis in self-attention. In International Conference on Machine Learning. PMLR, 9547--9557."},{"key":"e_1_3_2_1_35_1","volume-title":"Asian Conference on Machine Learning. PMLR, 1081--1093","author":"Shin Joonbo","year":"2019","unstructured":"Joonbo Shin, Yoonhyung Lee, and Kyomin Jung. 2019. Effective sentence scoring method using BERT for speech recognition. In Asian Conference on Machine Learning. PMLR, 1081--1093."},{"key":"e_1_3_2_1_36_1","volume-title":"Well-Read Students Learn Better: The Impact of Student Initialization on Knowledge Distillation. CoRR abs\/1908.08962","author":"Turc Iulia","year":"2019","unstructured":"Iulia Turc, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Well-Read Students Learn Better: The Impact of Student Initialization on Knowledge Distillation. CoRR abs\/1908.08962 (2019). arXiv:1908.08962 http:\/\/arxiv.org\/abs\/1908.08962"},{"key":"e_1_3_2_1_37_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Wang Alex","year":"2018","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_1_38_1","volume-title":"BERT post-training for review reading comprehension and aspect-based sentiment analysis. arXiv preprint arXiv:1904.02232","author":"Xu Hu","year":"2019","unstructured":"Hu Xu, Bing Liu, Lei Shu, and Philip S Yu. 2019. BERT post-training for review reading comprehension and aspect-based sentiment analysis. arXiv preprint arXiv:1904.02232 (2019)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467262"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6479"},{"key":"e_1_3_2_1_41_1","volume-title":"Prune once for all: Sparse pre-trained language models. arXiv preprint arXiv:2111.05754","author":"Zafrir Ofir","year":"2021","unstructured":"Ofir Zafrir, Ariel Larey, Guy Boudoukh, Haihao Shen, and Moshe Wasserblat. 2021. Prune once for all: Sparse pre-trained language models. arXiv preprint arXiv:2111.05754 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00042"},{"key":"e_1_3_2_1_43_1","volume-title":"Ternarybert: Distillation-aware ultra-low bit bert. arXiv preprint arXiv:2009.12812","author":"Zhang Wei","year":"2020","unstructured":"Wei Zhang, Lu Hou, Yichun Yin, Lifeng Shang, Xiao Chen, Xin Jiang, and Qun Liu. 2020. Ternarybert: Distillation-aware ultra-low bit bert. arXiv preprint arXiv:2009.12812 (2020)."},{"key":"e_1_3_2_1_44_1","volume-title":"AutoDistill: An end-to-end framework to explore and distill hardware-efficient language models. arXiv preprint arXiv:2201.08539","author":"Zhang Xiaofan","year":"2022","unstructured":"Xiaofan Zhang, Zongwei Zhou, Deming Chen, and Yu Emma Wang. 2022. AutoDistill: An end-to-end framework to explore and distill hardware-efficient language models. arXiv preprint arXiv:2201.08539 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Incorporating bert into neural machine translation. arXiv preprint arXiv:2002.06823","author":"Zhu Jinhua","year":"2020","unstructured":"Jinhua Zhu, Yingce Xia, Lijun Wu, Di He, Tao Qin, Wengang Zhou, Houqiang Li, and Tie-Yan Liu. 2020. Incorporating bert into neural machine translation. arXiv preprint arXiv:2002.06823 (2020)."}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"],"location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24"},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676747","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676747","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:58Z","timestamp":1750290238000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676747"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":45,"alternative-id":["10.1145\/3676536.3676747","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676747","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}