{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T04:26:52Z","timestamp":1775881612603,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676753","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T13:21:20Z","timestamp":1744204880000},"page":"1-9","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["An Agile Framework for Efficient LLM Accelerator Development and Model Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2480-6191","authenticated-orcid":false,"given":"Lvcheng","family":"Chen","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4898-9343","authenticated-orcid":false,"given":"Ying","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6508-2639","authenticated-orcid":false,"given":"Chenyi","family":"Wen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5853-4931","authenticated-orcid":false,"given":"Shizhang","family":"Wang","sequence":"additional","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8951-4969","authenticated-orcid":false,"given":"Li","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hubei University of Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6406-4810","authenticated-orcid":false,"given":"Bei","family":"Yu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5153-6698","authenticated-orcid":false,"given":"Qi","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2610-7522","authenticated-orcid":false,"given":"Cheng","family":"Zhuo","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Attention is all you need,\" Advances in neural information processing systems","author":"Vaswani A.","year":"2017","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, \u0141. Kaiser, and I. Polosukhin, \"Attention is all you need,\" Advances in neural information processing systems, vol. 30, 2017."},{"key":"e_1_3_2_1_2_1","first-page":"1877","volume-title":"Askell et al., \"Language models are few-shot learners,\" Advances in neural information processing systems","author":"Brown T.","year":"2020","unstructured":"T. Brown, B. Mann, N. Ryder, M. Subbiah, J. D. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell et al., \"Language models are few-shot learners,\" Advances in neural information processing systems, vol. 33, pp. 1877--1901, 2020."},{"key":"e_1_3_2_1_3_1","volume-title":"Azhar et al., \"Llama: Open and efficient foundation language models,\" arXiv preprint arXiv:2302.13971","author":"Touvron H.","year":"2023","unstructured":"H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix, B. Rozi\u00e8re, N. Goyal, E. Hambro, F. Azhar et al., \"Llama: Open and efficient foundation language models,\" arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"For tech giants, ai like bing and bard poses billion-dollar search problem,\" https:\/\/www.reuters.com\/technology\/tech-giants-ai-like-bing-bard-poses-billion-dollar-search-problem-2023-02-22\/","year":"2023","unstructured":"\"Focus: For tech giants, ai like bing and bard poses billion-dollar search problem,\" https:\/\/www.reuters.com\/technology\/tech-giants-ai-like-bing-bard-poses-billion-dollar-search-problem-2023-02-22\/, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"A survey on making deep learning models smaller, faster, and better,\" ACM Computing Surveys","author":"Menghani G.","year":"2023","unstructured":"G. Menghani, \"Efficient deep learning: A survey on making deep learning models smaller, faster, and better,\" ACM Computing Surveys, 2023."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3451179"},{"key":"e_1_3_2_1_7_1","volume-title":"Efficient memory management for large language model serving with pagedattention,\" in Proceedings of the 29th Symposium on Operating Systems Principles","author":"Kwon W.","year":"2023","unstructured":"W. Kwon, Z. Li, S. Zhuang, Y. Sheng, L. Zheng, C. H. Yu, J. Gonzalez, H. Zhang, and I. Stoica, \"Efficient memory management for large language model serving with pagedattention,\" in Proceedings of the 29th Symposium on Operating Systems Principles, 2023."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_9_1","volume-title":"BERT: Pre-training of deep bidirectional transformers for language understanding,\" arXiv preprint arXiv:1810.04805","author":"Devlin J.","year":"2018","unstructured":"J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, \"BERT: Pre-training of deep bidirectional transformers for language understanding,\" arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_10_1","unstructured":"OpenAI \"GPT-4 Technical Report \" 2023."},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Systolic arrays (for vlsi)","volume":"1978","author":"Kung H. T.","year":"1979","unstructured":"H. T. Kung and C. E. Leiserson, \"Systolic arrays (for vlsi),\" in Sparse Matrix Proceedings 1978, vol. 1, 1979.","journal-title":"Sparse Matrix Proceedings"},{"key":"e_1_3_2_1_12_1","volume-title":"Automated systolic array architecture synthesis for high throughput cnn inference on fpgas,\" in Proceedings of the 54th Annual Design Automation Conference","author":"Wei X.","year":"2017","unstructured":"X. Wei, C. H. Yu, P. Zhang, Y. Chen, Y. Wang, H. Hu, Y. Liang, and J. Cong, \"Automated systolic array architecture synthesis for high throughput cnn inference on fpgas,\" in Proceedings of the 54th Annual Design Automation Conference 2017, 2017."},{"key":"e_1_3_2_1_13_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE)","author":"Peltekis C.","year":"2023","unstructured":"C. Peltekis, D. Filippas, G. Dimitrakopoulos, C. Nicopoulos, and D. Pnevmatikatos, \"Arrayflex: A systolic array architecture with configurable transparent pipelining,\" in 2023 Design, Automation & Test in Europe Conference & Exhibition (DATE), 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"Liang et al., \"Susy: A programming model for productive construction of highperformance systolic arrays on fpgas,\" in Proceedings of the 39th International Conference on Computer-Aided Design","author":"Lai Y.-H.","year":"2020","unstructured":"Y.-H. Lai, H. Rong, S. Zheng, W. Zhang, X. Cui, Y. Jia, J. Wang, B. Sullivan, Z. Zhang, Y. Liang et al., \"Susy: A programming model for productive construction of highperformance systolic arrays on fpgas,\" in Proceedings of the 39th International Conference on Computer-Aided Design, 2020."},{"key":"e_1_3_2_1_15_1","first-page":"460","volume-title":"Jin et al., \"onednn graph compiler: A hybrid approach for high-performance deep learning compilation,\" IEEE","author":"Li J.","year":"2024","unstructured":"J. Li, Z. Qin, Y. Mei, J. Cui, Y. Song, C. Chen, Y. Zhang, L. Du, X. Cheng, B. Jin et al., \"onednn graph compiler: A hybrid approach for high-performance deep learning compilation,\" IEEE, pp. 460--470, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"GPU Technology Conference","volume":"1","author":"Vanholder H.","year":"2016","unstructured":"H. Vanholder, \"Efficient inference with TensorRT,\" in GPU Technology Conference, vol. 1, no. 2, 2016."},{"key":"e_1_3_2_1_17_1","volume-title":"Leveraging domain information for the efficient automated design of deep learning accelerators,\" in 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","author":"Sakhuja C.","year":"2023","unstructured":"C. Sakhuja, Z. Shi, and C. Lin, \"Leveraging domain information for the efficient automated design of deep learning accelerators,\" in 2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA), 2023."},{"key":"e_1_3_2_1_18_1","volume-title":"High performance depthwise and pointwise convolutions on mobile devices,\" in Proc. AAAI","author":"Zhang P.","unstructured":"P. Zhang, E. Lo, and B. Lu, \"High performance depthwise and pointwise convolutions on mobile devices,\" in Proc. AAAI, vol. 34, no. 04, 2020."},{"key":"e_1_3_2_1_19_1","volume-title":"A low-latency multi-fpga appliance for accelerating transformer-based text generation,\" in 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Hong S.","year":"2022","unstructured":"S. Hong, S. Moon, J. Kim, S. Lee, M. Kim, D. Lee, and J.-Y. Kim, \"DFX: A low-latency multi-fpga appliance for accelerating transformer-based text generation,\" in 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO), 2022."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530584"},{"key":"e_1_3_2_1_21_1","volume-title":"A high-performance accelerator for super-resolution processing on embedded gpu,\" IEEE TCAD","author":"Zhao W.","year":"2023","unstructured":"W. Zhao, Y. Bai, Q. Sun, W. Li, H. Zheng, N. Jiang, J. Lu, B. Yu, and M. D. Wong, \"A high-performance accelerator for super-resolution processing on embedded gpu,\" IEEE TCAD, 2023."},{"key":"e_1_3_2_1_22_1","volume-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","author":"Dao T.","year":"2022","unstructured":"T. Dao, D. Fu, S. Ermon, A. Rudra, and C. R\u00e9, \"Flashattention: Fast and memory-efficient exact attention with io-awareness,\" vol. 35, 2022."},{"key":"e_1_3_2_1_23_1","volume-title":"Faster attention with better parallelism and work partitioning,\" arXiv preprint arXiv:2307.08691","author":"Dao T.","year":"2023","unstructured":"T. Dao, \"Flashattention-2: Faster attention with better parallelism and work partitioning,\" arXiv preprint arXiv:2307.08691, 2023."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"S. Zeng J. Liu G. Dai X. Yang T. Fu H. Wang W. Ma H. Sun S. Li Z. Huang et al. \"Flightllm: Efficient large language model inference with a complete mapping flow on fpgas \" in Proceedings of the 2024 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays 2024.","DOI":"10.1145\/3626202.3637562"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522302"},{"key":"e_1_3_2_1_26_1","volume-title":"spike, and the rocket core,\" Berkeley Architecture Group","author":"Keller B.","year":"2013","unstructured":"B. Keller, \"Risc-v, spike, and the rocket core,\" Berkeley Architecture Group, 2013."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2996616"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228584"},{"key":"e_1_3_2_1_30_1","volume-title":"The hwacha microarchitecture manual, version 3.8.\" EECS Department","author":"Lee Y.","year":"2015","unstructured":"Y. Lee et al., \"The hwacha microarchitecture manual, version 3.8.\" EECS Department, University of California, Berkeley, Tech. Rep. UCB\/EECS-2015-263, 2015."},{"key":"e_1_3_2_1_31_1","volume-title":"Mao et al., \"Gemmini: Enabling systematic deep-learning architecture evaluation via full-stack integration,\" in 2021 58th ACM\/IEEE Design Automation Conference (DAC)","author":"Genc H.","year":"2021","unstructured":"H. Genc, S. Kim, A. Amid, A. Haj-Ali, V. Iyer, P. Prakash, J. Zhao, D. Grubb, H. Liew, H. Mao et al., \"Gemmini: Enabling systematic deep-learning architecture evaluation via full-stack integration,\" in 2021 58th ACM\/IEEE Design Automation Conference (DAC), 2021."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1098\/rsta.2019.0155"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-015-1070-9"},{"key":"e_1_3_2_1_34_1","first-page":"927","article-title":"A precision-scalable risc-v dnn processor with on-device learning capability at the extreme edge","author":"Huang L.","year":"2024","unstructured":"L. Huang, C. Fang, Q. Li, J. Lin, and Z. Wang, \"A precision-scalable risc-v dnn processor with on-device learning capability at the extreme edge,\" IEEE, pp. 927--932, 2024.","journal-title":"IEEE"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1098\/rspa.2007.1900"},{"key":"e_1_3_2_1_36_1","volume-title":"AISTATS","author":"Li S.","year":"2022","unstructured":"S. Li, R. M. Kirby, and S. Zhe, \"Deep multi-fidelity active learning of high-dimensional outputs,\" Proc. AISTATS, 2022."},{"key":"e_1_3_2_1_37_1","volume-title":"Practical multi-fidelity bayesian optimization for hyperparameter tuning,\" in Uncertainty in Artificial Intelligence","author":"Wu J.","year":"2020","unstructured":"J. Wu, S. Toscano-Palmerin, P. I. Frazier, and A. G. Wilson, \"Practical multi-fidelity bayesian optimization for hyperparameter tuning,\" in Uncertainty in Artificial Intelligence, 2020."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.06.084"},{"key":"e_1_3_2_1_39_1","volume-title":"CVPR","author":"Zhang Y.","year":"2022","unstructured":"Y. Zhang et al., \"PokeBNN: A binary pursuit of lightweight accuracy,\" in Proc. CVPR, 2022."},{"key":"e_1_3_2_1_40_1","volume-title":"1-bit wavenet: compressing a generative neural network in speech recognition with two binarized methods,\" in 2021 IEEE 16th conference on industrial electronics and applications","author":"Gao S.","year":"2021","unstructured":"S. Gao, R. Wang, L. Jiang, and B. Zhang, \"1-bit wavenet: compressing a generative neural network in speech recognition with two binarized methods,\" in 2021 IEEE 16th conference on industrial electronics and applications (ICIEA), 2021."},{"key":"e_1_3_2_1_41_1","volume-title":"CVPR","author":"Chen T.","year":"2021","unstructured":"T. Chen et al., \"\"BNN-BN=?\": Training Binary Neural Networks Without Batch Normalization,\" in Proc. CVPR, 2021."},{"key":"e_1_3_2_1_42_1","volume-title":"Binarized neural machine translation,\" Advances in Neural Information Processing Systems","author":"Zhang Y.","year":"2024","unstructured":"Y. Zhang, A. Garg, Y. Cao, L. Lew, B. Ghorbani, Z. Zhang, and O. Firat, \"Binarized neural machine translation,\" Advances in Neural Information Processing Systems, vol. 36, 2024."},{"key":"e_1_3_2_1_43_1","volume-title":"Onebit: Towards extremely low-bit large language models,\" arXiv preprint arXiv:2402.11295","author":"Xu Y.","year":"2024","unstructured":"Y. Xu, X. Han, Z. Yang, S. Wang, Q. Zhu, Z. Liu, W. Liu, and W. Che, \"Onebit: Towards extremely low-bit large language models,\" arXiv preprint arXiv:2402.11295, 2024."},{"key":"e_1_3_2_1_44_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models,\" arXiv preprint arXiv:2310.11453","author":"Wang H.","year":"2023","unstructured":"H. Wang, S. Ma, L. Dong, S. Huang, H. Wang, L. Ma, F. Yang, R. Wang, Y. Wu, and F. Wei, \"Bitnet: Scaling 1-bit transformers for large language models,\" arXiv preprint arXiv:2310.11453, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"Pb-llm: Partially binarized large language models,\" arXiv preprint arXiv:2310.00034","author":"Shang Y.","year":"2023","unstructured":"Y. Shang, Z. Yuan, Q. Wu, and Z. Dong, \"Pb-llm: Partially binarized large language models,\" arXiv preprint arXiv:2310.00034, 2023."},{"key":"e_1_3_2_1_46_1","volume-title":"Billm: Pushing the limit of post-training quantization for llms,\" arXiv preprint arXiv:2402.04291","author":"Huang W.","year":"2024","unstructured":"W. Huang, Y. Liu, H. Qin, Y. Li, S. Zhang, X. Liu, M. Magno, and X. Qi, \"Billm: Pushing the limit of post-training quantization for llms,\" arXiv preprint arXiv:2402.04291, 2024."},{"key":"e_1_3_2_1_47_1","volume-title":"Lin et al., \"Opt: Open pre-trained transformer language models","author":"Zhang S.","year":"2022","unstructured":"S. Zhang, S. Roller, N. Goyal, M. Artetxe, M. Chen, S. Chen, C. Dewan, M. Diab, X. Li, X. V. Lin et al., \"Opt: Open pre-trained transformer language models,\" 2022."},{"key":"e_1_3_2_1_48_1","volume-title":"Li et al., \"Towards developing high performance risc-v processors using agile methodology,\" in 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Xu Y.","year":"2022","unstructured":"Y. Xu, Z. Yu, D. Tang, G. Chen, L. Chen, L. Gou, Y. Jin, Q. Li, X. Li, Z. Li et al., \"Towards developing high performance risc-v processors using agile methodology,\" in 2022 55th IEEE\/ACM International Symposium on Microarchitecture (MICRO), 2022."},{"key":"e_1_3_2_1_49_1","volume-title":"Asap7 predictive design kit development and cell design technology co-optimization,\" in 2017 IEEE\/ACM International Conference on Computer-Aided Design (ICCAD)","author":"Vashishtha V.","year":"2017","unstructured":"V. Vashishtha, M. Vangala, and L. T. Clark, \"Asap7 predictive design kit development and cell design technology co-optimization,\" in 2017 IEEE\/ACM International Conference on Computer-Aided Design (ICCAD), 2017."},{"key":"e_1_3_2_1_50_1","first-page":"508","volume-title":"Sage","author":"Abdi H.","year":"2007","unstructured":"H. Abdi, \"The kendall rank correlation coefficient,\" Encyclopedia of Measurement and Statistics. Sage, Thousand Oaks, CA, pp. 508--510, 2007."},{"key":"e_1_3_2_1_51_1","volume-title":"Sutskever et al., \"Improving language understanding by generative pre-training","author":"Radford A.","year":"2018","unstructured":"A. Radford, K. Narasimhan, T. Salimans, I. Sutskever et al., \"Improving language understanding by generative pre-training,\" 2018."}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"]},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676753","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676753","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:58Z","timestamp":1750290238000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676753"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":50,"alternative-id":["10.1145\/3676536.3676753","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676753","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}