{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T12:35:55Z","timestamp":1770726955176,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,2,22]]},"DOI":"10.1145\/3748173.3779191","type":"proceedings-article","created":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T21:17:35Z","timestamp":1770326255000},"page":"247-257","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TeLLMe: An Efficient End-to-End Ternary LLM Prefill and Decode Accelerator with Table-Lookup Matmul on Edge FPGAs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6877-5764","authenticated-orcid":false,"given":"Ye","family":"Qiao","sequence":"first","affiliation":[{"name":"University of California, Irvine, Irvine, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7180-8655","authenticated-orcid":false,"given":"Zhiheng","family":"Chen","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7315-8151","authenticated-orcid":false,"given":"Yifan","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0506-2718","authenticated-orcid":false,"given":"Yian","family":"Wang","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7669-1467","authenticated-orcid":false,"given":"Sitao","family":"Huang","sequence":"additional","affiliation":[{"name":"University of California, Irvine, Irvine, California, USA"}]}],"member":"320","published-online":{"date-parts":[[2026,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877-1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877-1901, 2020."},{"key":"e_1_3_2_1_2_1","volume-title":"et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948","author":"Guo Daya","year":"2025","unstructured":"Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948, 2025."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIT48603.2022.10002796"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETC.2023.3237778"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.36227\/techrxiv.176157881.11572886\/v1"},{"key":"e_1_3_2_1_7_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453","author":"Wang Hongyu","year":"2023","unstructured":"Hongyu Wang, Shuming Ma, Li Dong, and et al. Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453, 2023."},{"key":"e_1_3_2_1_8_1","volume-title":"Cobra: Algorithm-architecture co-optimized binary transformer accelerator for edge inference. arXiv preprint arXiv:2504.16269","author":"Qiao Ye","year":"2025","unstructured":"Ye Qiao, Zhiheng Chen, Yian Wang, Yifan Zhang, Yunzhe Deng, and Sitao Huang. Cobra: Algorithm-architecture co-optimized binary transformer accelerator for edge inference. arXiv preprint arXiv:2504.16269, 2025."},{"key":"e_1_3_2_1_9_1","volume-title":"The era of 1-bit llms: All large language models are in 1.58 bits. arXiv preprint arXiv:2402.17764","author":"Ma Shuming","year":"2024","unstructured":"Shuming Ma, Hongyu Wang, Lingxiao Ma, and et al. The era of 1-bit llms: All large language models are in 1.58 bits. arXiv preprint arXiv:2402.17764, 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"Pushing up to the limit of memory bandwidth and capacity utilization for efficient llm decoding on embedded fpga. arXiv preprint arXiv:2502.10659","author":"Li Jindong","year":"2025","unstructured":"Jindong Li, Tenglong Li, Guobin Shen, Dongcheng Zhao, Qian Zhang, and Yi Zeng. Pushing up to the limit of memory bandwidth and capacity utilization for efficient llm decoding on embedded fpga. arXiv preprint arXiv:2502.10659, 2025."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00014"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3706628.3708874"},{"key":"e_1_3_2_1_13_1","first-page":"235","volume-title":"Table-lookup mac: Scalable processing of quantised neural networks in fpga soft logic","author":"Gerlinghoff Daniel","year":"2024","unstructured":"Daniel Gerlinghoff, Benjamin Choong, Rick Goh, Weng-Fai Wong, and Tao Luo. Table-lookup mac: Scalable processing of quantised neural networks in fpga soft logic. pages 235-245, 04 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"T-mac: Cpu renaissance via table lookup for low-bit llm deployment on edge. arXiv preprint arXiv:2407.00088","author":"Wei Jianyu","year":"2024","unstructured":"Jianyu Wei, Shijie Cao, Ting Cao, and et al. T-mac: Cpu renaissance via table lookup for low-bit llm deployment on edge. arXiv preprint arXiv:2407.00088, 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"Llamaf: An efficient llama2 architecture accelerator on embedded fpgas. arXiv preprint arXiv:2409.11424","author":"Xu Han","year":"2024","unstructured":"Han Xu, Yutong Li, and Shihao Ji. Llamaf: An efficient llama2 architecture accelerator on embedded fpgas. arXiv preprint arXiv:2409.11424, 2024."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD57390.2023.10323651"},{"key":"e_1_3_2_1_17_1","volume-title":"Designing efficient llm accelerators for edge devices. arXiv preprint arXiv:2408.00462","author":"Haris Jude","year":"2024","unstructured":"Jude Haris, Rappy Saha, Wenhao Hu, and Jos\u00e9 Cano. Designing efficient llm accelerators for edge devices. arXiv preprint arXiv:2408.00462, 2024. Accessed: 2025-04-20."},{"key":"e_1_3_2_1_18_1","volume-title":"Meadow: Memory-efficient dataflow and data packing for low power edge llms. arXiv preprint arXiv:2503.11663, feb","author":"Moitra Abhishek","year":"2025","unstructured":"Abhishek Moitra, Arkapravo Ghosh, Shrey Agarwal, Aporva Amarnath, Karthik Swaminathan, and Priyadarshini Panda. Meadow: Memory-efficient dataflow and data packing for low power edge llms. arXiv preprint arXiv:2503.11663, feb 2025."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. SmoothQuant: Accurate and efficient post-training quantization for large language models. In Proceedings of the 40th International Conference on Machine Learning, 2023."},{"key":"e_1_3_2_1_20_1","first-page":"1","volume-title":"Automation & Test in Europe Conference (DATE)","author":"Wei Renjie","year":"2025","unstructured":"Renjie Wei, Songqiang Xu, Linfeng Zhong, Zebin Yang, Qingyu Guo, Yuan Wang, Runsheng Wang, and Meng Li. Lightmamba: Efficient mamba acceleration on fpga with quantization and hardware co-design. In 2025 Design, Automation & Test in Europe Conference (DATE), pages 1-7, 2025."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2025.3546256"},{"key":"e_1_3_2_1_23_1","volume-title":"Tereffic: Highly efficient ternary llm inference on fpga. arXiv preprint arXiv:2502.16473","author":"Yin Chenyang","year":"2025","unstructured":"Chenyang Yin, Zhenyu Bai, Pranav Venkatram, Shivam Aggarval, Zhaoying Li, and Tulika Mitra. Tereffic: Highly efficient ternary llm inference on fpga. arXiv preprint arXiv:2502.16473, 2025."},{"key":"e_1_3_2_1_24_1","volume-title":"https:\/\/www.amd.com\/en\/products\/adaptive-socs-and-fpgas\/versal.html","author":"AMD.","year":"2024","unstructured":"AMD. Versal adaptive socs - AMD. https:\/\/www.amd.com\/en\/products\/adaptive-socs-and-fpgas\/versal.html, 2024. Accessed: 2024-01-20."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/FPL50879.2020.00055"},{"key":"e_1_3_2_1_26_1","unstructured":"Advanced Micro Devices. Ultrascale architecture libraries guide. Technical Report UG974 Advanced Micro Devices May 2024. Version 2024.1. Describes UltraScale primitives such as RAM32X1D."},{"key":"e_1_3_2_1_27_1","volume-title":"transformers\/models\/llama at main \u00b7 huggingface\/transformers","author":"Face Hugging","year":"2023","unstructured":"Hugging Face. transformers\/models\/llama at main \u00b7 huggingface\/transformers, 2023. GitHub repository."},{"key":"e_1_3_2_1_28_1","unstructured":"Advanced Micro Devices. Ultrascale architecture memory resources user guide. Technical Report UG573 Advanced Micro Devices September 2021. Version 1.13."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICFPT47387.2019.00029"},{"key":"e_1_3_2_1_30_1","volume-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. Flashattention: Fast and memory-efficient exact attention with io-awareness, 2022."},{"key":"e_1_3_2_1_31_1","first-page":"12","article-title":"Vivado Design Suite: AXI Reference Guide (UG1037)","author":"Devices Advanced Micro","year":"2022","unstructured":"Advanced Micro Devices. Vivado Design Suite: AXI Reference Guide (UG1037). Advanced Micro Devices, 12 2022. Version 2022.2.","journal-title":"Advanced Micro Devices"},{"key":"e_1_3_2_1_32_1","volume-title":"Local llms on raspberry pi: Qwen3. https:\/\/learn.adafruit.com\/local-llms-on-raspberry-pi\/qwen3","year":"2025","unstructured":"Adafruit. Local llms on raspberry pi: Qwen3. https:\/\/learn.adafruit.com\/local-llms-on-raspberry-pi\/qwen3, 2025. Accessed: 2025-9-01."},{"key":"e_1_3_2_1_33_1","volume-title":"Tutorial - small language models (slm). https:\/\/www.jetson-ai-lab.com\/tutorial_slm.html","author":"Lab NVIDIA","year":"2025","unstructured":"NVIDIA Jetson AI Lab. Tutorial - small language models (slm). https:\/\/www.jetson-ai-lab.com\/tutorial_slm.html, 2025. Accessed: 2025-10-01."},{"key":"e_1_3_2_1_34_1","volume-title":"Tenet: An efficient sparsity-aware lut-centric architecture for ternary llm inference on edge. arXiv:2509.13765","author":"Huang Zhirui","year":"2025","unstructured":"Zhirui Huang, Rui Ma, Shijie Cao, Ran Shu, Ian Wang, Ting Cao, Chixiao Chen, and Yongqiang Xiong. Tenet: An efficient sparsity-aware lut-centric architecture for ternary llm inference on edge. arXiv:2509.13765, 2025."}],"event":{"name":"FPGA '26:The 2026 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays","location":"Seaside CA USA","sponsor":["SIGDA ACM Special Interest Group on Design Automation"]},"container-title":["Proceedings of the 2026 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays"],"original-title":[],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T16:18:09Z","timestamp":1770653889000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3748173.3779191"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,21]]},"references-count":34,"alternative-id":["10.1145\/3748173.3779191","10.1145\/3748173"],"URL":"https:\/\/doi.org\/10.1145\/3748173.3779191","relation":{},"subject":[],"published":{"date-parts":[[2026,2,21]]},"assertion":[{"value":"2026-02-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}