{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T05:59:14Z","timestamp":1781157554610,"version":"3.54.1"},"reference-count":65,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T00:00:00Z","timestamp":1778630400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T00:00:00Z","timestamp":1778630400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,5,13]]},"DOI":"10.1109\/fccm68464.2026.00027","type":"proceedings-article","created":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T19:59:45Z","timestamp":1781121585000},"page":"109-118","source":"Crossref","is-referenced-by-count":0,"title":["LUT-LLM: Efficient Language Model Inference with Memory-based Computations on FPGAs"],"prefix":"10.1109","author":[{"given":"Zifan","family":"He","sequence":"first","affiliation":[{"name":"University of California,Los Angeles"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shengyu","family":"Ye","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Ma","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yang","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jason","family":"Cong","sequence":"additional","affiliation":[{"name":"University of California,Los Angeles"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A complete survey on LLM-based AI chatbots","author":"Dam","year":"2024"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3729220"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.737"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.189"},{"key":"ref5","article-title":"Deep Research Agents: A Systematic Examination And Roadmap","author":"Huang","year":"2025"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3656177"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3656401"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref9","article-title":"Autopal: Autonomous adaptation to users for personal AI companionship","author":"Cheng","year":"2024"},{"key":"ref10","article-title":"Towards Ethical Personal AI Applications: Practical Considerations for AI Assistants with Long-Term Memory","author":"Lee","year":"2024"},{"key":"ref11","article-title":"Smart Home Technology Solutions - Qualcomm","author":"Incorporated","year":"2025"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611386"},{"key":"ref13","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023"},{"key":"ref14","article-title":"Flashdecoding++: Faster large language model inference on GPUs","author":"Hong","year":"2023"},{"key":"ref15","article-title":"GPTQ: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022"},{"key":"ref16","article-title":"AMD Alveo U280 Data Center Accelerator Card Data Sheet (DS963, v1.7)","year":"2023"},{"key":"ref17","article-title":"NVIDIA V100 Tesla Tensor Core GPU Data Sheet","year":"2020"},{"key":"ref18","article-title":"AMD Alveo V80 Data Center Accelerator Cards Data Sheet (DS1013, v1.0)","year":"2024"},{"key":"ref19","article-title":"NVIDIA A100 80 GB PCIe GPU Data Sheet","year":"2021"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00057"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637576"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM62733.2025.00077"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM60383.2024.00023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.467"},{"key":"ref25","article-title":"GPTVQ: The blessing of dimensionality for LLM quantization","author":"Van Baalen","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MASSP.1984.1162229"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM62733.2025.00047"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637569"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3725843.3762817"},{"key":"ref33","first-page":"7197","article-title":"Up or down? adaptive rounding for post-training quantization","volume-title":"International conference on machine learning","author":"Nagel"},{"key":"ref34","first-page":"87","article-title":"AWQ: Activation-aware weight quantization for on-device LLM compression and acceleration","volume-title":"Proceedings of machine learning and systems","volume":"6","author":"Lin"},{"key":"ref35","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","volume-title":"International conference on machine learning","author":"Xiao"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3656643"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1093\/oed\/9355053924"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1904"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3613285"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696099"},{"key":"ref42","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3126531","article-title":"qLUT: Input-aware quantized table lookup for energy-efficient approximate accelerators","volume":"16","author":"Raha","year":"2017","journal-title":"ACM Transactions on Embedded Computing Systems (TECS)"},{"key":"ref43","article-title":"BitNet: Scaling 1-bit transformers for large language models","author":"Wang","year":"2023"},{"key":"ref44","article-title":"BitNet a4. 8: 4-bit Activations for 1-bit LLMs","author":"Wang","year":"2024"},{"key":"ref45","article-title":"Qserve: W4a8kv4 quantization and system co-design for efficient llm serving","author":"Lin","year":"2024"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref48","article-title":"Eq-bench creative writing benchmark v3","author":"Paech","year":"2025"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3706628.3708864"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1017\/9780511811487"},{"key":"ref51","article-title":"Qwen3 technical report","author":"Yang","year":"2025"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"ref53","article-title":"Understanding straight-through estimator in training activation quantized neural nets","author":"Yin","year":"2019"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3018"},{"key":"ref56","article-title":"SpinQuant: LLM quantization with learned rotations","author":"Liu","year":"2024"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3609335"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3490422.3502361"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3431920.3439289"},{"key":"ref60","article-title":"AMD Instinct MI210 Accelerator Product Brief","year":"2023"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3710848.3710871"},{"key":"ref63","article-title":"nvidia-ml-py: Python bindings for NVIDIA Management Library (NVML)","year":"2025"},{"key":"ref64","article-title":"A Systematic Characterization of LLM Inference on GPUs","author":"Wang","year":"2025"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.23919\/VLSITechnologyandCir65189.2025.11074854"}],"event":{"name":"2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","location":"Atlanta, GA, USA","start":{"date-parts":[[2026,5,13]]},"end":{"date-parts":[[2026,5,16]]}},"container-title":["2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11552597\/11552602\/11552691.pdf?arnumber=11552691","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T05:22:17Z","timestamp":1781155337000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11552691\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,13]]},"references-count":65,"URL":"https:\/\/doi.org\/10.1109\/fccm68464.2026.00027","relation":{},"subject":[],"published":{"date-parts":[[2026,5,13]]}}}