{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:35:21Z","timestamp":1773246921979,"version":"3.50.1"},"reference-count":57,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2021-II211343"],"award-info":[{"award-number":["RS-2021-II211343"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["IITP-2023-RS-2023-00256081"],"award-info":[{"award-number":["IITP-2023-RS-2023-00256081"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2024-00347394"],"award-info":[{"award-number":["RS-2024-00347394"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2025-02218733"],"award-info":[{"award-number":["RS-2025-02218733"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2024-00355678"],"award-info":[{"award-number":["RS-2024-00355678"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information and Communications Technology Planning and Evaluation and National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2023-00228970"],"award-info":[{"award-number":["RS-2023-00228970"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. I"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1109\/tcsi.2025.3576232","type":"journal-article","created":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T13:55:53Z","timestamp":1749563753000},"page":"6553-6566","source":"Crossref","is-referenced-by-count":4,"title":["CLAT: A Clustering-Based Attention Transformer Accelerator for Low-Latency Text Generation in LLMs"],"prefix":"10.1109","volume":"72","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7760-0168","authenticated-orcid":false,"given":"Sunwoo","family":"Lee","sequence":"first","affiliation":[{"name":"Department of Intelligence and Information, the Research Institute for Convergence Science, The Inter-University Semiconductor Research Center, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8824-7997","authenticated-orcid":false,"given":"Beomseok","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, the Research Institute for Convergence Science, The Inter-University Semiconductor Research Center, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9603-9588","authenticated-orcid":false,"given":"Jeongwoo","family":"Park","sequence":"additional","affiliation":[{"name":"Electrical and Computer Engineering Department, Sungkyunkwan University, Suwon, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0395-8076","authenticated-orcid":false,"given":"Dongsuk","family":"Jeon","sequence":"additional","affiliation":[{"name":"Department of Intelligence and Information, the Research Institute for Convergence Science, The Inter-University Semiconductor Research Center, Seoul National University, Seoul, South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref3","article-title":"Improving language understanding by generative pre-training","year":"2018"},{"issue":"8","key":"ref4","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref5","article-title":"ALBERT: A lite BERT for self-supervised learning of language representations","author":"Lan","year":"2019","journal-title":"arXiv:1909.11942"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref7","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv:2005.14165"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref11","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01102"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"ref14","article-title":"OPT: Open pre-trained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv:2205.01068"},{"key":"ref15","article-title":"Bloom: A 176b-parameter open-access multilingual language model","volume-title":"arXiv:2211.05100","author":"Scao","year":"2023"},{"key":"ref16","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00338"},{"key":"ref18","article-title":"The falcon series of open language models","author":"Almazrouei","year":"2023","journal-title":"arXiv:2311.16867"},{"key":"ref19","volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 With 90%* ChatGPT Quality","author":"Chiang et al","year":"2023"},{"key":"ref20","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv:2010.11929"},{"key":"ref21","article-title":"GPTQ: Accurate post-training quantization for generative pre-trained transformers","author":"Frantar","year":"2022","journal-title":"arXiv:2210.17323"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.26"},{"key":"ref23","first-page":"38087","article-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xiao"},{"key":"ref24","first-page":"87","article-title":"AWQ: Activation-aware weight quantization for on-device LLM compression and acceleration","volume-title":"Proc. Mach. Learn. Syst.","volume":"66","author":"Lin"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2022.3170848"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00093"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/hpca47549.2020.00035"},{"key":"ref32","first-page":"250","article-title":"MnnFast: A fast and scalable system architecture for memory-augmented neural networks","volume-title":"Proc. ACM\/IEEE 46th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Jang"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref34","first-page":"34661","article-title":"H2O: Heavy-hitter Oracle for efficient generative inference of large language models","volume-title":"Proc. Adv. In Neural Inf. Process. Syst.","volume":"36","author":"Zhang"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref37","article-title":"RPTQ: Reorder-based post-training quantization for large language models","author":"Yuan","year":"2023","journal-title":"arXiv:2304.01089"},{"key":"ref38","article-title":"CLAQ: Pushing the limits of low-bit post-training quantization for LLMs","author":"Wang","year":"2024","journal-title":"arXiv:2405.17233"},{"key":"ref39","article-title":"SKVQ: Sliding-window key and value cache quantization for large language models","author":"Duanmu","year":"2024","journal-title":"arXiv:2405.06219"},{"key":"ref40","first-page":"21665","article-title":"Fast transformers with clustered attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Vyas"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"ref42","article-title":"Generating long sequences with sparse transformers","author":"Child","year":"2019","journal-title":"arXiv:1904.10509"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007612920971"},{"key":"ref44","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020","journal-title":"arXiv:2004.05150"},{"key":"ref45","first-page":"17283","article-title":"Big Bird: Transformers for longer sequences","volume-title":"Proc. 34th Adv. In Neural Inf. Process. Syst. (NeurlPS)","volume":"33","author":"Zaheer"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.sustainlp-1.5"},{"key":"ref47","article-title":"8-bit optimizers via block-wise quantization","author":"Dettmers","year":"2021","journal-title":"arXiv:2110.02861"},{"key":"ref48","article-title":"The Llama 3 herd of models","author":"Dubey","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref49","first-page":"19274","article-title":"Fast inference from transformers via speculative decoding","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Leviathan"},{"key":"ref50","article-title":"Pointer sentinel mixture models","author":"Merity","year":"2016","journal-title":"arXiv:1609.07843"},{"key":"ref51","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2020","journal-title":"arXiv:2009.03300"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctvcm4g18.8"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"ref55","volume-title":"NVIDIA TensorRT: A Platform for High-Performance Deep Learning Inference","year":"2023"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3420728"},{"key":"ref57","volume-title":"Alveo U280 Data Center Accelerator Card Data Sheet","year":"2021"}],"container-title":["IEEE Transactions on Circuits and Systems I: Regular Papers"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8919\/11217317\/11029639.pdf?arnumber=11029639","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T18:08:49Z","timestamp":1761588529000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11029639\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11]]},"references-count":57,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tcsi.2025.3576232","relation":{},"ISSN":["1549-8328","1558-0806"],"issn-type":[{"value":"1549-8328","type":"print"},{"value":"1558-0806","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11]]}}}