{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:13:25Z","timestamp":1774628005720,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":15,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100010418","name":"Institute for Information and communications Technology Promotion","doi-asserted-by":"publisher","award":["IITP-2020-0-01847"],"award-info":[{"award-number":["IITP-2020-0-01847"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["4120200113769"],"award-info":[{"award-number":["4120200113769"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"IC Design Education Center"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3655953","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Token-Picker: Accelerating Attention in Text Generation with Minimized Memory Transfer via Probability Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7722-8702","authenticated-orcid":false,"given":"Junyoung","family":"Park","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Daejeon, Republic of Korea"},{"name":"System LSI, Samsung Electronics, Hwaseong, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3557-8526","authenticated-orcid":false,"given":"Myeonggu","family":"Kang","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Daejeon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0432-9324","authenticated-orcid":false,"given":"Yunki","family":"Han","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Chungcheongnam-do, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6165-6234","authenticated-orcid":false,"given":"Yang-Gon","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Daejeon, Republic of Korea"},{"name":"System LSI, Samsung Electronics, Hwaseong, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5943-1599","authenticated-orcid":false,"given":"Jaekang","family":"Shin","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Daejeon, Republic of Korea"},{"name":"System LSI, Samsung Electronics, Hwaseong, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9585-4591","authenticated-orcid":false,"given":"Lee-Sup","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Daejeon, Daejeon, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"2021 IEEE International Symposium on HighPerformance Computer Architecture (HPCA). IEEE, 97--110","author":"Hanrui","unstructured":"Hanrui Wang et al. 2021. Spatten: Efficient sparse attention architecture with cascade token and head pruning. In 2021 IEEE International Symposium on HighPerformance Computer Architecture (HPCA). IEEE, 97--110."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3085572"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2973991"},{"key":"e_1_3_2_1_4_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Alec Radford","year":"2019","unstructured":"Alec Radford et el. 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_5_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Gyeong-In","unstructured":"Gyeong-In Yu et el. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521--538."},{"key":"e_1_3_2_1_6_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Hugo Touvron","year":"2023","unstructured":"Hugo Touvron et el. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Jacob Devlin","year":"2018","unstructured":"Jacob Devlin et el. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"MICRO-54: 54th Annual IEEE\/ACM International Symposium on Microarchitecture. 977--991","author":"Liqiang","unstructured":"Liqiang Lu et el. 2021. Sanger: A co-design framework for enabling sparse attention using reconfigurable architecture. In MICRO-54: 54th Annual IEEE\/ACM International Symposium on Microarchitecture. 977--991."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Reiner","year":"2023","unstructured":"Reiner Pope et el. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843","author":"Stephen Merity","year":"2016","unstructured":"Stephen Merity et el. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_11_1","volume-title":"Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Susan Zhang","year":"2022","unstructured":"Susan Zhang et el. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_12_1","unstructured":"Thomas Wolf et el. 2020. HuggingFace's Transformers: State-of-the-art Natural Language Processing. arXiv:1910.03771 [cs.CL]"},{"key":"e_1_3_2_1_13_1","volume-title":"Efficient memory management for large language model serving with pagedattention. arXiv preprint arXiv:2309.06180","author":"Woosuk Kwon","year":"2023","unstructured":"Woosuk Kwon et el. 2023. Efficient memory management for large language model serving with pagedattention. arXiv preprint arXiv:2309.06180 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Zhenzhong Lan","year":"2019","unstructured":"Zhenzhong Lan et el. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 49th Annual International Symposium on Computer Architecture. 902--915","author":"Zheng","unstructured":"Zheng Li et el. 2022. Accelerating attention through gradient-based learned runtime pruning. In Proceedings of the 49th Annual International Symposium on Computer Architecture. 902--915."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","location":"San Francisco CA USA","acronym":"DAC '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655953","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3655953","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:55Z","timestamp":1750295875000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655953"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":15,"alternative-id":["10.1145\/3649329.3655953","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3655953","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}