{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T04:52:06Z","timestamp":1783054326657,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Samsung Electronics","award":["IO231030-07528-01"],"award-info":[{"award-number":["IO231030-07528-01"]}]},{"name":"IITP","award":["IITP-2023-RS-2023-00256081"],"award-info":[{"award-number":["IITP-2023-RS-2023-00256081"]}]},{"name":"IITP","award":["No.2021-0-01343"],"award-info":[{"award-number":["No.2021-0-01343"]}]},{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation","doi-asserted-by":"publisher","award":["PRISM"],"award-info":[{"award-number":["PRISM"]}],"id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640422","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"103-119","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":110,"title":["AttAcc! Unleashing the Power of PIM for Batched Transformer-based Generative Model Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5623-6985","authenticated-orcid":false,"given":"Jaehyun","family":"Park","sequence":"first","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2447-4369","authenticated-orcid":false,"given":"Jaewan","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4243-2111","authenticated-orcid":false,"given":"Kwanhee","family":"Kyung","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9987-9809","authenticated-orcid":false,"given":"Michael Jaemin","family":"Kim","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1956-4629","authenticated-orcid":false,"given":"Yongsuk","family":"Kwon","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0442-5634","authenticated-orcid":false,"given":"Nam Sung","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1733-1394","authenticated-orcid":false,"given":"Jung Ho","family":"Ahn","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints.","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2019.2945617"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783753"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel Ziegler Jeffrey Wu Clemens Winter Chris Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. In NeurIPS. 10.48550\/arXiv.2005.14165","DOI":"10.48550\/arXiv.2005.14165"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2017.7870333"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","unstructured":"Kevin K Chang Prashant J Nair Donghyuk Lee Saugata Ghose Moinuddin K Qureshi and Onur Mutlu. 2016. Low-Cost Inter-Linked Sub-arrays (LISA): Enabling Fast Inter-Subarray Data Movement in DRAM. In HPCA. 10.1109\/HPCA.2016.7446095","DOI":"10.1109\/HPCA.2016.7446095"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476146"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2023.3305386"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.mejo.2016.04.006"},{"key":"e_1_3_2_1_11_1","unstructured":"Compute Express Link Consortium. 2022. Compute Express Link 3.0 White Paper. https:\/\/www.computeexpresslink.org\/_files\/ugd\/0c1418_a8713008916044ae9604405d10a7773b.pdf"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056040"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Fei Gao Georgios Tziantzioulis and David Wentzlaff. 2019. Compute-DRAM: In-Memory Compute Using Off-the-Shelf DRAMs. In MICRO. 100--113. 10.1145\/3352460.3358260","DOI":"10.1145\/3352460.3358260"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_16_1","unstructured":"SAFARI Research Group. 2023. Ramulator 2.0 --- GitHub Repository. https:\/\/github.com\/CMU-SAFARI\/ramulator2"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Peng Gu Xinfeng Xie Yufei Ding Guoyang Chen Weifeng Zhang Dimin Niu and Yuan Xie. 2020. iPIM: Programmable In-Memory Image Processing Accelerator Using Near-Bank Architecture. In ISCA. 804--817. 10.1109\/ISCA45697.2020.00071","DOI":"10.1109\/ISCA45697.2020.00071"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"Cong Guo Jiaming Tang Weiming Hu Jingwen Leng Chen Zhang Fan Yang Yunxin Liu Minyi Guo and Yuhao Zhu. 2023. OliVe: Accelerating Large Language Models via Hardware-Friendly Outlier-Victim Pair Quantization. In ISCA. 10.1145\/3579371.3589038","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446749"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00035"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00040"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"Mohsen Imani Saransh Gupta Yeseong Kim and Tajana Rosing. 2019. FloatPIM: In-Memory Acceleration of Deep Neural Network Training with High Precision. In ISCA. 802--815. 10.1145\/3307650.3322237","DOI":"10.1145\/3307650.3322237"},{"key":"e_1_3_2_1_26_1","unstructured":"JEDEC. 2022. High Bandwidth Memory DRAM (HBM3)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIT.2018.8510682"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","unstructured":"Norm Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Clifford Young Xiang Zhou Zongwei Zhou and David A Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. In ISCA. 10.1145\/3579371.3589350","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"Norman P. Jouppi Doe Hyun Yoon Matthew Ashcraft Mark Gottscho Thomas B. Jablin George Kurian James Laudon Sheng Li Peter C. Ma Xiaoyu Ma Thomas Norrie Nishant Patil Sushma Prasad Cliff Young Zongwei Zhou and David A. Patterson. 2021. Ten Lessons From Three Generations Shaped Google's TPUv4i: Industrial Product. In ISCA. 1--14. 10.1109\/ISCA52012.2021.00010","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/IMW.2017.7939084"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2984496"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.41"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Yoongu Kim Vivek Seshadri Donghyuk Lee Jamie Liu and Onur Mutlu. 2012. A Case for Exploiting Subarray-Level Parallelism (SALP) in DRAM. In ISCA. 10.1145\/2366231.2337202","DOI":"10.1145\/2366231.2337202"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2015.2414456"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","unstructured":"Youngeun Kwon Yunjae Lee and Minsoo Rhu. 2019. TensorDIMM: A Practical Near-Memory Processing Architecture for Embeddings and Tensor Operations in Deep Learning. In MICRO. 740--753. 10.1145\/3352460.3358284","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Sukhan Lee Shin-haeng Kang Jaehoon Lee Hyeonsu Kim Eojin Lee Seungwoo Seo Hosang Yoon Seungwon Lee Kyounghwan Lim Hyunsung Shin Jinhyun Kim O Seongil Anand Iyer David Wang Kyomin Sohn and Nam Sung Kim. 2021. Hardware Architecture and Software Stack for PIM based on Commercial DRAM Technology: Industrial Product. In ISCA. 10.1109\/ISCA52012.2021.00013","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731711"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123977"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2008.4541126"},{"key":"e_1_3_2_1_41_1","volume-title":"F. Nisa Bostanc\u0131, Ataberk Olgun, A. Giray Ya\u011fl\u0131k\u00e7\u0131, and Onur Mutlu.","author":"Luo Haocong","year":"2023","unstructured":"Haocong Luo, Yahya Can Tu\u011frul, F. Nisa Bostanc\u0131, Ataberk Olgun, A. Giray Ya\u011fl\u0131k\u00e7\u0131, and Onur Mutlu. 2023. Ramulator 2.0: A Modern, Modular, and Extensible DRAM Simulator."},{"key":"e_1_3_2_1_42_1","unstructured":"NVIDIA. 2023. NVIDIA DGX A100. https:\/\/resources.nvidia.com\/en-us-dgx-systems\/dgx-ai"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Mike O'Connor Niladrish Chatterjee Donghyuk Lee John Wilson Aditya Agrawal Stephen W Keckler and William J Dally. 2017. FineGrained DRAM: Energy-Efficient DRAM for Extreme Bandwidth Systems. In MICRO. 10.1145\/3123939.3124545","DOI":"10.1145\/3123939.3124545"},{"key":"e_1_3_2_1_44_1","unstructured":"OpenAI. 2023. Models. https:\/\/platform.openai.com\/docs\/models\/."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","unstructured":"Jaehyun Park Byeongho Kim Sungmin Yun Eojin Lee Minsoo Rhu and Jung Ho Ahn. 2021. TRiM: Enhancing Processor-Memory Interfaces with Scalable Tensor Reduction in Memory. In MICRO. 10.1145\/3466752.3480080","DOI":"10.1145\/3466752.3480080"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731562"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2022.3193354"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460971"},{"key":"e_1_3_2_1_49_1","volume-title":"Splitwise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR]","author":"Patel Pratyush","year":"2023","unstructured":"Pratyush Patel, Esha Choukse, Chaojie Zhang, \u00cd\u00f1igo Goiri, Aashaka Shah, Saeed Maleki, and Ricardo Bianchini. 2023. Splitwise: Efficient generative LLM inference using phase splitting. arXiv:2311.18677 [cs.AR]"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"e_1_3_2_1_51_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving Language Understanding by Generative Pre-training. (2018)."},{"key":"e_1_3_2_1_52_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2022.3232096"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124544"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2014.94"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI51249.2020.00016"},{"key":"e_1_3_2_1_57_1","unstructured":"Noam Shazeer. 2019. Fast Transformer Decoding: One Write-Head is All You Need."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2857044"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2018.8310252"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2010.69"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1706.03762"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","unstructured":"Hanrui Wang Zhekai Zhang and Song Han. 2021. SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. In HPCA. 97--110. 10.1109\/HPCA51647.2021.00018","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2019.2939682"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/IEDM.2016.7838333"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models. In Proceedings of the 40th International Conference on Machine Learning, Vol. 202. 38087--38099. https:\/\/proceedings.mlr.press\/v202\/xiao23c.html"},{"key":"e_1_3_2_1_66_1","volume-title":"Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. ORCA: A Distributed Serving System for Transformer-Based Generative Models. In OSDI. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2022.3182387"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00071"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","unstructured":"Minxuan Zhou Weihong Xu Jaeyoung Kang and Tajana Rosing. 2022. TransPIM: A Memory-based Acceleration via Software-Hardware Co-Design for Transformer. In HPCA. 10.1109\/HPCA53966.2022.00082","DOI":"10.1109\/HPCA53966.2022.00082"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640422","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640422","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640422"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":68,"alternative-id":["10.1145\/3620665.3640422","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640422","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}