{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T04:39:53Z","timestamp":1768279193197,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"AI Chip Center for Emerging Smart Systems"},{"name":"Huawei Hong Kong Research Center"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656592","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"62-73","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["LCM: LLM-focused Hybrid SPM-cache Architecture with Cache Management for Multi-Core AI Accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9547-9653","authenticated-orcid":false,"given":"Chengtao","family":"Lai","sequence":"first","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7037-7418","authenticated-orcid":false,"given":"Zhongchun","family":"Zhou","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1251-9210","authenticated-orcid":false,"given":"Akash","family":"Poptani","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology Dharwad, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7622-6714","authenticated-orcid":false,"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong Special Administrative Region of China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/1061267.1061271"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00081"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Sid Black Stella Biderman Eric Hallahan Quentin Anthony Leo Gao Laurence Golding Horace He Connor Leahy Kyle McDonell Jason Phang Michael Pieler USVSN\u00a0Sai Prashanth Shivanshu Purohit Laria Reynolds Jonathan Tow Ben Wang and Samuel Weinbach. 2022. GPT-NeoX-20B: An Open-Source Autoregressive Language Model. arxiv:2204.06745\u00a0[cs.CL]","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"e_1_3_2_1_4_1","volume-title":"Tech. Rep. MSU-CSE-99-31.","author":"Brehob M.","year":"1999","unstructured":"M. Brehob and R. Enbody. 1999. An Analytical Model of Locality and Caching. Tech. Rep. MSU-CSE-99-31. Michigan State University, Department of Computer Science and Engineering."},{"key":"e_1_3_2_1_5_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, 2018. { TVM} : An automated { End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"e_1_3_2_1_9_1","unstructured":"NVIDIA Corporation. 2022. NVIDIA H100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-tensor-core"},{"key":"e_1_3_2_1_10_1","unstructured":"Qualcomm Corporation. 2023. Snapdragon X Elite. https:\/\/www.qualcomm.com\/snapdragon\/laptops"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485832.3485902"},{"key":"e_1_3_2_1_12_1","unstructured":"Tim Dettmers Mike Lewis Younes Belkada and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale. arxiv:2208.07339\u00a0[cs.LG]"},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL]"},{"key":"e_1_3_2_1_14_1","unstructured":"EleutherAI. 2023. GPT-J. https:\/\/www.eleuther.ai\/artifacts\/gpt-j"},{"key":"e_1_3_2_1_15_1","unstructured":"EleutherAI. 2023. GPT-Neo. https:\/\/www.eleuther.ai\/artifacts\/gpt-neo"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-015-1505-6"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3093337.3037702"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304014"},{"key":"e_1_3_2_1_19_1","volume-title":"Gemmini: An agile systolic array generator enabling systematic evaluations of deep-learning architectures. arXiv preprint arXiv:1911.09925 3","author":"Genc Hasan","year":"2019","unstructured":"Hasan Genc, Ameer Haj-Ali, Vighnesh Iyer, Alon Amid, Howard Mao, John Wright, Colin Schmidt, Jerry Zhao, Albert Ou, Max Banister, 2019. Gemmini: An agile systolic array generator enabling systematic evaluations of deep-learning architectures. arXiv preprint arXiv:1911.09925 3 (2019), 25."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2002.1003579"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580017"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123942"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/264107.264207"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Norman\u00a0P. Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Cliff Young Xiang Zhou Zongwei Zhou and David Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. arxiv:2304.01433\u00a0[cs.AR]","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575747"},{"key":"e_1_3_2_1_27_1","volume-title":"FRAME: Fast Roofline Analytical Modeling and Estimation. https:\/\/github.com\/maestro-project\/frame","author":"Kao Sheng-Chun","year":"2022","unstructured":"Sheng-Chun Kao, Suvinay Subramanian, Abhimanyu Bambhaniya, and Tushar Krishna. 2022. FRAME: Fast Roofline Analytical Modeling and Estimation. https:\/\/github.com\/maestro-project\/frame"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.24"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2007.70816"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001178"},{"key":"e_1_3_2_1_31_1","volume-title":"Full stack optimization of transformer inference: a survey. arXiv preprint arXiv:2302.14017","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Coleman Hooper, Thanakul Wattanawong, Minwoo Kang, Ruohan Yan, Hasan Genc, Grace Dinh, Qijing Huang, Kurt Keutzer, Michael\u00a0W Mahoney, 2023. Full stack optimization of transformer inference: a survey. arXiv preprint arXiv:2302.14017 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Maestro: A data-centric approach to understand reuse, performance, and hardware cost of dnn mappings","author":"Kwon Hyoukjun","year":"2020","unstructured":"Hyoukjun Kwon, Prasanth Chatarasi, Vivek Sarkar, Tushar Krishna, Michael Pellauer, and Angshuman Parashar. 2020. Maestro: A data-centric approach to understand reuse, performance, and hardware cost of dnn mappings. IEEE micro 40, 3 (2020), 20\u201329."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/339647.339669"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/379240.379259"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875654"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2008.4771793"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567961"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2717764.2717783"},{"key":"e_1_3_2_1_39_1","unstructured":"MindSpore. 2023. Parallel Distributed Training Example (Ascend). https:\/\/www.mindspore.cn\/docs\/programming_guide\/ en\/r1.5\/distributed_training_ascend.html"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3530909"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622867"},{"key":"e_1_3_2_1_44_1","volume-title":"Xla: Compiling machine learning for peak performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. Xla: Compiling machine learning for peak performance. (2020)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460227"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2678373.2665689"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783751"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1150019.1136508"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00051"},{"key":"e_1_3_2_1_50_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:2307.09288\u00a0[cs.CL]"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080244"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2009.4798239"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3424669"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830807"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00042"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2022.3170848"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656592","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656592","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:24:55Z","timestamp":1755876295000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656592"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":56,"alternative-id":["10.1145\/3650200.3656592","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656592","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}