{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T15:01:43Z","timestamp":1773414103849,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T00:00:00Z","timestamp":1723420800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,12]]},"DOI":"10.1145\/3673038.3673053","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T18:29:01Z","timestamp":1723141741000},"page":"752-761","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["IMI: In-memory Multi-job Inference Acceleration for Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5009-3514","authenticated-orcid":false,"given":"Bin","family":"Gao","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7139-724X","authenticated-orcid":false,"given":"Zhehui","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of High-Performance Computing, A*STAR, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3477-0607","authenticated-orcid":false,"given":"Zhuomin","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3415-3676","authenticated-orcid":false,"given":"Tao","family":"Luo","sequence":"additional","affiliation":[{"name":"Institute of High-Performance Computing, A*STAR, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4281-2053","authenticated-orcid":false,"given":"Weng-Fai","family":"Wong","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0307-266X","authenticated-orcid":false,"given":"Zhi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Panther: A programmable architecture for neural network training harnessing energy-efficient reram","author":"Ankit Aayush","year":"2020","unstructured":"Aayush Ankit, Izzat El\u00a0Hajj, Sai\u00a0Rahul Chalamalasetti, Sapan Agarwal, Matthew Marinella, Martin Foltin, John\u00a0Paul Strachan, Dejan Milojicic, Wen-Mei Hwu, and Kaushik Roy. 2020. Panther: A programmable architecture for neural network training harnessing energy-efficient reram. IEEE Trans. Comput. (2020)."},{"key":"e_1_3_2_1_2_1","volume-title":"Model tells you what to discard: Adaptive kv cache compression for llms. arXiv preprint arXiv:2310.01801","author":"Ge Suyu","year":"2023","unstructured":"Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, and Jianfeng Gao. 2023. Model tells you what to discard: Adaptive kv cache compression for llms. arXiv preprint arXiv:2310.01801 (2023)."},{"key":"e_1_3_2_1_3_1","first-page":"15908","article-title":"Transformer in transformer","volume":"34","author":"Han Kai","year":"2021","unstructured":"Kai Han, An Xiao, Enhua Wu, Jianyuan Guo, Chunjing Xu, and Yunhe Wang. 2021. Transformer in transformer. Advances in Neural Information Processing Systems 34 (2021), 15908\u201315919.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","volume-title":"Newton: A DRAM-maker\u2019s accelerator-in-memory (AiM) architecture for machine learning","author":"He Mingxuan","year":"2020","unstructured":"Mingxuan He, Choungki Song, Ilkon Kim, Chunseok Jeong, Seho Kim, Il Park, Mithuna Thottethodi, and TN Vijaykumar. 2020. Newton: A DRAM-maker\u2019s accelerator-in-memory (AiM) architecture for machine learning. In IEEE\/ACM MICRO."},{"key":"e_1_3_2_1_5_1","unstructured":"[5] Intel. 2024. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/memory-performance-in-a-nutshell.html."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_7_1","volume-title":"EDA for IC implementation, circuit design, and process technology","author":"Lavagno Luciano","unstructured":"Luciano Lavagno, Louis Scheffer, and Grant Martin. 2018. EDA for IC implementation, circuit design, and process technology. CRC press."},{"key":"e_1_3_2_1_8_1","volume-title":"A system-level simulator for RRAM-based neuromorphic computing chips. ACM TACO","author":"Kay\u00a0Fei Lee Matthew","year":"2019","unstructured":"Matthew Kay\u00a0Fei Lee, Yingnan Cui, Thannirmalai Somu, Tao Luo, Jun Zhou, Wai\u00a0Teng Tang, Weng-Fai Wong, and Rick Siow\u00a0Mong Goh. 2019. A system-level simulator for RRAM-based neuromorphic computing chips. ACM TACO (2019)."},{"key":"e_1_3_2_1_9_1","volume-title":"Hardware architecture and software stack for PIM based on commercial DRAM technology: Industrial product","author":"Lee Sukhan","unstructured":"Sukhan Lee, Shin-haeng Kang, Jaehoon Lee, Hyeonsu Kim, Eojin Lee, Seungwoo Seo, Hosang Yoon, Seungwon Lee, Kyounghwan Lim, Hyunsung Shin, 2021. Hardware architecture and software stack for PIM based on commercial DRAM technology: Industrial product. In ACM\/IEEE ISCA."},{"key":"e_1_3_2_1_10_1","volume-title":"Hybrid memory buffer microarchitecture for high-radix routers","author":"Li Cunlu","year":"2021","unstructured":"Cunlu Li, Dezun Dong, Xiangke Liao, and John Kim. 2021. Hybrid memory buffer microarchitecture for high-radix routers. IEEE Trans. Comput. (2021)."},{"key":"e_1_3_2_1_11_1","unstructured":"Cong Li Zhe Zhou Size Zheng Jiaxi Zhang Yun Liang and Guangyu Sun. 2024. SpecPIM: Accelerating Speculative Inference on PIM-Enabled System via Architecture-Dataflow Co-Exploration. In ACM ASPLOS."},{"key":"e_1_3_2_1_12_1","volume-title":"Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. 2023. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:2306.00978 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Bit-transformer: Transforming bit-level sparsity into higher preformance in reram-based accelerator","author":"Liu Fangxin","year":"2021","unstructured":"Fangxin Liu, Wenbo Zhao, Zhezhi He, Zongwu Wang, Yilong Zhao, Yongbiao Chen, and Li Jiang. 2021. Bit-transformer: Transforming bit-level sparsity into higher preformance in reram-based accelerator. In IEEE\/ACM ICCAD."},{"key":"e_1_3_2_1_14_1","volume-title":"Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time. Advances in Neural Information Processing Systems","author":"Liu Zichang","year":"2024","unstructured":"Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava. 2024. Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time. Advances in Neural Information Processing Systems (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Wai\u00a0Teng Tang, Weng-Fai Wong, and Rick Siow\u00a0Mong Goh.","author":"Luo Tao","year":"2018","unstructured":"Tao Luo, Xuan Wang, Chuping Qu, Matthew Kay\u00a0Fei Lee, Wai\u00a0Teng Tang, Weng-Fai Wong, and Rick Siow\u00a0Mong Goh. 2018. An FPGA-based hardware emulator for neuromorphic chip with RRAM. IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"A survey of techniques for modeling and improving reliability of computing systems","author":"Mittal Sparsh","year":"2015","unstructured":"Sparsh Mittal and Jeffrey\u00a0S Vetter. 2015. A survey of techniques for modeling and improving reliability of computing systems. IEEE TPDS (2015)."},{"key":"e_1_3_2_1_17_1","unstructured":"[17] OpenAI. 2024. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_1_18_1","unstructured":"philschmid. [n. d.]. ShareGPT Raw. https:\/\/huggingface.co\/datasets\/philschmid\/sharegpt-raw\/tree\/main\/sharegpt_90k_raw_dataset."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001139"},{"key":"e_1_3_2_1_20_1","volume-title":"Pipelayer: A pipelined reram-based accelerator for deep learning","author":"Song Linghao","year":"2017","unstructured":"Linghao Song, Xuehai Qian, Hai Li, and Yiran Chen. 2017. Pipelayer: A pipelined reram-based accelerator for deep learning. In IEEE HPCA."},{"key":"e_1_3_2_1_21_1","volume-title":"X-former: In-memory acceleration of transformers","author":"Sridharan Shrihari","year":"2023","unstructured":"Shrihari Sridharan, Jacob\u00a0R Stevens, Kaushik Roy, and Anand Raghunathan. 2023. X-former: In-memory acceleration of transformers. IEEE VLSI (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00339-016-9841-0"},{"key":"e_1_3_2_1_23_1","volume-title":"Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca.","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori\u00a0B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_24_1","volume-title":"Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, Johan Schalkwyk, Andrew\u00a0M Dai, Anja Hauth, 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Towards Efficient and Reliable LLM Serving: A Real-World Workload Study. arXiv preprint arXiv:2401.17644","author":"Wang Yuxin","year":"2024","unstructured":"Yuxin Wang, Yuhan Chen, Zeyu Li, Zhenheng Tang, Rui Guo, Xin Wang, Qiang Wang, Amelie\u00a0Chi Zhou, and Xiaowen Chu. 2024. Towards Efficient and Reliable LLM Serving: A Real-World Workload Study. arXiv preprint arXiv:2401.17644 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"ReNEW: Enhancing lifetime for ReRAM crossbar based neural network accelerators","author":"Wen Wen","unstructured":"Wen Wen, Youtao Zhang, and Jun Yang. 2019. ReNEW: Enhancing lifetime for ReRAM crossbar based neural network accelerators. In IEEE ICCD."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Xiaoxuan Yang Bonan Yan Hai Li and Yiran Chen. 2020. ReTransformer: ReRAM-based processing-in-memory architecture for transformer acceleration. In ICCAD.","DOI":"10.1145\/3400302.3415640"},{"key":"e_1_3_2_1_31_1","volume-title":"ChunkAttention: Efficient Self-Attention with Prefix-Aware KV Cache and Two-Phase Partition. arXiv preprint arXiv:2402.15220","author":"Ye Lu","year":"2024","unstructured":"Lu Ye, Ze Tao, Yong Huang, and Yang Li. 2024. ChunkAttention: Efficient Self-Attention with Prefix-Aware KV Cache and Two-Phase Partition. arXiv preprint arXiv:2402.15220 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Orca: A distributed serving system for Transformer-Based generative models. In USENIX OSDI.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for Transformer-Based generative models. In USENIX OSDI."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001192"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218590"},{"key":"e_1_3_2_1_35_1","volume-title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670","author":"Zhong Yinmin","year":"2024","unstructured":"Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao Zhang. 2024. DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving. arXiv preprint arXiv:2401.09670 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Transpim: A memory-based acceleration via software-hardware co-design for transformer","author":"Zhou Minxuan","year":"2022","unstructured":"Minxuan Zhou, Weihong Xu, Jaeyoung Kang, and Tajana Rosing. 2022. Transpim: A memory-based acceleration via software-hardware co-design for transformer. In IEEE HPCA."}],"event":{"name":"ICPP '24: the 53rd International Conference on Parallel Processing","location":"Gotland Sweden","acronym":"ICPP '24"},"container-title":["Proceedings of the 53rd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673053","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3673038.3673053","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T17:31:14Z","timestamp":1758648674000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3673038.3673053"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,12]]},"references-count":36,"alternative-id":["10.1145\/3673038.3673053","10.1145\/3673038"],"URL":"https:\/\/doi.org\/10.1145\/3673038.3673053","relation":{},"subject":[],"published":{"date-parts":[[2024,8,12]]},"assertion":[{"value":"2024-08-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}