{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T07:14:24Z","timestamp":1772694864466,"version":"3.50.1"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408592","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-21","source":"Crossref","is-referenced-by-count":0,"title":["PIMphony: Overcoming Bandwidth and Capacity Inefficiency in PIM-Based Long-Context LLM Inference System"],"prefix":"10.1109","author":[{"given":"Hyucksung","family":"Kwon","sequence":"first","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kyungmo","family":"Koo","sequence":"additional","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Janghyeon","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Woongkyu","family":"Lee","sequence":"additional","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minjae","family":"Lee","sequence":"additional","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gyeonggeun","family":"Jung","sequence":"additional","affiliation":[{"name":"KAIST,Daejeon,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hyungdeok","family":"Lee","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yousub","family":"Jung","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jaehan","family":"Park","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yosub","family":"Song","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Byeongsu","family":"Yang","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haerang","family":"Choi","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guhyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jongsoon","family":"Won","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Woojae","family":"Shin","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changhyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gyeongcheol","family":"Shin","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongkee","family":"Kwon","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ilkon","family":"Kim","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Euicheol","family":"Lim","sequence":"additional","affiliation":[{"name":"Solution Advanced Technology, SK hynix,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John","family":"Kim","sequence":"additional","affiliation":[{"name":"KAIST,Daejeon,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jungwook","family":"Choi","sequence":"additional","affiliation":[{"name":"Hanyang University,Seoul,Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"NVIDIA TensorRT-LLM"},{"key":"ref2","volume-title":"SGLang"},{"key":"ref3","volume-title":"vLLM"},{"key":"ref4","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints","author":"Ainslie","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.memori.2022.100022"},{"key":"ref6","volume-title":"Qwen technical report","author":"Bai","year":"2023"},{"key":"ref7","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding","author":"Bai","year":"2024"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3085572"},{"key":"ref9","article-title":"Claude 3.5 sonnet","year":"2024","journal-title":"Claude"},{"key":"ref10","volume-title":"Gemini 2.5: Pushing the frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","author":"Comanici","year":"2025"},{"key":"ref11","article-title":"Flash-decoding for longcontext inference","author":"Dao","year":"2023","journal-title":"Online"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3622781.3674180"},{"key":"ref13","volume-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3489048.3522661"},{"key":"ref15","volume-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716267"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3174101"},{"key":"ref18","volume-title":"Fastdecode: High-throughput gpu-efficient 11 m serving using heterogeneous pipelines","author":"He","year":"2024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00040"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716009"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651380"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/micro56248.2022.00051"},{"key":"ref23","article-title":"Gpipe: Easy scaling with microbatch pipeline parallelism","volume-title":"proceeding of Computer Science Computer Vision and Pattern Recognition","author":"Huang","year":"2019"},{"key":"ref24","article-title":"Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology","author":"Hyun","year":"2023","journal-title":"arXiv"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2024.3410842"},{"key":"ref26","volume-title":"Mistral 7b","author":"Jiang","year":"2023"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"815","DOI":"10.1145\/3613424.3614314","article-title":"Aespa: Asynchronous execution scheme to exploit bank-level parallelism of processing-in-memory","volume-title":"Proceedings of the 56th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Kal","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2024.3375352"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2025.3571857"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/HCS61935.2024.10664793"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3164651"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254711"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2022.3200718"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/HCS59251.2023.10254717"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895629"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895629"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365862"},{"key":"ref39","volume-title":"Pim-mmu: A memory management unit for accelerating data transfers in commercial pim systems","author":"Lee","year":"2024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42614.2022.9731711"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640376"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651352"},{"key":"ref45","volume-title":"How long can open-source 11 ms truly promise on context length?","author":"Li","year":"2023"},{"key":"ref46","volume-title":"Accllm: Accelerating long-context llm inference via algorithm-hardware co-design","author":"Liang","year":"2025"},{"key":"ref47","article-title":"Repobench: Benchmarking repositorylevel code auto-completion systems","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"The llama 4 herd: The beginning of a new era of natively multimodal ai innovation","year":"2025","journal-title":"released under the Llama 4 Community License Agreement"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731051"},{"key":"ref50","article-title":"Introducing mpt-7b: A new standard for open-source, commercially usable llms","year":"2023","journal-title":"MosaicML"},{"key":"ref51","article-title":"A Modern Primer on Processing in Memory","author":"Mutlu","year":"2020","journal-title":"arXiv"},{"key":"ref52","article-title":"Openai o1 system card","year":"2024","journal-title":"OpenAI"},{"key":"ref53","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2024"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640422"},{"key":"ref55","volume-title":"Splitwise: Efficient generative 11 m inference using phase splitting","author":"Patel","year":"2024"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00079"},{"key":"ref57","article-title":"Mind the memory gap: Unveiling gpu bottlenecks in large-batch 1lm inference","author":"Recasens","year":"2025","journal-title":"arXiv preprint"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2025.3550414"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651324"},{"key":"ref60","article-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv"},{"key":"ref61","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv preprint"},{"key":"ref62","article-title":"Aimx platform white paper","year":"2023","journal-title":"SK hynix, Memory Solution Product Design (MSPD) Division"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00116"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3446804.3446845"},{"key":"ref65","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref66","volume-title":"Qwen3-max: Just scale it","author":"Team","year":"2025"},{"key":"ref67","article-title":"Iree","year":"2019","journal-title":"an MLIR-based compiler and runtime for ML models from multiple frameworks"},{"key":"ref68","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD56317.2022.00062"},{"key":"ref70","volume-title":"Qwen3 technical report","author":"Yang","year":"2025"},{"key":"ref71","article-title":"Flashinfer: Efficient and customizable attention engine for llm inference serving","author":"Ye","year":"2025","journal-title":"arXiv preprint"},{"key":"ref72","first-page":"521","article-title":"Orca: A distributed serving system for Transformer-Based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"key":"ref73","volume-title":"Lv-eval: A balanced long-context benchmark with 5 length levels up to 256k","author":"Yuan","year":"2024"},{"key":"ref74","first-page":"5905","article-title":"QMSum: A new benchmark for query-based multi-domain meeting summarization","volume-title":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Zhong"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00082"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408592.pdf?arnumber=11408592","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:36:08Z","timestamp":1772692568000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408592\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408592","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}