{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T16:50:02Z","timestamp":1774716602457,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731051","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"808-820","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Hybe: GPU-NPU Hybrid System for Efficient LLM Inference with Million-Token Context Window"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5924-7000","authenticated-orcid":false,"given":"Seungjae","family":"Moon","sequence":"first","affiliation":[{"name":"HyperAccel, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4415-8552","authenticated-orcid":false,"given":"Junseo","family":"Cha","sequence":"additional","affiliation":[{"name":"HyperAccel, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4036-7590","authenticated-orcid":false,"given":"Hyunjun","family":"Park","sequence":"additional","affiliation":[{"name":"HyperAccel, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1099-1496","authenticated-orcid":false,"given":"Joo-Young","family":"Kim","sequence":"additional","affiliation":[{"name":"HyperAccel, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527405"},{"key":"e_1_3_3_1_3_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Joshua Ainslie James Lee-Thorp Michiel de Jong Yury Zemlyanskiy Federico Lebr\u00f3n and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13245 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_3_1_5_2","volume-title":"Poster: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Ali Ghazanfar","year":"2020","unstructured":"Ghazanfar Ali, Sridutt Bhalachandra, Nicholas Wright, Alan Sill, and Yong Chen. 2020. Evaluation of power controls and counters on general-purpose Graphics Processing Units (GPUs). In Poster: International Conference for High Performance Computing, Networking, Storage and Analysis."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_3_1_7_2","unstructured":"Rohan Anil Andrew\u00a0M Dai Orhan Firat Melvin Johnson Dmitry Lepikhin Alexandre Passos Siamak Shakeri Emanuel Taropa Paige Bailey Zhifeng Chen et\u00a0al. 2023. Palm 2 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.10403 (2023)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Sid Black Stella Biderman Eric Hallahan Quentin Anthony Leo Gao Laurence Golding Horace He Connor Leahy Kyle McDonell Jason Phang et\u00a0al. 2022. Gpt-neox-20b: An open-source autoregressive language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06745 (2022).","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"e_1_3_3_1_9_2","unstructured":"Shouyuan Chen Sherman Wong Liangjian Chen and Yuandong Tian. 2023. Extending context window of large language models via positional interpolation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.15595 (2023)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Jack Choquette. 2023. Nvidia hopper h100 gpu: Scaling performance. IEEE Micro (2023).","DOI":"10.1109\/MM.2023.3256796"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. NVIDIA A100 Tensor Core GPU: Performance and Innovation. IEEE Micro 41 2 (2021) 29\u201335. 10.1109\/MM.2021.3061394","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_1_12_2","unstructured":"Soham De Samuel\u00a0L Smith Anushan Fernando Aleksandar Botev George Cristian-Muraru Albert Gu Ruba Haroun Leonard Berrada Yutian Chen Srivatsan Srinivasan et\u00a0al. 2024. Griffin: Mixing gated linear recurrences with local attention for efficient language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.19427 (2024)."},{"key":"e_1_3_3_1_13_2","unstructured":"Yiran Ding Li\u00a0Lyna Zhang Chengruidong Zhang Yuanyuan Xu Ning Shang Jiahang Xu Fan Yang and Mao Yang. 2024. LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13753 (2024)."},{"key":"e_1_3_3_1_14_2","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_1_15_2","unstructured":"Michael Feil. 2024. Synthetic Data Generation for Contexts Up to 1 Million Tokens Using Short-Context Models. Gradient Blog (2024)."},{"key":"e_1_3_3_1_16_2","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00752 (2023)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651380"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"e_1_3_3_1_19_2","unstructured":"Coleman Hooper Sehoon Kim Hiva Mohammadzadeh Michael\u00a0W Mahoney Yakun\u00a0Sophia Shao Kurt Keutzer and Amir Gholami. 2024. KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.18079 (2024)."},{"key":"e_1_3_3_1_20_2","unstructured":"Albert\u00a0Q Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Emma\u00a0Bou Hanna Florian Bressand et\u00a0al. 2024. Mixtral of experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04088 (2024)."},{"key":"e_1_3_3_1_21_2","unstructured":"Andreas\u00a0Kosmas Kakolyris Dimosthenis Masouros Petros Vavaroutsos Sotirios Xydis and Dimitrios Soudris. 2024. SLO-aware GPU Frequency Scaling for Energy Efficient LLM Inference Serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.05235 (2024)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","unstructured":"Yoongu Kim Weikun Yang and Onur Mutlu. 2016. Ramulator: A Fast and Extensible DRAM Simulator. IEEE Computer Architecture Letters 15 1 (2016) 45\u201349. 10.1109\/LCA.2015.2414456","DOI":"10.1109\/LCA.2015.2414456"},{"key":"e_1_3_3_1_23_2","unstructured":"Oleksii Kuchaiev Jason Li Huyen Nguyen Oleksii Hrinchuk Ryan Leary Boris Ginsburg Samuel Kriman Stanislav Beliaev Vitaly Lavrukhin Jack Cook et\u00a0al. 2019. Nemo: a toolkit for building ai applications using neural modules. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1909.09577 (2019)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Ang Li Shuaiwen\u00a0Leon Song Jieyang Chen Jiajia Li Xu Liu Nathan\u00a0R Tallent and Kevin\u00a0J Barker. 2019. Evaluating modern gpu interconnect: Pcie nvlink nv-sli nvswitch and gpudirect. IEEE Transactions on Parallel and Distributed Systems 31 1 (2019) 94\u2013110.","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_3_1_26_2","unstructured":"Ji Lin Jiaming Tang Haotian Tang Shang Yang Xingyu Dang and Song Han. 2023. Awq: Activation-aware weight quantization for llm compression and acceleration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.00978 (2023)."},{"key":"e_1_3_3_1_27_2","unstructured":"Hao Liu Matei Zaharia and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.01889 (2023)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Weile Luo Ruibo Fan Zeyu Li Dayou Du Qiang Wang and Xiaowen Chu. 2024. Benchmarking and Dissecting the Nvidia Hopper GPU Architecture. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13499 (2024).","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Eitan Medina and Eran Dagan. 2020. Habana labs purpose-built ai inference and training processor architectures: Scaling ai training systems using standard ethernet with gaudi processor. IEEE Micro 40 2 (2020) 17\u201324.","DOI":"10.1109\/MM.2020.2975185"},{"key":"e_1_3_3_1_30_2","unstructured":"Tsendsuren Munkhdalai Manaal Faruqui and Siddharth Gopal. 2024. Leave no context behind: Efficient infinite context transformers with infini-attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.07143 (2024)."},{"key":"e_1_3_3_1_31_2","unstructured":"Gunho Park Baeseong Park Minsub Kim Sungjae Lee Jeonghoon Kim Beomseok Kwon Se\u00a0Jung Kwon Byeongwook Kim Youngjoo Lee and Dongsoo Lee. 2022. Lut-gemm: Quantized matrix multiplication based on luts for efficient inference in large-scale generative language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2206.09557 (2022)."},{"key":"e_1_3_3_1_32_2","unstructured":"Pratyush Patel Esha Choukse Chaojie Zhang \u00cd\u00f1igo Goiri Aashaka Shah Saeed Maleki and Ricardo Bianchini. 2023. Splitwise: Efficient generative llm inference using phase splitting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.18677 (2023)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Bo Peng Eric Alcaide Quentin Anthony Alon Albalak Samuel Arcadinho Stella Biderman Huanqi Cao Xin Cheng Michael Chung Matteo Grella et\u00a0al. 2023. Rwkv: Reinventing rnns for the transformer era. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13048 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"e_1_3_3_1_34_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_35_2","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et\u00a0al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.05530 (2024)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651324"},{"key":"e_1_3_3_1_37_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_38_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_39_2","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. 2023. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning. PMLR, 38087\u201338099."},{"key":"e_1_3_3_1_40_2","unstructured":"Daliang Xu Wangsong Yin Xin Jin Ying Zhang Shiyun Wei Mengwei Xu and Xuanzhe Liu. 2023. Llmcad: Fast and scalable on-device large language model inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.04255 (2023)."},{"key":"e_1_3_3_1_41_2","unstructured":"Alex Young Bei Chen Chao Li Chengen Huang Ge Zhang Guanwei Zhang Heng Li Jiangcheng Zhu Jianqun Chen Jing Chang et\u00a0al. 2024. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.04652 (2024)."},{"key":"e_1_3_3_1_42_2","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for { Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"e_1_3_3_1_44_2","unstructured":"Peitian Zhang Zheng Liu Shitao Xiao Ninglu Shao Qiwei Ye and Zhicheng Dou. 2024. Soaring from 4K to 400K: Extending LLM\u2019s Context with Activation Beacon. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.03462 (2024)."},{"key":"e_1_3_3_1_45_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin et\u00a0al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.01068 (2022)."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Youpeng Zhao Di Wu and Jun Wang. 2024. ALISA: Accelerating Large Language Model Inference via Sparsity-Aware KV Caching. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.17312 (2024).","DOI":"10.1109\/ISCA59077.2024.00077"},{"key":"e_1_3_3_1_47_2","unstructured":"Yinmin Zhong Shengyu Liu Junda Chen Jianbo Hu Yibo Zhu Xuanzhe Liu Xin Jin and Hao Zhang. 2024. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.09670 (2024)."},{"key":"e_1_3_3_1_48_2","unstructured":"Yanqi Zhou Tao Lei Hanxiao Liu Nan Du Yanping Huang Vincent Zhao Andrew\u00a0M Dai Quoc\u00a0V Le James Laudon et\u00a0al. 2022. Mixture-of-experts with expert choice routing. Advances in Neural Information Processing Systems 35 (2022) 7103\u20137114."}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731051","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:06:19Z","timestamp":1750503979000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731051"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":47,"alternative-id":["10.1145\/3695053.3731051","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731051","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}