{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T15:46:02Z","timestamp":1782920762159,"version":"3.54.5"},"reference-count":103,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23A6007,U24A20234"],"award-info":[{"award-number":["U23A6007,U24A20234"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,1,31]]},"DOI":"10.1109\/hpca68181.2026.11408539","type":"proceedings-article","created":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T20:47:22Z","timestamp":1772657242000},"page":"1-15","source":"Crossref","is-referenced-by-count":1,"title":["AUM: Unleashing the Efficiency Potential of Shared Processors with Accelerator Units for LLM Serving"],"prefix":"10.1109","author":[{"given":"Xinkai","family":"Wang","sequence":"first","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yiming","family":"Zhuansun","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinyang","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaofeng","family":"Hou","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jing","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Luping","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weigao","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cheng","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guodong","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liping","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Sparamx: Accelerating compressed llms token generation on amx-powered cpus","author":"AbouElhamayed","year":"2025"},{"key":"ref2","article-title":"The risc-v instruction set manual: Volume i,\u201d Chapter 9. \u201cM","volume-title":"Standard Extension for Integer Multiplication and Division, Version 2.0","author":"Andrew waterman","year":"2022"},{"key":"ref3","volume-title":"The concern around gpu shortages and how these could impact the ai revolution","author":"Arik","year":"2023"},{"key":"ref4","article-title":"The c1-sme2 unit","volume-title":"ARM","year":"2025"},{"key":"ref5","article-title":"Sme and sme2","volume-title":"ARM","year":"2025"},{"key":"ref6","volume-title":"Longbench: A bilingual, multitask benchmark for long context understanding","author":"Bai","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01761-2"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3185768.3185771"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3731569.3764843"},{"key":"ref10","volume-title":"Evaluating large language models trained on code","author":"Chen","year":"2021"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304005"},{"key":"ref12","article-title":"The ai code editor","volume-title":"Cursor","year":"2025"},{"key":"ref13","volume-title":"How aws and intel make 11 ms more accessible and cost-effective with deepseek","author":"Dylan Souvage","year":"2025"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/hpca.2007.346201"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CloudCom.2015.60"},{"key":"ref16","first-page":"745","article-title":"Fair scheduling for avx2 and avx-512 workloads","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC)","author":"Gottschlag","year":"2021"},{"key":"ref17","first-page":"955","article-title":"Translation leak-aside buffer: Defeating cache side-channel protections with TLB attacks","volume-title":"27th USENIX Security Symposium (USENIX Security 18)","author":"Gras"},{"key":"ref18","volume-title":"Systems Performance: Enterprise and the Cloud","author":"Gregg","year":"2013"},{"key":"ref19","volume-title":"perf examples","author":"Gregg","year":"2024"},{"key":"ref20","volume-title":"Deepfm: A factorizationmachine based neural network for ctr prediction","author":"Guo","year":"2017"},{"key":"ref21","volume-title":"Inference performance optimization for large language models on cpus","author":"He","year":"2024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/sc.2016.83"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475437"},{"key":"ref24","article-title":"Intel\u00ae 64 and ia-32 architectures software developer\u2019s manual","year":"2022","journal-title":"Volume 1 (3 A, 3 B, 3 C&3D): Basic Architecture, Chapter 18: Programming With Intel\u00ae Advanced Matrix Extensions"},{"key":"ref25","article-title":"Intel\u00ae architecture instruction set extensions and future features 64 and ia-32 architectures software developer\u2019s manual","year":"2024","journal-title":"Chapter 3: Intel\u00ae AMX INSTRUCTION SET REFERENCE"},{"key":"ref26","article-title":"Intel\u00ae rdt software package","volume-title":"Intel","year":"2024"},{"key":"ref27","article-title":"Intel\u00ae resource director technology framework","volume-title":"Intel","year":"2024"},{"key":"ref28","article-title":"Intel unveils future-generation xeon with robust performance and efficiency architectures","volume-title":"Intel","year":"2024"},{"key":"ref29","article-title":"oneapi deep neural network library (onednn)","volume-title":"Intel","year":"2024"},{"key":"ref30","article-title":"Overview of accelerating ai inference and llm applications with cpus","year":"2025","journal-title":"Intel"},{"key":"ref31","article-title":"Support for next generation intel xeon scalable processors","volume-title":"Intel","year":"2025"},{"key":"ref32","article-title":"Unlock your ai potential with a winning combination","volume-title":"Intel","year":"2025"},{"key":"ref33","article-title":"xfastertransformer","volume-title":"Intel","year":"2025"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2015.2417545"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750392"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731092"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/lca.2024.3397747"},{"key":"ref38","volume-title":"Intel pmu profiling tools","author":"Kleen","year":"2024"},{"key":"ref39","volume-title":"sysbench","author":"Kopytov","year":"2025"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640401"},{"key":"ref41","article-title":"Qwen 3 + ktransformers 0.3 (+amx)= ai workstation\/pc","volume-title":"kvcache ai","year":"2025"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref43","article-title":"Summarize text","volume-title":"LangChain","year":"2025"},{"key":"ref44","volume-title":"Ecoserve: Designing carbon-aware ai inference systems","author":"Li","year":"2025"},{"key":"ref45","article-title":"turbostat - report processor frequency and idle statistics","volume-title":"Linux","year":"2024"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507752"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749475"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587437"},{"key":"ref49","first-page":"57","article-title":"Harvesting memorybound cpu stall cycles in software with msh","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Luo","year":"2024"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/hpca.2019.00024"},{"key":"ref51","article-title":"Introducing llama 3.2","volume-title":"Meta","year":"2024"},{"key":"ref52","article-title":"Our next-generation meta training and inference accelerator","volume-title":"Meta","year":"2024"},{"key":"ref53","article-title":"Faiss","volume-title":"Meta","year":"2025"},{"key":"ref54","article-title":"Introducing the new bing. the ai-powered assistant for your search","volume-title":"Microsoft","year":"2024"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/2788396"},{"key":"ref56","article-title":"Flexinfer: Flexible LLM inference with CPU computations","volume-title":"Eighth Conference on Machine Learning and Systems (MLSYS)","author":"Na","year":"2025"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/iiswc63097.2024.00024"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750382"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/isscc42614.2022.9731107"},{"key":"ref60","article-title":"Triton inference server","volume-title":"Nvidia","year":"2017"},{"key":"ref61","article-title":"Nvidia h100 tensor core gpu","volume-title":"Nvidia","year":"2024"},{"key":"ref62","article-title":"Gpt-4","volume-title":"OpenAI","year":"2024"},{"key":"ref63","article-title":"Api pricing","volume-title":"OpenAI","year":"2025"},{"key":"ref64","article-title":"Sharegpt","volume-title":"OpenAI","year":"2025"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1049\/iet-cdt.2014.0074"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651329"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/isca59077.2024.00019"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3695053.3731028"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3502181.3531464"},{"key":"ref70","volume-title":"Intel\u00ae avx-512 instructions","author":"Reinders","year":"2017"},{"key":"ref71","article-title":"riscv-v-spec","volume-title":"RISC-V","year":"2025"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/VLSID.2014.42"},{"key":"ref73","article-title":"Flexgen: high-throughput generative inference of large language models with a single gpu","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Sheng","year":"2023"},{"key":"ref74","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2020"},{"key":"ref75","article-title":"Specjbb 2015","volume-title":"SPEC","year":"2024"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/mm.2017.35"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00040"},{"key":"ref78","first-page":"1266","volume-title":"TAPAS: Thermal- and Power-Aware Scheduling for LLM Inference in Cloud Platforms","author":"Stojkovic","year":"2025"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/RTSS62706.2024.00032"},{"key":"ref80","article-title":"Ieit systems launches cpu inference servers to accelerate enterprise ai adoption","volume-title":"I. SYSTEMS","year":"2025"},{"key":"ref81","first-page":"3165","article-title":"Secsmt: Securing smt processors against contention-based covert channels","volume-title":"31st USENIX Security Symposium (USENIX Security 22)","author":"Taram"},{"key":"ref82","year":"2024","journal-title":"Gemma 2: Improving open language models at a practical size"},{"key":"ref83","volume-title":"Phi-3 technical report: A highly capable language model locally on your phone","year":"2024"},{"key":"ref84","volume-title":"Qwen3","year":"2025"},{"key":"ref85","volume-title":"Perf pmu events on sapphirerapids","author":"Torvalds","year":"2023"},{"key":"ref86","volume-title":"Tpc-h version 2 and version 3","year":"2024"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/hpca.2019.00026"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2014.6853218"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3620678.3624650"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1145\/3676641.3716283"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD63220.2024.00082"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS53621.2022.00030"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-95-1021-4_17"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456225"},{"key":"ref95","first-page":"945","article-title":"Mlaas in the wild: Workload analysis and scheduling in large-scale heterogeneous gpu clusters","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Weng"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00011"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ispass.2014.6844459"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378466"},{"key":"ref99","author":"Zhang","year":"2025","journal-title":"Covomix2: Advancing zero-shot dialogue generation with fully non-autoregressive flow matching"},{"key":"ref100","article-title":"Covomix: advancing zeroshot speech generation for human-like multi-talker conversations","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","author":"Zhang"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/micro.2014.53"},{"key":"ref102","first-page":"193","article-title":"Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zhong","year":"2024"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00059"}],"event":{"name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","location":"Sydney, Australia","start":{"date-parts":[[2026,1,31]]},"end":{"date-parts":[[2026,2,4]]}},"container-title":["2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11408404\/11408433\/11408539.pdf?arnumber=11408539","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T06:50:45Z","timestamp":1772693445000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11408539\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,31]]},"references-count":103,"URL":"https:\/\/doi.org\/10.1109\/hpca68181.2026.11408539","relation":{},"subject":[],"published":{"date-parts":[[2026,1,31]]}}}