{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:44:58Z","timestamp":1772725498077,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819510207","type":"print"},{"value":"9789819510214","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,4]],"date-time":"2025-11-04T00:00:00Z","timestamp":1762214400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,4]],"date-time":"2025-11-04T00:00:00Z","timestamp":1762214400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-1021-4_17","type":"book-chapter","created":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T10:28:57Z","timestamp":1762165737000},"page":"231-245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["AsymServe: Demystifying and\u00a0Optimizing LLM Serving Efficiency on\u00a0CPU Acceleration Units"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3764-8065","authenticated-orcid":false,"given":"Xinkai","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiming","family":"Zhuansun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6218-4659","authenticated-orcid":false,"given":"Chao","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7260-0521","authenticated-orcid":false,"given":"Jing","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4372-7851","authenticated-orcid":false,"given":"Xiaofeng","family":"Hou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lingyu","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Luping","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,4]]},"reference":[{"key":"17_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint: arXiv:2303.08774 (2023)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"DeVuyst, M., Venkat, A., Tullsen, D.M.: Execution migration in a heterogeneous-ISA chip multiprocessor. In: Proceedings of the Seventeenth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 261\u2013272 (2012)","DOI":"10.1145\/2150976.2151004"},{"key":"17_CR3","unstructured":"Google: assistant with bard: a step toward a more personal assistant (2024). https:\/\/bit.ly\/4h3Pti8"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Gottschlag, M., Brantsch, P., Bellosa, F.: Automatic core specialization for AVX-512 applications. In: Proceedings of the 13th ACM International Systems and Storage Conference, pp. 25\u201335 (2020)","DOI":"10.1145\/3383669.3398282"},{"key":"17_CR5","unstructured":"Gottschlag, M., Machauer, P., Khalil, Y., Bellosa, F.: Fair scheduling for AVX2 and AVX-512 workloads. In: 2021 USENIX Annual Technical Conference (USENIX ATC 2021), pp. 745\u2013758 (2021)"},{"key":"17_CR6","unstructured":"Gregg, B.: Perf examples (2024). https:\/\/www.brendangregg.com\/perf.html"},{"key":"17_CR7","unstructured":"He, P., et al.: Inference performance optimization for large language models on CPUs. In: ICML 2024 Workshop on Foundation Models in the Wild (2024)"},{"key":"17_CR8","unstructured":"Iliescu, D.A., Petrogalli, F.: Arm scalable vector extension and application to machine learning. Retrieved October (2018)"},{"key":"17_CR9","unstructured":"Intel: intel\u00ae RDT software package (2024). https:\/\/github.com\/intel\/intel-cmt-cat"},{"key":"17_CR10","unstructured":"Intel: intel unveils future-generation Xeon with robust performance and efficiency architectures (2024). https:\/\/bit.ly\/4gobDeL"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Jouppi, N., et al.: TPU v4: an optically reconfigurable supercomputer for machine learning with hardware support for embeddings. In: Proceedings of the 50th Annual International Symposium on Computer Architecture. ISCA \u201923 (2023)","DOI":"10.1145\/3579371.3589350"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Kanev, S., et al.: Profiling a warehouse-scale computer. In: Proceedings of the 42nd Annual International Symposium on Computer Architecture, pp. 158\u2013169 (2015)","DOI":"10.1145\/2749469.2750392"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Kim, H., Ye, G., Wang, N., Yazdanbakhsh, A., Kim, N.S.: Exploiting intel\u00ae advanced matrix extensions (AMX) for large language model inference. IEEE Comput. Archit. Lett. (2024)","DOI":"10.1109\/LCA.2024.3397747"},{"key":"17_CR14","unstructured":"Kleen, A.: Intel PMU profiling tools (2024). https:\/\/github.com\/andikleen\/pmu-tools"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Kwon, W., et al.: Efficient memory management for large language model serving with pagedAttention. In: Proceedings of the 29th Symposium on Operating Systems Principles, pp. 611\u2013626 (2023)","DOI":"10.1145\/3600006.3613165"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Liu, J., Tang, P., Hou, X., Li, C., Heng, P.A.: LoRAExit: empowering dynamic modulation of llms in resource-limited settings using low-rank adapters. In: Findings of the Association for Computational Linguistics: EMNLP 2024, pp. 9211\u20139225 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.539"},{"key":"17_CR17","unstructured":"Meta: Introducing llama 3.2 (2024). https:\/\/www.llama.com\/"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Microsoft: introducing the new Bing. the AI-powered assistant for your search (2024). https:\/\/bit.ly\/3DHVp26","DOI":"10.1007\/979-8-8688-0419-9_1"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Na, S., Jeong, G., Ahn, B.H., Young, J., Krishna, T., Kim, H.: Understanding performance implications of LLM inference on CPUs. In: 2024 IEEE International Symposium on Workload Characterization (IISWC), pp. 169\u2013180. IEEE (2024)","DOI":"10.1109\/IISWC63097.2024.00024"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Nassif, N., et\u00a0al.: Sapphire rapids: the next-generation intel Xeon scalable processor. In: 2022 IEEE International Solid-State Circuits Conference (ISSCC), vol.\u00a065, pp. 44\u201346. IEEE (2022)","DOI":"10.1109\/ISSCC42614.2022.9731107"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Padoin, E.L., Pilla, L.L., Castro, M., Boito, F.Z., Alexandre\u00a0Navaux, P.O., M\u00e9haut, J.F.: Performance\/energy trade-off in scientific computing: the case of arm big. little and intel sandy bridge. IET Comput. Dig. Tech. 9(1), 27\u201335 (2015)","DOI":"10.1049\/iet-cdt.2014.0074"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Patel, P., et al.: Splitwise: efficient generative LLM inference using phase splitting. In: 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pp. 118\u2013132. IEEE (2024)","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"17_CR23","unstructured":"Reinders, J.R.: Intel\u00ae AVX-512 instructions (2017). https:\/\/bit.ly\/3DbYbfL"},{"key":"17_CR24","unstructured":"Shen, H., Chang, H., Dong, B., Luo, Y., Meng, H.: Efficient LLM inference on CPUs. arXiv preprint: arXiv:2311.00502 (2023)"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Sriraman, A., Dhanotia, A.: Accelerometer: understanding acceleration opportunities for data center overheads at hyperscale. In: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 733\u2013750 (2020)","DOI":"10.1145\/3373376.3378450"},{"issue":"3","key":"17_CR26","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1145\/2678373.2665692","volume":"42","author":"A Venkat","year":"2014","unstructured":"Venkat, A., Tullsen, D.M.: Harnessing ISA diversity: design of a heterogeneous-ISA chip multiprocessor. ACM SIGARCH Comput. Architect. News 42(3), 121\u2013132 (2014)","journal-title":"ACM SIGARCH Comput. Architect. News"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Not all resources are visible: exploiting fragmented shadow resources in shared-state scheduler architecture. In: Proceedings of the 2023 ACM Symposium on Cloud Computing (SoCC), pp. 109\u2013124 (2023)","DOI":"10.1145\/3620678.3624650"},{"key":"17_CR28","doi-asserted-by":"publisher","unstructured":"Wang, X., et al.: Exist: enabling extremely efficient intra-service tracing observability in datacenters. In: Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), Vol. 2, pp. 355\u2013372 (2025). https:\/\/doi.org\/10.1145\/3676641.3716283","DOI":"10.1145\/3676641.3716283"},{"key":"17_CR29","doi-asserted-by":"publisher","unstructured":"Wang, X., Li, C., Sun, L., Lyu, Q., Hou, X., Leng, J., Guo, M.: SHEEO: continuous energy efficiency optimization in autonomous embedded systems. In: 2024 IEEE 42nd International Conference on Computer Design (ICCD), pp. 496\u2013503 (2024). https:\/\/doi.org\/10.1109\/ICCD63220.2024.00082","DOI":"10.1109\/ICCD63220.2024.00082"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Yasin, A.: A top-down method for performance analysis and counters architecture. In: 2014 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 35\u201344. IEEE (2014)","DOI":"10.1109\/ISPASS.2014.6844459"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Yuan, Y., et\u00a0al.: Intel accelerators ecosystem: an SOC-oriented perspective: industry product. In: 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pp. 848\u2013862. IEEE (2024)","DOI":"10.1109\/ISCA59077.2024.00066"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Zeng, S., et\u00a0al.: FlightLLM: efficient large language model inference with a complete mapping flow on FPGAs. In: Proceedings of the 2024 ACM\/SIGDA International Symposium on Field Programmable Gate Arrays, pp. 223\u2013234 (2024)","DOI":"10.1145\/3626202.3637562"},{"key":"17_CR33","unstructured":"Zhong, Y., et al.: DistServe: disaggregating prefill and decoding for goodput-optimized large language model serving. In: 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pp. 193\u2013210 (2024)"},{"key":"17_CR34","unstructured":"Zhou, Z., et\u00a0al.: A survey on efficient inference for large language models. arXiv preprint: arXiv:2404.14294 (2024)"}],"container-title":["Lecture Notes in Computer Science","Advanced Parallel Processing Technologies"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-1021-4_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T10:29:12Z","timestamp":1762165752000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-1021-4_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,4]]},"ISBN":["9789819510207","9789819510214"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-1021-4_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,4]]},"assertion":[{"value":"4 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APPT","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Advanced Parallel Processing Technologies","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"appt2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.appt-conference.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}