{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:12Z","timestamp":1755870012396,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","funder":[{"name":"Ministerio de Ciencia, Innovaci\u00f3n y Universidades \/ Agencia Estatal de Investigaci\u00f3n","award":["PID2022-139664NB-I00"],"award-info":[{"award-number":["PID2022-139664NB-I00"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,8]]},"DOI":"10.1145\/3721145.3725751","type":"proceedings-article","created":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:57:17Z","timestamp":1755867437000},"page":"764-775","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Server Consolidation through a balanced mix of Transformer-based and Conventional Applications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1262-1256","authenticated-orcid":false,"given":"Pablo","family":"Abad","sequence":"first","affiliation":[{"name":"Computer Engineering Group, Universidad de Cantabria, Santander, Spain,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5818-1188","authenticated-orcid":false,"given":"Pablo","family":"Prieto","sequence":"additional","affiliation":[{"name":"Computer Engineering Group, Universidad de Cantabria, Santander, Spain,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6904-3282","authenticated-orcid":false,"given":"Valentin","family":"Puente","sequence":"additional","affiliation":[{"name":"Computer Engineering Group, Universidad de Cantabria, Santander, Spain,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2214-303X","authenticated-orcid":false,"given":"Jose Angel","family":"Gregorio","sequence":"additional","affiliation":[{"name":"Computer Engineering Group, Universidad de Cantabria, Santander, Spain,"}]}],"member":"320","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"e_1_3_3_1_1_2","first-page":"6000","article-title":"Attention is all you need","author":"Vaswani A.","year":"2017","unstructured":"[1] A. Vaswani et al., \u201cAttention is all you need,\u201d in Advances in Neural Information Processing Systems, 2017, pp. 6000\u20136010.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.2005.14165"},{"key":"e_1_3_3_1_3_2","volume-title":"33rd International Conference on Machine Learning, ICML 2016","volume":"1","author":"Amodei D.","year":"2016","unstructured":"D. Amodei et al., \u201cDeep speech 2: End-to-end speech recognition in English and Mandarin,\u201d in 33rd International Conference on Machine Learning, ICML 2016, 2016, vol. 1."},{"key":"e_1_3_3_1_4_2","volume-title":"Large Language Model Inference Acceleration: A Comprehensive Hardware Perspective","author":"Li J.","year":"2024","unstructured":"J. Li et al., \u201cLarge Language Model Inference Acceleration: A Comprehensive Hardware Perspective.\u201d 2024."},{"key":"e_1_3_3_1_5_2","volume-title":"Intel White Paper","author":"Rodriguez A.","year":"2018","unstructured":"A. Rodriguez, E. Segal, E. Meiri, E. Fomenko, Y. J. Kim, and H. Shen, \u201cLower Numerical Precision Deep Learning Inference and Training,\u201d Intel White Paper, 2018."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ARITH.2019.00019"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749475"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304005"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541941"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-024-01013-5"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3071762"},{"key":"e_1_3_3_1_12_2","volume-title":"Deep Learning Inference in Facebook Data Centers: Characterization, Performance Optimizations and Hardware Implications","author":"Park J.","year":"2018","unstructured":"J. Park et al., \u201cDeep Learning Inference in Facebook Data Centers: Characterization, Performance Optimizations and Hardware Implications.\u201d 2018."},{"key":"e_1_3_3_1_13_2","first-page":"729","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Hu Q.","year":"2024","unstructured":"Q. Hu et al., \u201cCharacterization of large language model development in the datacenter,\u201d in 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), 2024, pp. 709\u2013729."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC47752.2019.9042047"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC63097.2024.00024"},{"key":"e_1_3_3_1_16_2","volume-title":"Efficient LLM Inference on CPUs","author":"Shen H.","year":"2023","unstructured":"H. Shen, H. Chang, B. Dong, Y. Luo, and H. Meng, \u201cEfficient LLM Inference on CPUs,\u201d arXiv, vol. abs\/2311.0, 2023."},{"key":"e_1_3_3_1_17_2","volume-title":"SPEC CPU 2017","year":"2017","unstructured":"\u201cSPEC CPU 2017,\u201d 2017. https:\/\/www.spec.org\/"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3080702"},{"key":"e_1_3_3_1_19_2","unstructured":"G. Gerganov \u201cLlama.cpp: Inference in pure C\/C++.\u201d"},{"key":"e_1_3_3_1_20_2","unstructured":"G. Gerganov \u201cggml: tensor library for machine learning.\u201d"},{"key":"e_1_3_3_1_21_2","volume-title":"TinyLlama: An Open-Source Small Language Model","author":"Zhang P.","year":"2024","unstructured":"P. Zhang, G. Zeng, T. Wang, and W. Lu, \u201cTinyLlama: An Open-Source Small Language Model,\u201d arXiv, vol. 2401.02385, 2024."},{"key":"e_1_3_3_1_22_2","volume-title":"StarCoder: may the source be with you!","author":"Li R.","year":"2023","unstructured":"R. Li et al., \u201cStarCoder: may the source be with you!,\u201d arXiv, vol. abs\/2305.0, 2023."},{"key":"e_1_3_3_1_23_2","volume-title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","author":"Touvron H.","year":"2023","unstructured":"H. Touvron et al., \u201cLlama 2: Open Foundation and Fine-Tuned Chat Models.\u201d 2023."},{"key":"e_1_3_3_1_24_2","volume-title":"Mistral 7B","author":"Jiang A. Q.","year":"2023","unstructured":"A. Q. Jiang et al., \u201cMistral 7B.\u201d 2023."},{"key":"e_1_3_3_1_25_2","volume-title":"The Llama 3 Herd of Models","author":"Grattafiori A.","year":"2024","unstructured":"A. Grattafiori et al., \u201cThe Llama 3 Herd of Models.\u201d 2024."},{"key":"e_1_3_3_1_26_2","volume-title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","author":"Black S.","year":"2022","unstructured":"S. Black et al., \u201cGPT-NeoX-20B: An Open-Source Autoregressive Language Model.\u201d 2022."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3534879.3534882"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1016\/j.jml.2019.104047"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2014.6844459"},{"key":"e_1_3_3_1_30_2","unstructured":"\u201cIntel PMU Profiling Tools Source Code and Documentation.\u201d https:\/\/github.com\/andikleen\/pmu-tools"},{"key":"e_1_3_3_1_31_2","volume-title":"17 International Linux System Technology Conference. Nuremberg","author":"de Melo A. C.","year":"2010","unstructured":"A. C. de Melo, \u201cThe New Linux \u2018perf\u2019 Tools,\u201d 17 International Linux System Technology Conference. Nuremberg, 2010."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337863"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303963"},{"key":"e_1_3_3_1_34_2","volume-title":"Intel\u00ae 64 and IA-32 Architectures Software Developer's Manual","author":"I. Corporation","unstructured":"I. Corporation, \u201cIntel\u00ae 64 and IA-32 Architectures Software Developer's Manual, Volume 3 (3A, 3B & 3C): System Programming Guide,\u201d vol. 3, no. 253665, pp. 1\u20131386, 2013."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/RTAS.2017.15"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317840"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2018.00028"}],"event":{"name":"ICS '25: 2025 International Conference on Supercomputing","location":"Salt Lake City USA","acronym":"ICS '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 39th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721145.3725751","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:00:37Z","timestamp":1755867637000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721145.3725751"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,8]]},"references-count":37,"alternative-id":["10.1145\/3721145.3725751","10.1145\/3721145"],"URL":"https:\/\/doi.org\/10.1145\/3721145.3725751","relation":{},"subject":[],"published":{"date-parts":[[2025,6,8]]},"assertion":[{"value":"2025-08-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}