{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:13:54Z","timestamp":1776950034483,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","funder":[{"name":"EU Pilot for exascale EuroHPC EUPEX","award":["101033975"],"award-info":[{"award-number":["101033975"]}]},{"name":"EuroHPC JU SEANERGYS","award":["101177590"],"award-info":[{"award-number":["101177590"]}]},{"name":"DARE","award":["101143421"],"award-info":[{"award-number":["101143421"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777884.3797011","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:27:26Z","timestamp":1776947246000},"page":"83-95","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SweetSpot: An Analytical Model for Predicting Energy Efficiency of LLM Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2768-0418","authenticated-orcid":false,"given":"Hiari","family":"Pizzini Cavagna","sequence":"first","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6367-762X","authenticated-orcid":false,"given":"Andrea","family":"Proia","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1289-6354","authenticated-orcid":false,"given":"Giacomo","family":"Madella","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9595-2962","authenticated-orcid":false,"given":"Giovanni Battista","family":"Esposito","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1125-0588","authenticated-orcid":false,"given":"Francesco","family":"Antici","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1294-372X","authenticated-orcid":false,"given":"Daniele","family":"Cesarini","sequence":"additional","affiliation":[{"name":"Cineca, Casalecchio di Reno, BO, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0412-4396","authenticated-orcid":false,"given":"Zeynep","family":"Kiziltan","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1148-2450","authenticated-orcid":false,"given":"Andrea","family":"Bartolini","sequence":"additional","affiliation":[{"name":"University of Bologna, Bologna, Italy"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv:2305.13245 [cs.CL] https:\/\/arxiv.org\/abs\/2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arXiv:2305.13245 [cs.CL] https:\/\/arxiv.org\/abs\/2305.13245"},{"key":"e_1_3_2_1_2_1","unstructured":"Ebtesam Almazrouei Hamza Alobeidli Abdulaziz Alshamsi Alessandro Cappelli Ruxandra Cojocaru M\u00e9rouane Debbah \u00c9tienne Goffinet Daniel Hesslow et al. 2023. The Falcon Series of Open Language Models. arXiv:2311.16867 [cs.CL] https:\/\/arxiv.org\/abs\/2311.16867"},{"key":"e_1_3_2_1_3_1","volume-title":"Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, and Yuxiong He.","author":"Aminabadi Reza Yazdani","year":"2022","unstructured":"Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, and Yuxiong He. 2022. DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. arXiv:2207.00032 [cs.LG] https:\/\/arxiv.org\/abs\/2207.00032"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3339186.3339215"},{"key":"e_1_3_2_1_5_1","unstructured":"NVIDIA Corporation. 2025. Python Bindings for NVIDIA Management Library (pynvml). https:\/\/pypi.org\/project\/nvidia-ml-py3\/. Accessed: 2025-09-12."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Tri Dao Daniel Y. Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. arXiv:2205.14135 [cs.LG] https:\/\/arxiv.org\/abs\/2205.14135","DOI":"10.52202\/068431-1189"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Jared Fernandez Clara Na Vashisth Tiwari Yonatan Bisk Sasha Luccioni and Emma Strubell. 2025. Energy Considerations of Large Language Model Inference and Efficiency Optimizations. arXiv:2504.17674 [cs.CL] https:\/\/arxiv.org\/abs\/2504.17674","DOI":"10.18653\/v1\/2025.acl-long.1563"},{"key":"e_1_3_2_1_8_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_9_1","unstructured":"Nidhal Jegham Marwan Abdelatti Lassad Elmoubarki and Abdeltawab Hendawi. 2025. How Hungry is AI? Benchmarking Energy Water and Carbon Footprint of LLM Inference. arXiv:2505.09598 [cs.CY] https:\/\/arxiv.org\/abs\/2505.09598"},{"key":"e_1_3_2_1_10_1","volume-title":"Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. arXiv:2309.06180 [cs.LG] https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"e_1_3_2_1_11_1","unstructured":"Paul Joe Maliakel Shashikant Ilager and Ivona Brandic. 2025. Investigating Energy Efficiency and Performance Trade-offs in LLM Inference Across Tasks and DVFS Settings. arXiv:2501.08219 [cs.LG] https:\/\/arxiv.org\/abs\/2501.08219"},{"key":"e_1_3_2_1_12_1","volume-title":"Michele Merler, Parameswaran Selvam, et al.","author":"Mishra Mayank","year":"2024","unstructured":"Mayank Mishra, Matt Stallone, Gaoyuan Zhang, Yikang Shen, Aditya Prasad, Adriana Meza Soria, Michele Merler, Parameswaran Selvam, et al., 2024. Granite Code Models: A Family of Open Foundation Models for Code Intelligence. arXiv:2405.04324 [cs.AI] https:\/\/arxiv.org\/abs\/2405.04324"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3757892.3757900"},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/github.com\/NVIDIA\/TensorRT-LLM. Version 0.21.0, accessed","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. 2025. TensorRT-LLM. https:\/\/github.com\/NVIDIA\/TensorRT-LLM. Version 0.21.0, accessed September 15, 2025."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. arXiv:1910.02054 [cs.LG] https:\/\/arxiv.org\/abs\/1910.02054","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_16_1","volume-title":"Fast Transformer Decoding: One Write-Head is All You Need. CoRR","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. 2019. Fast Transformer Decoding: One Write-Head is All You Need. CoRR, Vol. abs\/1911.02150 (2019), 0. arXiv:1911.02150 http:\/\/arxiv.org\/abs\/1911.02150"},{"key":"e_1_3_2_1_17_1","unstructured":"Jovan Stojkovic Esha Choukse Chaojie Zhang Inigo Goiri and Josep Torrellas. 2024. Towards Greener LLMs: Bringing Energy-Efficiency to the Forefront of LLM Inference. arXiv:2403.20306 [cs.AI] https:\/\/arxiv.org\/abs\/2403.20306"},{"key":"e_1_3_2_1_18_1","volume-title":"Gemma: Open Models Based on Gemini Research and Technology. arXiv:2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivi\u00e8re, et al., 2024. Gemma: Open Models Based on Gemini Research and Technology. arXiv:2403.08295 [cs.CL] https:\/\/arxiv.org\/abs\/2403.08295"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3726523"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3721146.3721953"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Grant Wilkins Srinivasan Keshav and Richard Mortier. 2024. Offline Energy-Optimal LLM Serving: Workload-Based Energy Models for LLM Inference on Heterogeneous Systems. arXiv:2407.04014 [cs.DC] https:\/\/arxiv.org\/abs\/2407.04014","DOI":"10.1145\/3727200.3727217"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38-45","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Online, 38-45. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_24_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li et al. 2024. Qwen2 Technical Report. arXiv:2407.10671 [cs.CL] https:\/\/arxiv.org\/abs\/2407.10671"},{"key":"e_1_3_2_1_25_1","volume-title":"Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer.","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Yuzhang Shang, Yang Zhou, Zhen Dong, Zhe Zhou, Chenhao Xue, Bingzhe Wu, Zhikai Li, Qingyi Gu, Yong Jae Lee, Yan Yan, Beidi Chen, Guangyu Sun, and Kurt Keutzer. 2024. LLM Inference Unveiled: Survey and Roofline Model Insights. arXiv:2402.16363 [cs.CL] https:\/\/arxiv.org\/abs\/2402.16363"},{"key":"e_1_3_2_1_26_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL] https:\/\/arxiv.org\/abs\/2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, et al., 2022. OPT: Open Pre-trained Transformer Language Models. arXiv:2205.01068 [cs.CL] https:\/\/arxiv.org\/abs\/2205.01068"}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Proceedings of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T12:28:31Z","timestamp":1776947311000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777884.3797011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":26,"alternative-id":["10.1145\/3777884.3797011","10.1145\/3777884"],"URL":"https:\/\/doi.org\/10.1145\/3777884.3797011","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}