{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T15:15:02Z","timestamp":1776784502123,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,21]]},"DOI":"10.1145\/3695053.3731064","type":"proceedings-article","created":{"date-parts":[[2025,6,20]],"date-time":"2025-06-20T16:46:17Z","timestamp":1750437977000},"page":"1495-1508","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["AMALI: An Analytical Model for Accurately Modeling LLM Inference on Modern GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9705-7216","authenticated-orcid":false,"given":"Shiheng","family":"Cao","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3136-6721","authenticated-orcid":false,"given":"Junmin","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and Suzhou Institute for Advanced Research, University of Science and Technology of China, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6487-3658","authenticated-orcid":false,"given":"Junshi","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3900-3722","authenticated-orcid":false,"given":"Hong","family":"An","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8067-9612","authenticated-orcid":false,"given":"Zhibin","family":"Yu","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology(SIAT), Chinese Academy of Science(CAS), Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,20]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC55821.2022.9926299"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2016.7482092"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476221"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2019.8916466"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"Yehia Arafa Abdel-Hameed\u00a0A. Badawy Gopinath Chennupati Nandakishore Santhi and Stephan Eidenbenz. 2019. PPT-GPU: Scalable GPU Performance Modeling. IEEE Computer Architecture Letters 18 1 (2019) 55\u201358. 10.1109\/LCA.2019.2904497","DOI":"10.1109\/LCA.2019.2904497"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830780"},{"key":"e_1_3_3_1_8_2","unstructured":"Arun Chandrasekaran. 2024. Spotlight on 2024 Gartner Hype Cycle\u2122 for Emerging Technologies. https:\/\/www.gartner.com\/en\/articles\/hype-cycle-for-emerging-technologies. [Accessed: 2025-02-09]."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/1693453.1693470"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. NVIDIA A100 Tensor Core GPU: Performance and Innovation. IEEE Micro 41 2 (2021) 29\u201335. 10.1109\/MM.2021.3061394","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_3_1_13_2","unstructured":"CloudCores. 2024. CuAssembler: A CUDA PTX Assembly Tool. https:\/\/github.com\/cloudcores\/CuAssembler. [Accessed: 2024-11-10]."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2006.1620807"},{"key":"e_1_3_3_1_15_2","volume-title":"The Llama 3 Herd of Models","author":"Dubey A.","year":"2024","unstructured":"A. Dubey, A. Jauhri, A. Pandey, et\u00a0al. 2024. The Llama 3 Herd of Models. Technical Report. Meta Platforms, Inc. arXiv:https:\/\/arXiv.org\/abs\/2407.21783https:\/\/arxiv.org\/abs\/2407.21783 [Accessed: 2024-11-10]."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","unstructured":"Michael Garland Scott Le\u00a0Grand John Nickolls Joshua Anderson Jim Hardwick Scott Morton Everett Phillips Yao Zhang and Vasily Volkov. 2008. Parallel Computing Experiences with CUDA. IEEE Micro 28 4 (2008) 13\u201327. 10.1109\/MM.2008.57","DOI":"10.1109\/MM.2008.57"},{"key":"e_1_3_3_1_17_2","unstructured":"GitHub. 2021. GitHub Copilot. https:\/\/github.com\/features\/copilot\/. Accessed: 2024-11-16."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2017.7975298"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322221"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"Jo\u00e3o Guerreiro Aleksandar Ilic Nuno Roma and Pedro Tom\u00e1s. 2019. GPU Static Modeling Using PTX and Deep Structured Learning. IEEE Access 7 (2019) 159150\u2013159161. 10.1109\/ACCESS.2019.2951218","DOI":"10.1109\/ACCESS.2019.2951218"},{"key":"e_1_3_3_1_21_2","volume-title":"Computer Architecture: A Quantitative Approach (5th ed.)","author":"Hennessy John\u00a0L.","year":"2011","unstructured":"John\u00a0L. Hennessy and David\u00a0A. Patterson. 2011. Computer Architecture: A Quantitative Approach (5th ed.). Morgan Kaufmann Publishers Inc., San Mateo, CA, USA."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555775"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","unstructured":"Sunpyo Hong and Hyesoon Kim. 2009. An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness. SIGARCH Comput. Archit. News 37 3 (June 2009) 152\u2013163. 10.1145\/1555815.1555775","DOI":"10.1145\/1555815.1555775"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815998"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.59"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","unstructured":"Akshay Jain Mahmoud Khairy and Timothy\u00a0G. Rogers. 2018. A Quantitative Evaluation of Contemporary GPU Simulation Methodology. Proc. ACM Meas. Anal. Comput. Syst. 2 2 Article 35 (June 2018) 28\u00a0pages. 10.1145\/3224430","DOI":"10.1145\/3224430"},{"key":"e_1_3_3_1_27_2","unstructured":"Zhe Jia Marco Maggioni Benjamin Staiger and Daniele\u00a0Paolo Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. ArXiv abs\/1804.06826 (2018). https:\/\/api.semanticscholar.org\/CorpusID:4930164"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2004.1310786"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527384"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/2925426.2926267"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00041"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","unstructured":"Xinxin Mei and Xiaowen Chu. 2017. Dissecting GPU Memory Hierarchy Through Microbenchmarking. IEEE Transactions on Parallel and Distributed Systems 28 1 (2017) 72\u201386. 10.1109\/TPDS.2016.2549523","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"e_1_3_3_1_35_2","unstructured":"Meta-Llama. 2023. Llama. GitHub Repository. https:\/\/github.com\/Meta-Llama\/Llama [Accessed: 2023-11-10]."},{"key":"e_1_3_3_1_36_2","unstructured":"Sharan Narang and Greg Diamos. 2016. Baidu DeepBench. https:\/\/svail.github.io\/DeepBench. [Accessed: 2024-11-10]."},{"key":"e_1_3_3_1_37_2","volume-title":"Nsight Compute CLI","year":"2020","unstructured":"NVIDIA Corporation 2020. Nsight Compute CLI. NVIDIA Corporation. https:\/\/developer.nvidia.com\/nsight-compute-cli Updated in 2021."},{"key":"e_1_3_3_1_38_2","unstructured":"NVIDIA Corporation. 2024. CUDA Binary Utilities Documentation. https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/index.html. Accessed: 2024-11-19."},{"key":"e_1_3_3_1_39_2","unstructured":"NVIDIA Corporation. 2024. CUDA C++ Best Practices Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-best-practices-guide\/index.html. [Accessed: 2024-11-10]."},{"key":"e_1_3_3_1_40_2","unstructured":"NVIDIA Corporation. 2025. NVIDIA Ampere Architecture Whitepaper. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. Accessed: February 15 2025."},{"key":"e_1_3_3_1_41_2","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/index\/chatgpt\/. Accessed: 2024-11-16."},{"key":"e_1_3_3_1_42_2","first-page":"8024","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward\u00a0Z. Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada. Curran Associates, Inc., Vancouver, BC, Canada, 8024\u20138035."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC55918.2022.00033"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","unstructured":"Jason Power Joel Hestness Marc\u00a0S. Orr Mark\u00a0D. Hill and David\u00a0A. Wood. 2015. gem5-gpu: A Heterogeneous CPU-GPU Simulator. IEEE Computer Architecture Letters 14 1 (2015) 34\u201336. 10.1109\/LCA.2014.2299539","DOI":"10.1109\/LCA.2014.2299539"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00043"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_3_1_47_2","unstructured":"Run:AI. 2024. PyTorch GPU: A Guide to Multi GPU Training. https:\/\/www.run.ai\/guides\/gpu-deep-learning\/pytorch-gpu. Accessed: 2024-11-19."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00088"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145819"},{"key":"e_1_3_3_1_50_2","volume-title":"IMPACT Technical Report, IMPACT-12-01","author":"Stratton John\u00a0A.","year":"2012","unstructured":"John\u00a0A. Stratton, Christopher Rodrigues, I-Jui Sung, Nady Obeid, Li-Wen Chang, Nasser Anssari, Geng\u00a0Daniel Liu, and Wen mei W.\u00a0Hwu. 2012. IMPACT Technical Report, IMPACT-12-01. Technical Report IMPACT-12-01. University of Illinois at Urbana-Champaign, Urbana, IL, USA. https:\/\/api.semanticscholar.org\/CorpusID:497928"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/2370816.2370865"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00077"},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_3_1_55_2","unstructured":"vLLM Team. 2024. Easy fast and cheap LLM serving for everyone. https:\/\/docs.vllm.ai\/en\/latest\/. [Accessed: 2025-02-09]."},{"key":"e_1_3_3_1_56_2","doi-asserted-by":"publisher","unstructured":"Lu Wang Magnus Jahre Almutaz Adileh Zhiying Wang and Lieven Eeckhout. 2019. Modeling Emerging Memory-Divergent GPU Applications. IEEE Computer Architecture Letters 18 2 (2019) 95\u201398. 10.1109\/LCA.2019.2923618","DOI":"10.1109\/LCA.2019.2923618"},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00085"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00062"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00082"},{"key":"e_1_3_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749745"}],"event":{"name":"ISCA '25: Proceedings of the 52nd Annual International Symposium on Computer Architecture","location":"Tokyo Japan","acronym":"SIGARCH '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 52nd Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3695053.3731064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:04:41Z","timestamp":1750503881000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3695053.3731064"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,20]]},"references-count":60,"alternative-id":["10.1145\/3695053.3731064","10.1145\/3695053"],"URL":"https:\/\/doi.org\/10.1145\/3695053.3731064","relation":{},"subject":[],"published":{"date-parts":[[2025,6,20]]},"assertion":[{"value":"2025-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}