{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:34:24Z","timestamp":1773192864639,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707265","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"493-508","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Forecasting GPU Performance for Deep Learning Training and Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0035-1359","authenticated-orcid":false,"given":"Seonho","family":"Lee","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2777-1118","authenticated-orcid":false,"given":"Amar","family":"Phanishayee","sequence":"additional","affiliation":[{"name":"Meta, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8184-0528","authenticated-orcid":false,"given":"Divya","family":"Mahajan","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AMD. 2022. AMD CDNA 2 ARCHITECTURE. https:\/\/www.amd.com\/content\/dam\/amd\/en\/documents\/instinct-business-docs\/white-papers\/amd-cdna2-white-paper.pdf."},{"key":"e_1_3_2_1_2_1","unstructured":"AMD. 2024. AMD Instinct Accelerators. https:\/\/www.amd.com\/en\/products\/accelerators\/instinct.html."},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. 2024. AMD Instinct HPC Solutions. https:\/\/www.amd.com\/en\/graphics\/servers-instinct-mi-powered-servers."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830780"},{"key":"e_1_3_2_1_5_1","volume-title":"International Symposium on Microarchitecture (MICRO).","author":"Baddouh Cesar Avalos","unstructured":"Cesar Avalos Baddouh, Mahmoud Khairy, Roland N. Green, Mathias Payer, and Timothy G. Rogers. 2021. Principal Kernel Analysis: A Tractable Methodology to Simulate Scaled GPU Workloads. In International Symposium on Microarchitecture (MICRO)."},{"key":"e_1_3_2_1_6_1","volume-title":"International Symposium on Performance Analysis of Systems Software (ISPASS).","author":"Bakhoda Ali","unstructured":"Ali Bakhoda, George L. Yuan, Wilson W. L. Fung, Henry Wong, and Tor M. Aamodt. 2009. Analyzing CUDA workloads using a detailed GPU simulator. In International Symposium on Performance Analysis of Systems Software (ISPASS)."},{"key":"e_1_3_2_1_7_1","volume-title":"Language Models are Few-Shot Learners. arxiv","author":"Brown Tom B.","year":"2005","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv: 2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_8_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In USENIX Symposium on Operating Systems Design and Implementation (OSDI).","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In USENIX Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"e_1_3_2_1_10_1","unstructured":"Tri Dao Daniel Y. Fu Stefano Ermon Atri Rudra and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_11_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)."},{"key":"e_1_3_2_1_12_1","unstructured":"Forbes. 2024. Meta To Build Open-Source Artificial General Intelligence For All Zuckerberg Says. https:\/\/www.forbes.com\/sites\/johnkoetsier\/2024\/01\/18\/zuckerberg-on-ai-meta-building-agi-for-everyone-and-open-sourcing-it."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00012"},{"key":"e_1_3_2_1_14_1","unstructured":"Google. 2024. TensorFlow. https:\/\/www.tensorflow.org\/."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555775"},{"key":"e_1_3_2_1_16_1","volume-title":"MAD Max Beyond Single-Node: Enabling Large Machine Learning Model Acceleration on Distributed Systems. In International Symposium on Computer Architecture (ISCA).","author":"Hsia Samuel","year":"2024","unstructured":"Samuel Hsia, Alicia Golden, Bilge Acun, Newsha Ardalani, Zachary DeVito, Gu-Yeon Wei, David Brooks, and Carole-Jean Wu. 2024. MAD Max Beyond Single-Node: Enabling Large Machine Learning Model Acceleration on Distributed Systems. In International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Neural Information Processing Systems (NeuRIPS).","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Xu Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In International Conference on Neural Information Processing Systems (NeuRIPS)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607102"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2012.6189201"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Norman P. Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Cliff Young Xiang Zhou Zongwei Zhou and David Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. arxiv: 2304.01433 [cs.AR]","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_21_1","volume-title":"Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling. In International Symposium on Computer Architecture (ISCA).","author":"Khairy Mahmoud","unstructured":"Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, and Timothy G. Rogers. 2020. Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling. In International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Neural Information Processing Systems (NeuRIPS).","author":"Krizhevsky Alex","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet classification with deep convolutional neural networks. In International Conference on Neural Information Processing Systems (NeuRIPS)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_24_1","volume-title":"Data-Driven Offline Optimization For Architecting Hardware Accelerators. In International Conference on Neural Information Processing Systems (NeuRIPS).","author":"Kumar Aviral","year":"2022","unstructured":"Aviral Kumar, Amir Yazdanbakhsh, Milad Hashemi Kevin Swersky, and Sergey Levine. 2022. Data-Driven Offline Optimization For Architecting Hardware Accelerators. In International Conference on Neural Information Processing Systems (NeuRIPS)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358252"},{"key":"e_1_3_2_1_26_1","volume-title":"Path Forward Beyond Simulators: Fast and Accurate GPU Execution Time Prediction for DNN Workloads. In International Symposium on Microarchitecture (MICRO) (2023--10--28)","author":"Li Ying","year":"2023","unstructured":"Ying Li, Yifan Sun, and Adwait Jog. 2023. Path Forward Beyond Simulators: Fast and Accurate GPU Execution Time Prediction for DNN Workloads. In International Symposium on Microarchitecture (MICRO) (2023--10--28)."},{"key":"e_1_3_2_1_27_1","volume-title":"Jacob Sacks, Adel Ardalan, Arun Kumar, and Hadi Esmaeilzadeh.","author":"Mahajan Divya","year":"2018","unstructured":"Divya Mahajan, Joon Kyung Kim, Jacob Sacks, Adel Ardalan, Arun Kumar, and Hadi Esmaeilzadeh. 2018. In-RDBMS Hardware Acceleration of Advanced Analytics."},{"key":"e_1_3_2_1_28_1","volume-title":"Tabla: A Unified Template-based Framework for Accelerating Statistical Machine Learning. In International Symposium on High-Performance Computer Architecture (HPCA).","author":"Mahajan Divya","year":"2016","unstructured":"Divya Mahajan, Jongse Park, Emmanuel Amaro, Hardik Sharma, Amir Yazdanbakhsh, Joon Kim, and Hadi Esmaeilzadeh. 2016. Tabla: A Unified Template-based Framework for Accelerating Statistical Machine Learning. In International Symposium on High-Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Machine Learning. PMLR, 7937--7947","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. 2021. Memory-efficient pipeline-parallel DNN training. In International Conference on Machine Learning. PMLR, 7937--7947."},{"key":"e_1_3_2_1_31_1","unstructured":"nsnam. 2024. ns-3. https:\/\/www.nsnam.org\/."},{"key":"e_1_3_2_1_32_1","volume-title":"CUTLASS: Fast Linear Algebra in CUDA C. https:\/\/developer.nvidia.com\/blog\/cutlass-linear-algebra-cuda\/.","author":"NVIDIA.","year":"2017","unstructured":"NVIDIA. 2017. CUTLASS: Fast Linear Algebra in CUDA C. https:\/\/developer.nvidia.com\/blog\/cutlass-linear-algebra-cuda\/."},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA. 2023a. GPU Performance Background. https:\/\/docs.nvidia.com\/deeplearning\/performance\/pdf\/GPU-Performance-Background-User-Guide.pdf."},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2023b. NVIDIA DGX H100. https:\/\/resources.nvidia.com\/en-us-dgx-systems\/ai-enterprise-dgx."},{"key":"e_1_3_2_1_35_1","unstructured":"NVIDIA. 2023c. NVIDIA H100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-tensor-core."},{"key":"e_1_3_2_1_36_1","unstructured":"NVIDIA. 2024a. GPU Performance Background User's Guide. https:\/\/docs.nvidia.com\/deeplearning\/performance\/dl-performance-gpu-background\/index.html##gpu-execution."},{"key":"e_1_3_2_1_37_1","unstructured":"NVIDIA. 2024b. NVIDIA Blackwell Architecture Technical Brief. https:\/\/resources.nvidia.com\/en-us-blackwell-architecture."},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA. 2024c. NVIDIA cuDNN. https:\/\/developer.nvidia.com\/cudnn."},{"key":"e_1_3_2_1_39_1","volume-title":"2024 d","author":"NVIDIA.","unstructured":"NVIDIA. 2024 d. NVIDIA Data Center GPUs. https:\/\/www.nvidia.com\/en-us\/data-center\/data-center-gpus\/."},{"key":"e_1_3_2_1_40_1","unstructured":"Nvidia. 2024. Nvidia DGX Platform. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-platform\/."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA. 2024. NVlink. https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_42_1","volume-title":"Sunstone: A Scalable and Versatile Scheduler for Mapping Tensor Algebra on Spatial Accelerators. In International Symposium on Performance Analysis of Systems Software (ISPASS).","author":"Olyaiy MohammadHossein","year":"2023","unstructured":"MohammadHossein Olyaiy, Christopher Ng, Alexandra Sasha Fedorova, and Mieszko Lis. 2023. Sunstone: A Scalable and Versatile Scheduler for Mapping Tensor Algebra on Spatial Accelerators. In International Symposium on Performance Analysis of Systems Software (ISPASS)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062257"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Kishore Punniyamurthy Behzad Boroujerdian and Andreas Gerstlauer. 2017. GATSim: abstract timing simulation of GPUs.","DOI":"10.23919\/DATE.2017.7926956"},{"key":"e_1_3_2_1_45_1","unstructured":"PyTorch. 2023a. PyTorch Profiler. https:\/\/pytorch.org\/tutorials\/recipes\/recipes\/profiler_recipe.html"},{"key":"e_1_3_2_1_46_1","unstructured":"PyTorch. 2023b. torch.fx-PyTorch 2.2 documentation. https:\/\/pytorch.org\/docs\/stable\/fx.html."},{"key":"e_1_3_2_1_47_1","unstructured":"PyTorch. 2024a. Introduction to torch.compile. https:\/\/pytorch.org\/tutorials\/intermediate\/torch_compile_tutorial.html."},{"key":"e_1_3_2_1_48_1","unstructured":"PyTorch. 2024b. PyTorch. https:\/\/pytorch.org\/."},{"key":"e_1_3_2_1_49_1","volume-title":"PALEO: A PERFORMANCE MODEL FOR DEEP NEURAL NETWORKS. In International Conference on Learning Representations (ICLR).","author":"Qi Hang","year":"2017","unstructured":"Hang Qi, Evan R Sparks, and Ameet Talwalkar. 2017. PALEO: A PERFORMANCE MODEL FOR DEEP NEURAL NETWORKS. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_50_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_51_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arxiv: 1910.10683 [cs.LG]"},{"key":"e_1_3_2_1_52_1","volume-title":"ASTRA-SIM: Enabling SW\/HW Co-Design Exploration for Distributed DL Training Platforms. In International Symposium on Performance Analysis of Systems Software (ISPASS).","author":"Rashidi Saeed","year":"2020","unstructured":"Saeed Rashidi, Srinivas Sridharan, Sudarshan Srinivasan, and Tushar Krishna. 2020. ASTRA-SIM: Enabling SW\/HW Co-Design Exploration for Distributed DL Training Platforms. In International Symposium on Performance Analysis of Systems Software (ISPASS)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"e_1_3_2_1_54_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. arxiv: 1409.1556 [cs.CV]"},{"key":"e_1_3_2_1_55_1","volume-title":"Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro.","author":"Smith Shaden","year":"2022","unstructured":"Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, and Bryan Catanzaro. 2022. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model. arxiv: 2201.11990 [cs.CL]"},{"key":"e_1_3_2_1_56_1","volume-title":"A better measure of relative prediction accuracy for model selection and model estimation. Journal of the Operational Research Society (JORS)","author":"Tofallis Chris","year":"2015","unstructured":"Chris Tofallis. 2015. A better measure of relative prediction accuracy for model selection and model estimation. Journal of the Operational Research Society (JORS) (2015)."},{"key":"e_1_3_2_1_57_1","volume-title":"TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In USENIX Symposium on Networked Systems Design and Implementation (NSDI).","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. 2023. TopoOpt: Co-optimizing Network Topology and Parallelization Strategy for Distributed Training Jobs. In USENIX Symposium on Networked Systems Design and Implementation (NSDI)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2019.00062"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056063"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489048.3530962"},{"key":"e_1_3_2_1_62_1","volume-title":"Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In Annual Technical Conference (USENIX ATC).","author":"Yu Geoffrey X.","year":"2021","unstructured":"Geoffrey X. Yu, Yubo Gao, Pavel Golikov, and Gennady Pekhimenko. 2021. Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training. In Annual Technical Conference (USENIX ATC)."},{"key":"e_1_3_2_1_63_1","volume-title":"Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In Annual Technical Conference (USENIX ATC).","author":"Zhu Hongyu","year":"2020","unstructured":"Hongyu Zhu, Amar Phanishayee, and Gennady Pekhimenko. 2020. Daydream: Accurately Estimating the Efficacy of Optimizations for DNN Training. In Annual Technical Conference (USENIX ATC)."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707265","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707265","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:46:21Z","timestamp":1755787581000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707265"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":63,"alternative-id":["10.1145\/3669940.3707265","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707265","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}