{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:08:01Z","timestamp":1768345681697,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772233","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"283-298","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Snap &amp; Replay: A new way to analyze uarch-scale performance bottlenecks for ML accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2364-3891","authenticated-orcid":false,"given":"Ioannis","family":"Zarkadas","sequence":"first","affiliation":[{"name":"Columbia University, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8209-5006","authenticated-orcid":false,"given":"Amanda","family":"Tomlinson","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4046-2022","authenticated-orcid":false,"given":"Asaf","family":"Cidon","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6122-8998","authenticated-orcid":false,"given":"Baris","family":"Kasikci","sequence":"additional","affiliation":[{"name":"University of Washington, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7564-6007","authenticated-orcid":false,"given":"Ofir","family":"Weisse","sequence":"additional","affiliation":[{"name":"Google, Sunnyvale, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. Debugging with Cerebras SDK. https:\/\/sdk.cerebras.net\/debug\/debugging#csdb-debugger. Accessed: 2024-10-29."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: a system for large-scale machine learning. In Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (Savannah, GA, USA) (OSDI'16). USENIX Association, USA, 265\u2013283."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/1753228.1753233"},{"key":"e_1_3_2_1_4_1","volume-title":"Advanced Micro Devices","author":"Inc.","year":"2024","unstructured":"Inc. Advanced Micro Devices. 2024. AMD Official Website. https:\/\/www.amd.com Accessed: 2024-11-26."},{"key":"e_1_3_2_1_5_1","volume-title":"AMD ROCm Profiler. AMD. https:\/\/rocm.docs.amd.com\/projects\/rocprofiler\/en\/docs-5.0.2\/Accessed on","author":"AMD","year":"2024","unstructured":"AMD 2024. AMD ROCm Profiler. AMD. https:\/\/rocm.docs.amd.com\/projects\/rocprofiler\/en\/docs-5.0.2\/Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.)","volume":"4","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Daniel Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, Brennan Saeta, Parker Schuh, Ryan Sepassi, Laurent Shafey, Chandu Thekkath, and Yonghui Wu. 2022. Pathways: Asynchronous Distributed Dataflow for ML. In Proceedings of Machine Learning and Systems, D. Marculescu, Y. Chi, and C. Wu (Eds.), Vol. 4. 430\u2013449. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2022\/file\/37385144cac01dff38247ab11c119e3c-Paper.pdf"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_2_1_9_1","volume-title":"d.]. Generative AI Races Toward $1.3 Trillion in Revenue by","author":"Intelligence Bloomberg","year":"2032","unstructured":"Bloomberg Intelligence. [n. d.]. Generative AI Races Toward $1.3 Trillion in Revenue by 2032. https:\/\/www.bloomberg.com\/professional\/insights\/data\/generative-ai-races-toward-1-3-trillion-in-revenue-by-2032\/. Accessed: 2024-11-22."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063454"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: an automated end-to-end optimizing compiler for deep learning. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (Carlsbad, CA, USA) (OSDI'18). USENIX Association, USA, 579\u2013594."},{"key":"e_1_3_2_1_12_1","unstructured":"Intel Corporation. 2024. Intel Official Website. https:\/\/www.intel.com Accessed: 2024-11-26."},{"key":"e_1_3_2_1_13_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Official Website. https:\/\/www.nvidia.com Accessed: 2024-11-26."},{"key":"e_1_3_2_1_14_1","unstructured":"NVIDIA Corporation. 2025. CUDA-GDB. https:\/\/developer.nvidia.com\/cuda-gdb Accessed: 2025-02-14."},{"key":"e_1_3_2_1_15_1","unstructured":"NVIDIA Corporation. 2025. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl Accessed: 2025-01-21."},{"key":"e_1_3_2_1_16_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=mZn2Xyh9Ec"},{"key":"e_1_3_2_1_17_1","volume-title":"Oh (Eds.)","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 16344\u201316359. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/67d57c32e20fd0a7a302cb81d36e40d5-Paper-Conference.pdf"},{"key":"e_1_3_2_1_18_1","volume-title":"Arcadia: End-to-End AI System Performance Simulator. https:\/\/engineering.fb.com\/2023\/09\/07\/data-infrastructure\/arcadia-end-to-end-ai-system-performance-simulator\/.","author":"Engineering Facebook","year":"2023","unstructured":"Facebook Engineering. 2023. Arcadia: End-to-End AI System Performance Simulator. https:\/\/engineering.fb.com\/2023\/09\/07\/data-infrastructure\/arcadia-end-to-end-ai-system-performance-simulator\/."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589348"},{"key":"e_1_3_2_1_20_1","unstructured":"Roy Frostig Matthew Johnson and Chris Leary. 2018. Compiling machine learning programs via high-level tracing. https:\/\/mlsys.org\/Conferences\/doc\/2018\/146.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_2_1_22_1","volume-title":"https:\/\/www.tensorflow.org\/tensorboard Accessed on","author":"Google TensorBoard Google","year":"2024","unstructured":"Google 2024. Google TensorBoard. Google. https:\/\/www.tensorflow.org\/tensorboard Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578244.3583736"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASICON.2017.8252580"},{"key":"e_1_3_2_1_25_1","volume-title":"Intel GTPin: Graphics Program Instrumentation","author":"Intel Corporation 2024.","unstructured":"Intel Corporation 2024. Intel GTPin: Graphics Program Instrumentation. Intel Corporation. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/tool\/gtpin.html Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_26_1","volume-title":"Intel VTune Profiler","author":"Intel Corporation 2024.","unstructured":"Intel Corporation 2024. Intel VTune Profiler. Intel Corporation. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/vtune-profiler.html Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_27_1","unstructured":"JAX Developers. [n.d.]. Pallas: A Kernel Language for JAX. https:\/\/jax.readthedocs.io\/en\/latest\/pallas\/index.html. Accessed: 2024-11-22."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Norman P. Jouppi Cliff Young Nishant Patil David Patterson Gaurav Agrawal Raminder Bajwa Sarah Bates Suresh Bhatia Nan Boden Al Borchers Rick Boyle Pierre luc Cantin Clifford Chao Chris Clark Jeremy Coriell Mike Daley Matt Dau Jeffrey Dean Ben Gelb Tara Vazir Ghaemmaghami Rajendra Gottipati William Gulland Robert Hagmann C. Richard Ho Doug Hogberg John Hu Robert Hundt Dan Hurt Julian Ibarz Aaron Jaffey Alek Jaworski Alexander Kaplan Harshit Khaitan Andy Koch Naveen Kumar Steve Lacy James Laudon James Law Diemthu Le Chris Leary Zhuyuan Liu Kyle Lucke Alan Lundin Gordon MacKean Adriana Maggiore Maire Mahony Kieran Miller Rahul Nagarajan Ravi Narayanaswami Ray Ni Kathy Nix Thomas Norrie Mark Omernick Narayana Penukonda Andy Phelps and Jonathan Ross. 2017. In-Datacenter Performance Analysis of a Tensor Processing Unit. https:\/\/arxiv.org\/pdf\/1704.04760.pdf","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2985963"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00028"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3178068"},{"key":"e_1_3_2_1_36_1","volume-title":"https:\/\/llvm.org Accessed on","author":"Project LLVM","year":"2024","unstructured":"LLVM Foundation 2024. LLVM Project. LLVM Foundation. https:\/\/llvm.org Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"Ayaz Akram, Mohammad Alian, Rico Amslinger, Matteo Andreozzi, Adri\u00e0 Armejach, Nils Asmussen, Brad Beckmann, Srikant Bharadwaj, et al.","author":"Lowe-Power Jason","year":"2020","unstructured":"Jason Lowe-Power, Abdul Mutaal Ahmad, Ayaz Akram, Mohammad Alian, Rico Amslinger, Matteo Andreozzi, Adri\u00e0 Armejach, Nils Asmussen, Brad Beckmann, Srikant Bharadwaj, et al. 2020. The gem5 simulator: Version 20.0+. arXiv preprint arXiv:2007.03152 (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"https:\/\/developer.nvidia.com\/cupti Accessed on","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2020. NVIDIA CUPTI. NVIDIA. https:\/\/developer.nvidia.com\/cupti Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_39_1","volume-title":"NVIDIA nvprof. NVIDIA. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/ Accessed on","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2020. NVIDIA nvprof. NVIDIA. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/ Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"NVIDIA Nsight Compute. NVIDIA. https:\/\/developer.nvidia.com\/nsight-compute Accessed on","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2024. NVIDIA Nsight Compute. NVIDIA. https:\/\/developer.nvidia.com\/nsight-compute Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_41_1","volume-title":"NVIDIA Nsight Systems. NVIDIA. https:\/\/developer.nvidia.com\/nsight-systems Accessed on","author":"NVIDIA","year":"2024","unstructured":"NVIDIA 2024. NVIDIA Nsight Systems. NVIDIA. https:\/\/developer.nvidia.com\/nsight-systems Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"NVIDIA Compute Sanitizer API","author":"NVIDIA Corporation 2024.","unstructured":"NVIDIA Corporation 2024. NVIDIA Compute Sanitizer API. NVIDIA Corporation. https:\/\/docs.nvidia.com\/compute-sanitizer\/SanitizerApiGuide\/index.html Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_43_1","volume-title":"SASSI: A Low-Level GPU Instrumentation Framework","author":"NVIDIA Corporation 2024.","year":"2024","unstructured":"NVIDIA Corporation 2024. SASSI: A Low-Level GPU Instrumentation Framework. NVIDIA Corporation. https:\/\/github.com\/NVlabs\/SASSI Accessed on: October 15, 2024."},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA Corporation. 2025. CUPTI Documentation - Multi-Pass Collection. https:\/\/docs.nvidia.com\/cupti\/main\/main.html#multi-pass-collection Accessed: 2025-02-17."},{"key":"e_1_3_2_1_45_1","unstructured":"OpenXLA Project. [n. d.]. OpenXLA: An Open Ecosystem for Machine Learning Infrastructure. https:\/\/openxla.org\/. Accessed: 2024-11-22."},{"key":"e_1_3_2_1_46_1","volume-title":"PyTorch: an imperative style, high-performance deep learning library","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: an imperative style, high-performance deep learning library. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_47_1","unstructured":"OpenXLA Project. [n.d.]. StableHLO: A Portability Layer for ML Frameworks and Compilers. https:\/\/openxla.org\/stablehlo. Accessed: 2024-11-22."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/54.895008"},{"key":"e_1_3_2_1_49_1","volume-title":"XLA : Compiling Machine Learning for Peak Performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. XLA : Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_2_1_50_1","unstructured":"Amazon Web Services. [n. d.]. AWS Inferentia. https:\/\/aws.amazon. com\/ai\/machine-learning\/inferentia Accessed: 2024-11-22."},{"key":"e_1_3_2_1_51_1","unstructured":"Amazon Web Services. [n. d.]. AWS Trainium. https:\/\/aws.amazon.com\/ai\/machine-learning\/trainium Accessed: 2024-11-22."},{"key":"e_1_3_2_1_52_1","unstructured":"Amazon Web Services. 2025. Collective Communication. https:\/\/awsdocs-neuron.readthedocs-hosted.com\/en\/latest\/general\/arch\/neuron-features\/collective-communication.html Accessed: 2025-01-21."},{"key":"e_1_3_2_1_53_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20","author":"Shah Jay","year":"2024","unstructured":"Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, and Tri Dao. 2024. FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=tVConYid20"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3168831"},{"key":"e_1_3_2_1_55_1","unstructured":"Anton Shilov. 2025. DeepSeek's AI breakthrough bypasses industry-standard CUDA for some functions uses Nvidia's assembly-like PTX programming instead. https:\/\/www.tomshardware.com\/tech-industry\/artificial-intelligence\/deepseeks-ai-breakthrough-bypasses-industry-standard-cuda-uses-assembly-like-ptx-programming-instead Accessed: 2025-02-17."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_57_1","volume-title":"The Huzz: Instruction Fuzzing of Processors Using Golden-Reference Models for Finding Software-Exploitable Vulnerabilities. arXiv:2201.09941 [cs.CR] https:\/\/arxiv.org\/abs\/2201.09941","author":"Tyagi Aakash","year":"2022","unstructured":"Aakash Tyagi, Addison Crump, Ahmad-Reza Sadeghi, Garrett Persyn, Jeyavijayan Rajendran, Patrick Jauernig, and Rahul Kande. 2022. The Huzz: Instruction Fuzzing of Processors Using Golden-Reference Models for Finding Software-Exploitable Vulnerabilities. arXiv:2201.09941 [cs.CR] https:\/\/arxiv.org\/abs\/2201.09941"},{"key":"e_1_3_2_1_58_1","volume-title":"ChatGPT and Generative AI Are Booming, but the Costs Can Be Extraordinary. CNBC","author":"Vanian Jonathan","year":"2023","unstructured":"Jonathan Vanian and Kif Leswing. 2023. ChatGPT and Generative AI Are Booming, but the Costs Can Be Extraordinary. CNBC (2023). https:\/\/www.cnbc.com\/2023\/03\/13\/chatgpt-and-generative-ai-are-booming-but-at-a-very-expensive-price.html Published: Mar 13, 2023, Updated: Apr 17, 2023, Accessed: 2025-02-10"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00077"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942149"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507708"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370339"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772233","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:54Z","timestamp":1768321194000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772233"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":63,"alternative-id":["10.1145\/3772052.3772233","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772233","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}