{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T15:30:39Z","timestamp":1769787039049,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2411134"],"award-info":[{"award-number":["2411134"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,6]]},"DOI":"10.1145\/3676642.3736127","type":"proceedings-article","created":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T22:19:59Z","timestamp":1754518799000},"page":"48-63","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["DeepContext: A Context-aware, Cross-platform, and Cross-framework Tool for Performance Profiling and Analysis of Deep Learning Workloads"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0872-1246","authenticated-orcid":false,"given":"Qidong","family":"Zhao","sequence":"first","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6318-4505","authenticated-orcid":false,"given":"Hao","family":"Wu","sequence":"additional","affiliation":[{"name":"George Mason University, Fairfax, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4792-6244","authenticated-orcid":false,"given":"Yueming","family":"Hao","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5437-0445","authenticated-orcid":false,"given":"Zilingfeng","family":"Ye","sequence":"additional","affiliation":[{"name":"Independent, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1270-4147","authenticated-orcid":false,"given":"Jiajia","family":"Li","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1487-963X","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, NC, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-3182","authenticated-orcid":false,"given":"Keren","family":"Zhou","sequence":"additional","affiliation":[{"name":"George Mason University, Fairfax, VA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. Eclipse Theia: Cloud and Desktop IDE Platform. https:\/\/theiaide.org\/. Accessed: 2024--10--17."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. VSCodium: Open-Source Visual Studio Code Without Microsoft Branding. https:\/\/vscodium.com\/. Accessed: 2024--10--17."},{"key":"e_1_3_2_1_3_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2025. Using rocprofv3. https:\/\/rocm.docs.amd.com\/projects\/rocprofiler-sdk\/en\/latest\/how-to\/using-rocprofv3.html. Accessed: 2025-03-02."},{"key":"e_1_3_2_1_5_1","unstructured":"AMD Inc. 2024. ROCm Tracer (Roctracer). https:\/\/github.com\/ROCm-Developer-Tools\/roctracer. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Jason Ansel Edward Yang Horace He Natalia Gimelshein Animesh Jain Michael Voznesensky Bin Bao Peter Bell David Berard Evgeni Burovski et al. 2024. PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation. (2024).","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_7_1","unstructured":"Peter W Battaglia Jessica B Hamrick Victor Bapst Alvaro Sanchez-Gonzalez Vinicius Zambaldi Mateusz Malinowski Andrea Tacchetti David Raposo Adam Santoro Ryan Faulkner et al. 2018. Relational inductive biases deep learning and graph networks. arXiv preprint arXiv:1806.01261 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"Scalene: Scripting-language aware profiling for python. arXiv preprint arXiv:2006.03879","author":"Berger Emery D","year":"2020","unstructured":"Emery D Berger. 2020. Scalene: Scripting-language aware profiling for python. arXiv preprint arXiv:2006.03879 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"d.]. Monkey Patching in Python. Software Technology Tips ([n. d.]). Archived from the original on","author":"Biswal Bimal","year":"2012","unstructured":"Bimal Biswal. [n. d.]. Monkey Patching in Python. Software Technology Tips ([n. d.]). Archived from the original on 22 August 2012. Retrieved 9 December 2013."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4717"},{"key":"e_1_3_2_1_11_1","volume-title":"Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang.","author":"Bradbury James","year":"2018","unstructured":"James Bradbury, Roy Frostig, Peter Hawkins, Matthew James Johnson, Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang. 2018. JAX: composable transformations of PythonNumPy programs. http:\/\/github.com\/google\/jax"},{"key":"e_1_3_2_1_12_1","unstructured":"PyTorch Contributors. 2024. Normalization. cuh. https:\/\/github.com\/pytorch\/pytorch\/blob\/44483972bdd3dcd0c047020694817210846b5d70\/aten\/src\/ATen\/native\/cuda\/Normalization.cuh#L356. Accessed: 2024-06--26."},{"key":"e_1_3_2_1_13_1","unstructured":"NVIDIA Corporation. 2021. NVIDIA DLProf: Deep Learning Profiler. https:\/\/developer.nvidia.com\/DLProf. Accessed: 2024--10--16."},{"key":"e_1_3_2_1_14_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute. Accessed: 2024--10--16."},{"key":"e_1_3_2_1_15_1","volume-title":"Lab","author":"Criteo A.","year":"2014","unstructured":"Criteo A. I. Lab. 2014. Criteo 1TB Click Logs dataset. https:\/\/labs.criteo.com\/2014\/12\/download-terabyte-click-logs\/."},{"key":"e_1_3_2_1_16_1","unstructured":"George E. Dahl Frank Schneider Zachary Nado Naman Agarwal Chandramouli Shama Sastry Philipp Hennig Sourabh Medapati Runa Eschenhagen Priya Kasimbeg Daniel Suo Juhan Bae Justin Gilmer Abel L. Peirson Bilal Khan Rohan Anil Mike Rabbat Shankar Krishnan Daniel Snider Ehsan Amid Kongtao Chen Chris J. Maddison Rakshith Vasudev Michal Badura Ankush Garg and Peter Mattson. 2023. Benchmarking Neural Network Training Algorithms. arXiv:2306.07179"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_18_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_19_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Hugging Face. 2024. LLaMA Model Implementation in Transformers Library. https:\/\/github.com\/huggingface\/transformers\/blob\/main\/src\/transformers\/models\/llama\/modeling_llama.py#L69. Accessed: 2024--10--19."},{"key":"e_1_3_2_1_21_1","unstructured":"Python Software Foundation. 2024. cProfile: Python Code Profiler. https:\/\/docs.python.org\/3\/library\/profile.html. Accessed: 2024--10--16."},{"key":"e_1_3_2_1_22_1","first-page":"783","article-title":"RL-Scope: Cross-stack profiling for deep reinforcement learning workloads","volume":"3","author":"Gleeson James","year":"2021","unstructured":"James Gleeson, Moshe Gabel, Gennady Pekhimenko, Eyal de Lara, Srivatsan Krishnan, and Vijay Janapa Reddi. 2021. RL-Scope: Cross-stack profiling for deep reinforcement learning workloads. Proceedings of Machine Learning and Systems 3 (2021), 783--799.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2909476"},{"key":"e_1_3_2_1_24_1","unstructured":"Sam Gross. 2020. Issue #41162: Advanced indexing gradient is extremely slow when there are many duplicate indices. https:\/\/github. com\/pytorch\/pytorch\/issues\/41162#issuecomment-655834491. Meta."},{"key":"e_1_3_2_1_25_1","volume-title":"Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100","author":"Gulati Anmol","year":"2020","unstructured":"Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu,Wei Han, ShiboWang, Zhengdong Zhang, YonghuiWu, et al. 2020. Conformer: Convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_27_1","volume-title":"Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al.","author":"Hoffmann Jordan","year":"2022","unstructured":"Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. 2022. Training compute-optimal large language models. arXiv preprint arXiv:2203.15556 (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Open graph benchmark: Datasets for machine learning on graphs. Advances in neural information processing systems 33","author":"Hu Weihua","year":"2020","unstructured":"Weihua Hu, Matthias Fey, Marinka Zitnik, Yuxiao Dong, Hongyu Ren, Bowen Liu, Michele Catasta, and Jure Leskovec. 2020. Open graph benchmark: Datasets for machine learning on graphs. Advances in neural information processing systems 33 (2020), 22118--22133."},{"key":"e_1_3_2_1_29_1","unstructured":"Intel Corporation. 2024. Intel VTune Profiler. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/vtune-profiler.html. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_30_1","unstructured":"IO Visor Project. 2024. BCC (BPF Compiler Collection). https:\/\/github.com\/iovisor\/bcc. Accessed: 2024-03--10."},{"key":"e_1_3_2_1_31_1","unstructured":"IO Visor Project. 2024. bpftrace: High-level tracing language for eBPF. https:\/\/github.com\/iovisor\/bpftrace. Accessed: 2024-03--10."},{"key":"e_1_3_2_1_32_1","unstructured":"JAX Team. 2024. JAX Profiler. https:\/\/jax.readthedocs.io\/en\/latest\/profiler.html. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_33_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_34_1","unstructured":"Andrej Karpathy. 2022. NanoGPT. https:\/\/github.com\/karpathy\/nanoGPT."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00042"},{"key":"e_1_3_2_1_36_1","volume-title":"Linux Perf: Performance Monitoring for Linux. https:\/\/perf.wiki.kernel.org\/index.php\/Main_Page. Accessed: 2024--10--12.","author":"Linux Kernel Organization","year":"2024","unstructured":"Linux Kernel Organization. 2024. Linux Perf: Performance Monitoring for Linux. https:\/\/perf.wiki.kernel.org\/index.php\/Main_Page. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_37_1","unstructured":"Microsoft Corporation. 2024. Visual Studio Code. https:\/\/code.visualstudio.com. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_38_1","unstructured":"David Mosberger. [n. d.]. Libunwind. http:\/\/www.nongnu.org\/libunwind. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_39_1","volume-title":"Jianyu Huang, Narayanan Sundaraman, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, Alisson G Azzolini, et al.","author":"Naumov Maxim","year":"2019","unstructured":"Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, Narayanan Sundaraman, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, Alisson G Azzolini, et al. 2019. Deep learning recommendation model for personalization and recommendation systems. arXiv preprint arXiv:1906.00091 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"NVIDIA CUPTI: CUDA Profiling Tools Interface. https:\/\/developer.nvidia.com\/cupti. Accessed: 2024--10--12.","author":"NVIDIA Corporation","year":"2024","unstructured":"NVIDIA Corporation. 2024. NVIDIA CUPTI: CUDA Profiling Tools Interface. https:\/\/developer.nvidia.com\/cupti. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_41_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_42_1","volume-title":"NVIDIA Tools Extension","author":"NVIDIA Corporation","unstructured":"NVIDIA Corporation. 2025. NVIDIA Tools Extension (NVTX) Library. https:\/\/docs.nvidia.com\/tools-extension\/index.html. Accessed: 2025-03-02."},{"key":"e_1_3_2_1_43_1","unstructured":"NVIDIA Corporation. 2025. Parallel Thread Execution ISA: Data Movement and Conversion Instructions (cvt). https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html#datamovement- and-conversion-instructions-cvt. Accessed: 2025-03-02."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_45_1","volume-title":"Pytorch: An imperative style, highperformance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, highperformance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_47_1","unstructured":"PyTorch Team. 2024. PyTorch Profiler. https:\/\/pytorch.org\/tutorials\/recipes\/recipes\/profiler.html. Accessed: 2024--10--12."},{"key":"e_1_3_2_1_48_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention--MICCAI 2015: 18th international conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention--MICCAI 2015: 18th international conference, Munich, Germany, October 5--9, 2015, proceedings, part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"e_1_3_2_1_50_1","first-page":"104","article-title":"Hotline Profiler: Automatic Annotation and A Multi-Scale Timeline for Visualizing Time-Use in DNN Training","volume":"5","author":"Snider Daniel","year":"2023","unstructured":"Daniel Snider, Fanny Chevalier, and Gennady Pekhimenko. 2023. Hotline Profiler: Automatic Annotation and A Multi-Scale Timeline for Visualizing Time-Use in DNN Training. Proceedings of Machine Learning and Systems 5 (2023), 104--126.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_51_1","unstructured":"stas00. [n. d.]. [Feature Request] Implement RMSNorm Fused CUDA Kernel. https:\/\/github.com\/NVIDIA\/apex\/issues\/1271. Accessed: 2025-03-02."},{"key":"e_1_3_2_1_52_1","volume-title":"Juliette Love, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivi\u00e8re, Mihir Sanjay Kale, Juliette Love, et al. 2024. Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295 (2024)."},{"key":"e_1_3_2_1_53_1","unstructured":"PyTorch Team. 2024. Memory Format Tutorial. https:\/\/pytorch.org\/tutorials\/intermediate\/memory_format_tutorial.html. Accessed: 2024-06--23."},{"key":"e_1_3_2_1_54_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2012.39"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415890"},{"key":"e_1_3_2_1_57_1","unstructured":"Jure Zbontar Florian Knoll Anuroop Sriram Tullie Murrell Zhengnan Huang Matthew J Muckley Aaron Defazio Ruben Stern Patricia Johnson Mary Bruno et al. 2018. fastMRI: An open dataset and benchmarks for accelerated MRI. arXiv preprint arXiv:1811.08839 (2018)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00034"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2021.102837"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524059.3532388"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370339"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676642.3736127","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:24:42Z","timestamp":1757543082000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676642.3736127"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,6]]},"references-count":61,"alternative-id":["10.1145\/3676642.3736127","10.1145\/3676642"],"URL":"https:\/\/doi.org\/10.1145\/3676642.3736127","relation":{},"subject":[],"published":{"date-parts":[[2025,8,6]]},"assertion":[{"value":"2025-08-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}