{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T03:22:34Z","timestamp":1781234554802,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,5,8]],"date-time":"2023-05-08T00:00:00Z","timestamp":1683504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100004836","name":"Danmarks Frie Forskningsfond","doi-asserted-by":"publisher","award":["0171-00061B"],"award-info":[{"award-number":["0171-00061B"]}],"id":[{"id":"10.13039\/501100004836","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,5,8]]},"DOI":"10.1145\/3578356.3592589","type":"proceedings-article","created":{"date-parts":[[2023,5,4]],"date-time":"2023-05-04T19:44:37Z","timestamp":1683229477000},"page":"18-25","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Profiling and Monitoring Deep Learning Training Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0156-1435","authenticated-orcid":false,"given":"Ehsan","family":"Yousefzadeh-Asl-Miandoab","sequence":"first","affiliation":[{"name":"Computer Science, IT University of Copenhagen, Copenhagen, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3451-5602","authenticated-orcid":false,"given":"Ties","family":"Robroek","sequence":"additional","affiliation":[{"name":"IT University of Copenhagen, Copenhagen, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6838-4854","authenticated-orcid":false,"given":"Pinar","family":"Tozun","sequence":"additional","affiliation":[{"name":"IT University of Copenhagen, Copenhagen, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,5,8]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. NVIDIA Data Center GPU Manager. https:\/\/github.com\/NVIDIA\/DCGM. [n. d.]. NVIDIA Data Center GPU Manager. https:\/\/github.com\/NVIDIA\/DCGM."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. NVIDIA Management Library (NVML). https:\/\/developer.nvidia.com\/nvidia-management-library-nvml. [n. d.]. NVIDIA Management Library (NVML). https:\/\/developer.nvidia.com\/nvidia-management-library-nvml."},{"key":"e_1_3_2_1_3_1","unstructured":"[n. d.]. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute. [n. d.]. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-compute."},{"key":"e_1_3_2_1_4_1","unstructured":"[n. d.]. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems. [n. d.]. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems."},{"key":"e_1_3_2_1_5_1","unstructured":"[n. d.]. NVIDIA System Management Interface. https:\/\/developer.download.nvidia.com\/compute\/DCGM\/docs\/nvidia-smi-367.38.pdf. [n. d.]. NVIDIA System Management Interface. https:\/\/developer.download.nvidia.com\/compute\/DCGM\/docs\/nvidia-smi-367.38.pdf."},{"key":"e_1_3_2_1_6_1","unstructured":"[n. d.]. NVTOP: Neat Videocard TOP. https:\/\/github.com\/Syllo\/nvtop. [n. d.]. NVTOP: Neat Videocard TOP. https:\/\/github.com\/Syllo\/nvtop."},{"key":"e_1_3_2_1_7_1","volume-title":"d.]. NVTX","unstructured":"[n. d.]. NVTX ( NVIDIA Tools Extension Library) . https:\/\/nvidia.github.io\/NVTX\/. [n. d.]. NVTX (NVIDIA Tools Extension Library). https:\/\/nvidia.github.io\/NVTX\/."},{"key":"e_1_3_2_1_8_1","unstructured":"[n. d.]. PyTorch Profiler. https:\/\/pytorch.org\/tutorials\/recipes\/recipes\/profiler_recipe.html. [n. d.]. PyTorch Profiler. https:\/\/pytorch.org\/tutorials\/recipes\/recipes\/profiler_recipe.html."},{"key":"e_1_3_2_1_9_1","unstructured":"[n. d.]. ROCm Documentation. https:\/\/sep5.readthedocs.io\/en\/latest\/ROCm_System_Managment\/ROCm-System-Managment.html. [n. d.]. ROCm Documentation. https:\/\/sep5.readthedocs.io\/en\/latest\/ROCm_System_Managment\/ROCm-System-Managment.html."},{"key":"e_1_3_2_1_10_1","unstructured":"[n. d.]. TensorBoard: TensorFlow's visualization toolkit. https:\/\/www.tensorflow.org\/tensorboard. [n. d.]. TensorBoard: TensorFlow's visualization toolkit. https:\/\/www.tensorflow.org\/tensorboard."},{"key":"e_1_3_2_1_11_1","unstructured":"[n. d.]. top(1) --- Linux manual page. https:\/\/man7.org\/linux\/man-pages\/man1\/top.1.html. [n. d.]. top(1) --- Linux manual page. https:\/\/man7.org\/linux\/man-pages\/man1\/top.1.html."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/1753228.1753233"},{"key":"e_1_3_2_1_13_1","volume-title":"Wood","author":"Ailamaki Anastassia","year":"1999","unstructured":"Anastassia Ailamaki , David J. DeWitt , Mark D. Hill , and David A . Wood . 1999 . DBMSs on a Modern Processor : Where Does Time Go?. In VLDB. 266--277. Anastassia Ailamaki, David J. DeWitt, Mark D. Hill, and David A. Wood. 1999. DBMSs on a Modern Processor: Where Does Time Go?. In VLDB. 266--277."},{"key":"e_1_3_2_1_14_1","volume-title":"Sebastian Benjamin Wrede, and P\u0131nar T\u00f6z\u00fcn","author":"Baunsgaard Sebastian","year":"2020","unstructured":"Sebastian Baunsgaard , Sebastian Benjamin Wrede, and P\u0131nar T\u00f6z\u00fcn . 2020 . Training for Speech Recognition on Coprocessors. In ADMS. Sebastian Baunsgaard, Sebastian Benjamin Wrede, and P\u0131nar T\u00f6z\u00fcn. 2020. Training for Speech Recognition on Coprocessors. In ADMS."},{"key":"e_1_3_2_1_15_1","volume-title":"Quasar: Resource-Efficient and QoS-Aware Cluster Management. In ASPLOS. 127--144.","author":"Delimitrou Christina","year":"2014","unstructured":"Christina Delimitrou and Christos Kozyrakis . 2014 . Quasar: Resource-Efficient and QoS-Aware Cluster Management. In ASPLOS. 127--144. Christina Delimitrou and Christos Kozyrakis. 2014. Quasar: Resource-Efficient and QoS-Aware Cluster Management. In ASPLOS. 127--144."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2211477"},{"key":"e_1_3_2_1_17_1","volume-title":"Anastasia Ailamaki, and Babak Falsafi.","author":"Ferdman Michael","year":"2012","unstructured":"Michael Ferdman , Almutaz Adileh , Onur Kocberber , Stavros Volos , Mohammad Alisafaee , Djordje Jevdjic , Cansu Kaynak , Adrian Daniel Popescu , Anastasia Ailamaki, and Babak Falsafi. 2012 . Clearing the Clouds : A Study of Emerging Scale-out Workloads on Modern Hardware. In ASPLOS. 37--48. Michael Ferdman, Almutaz Adileh, Onur Kocberber, Stavros Volos, Mohammad Alisafaee, Djordje Jevdjic, Cansu Kaynak, Adrian Daniel Popescu, Anastasia Ailamaki, and Babak Falsafi. 2012. Clearing the Clouds: A Study of Emerging Scale-out Workloads on Modern Hardware. In ASPLOS. 37--48."},{"key":"e_1_3_2_1_18_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778. Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778."},{"key":"e_1_3_2_1_19_1","unstructured":"Myeongjae Jeon Shivaram Venkataraman Amar Phanishayee Junjie Qian Wencong Xiao and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX ATC. 947--960. Myeongjae Jeon Shivaram Venkataraman Amar Phanishayee Junjie Qian Wencong Xiao and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In USENIX ATC. 947--960."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544497.3544501"},{"key":"e_1_3_2_1_21_1","volume-title":"Kim Hazelwood, Parthasarathy Ranganathan, Tipp Moseley, Gu-Yeon Wei, and David Brooks.","author":"Kanev Svilen","year":"2015","unstructured":"Svilen Kanev , Juan Pablo Darago , Kim Hazelwood, Parthasarathy Ranganathan, Tipp Moseley, Gu-Yeon Wei, and David Brooks. 2015 . Profiling a Warehouse-Scale Computer . In ISCA. 158--169. Svilen Kanev, Juan Pablo Darago, Kim Hazelwood, Parthasarathy Ranganathan, Tipp Moseley, Gu-Yeon Wei, and David Brooks. 2015. Profiling a Warehouse-Scale Computer. In ISCA. 158--169."},{"key":"e_1_3_2_1_22_1","volume-title":"Roger C. Raphael, and Walter E. Baker.","author":"Keeton Kimberly","year":"1998","unstructured":"Kimberly Keeton , David A. Patterson , Yong Qiang He , Roger C. Raphael, and Walter E. Baker. 1998 . Performance Characterization of a Quad Pentium Pro SMP Using OLTP Workloads. In ISCA. 15--26. Kimberly Keeton, David A. Patterson, Yong Qiang He, Roger C. Raphael, and Walter E. Baker. 1998. Performance Characterization of a Quad Pentium Pro SMP Using OLTP Workloads. In ISCA. 15--26."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/3342263.3342276"},{"key":"e_1_3_2_1_24_1","volume-title":"Ching-Hsiang Chu, Arpan Jain, Nick Sarkauskas, Hari Subramoni, and Dhabaleswar K. Panda.","author":"Kousha Pouya","year":"2019","unstructured":"Pouya Kousha , Bharath Ramesh , Kaushik Kandadi Suresh , Ching-Hsiang Chu, Arpan Jain, Nick Sarkauskas, Hari Subramoni, and Dhabaleswar K. Panda. 2019 . Designing a Profiling and Visualization Tool for Scalable and In-depth Analysis of High-Performance GPU Clusters. In IEEE HiPC. 93--102. Pouya Kousha, Bharath Ramesh, Kaushik Kandadi Suresh, Ching-Hsiang Chu, Arpan Jain, Nick Sarkauskas, Hari Subramoni, and Dhabaleswar K. Panda. 2019. Designing a Profiling and Visualization Tool for Scalable and In-depth Analysis of High-Performance GPU Clusters. In IEEE HiPC. 93--102."},{"key":"e_1_3_2_1_25_1","unstructured":"Bryan McCann and contributors. 2022. PyTorch Example. https:\/\/github.com\/pytorch\/examples\/tree\/main\/mnist. Bryan McCann and contributors. 2022. PyTorch Example. https:\/\/github.com\/pytorch\/examples\/tree\/main\/mnist."},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA. [n. d.]. CUDA Profiling Tools Interface (CUPTI). https:\/\/docs.nvidia.com\/cupti\/. NVIDIA. [n. d.]. CUDA Profiling Tools Interface (CUPTI). https:\/\/docs.nvidia.com\/cupti\/."},{"key":"e_1_3_2_1_28_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , Alban Desmaison , Andreas K\u00f6pf , Edward Yang , Zach DeVito , Martin Raison , Alykhan Tejani , Sasank Chilamkurthy , Benoit Steiner , Lu Fang , Junjie Bai , and Soumith Chintala . 2019. PyTorch: An Imperative Style , High-Performance Deep Learning Library . In NIPS. 8026--8037. Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NIPS. 8026--8037."},{"key":"e_1_3_2_1_29_1","volume-title":"An Analysis of Collocation on GPUs for Deep Learning Training. CoRR","author":"Robroek Ties","year":"2022","unstructured":"Ties Robroek , Ehsan Yousefzadeh-Asl-Miandoab , and P\u0131nar T\u00f6z\u00fcn . 2022. An Analysis of Collocation on GPUs for Deep Learning Training. CoRR ( 2022 ). Ties Robroek, Ehsan Yousefzadeh-Asl-Miandoab, and P\u0131nar T\u00f6z\u00fcn. 2022. An Analysis of Collocation on GPUs for Deep Learning Training. CoRR (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_31_1","first-page":"287","article-title":"The TAU parallel performance system","volume":"20","author":"Shende Sameer S","year":"2006","unstructured":"Sameer S Shende and Allen D Malony . 2006 . The TAU parallel performance system . IJHPCA 20 , 2 (2006), 287 -- 311 . Sameer S Shende and Allen D Malony. 2006. The TAU parallel performance system. IJHPCA 20, 2 (2006), 287--311.","journal-title":"IJHPCA"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Utku Sirin Ahmad Yasin and Anastasia Ailamaki. 2017. A methodology for OLTP micro-architectural analysis. In DaMoN @ ACM SIGMOD. 1:1--1:10. Utku Sirin Ahmad Yasin and Anastasia Ailamaki. 2017. A methodology for OLTP micro-architectural analysis. In DaMoN @ ACM SIGMOD. 1:1--1:10.","DOI":"10.1145\/3076113.3076116"},{"key":"e_1_3_2_1_33_1","volume-title":"Mark Arnold, Jonathan Perkins, Xiaoyi Lu, Khaled Hamidouche, and Dhabaleswar K Panda.","author":"Subramoni Hari","year":"2016","unstructured":"Hari Subramoni , Albert Mathews Augustine , Mark Arnold, Jonathan Perkins, Xiaoyi Lu, Khaled Hamidouche, and Dhabaleswar K Panda. 2016 . INAM2: InfiniBand network analysis and monitoring with MPI. In High Performance Computing . 300--320. Hari Subramoni, Albert Mathews Augustine, Mark Arnold, Jonathan Perkins, Xiaoyi Lu, Khaled Hamidouche, and Dhabaleswar K Panda. 2016. INAM2: InfiniBand network analysis and monitoring with MPI. In High Performance Computing. 300--320."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"P\u0131nar T\u00f6z\u00fcn Brian Gold and Anastasia Ailamaki. 2013. OLTP in Wonderland: Where Do Cache Misses Come from in Major OLTP Components?. In DaMoN @ ACM SIGMOD. Article 8 6 pages. P\u0131nar T\u00f6z\u00fcn Brian Gold and Anastasia Ailamaki. 2013. OLTP in Wonderland: Where Do Cache Misses Come from in Major OLTP Components?. In DaMoN @ ACM SIGMOD. Article 8 6 pages.","DOI":"10.1145\/2485278.2485286"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"P\u0131nar T\u00f6z\u00fcn Ippokratis Pandis Cansu Kaynak Djordje Jevdjic and Anastasia Ailamaki. 2013. From A to E: Analyzing TPC's OLTP Benchmarks: The Obsolete the Ubiquitous the Unexplored. In EDBT. 17--28. P\u0131nar T\u00f6z\u00fcn Ippokratis Pandis Cansu Kaynak Djordje Jevdjic and Anastasia Ailamaki. 2013. From A to E: Analyzing TPC's OLTP Benchmarks: The Obsolete the Ubiquitous the Unexplored. In EDBT. 17--28.","DOI":"10.1145\/2452376.2452380"},{"key":"e_1_3_2_1_36_1","first-page":"599","article-title":"Horizontally Fused Training Array: An Effective Hardware Utilization Squeezer for Training Novel Deep Learning Models","volume":"3","author":"Wang Shang","year":"2021","unstructured":"Shang Wang , Peiming Yang , Yuxuan Zheng , Xin Li , and Gennady Pekhimenko . 2021 . Horizontally Fused Training Array: An Effective Hardware Utilization Squeezer for Training Novel Deep Learning Models . MLSys 3 (2021), 599 -- 623 . Shang Wang, Peiming Yang, Yuxuan Zheng, Xin Li, and Gennady Pekhimenko. 2021. Horizontally Fused Training Array: An Effective Hardware Utilization Squeezer for Training Novel Deep Learning Models. MLSys 3 (2021), 599--623.","journal-title":"MLSys"},{"key":"e_1_3_2_1_37_1","unstructured":"Ross Wightman. 2019. PyTorch Image Models. https:\/\/github.com\/rwightman\/pytorch-image-models. Ross Wightman. 2019. PyTorch Image Models. https:\/\/github.com\/rwightman\/pytorch-image-models."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Hui Zhang and Jeffrey Hollingsworth. 2019. Understanding the performance of GPGPU applications from a data-centric view. In ProTools. 1--8. Hui Zhang and Jeffrey Hollingsworth. 2019. Understanding the performance of GPGPU applications from a data-centric view. In ProTools. 1--8.","DOI":"10.1109\/ProTools49597.2019.00006"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Keren Zhou Mark Krentel and John Mellor-Crummey. 2020. A Tool for Top-down Performance Analysis of GPU-Accelerated Applications. In PPoPP. 415--416. Keren Zhou Mark Krentel and John Mellor-Crummey. 2020. A Tool for Top-down Performance Analysis of GPU-Accelerated Applications. In PPoPP. 415--416.","DOI":"10.1145\/3332466.3374534"}],"event":{"name":"EuroMLSys '23: 3rd Workshop on Machine Learning and Systems","location":"Rome Italy","acronym":"EuroMLSys '23","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 3rd Workshop on Machine Learning and Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578356.3592589","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:51Z","timestamp":1750178811000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3578356.3592589"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,8]]},"references-count":38,"alternative-id":["10.1145\/3578356.3592589","10.1145\/3578356"],"URL":"https:\/\/doi.org\/10.1145\/3578356.3592589","relation":{},"subject":[],"published":{"date-parts":[[2023,5,8]]},"assertion":[{"value":"2023-05-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}