{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T05:39:59Z","timestamp":1773207599475,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,14]],"date-time":"2022-08-14T00:00:00Z","timestamp":1660435200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,14]]},"DOI":"10.1145\/3534678.3539036","type":"proceedings-article","created":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T19:06:12Z","timestamp":1660331172000},"page":"3801-3809","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Profiling Deep Learning Workloads at Scale using Amazon SageMaker"],"prefix":"10.1145","author":[{"given":"Nathalie","family":"Rauschmayr","sequence":"first","affiliation":[{"name":"Amazon Web Services, Vancouver, BC, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sami","family":"Kama","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Muhyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miyoung","family":"Choi","sequence":"additional","affiliation":[{"name":"Amazon Web Services, Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Krishnaram","family":"Kenthapadi","sequence":"additional","affiliation":[{"name":"Fiddler AI, Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,8,14]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2022. Penn-Fudan Database for Pedestrian Detection and Segmentation. https:\/\/www.cis.upenn.edu\/~jshi\/ped_html\/."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData47090.2019.9005703"},{"key":"e_1_3_2_1_3_1","unstructured":"Dario Amodei Danny Hernandez Girish Sastry Jack Clark Greg Brockman and Ilya Sutskever. 2018. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute."},{"key":"e_1_3_2_1_4_1","volume-title":"The Lustre storage architecture. arXiv preprint arXiv:1903.01955","author":"Braam Peter","year":"2019","unstructured":"Peter Braam. 2019. The Lustre storage architecture. arXiv preprint arXiv:1903.01955 (2019)."},{"key":"e_1_3_2_1_5_1","volume-title":"2012 proceedings of the 35th international convention MIPRO. IEEE, 1725--1730","author":"Culjak Ivan","year":"2012","unstructured":"Ivan Culjak, David Abram, Tomislav Pribanic, Hrvoje Dzapo, and Mario Cifrek. 2012. A brief introduction to OpenCV. In 2012 proceedings of the 35th international convention MIPRO. IEEE, 1725--1730."},{"key":"e_1_3_2_1_6_1","volume-title":"Linux Plumbers Conference, Vol.","volume":"118","author":"de Melo Arnaldo Carvalho","year":"2009","unstructured":"Arnaldo Carvalho de Melo. 2009. Performance counters on Linux. In Linux Plumbers Conference, Vol., Vol. 118."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","unstructured":"Nathalie~Rauschmayr et al. 2021. Amazon SageMaker Debugger: A System for Real-time Insights into Machine Learning Model Training. In MLSys ."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2014.06.007"},{"key":"e_1_3_2_1_11_1","unstructured":"Google. 2021. Chrome Frame Viewer Overview and Getting Started. https:\/\/www.chromium.org\/developers\/how-tos\/trace-event-profiling-tool\/frame-viewer."},{"key":"e_1_3_2_1_12_1","unstructured":"Google AI. 2021. Optimize TensorFlow performance using the Profiler. https:\/\/www.tensorflow.org\/guide\/profiler."},{"key":"e_1_3_2_1_13_1","volume-title":"DeepProf: Performance Analysis for Deep Learning Applications via Mining GPU Execution Patterns. arXiv preprint arXiv:1707.03750","author":"Gu Jiazhen","year":"2017","unstructured":"Jiazhen Gu, Huan Liu, Yangfan Zhou, and Xin Wang. 2017. DeepProf: Performance Analysis for Deep Learning Applications via Mining GPU Execution Patterns. arXiv preprint arXiv:1707.03750 (2017)."},{"key":"e_1_3_2_1_14_1","unstructured":"Joaquin~Anton Guirao Krzysztof Lecki Janusz Lisiecki Serge Panev Albert Wolant and Micha? Zientkiewicz. 2019. Fast AI Data Preprocessing with NVIDIA DALI. https:\/\/developer.nvidia.com\/blog\/fast-ai-data-preprocessing-with-nvidia-dali\/."},{"key":"e_1_3_2_1_15_1","volume-title":"Mask R-CNN. In Proceedings of the IEEE International Conference on Computer Vision. 2961--2969","author":"He Kaiming","year":"2017","unstructured":"Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross Girshick. 2017. Mask R-CNN. In Proceedings of the IEEE International Conference on Computer Vision. 2961--2969."},{"key":"e_1_3_2_1_16_1","unstructured":"Daniel Horowitz. 2018. Nsight Systems Exposes New GPU Optimization Opportunities. https:\/\/developer.nvidia.com\/blog\/nsight-systems-exposes-gpu-optimization\/."},{"key":"e_1_3_2_1_17_1","unstructured":"Jiri Kraus. 2013. CUDA Pro Tip: Generate Custom Application Profile Timelines with NVTX. https:\/\/developer.nvidia.com\/blog\/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx\/."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374528"},{"key":"e_1_3_2_1_19_1","volume-title":"2020 b. PyTorch Distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. 2020 b. PyTorch Distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA. 2020. CUPTI. https:\/\/docs.nvidia.com\/cupti\/Cupti\/index.html."},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2021. nvprof. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html#nvprof-overview."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-67"},{"key":"e_1_3_2_1_24_1","volume-title":"VTune performance analyzer essentials","author":"Reinders James","year":"2005","unstructured":"James Reinders. 2005. VTune performance analyzer essentials. Intel Press (2005)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342006064482"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i09.7123"},{"key":"e_1_3_2_1_27_1","volume-title":"Journal of Physics: Conference Series, Vol.","author":"Tallent Nathan","unstructured":"Nathan Tallent, John Mellor-Crummey, Laksono Adhianto, Michael Fagan, and Mark Krentel. 2008. HPCToolkit: Performance tools for scientific computing. In Journal of Physics: Conference Series, Vol., Vol. 125. IOP Publishing."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Thangakrishnan Indu","year":"2020","unstructured":"Indu Thangakrishnan, Derya Cavdar, Can Karakus, Piyush Ghai, Yauheni Selivonchyk, and Cory Pruce. 2020. Herring: Rethinking the parameter server at scale for the cloud. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Atlanta, GA, USA. 1--13."},{"key":"e_1_3_2_1_29_1","unstructured":"The PyTorch team. 2021. Automatic differentiation package - torch.autograd. https:\/\/pytorch.org\/docs\/stable\/autograd.html."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2019.00037"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3423211.3425693"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415890"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSRE.2019.00020"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2018.8573476"}],"event":{"name":"KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Washington DC USA","acronym":"KDD '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539036","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3534678.3539036","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:09:50Z","timestamp":1750183790000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539036"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,14]]},"references-count":34,"alternative-id":["10.1145\/3534678.3539036","10.1145\/3534678"],"URL":"https:\/\/doi.org\/10.1145\/3534678.3539036","relation":{},"subject":[],"published":{"date-parts":[[2022,8,14]]},"assertion":[{"value":"2022-08-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}